library("data.table")
library("dplyr")
library("lubridate")
library("ggplot2")
library("gridExtra")
library("kableExtra")
library("pander")
library("stringr")
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
accidents <- read.csv("NYPD_Motor_Vehicle_Collisions.csv")
accidents = data.table(accidents) ## 1441945 lines
accidents = na.omit(accidents) ## 986984 lines

World cloud for the first contribution factor of vehicle

data1 = accidents %>% filter(!CONTRIBUTING.FACTOR.VEHICLE.1 %in% 
                               c("Unspecified", "1", "80", "", " "))
reason1 = data.frame(table(data1$CONTRIBUTING.FACTOR.VEHICLE.1))

wordcloud(words = reason1$Var1, freq = reason1$Freq#, type ="file"
          , scale = c(2,0.8), min.freq = 1
          , max.words = Inf, random.order = F, rot.per = 0.1
          , colors = brewer.pal(8, "Dark2"))

back to top

Please note: Driver inattention or distraction is the most important factor to vehicle collisions. The second important factor is the Failure to Yield Right-of-way. Here, the size of word shows the importance.





Trend of accidents by date

counted <- accidents %>% group_by(DATE) %>% count(TIME)
counted <- counted %>% mutate(dates = as.Date(DATE, format = "%m/%d/%Y"))
counted_V2 <- counted %>% group_by(dates) %>% summarise(incidents = sum(n))
ggplot(counted_V2, aes(dates, incidents)) + 
  geom_line() + 
  theme_minimal() + 
  xlab("Dates") + ylab("Incidents") + 
  ggtitle("Number of Accidents from 2012 to 2018")

back to top

Please note: Less accidents recent years than before, normally more accidents during the summer



Relationship between type of vehicle and accident

Using first vehicle code type and contribution factor 1

type1 <- accidents %>% group_by(VEHICLE.TYPE.CODE.1) %>% 
  count(CONTRIBUTING.FACTOR.VEHICLE.1) %>%
  arrange(desc(n))
type1[type1 == ""] <- NA
type1 <- na.omit(type1)
type1 <- data.frame(type1)
temp1 <- filter(type1, CONTRIBUTING.FACTOR.VEHICLE.1 != "Unspecified")
temp11 <- filter(type1, 
                 CONTRIBUTING.FACTOR.VEHICLE.1 != "Driver Inattention/Distraction" &
                   CONTRIBUTING.FACTOR.VEHICLE.1 != "Unspecified" & n > 50)

p1 <- ggplot(temp11, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.1, n), y = n, 
                         fill = VEHICLE.TYPE.CODE.1)) + 
  geom_bar(stat = "identity") + theme_minimal() +
  theme(axis.text.y = element_text(size = 6), 
        legend.key.width = unit(0.2,"cm"), legend.key.height=unit(0.2,"cm"), 
        legend.position = c(.55, .03), legend.justification = c(0, 0),
        legend.background = element_rect(fill=alpha('white', 0.4)),
        legend.text = element_text(size = 5.5),
        legend.title = element_text(size = 6)) + 
  xlab("Factor") + ylab("Incidents") + 
  ggtitle("Incidents by Factor for Vehicle 1") +
  coord_flip() + 
  guides(fill = guide_legend(ncol = 1, title = "Vehicle Type")) 
p1

temp111 <- filter(type1, 
                  CONTRIBUTING.FACTOR.VEHICLE.1 == "Driver Inattention/Distraction",
                  n > 500)
p11 <- ggplot(temp111, aes(x = reorder(VEHICLE.TYPE.CODE.1, n), y = n)) + 
  geom_bar(stat = "identity") + theme_minimal() + 
  theme(axis.text.y = element_text(size = 8)) + 
  xlab("Vehicle Type") + ylab("Incidents") + 
  ggtitle("Driver Inattention by Vehicle Type") + coord_flip()
p11

back to top

Passenger vehicle is the type of car who contribute the biggest volume in accidents, especially for driver inattention or distraction.



Using second vehicle code type and contribution factor 2

type2 <- accidents %>% group_by(VEHICLE.TYPE.CODE.2) %>% 
  count(CONTRIBUTING.FACTOR.VEHICLE.2) %>% arrange(desc(n))
type2[type2 == ""] <- NA
type2 <- na.omit(type2)
type2 <- data.frame(type2)
temp2 <- filter(type2, CONTRIBUTING.FACTOR.VEHICLE.2 != "Unspecified" & n > 150)

p2 <- ggplot(temp2, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.2,n), y = n, 
                        fill = VEHICLE.TYPE.CODE.2)) + 
  geom_bar(stat = "identity") + theme_minimal() +
  theme(axis.text.y = element_text(size = 8), 
        legend.key.width = unit(0.2,"cm"), legend.key.height=unit(0.2,"cm"), 
        legend.position = c(.5, .05), legend.justification = c(0, 0),
        legend.background = element_rect(fill=alpha('white', 0.6)),
        legend.text = element_text(size = 5.5),
        legend.title = element_text(size = 6)) + 
  xlab("Factor") + ylab("Incidents") + 
  ggtitle("Incidents by Factor for Vehicle 2") +
  coord_flip() + 
  guides(fill = guide_legend(ncol = 1, title = "Vehicle Type")) 
p2

back to top

Similar to the first vehicle type code, driver inattention makes up the majority of accident factors



Using third vehicle code type and contributing factor 3

type3 <- accidents %>% group_by(VEHICLE.TYPE.CODE.3) %>% 
  count(CONTRIBUTING.FACTOR.VEHICLE.3) %>% 
  arrange(desc(n))
type3[type3 == ""] <- NA
type3 <- na.omit(type3)
type3 <- data.frame(type3)
temp3 <- filter(type3, CONTRIBUTING.FACTOR.VEHICLE.3 != "Unspecified")

p3 <- ggplot(temp3, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.3,n), y = n, 
                        fill = VEHICLE.TYPE.CODE.3)) + 
  geom_bar(stat = "identity") + theme_minimal() + 
  theme(axis.text.y = element_text(size = 5.5), 
       legend.key.width = unit(0.2,"cm"), legend.key.height=unit(0.2,"cm") , 
       legend.position = c(0.6,.05), legend.justification = c(0, 0),
       legend.background = element_rect(fill=alpha('white', 0.4)),
       legend.text = element_text(size = 5),
       legend.title = element_text(size = 6)) + 
  xlab("Factor") + ylab("Incidents") + 
  ggtitle("Incidents by Factor for Vehicle 3") + coord_flip() + 
  guides(fill = guide_legend(ncol = 1, title = "Vehicle Type"))  
p3

back to top

Past two vehicles, the third vehicle involved in the accident is usually caused by the other vehicles already involved



Using fourth vehicle code type and contributing factor 4

type4 <- accidents %>% group_by(VEHICLE.TYPE.CODE.4) %>% 
  count(CONTRIBUTING.FACTOR.VEHICLE.4) %>%  arrange(desc(n))
type4[type4 == ""] <- NA
type4 <- na.omit(type4)
type4 <- data.frame(type4)
temp4 <- filter(type4, CONTRIBUTING.FACTOR.VEHICLE.4 != "Unspecified")

p4 <- ggplot(temp4, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.4, n), y = n, 
                        fill = VEHICLE.TYPE.CODE.4)) + 
  geom_bar(stat = "identity") + theme_minimal() +
  theme(axis.text.y = element_text(size = 6), 
        legend.key.width = unit(0.2,"cm"), 
        legend.key.height = unit(0.2,"cm"), 
        legend.position = c(.55, .05), 
        legend.justification = c(0, 0),
        legend.background = element_rect(fill=alpha('white', 0.6)),
        legend.text = element_text(size = 5.5),
        legend.title = element_text(size = 6)) + 
  xlab("Factor") + ylab("Incidents") + ggtitle("Incidents by Factor for Vehicle 4") +
  coord_flip() + 
  guides(fill = guide_legend(ncol = 1, title = "Vehicle Type")) 
p4

back to top

Other vehicles are usually the cause for the fourth vehicle getting involved in an accident



Using the last vehicle code type and contribution factor 5

type5 <- accidents %>% group_by(VEHICLE.TYPE.CODE.5) %>% 
  count(CONTRIBUTING.FACTOR.VEHICLE.5) %>% arrange(desc(n))
type5[type5 == ""] <- NA
type5 <- na.omit(type5)
type5 <- data.frame(type5)
temp5 <- filter(type5, CONTRIBUTING.FACTOR.VEHICLE.5 != "Unspecified")

p5 <- ggplot(temp5, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.5,n), y = n, 
                        fill = VEHICLE.TYPE.CODE.5)) + 
  geom_bar(stat = "identity") + theme_minimal() +
  theme(axis.text.y = element_text(size = 8), 
        legend.key.width = unit(0.2,"cm"), 
        legend.key.height = unit(0.2,"cm"),
        legend.background = element_rect(fill=alpha('white', 0.6)),
        legend.position = c(.57, .05), 
        legend.justification = c(0, 0),
        legend.text = element_text(size = 6),
        legend.title = element_text(size = 6)) + 
  xlab("Factor") + ylab("Incidents") + 
  ggtitle("Incidents by Factor for Vehicle 5") +
  coord_flip() + 
  guides(fill = guide_legend(ncol = 1, title = "Vehicle Type")) 
p5

back to top

Other vehicles are usually the cause for the fourth vehicle getting involved in an accident





Relationship between time and borough for number of person injured

accidents$hour = as.POSIXlt(as.POSIXlt(paste(Sys.Date(), accidents$TIME)))$hour
accidents$daytime = ifelse(accidents$hour >= 7 & accidents$hour < 12
                           , "Morning"
                           , ifelse(accidents$hour >= 12 & accidents$hour < 18
                                    , "Afternoon"
                                    , ifelse(accidents$hour >= 18 & accidents$hour < 24
                                             , "Evening"
                                             , "Night")))


accidents_daytime <- accidents %>% group_by(BOROUGH, daytime) %>% summarise(nb = sum(NUMBER.OF.PERSONS.INJURED, na.rm = TRUE))
ggplot(accidents_daytime, aes(x = daytime, y = BOROUGH, fill = nb)) + geom_tile() + theme_minimal()

back to top

Brooklyn is the most risky area during the whole day, Staten Island is quite safe during afternoon. Night is always less dangerous than the other time in a day.