library("data.table")
library("dplyr")
library("lubridate")
library("ggplot2")
library("gridExtra")
library("kableExtra")
library("pander")
library("stringr")
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")accidents <- read.csv("NYPD_Motor_Vehicle_Collisions.csv")
accidents = data.table(accidents) ## 1441945 lines
accidents = na.omit(accidents) ## 986984 linesdata1 = accidents %>% filter(!CONTRIBUTING.FACTOR.VEHICLE.1 %in%
c("Unspecified", "1", "80", "", " "))
reason1 = data.frame(table(data1$CONTRIBUTING.FACTOR.VEHICLE.1))
wordcloud(words = reason1$Var1, freq = reason1$Freq#, type ="file"
, scale = c(2,0.8), min.freq = 1
, max.words = Inf, random.order = F, rot.per = 0.1
, colors = brewer.pal(8, "Dark2"))Please note: Driver inattention or distraction is the most important factor to vehicle collisions. The second important factor is the Failure to Yield Right-of-way. Here, the size of word shows the importance.
counted <- accidents %>% group_by(DATE) %>% count(TIME)
counted <- counted %>% mutate(dates = as.Date(DATE, format = "%m/%d/%Y"))
counted_V2 <- counted %>% group_by(dates) %>% summarise(incidents = sum(n))
ggplot(counted_V2, aes(dates, incidents)) +
geom_line() +
theme_minimal() +
xlab("Dates") + ylab("Incidents") +
ggtitle("Number of Accidents from 2012 to 2018")Please note: Less accidents recent years than before, normally more accidents during the summer
type1 <- accidents %>% group_by(VEHICLE.TYPE.CODE.1) %>%
count(CONTRIBUTING.FACTOR.VEHICLE.1) %>%
arrange(desc(n))
type1[type1 == ""] <- NA
type1 <- na.omit(type1)
type1 <- data.frame(type1)
temp1 <- filter(type1, CONTRIBUTING.FACTOR.VEHICLE.1 != "Unspecified")
temp11 <- filter(type1,
CONTRIBUTING.FACTOR.VEHICLE.1 != "Driver Inattention/Distraction" &
CONTRIBUTING.FACTOR.VEHICLE.1 != "Unspecified" & n > 50)
p1 <- ggplot(temp11, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.1, n), y = n,
fill = VEHICLE.TYPE.CODE.1)) +
geom_bar(stat = "identity") + theme_minimal() +
theme(axis.text.y = element_text(size = 6),
legend.key.width = unit(0.2,"cm"), legend.key.height=unit(0.2,"cm"),
legend.position = c(.55, .03), legend.justification = c(0, 0),
legend.background = element_rect(fill=alpha('white', 0.4)),
legend.text = element_text(size = 5.5),
legend.title = element_text(size = 6)) +
xlab("Factor") + ylab("Incidents") +
ggtitle("Incidents by Factor for Vehicle 1") +
coord_flip() +
guides(fill = guide_legend(ncol = 1, title = "Vehicle Type"))
p1temp111 <- filter(type1,
CONTRIBUTING.FACTOR.VEHICLE.1 == "Driver Inattention/Distraction",
n > 500)
p11 <- ggplot(temp111, aes(x = reorder(VEHICLE.TYPE.CODE.1, n), y = n)) +
geom_bar(stat = "identity") + theme_minimal() +
theme(axis.text.y = element_text(size = 8)) +
xlab("Vehicle Type") + ylab("Incidents") +
ggtitle("Driver Inattention by Vehicle Type") + coord_flip()
p11Passenger vehicle is the type of car who contribute the biggest volume in accidents, especially for driver inattention or distraction.
type2 <- accidents %>% group_by(VEHICLE.TYPE.CODE.2) %>%
count(CONTRIBUTING.FACTOR.VEHICLE.2) %>% arrange(desc(n))
type2[type2 == ""] <- NA
type2 <- na.omit(type2)
type2 <- data.frame(type2)
temp2 <- filter(type2, CONTRIBUTING.FACTOR.VEHICLE.2 != "Unspecified" & n > 150)
p2 <- ggplot(temp2, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.2,n), y = n,
fill = VEHICLE.TYPE.CODE.2)) +
geom_bar(stat = "identity") + theme_minimal() +
theme(axis.text.y = element_text(size = 8),
legend.key.width = unit(0.2,"cm"), legend.key.height=unit(0.2,"cm"),
legend.position = c(.5, .05), legend.justification = c(0, 0),
legend.background = element_rect(fill=alpha('white', 0.6)),
legend.text = element_text(size = 5.5),
legend.title = element_text(size = 6)) +
xlab("Factor") + ylab("Incidents") +
ggtitle("Incidents by Factor for Vehicle 2") +
coord_flip() +
guides(fill = guide_legend(ncol = 1, title = "Vehicle Type"))
p2Similar to the first vehicle type code, driver inattention makes up the majority of accident factors
type3 <- accidents %>% group_by(VEHICLE.TYPE.CODE.3) %>%
count(CONTRIBUTING.FACTOR.VEHICLE.3) %>%
arrange(desc(n))
type3[type3 == ""] <- NA
type3 <- na.omit(type3)
type3 <- data.frame(type3)
temp3 <- filter(type3, CONTRIBUTING.FACTOR.VEHICLE.3 != "Unspecified")
p3 <- ggplot(temp3, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.3,n), y = n,
fill = VEHICLE.TYPE.CODE.3)) +
geom_bar(stat = "identity") + theme_minimal() +
theme(axis.text.y = element_text(size = 5.5),
legend.key.width = unit(0.2,"cm"), legend.key.height=unit(0.2,"cm") ,
legend.position = c(0.6,.05), legend.justification = c(0, 0),
legend.background = element_rect(fill=alpha('white', 0.4)),
legend.text = element_text(size = 5),
legend.title = element_text(size = 6)) +
xlab("Factor") + ylab("Incidents") +
ggtitle("Incidents by Factor for Vehicle 3") + coord_flip() +
guides(fill = guide_legend(ncol = 1, title = "Vehicle Type"))
p3Past two vehicles, the third vehicle involved in the accident is usually caused by the other vehicles already involved
type4 <- accidents %>% group_by(VEHICLE.TYPE.CODE.4) %>%
count(CONTRIBUTING.FACTOR.VEHICLE.4) %>% arrange(desc(n))
type4[type4 == ""] <- NA
type4 <- na.omit(type4)
type4 <- data.frame(type4)
temp4 <- filter(type4, CONTRIBUTING.FACTOR.VEHICLE.4 != "Unspecified")
p4 <- ggplot(temp4, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.4, n), y = n,
fill = VEHICLE.TYPE.CODE.4)) +
geom_bar(stat = "identity") + theme_minimal() +
theme(axis.text.y = element_text(size = 6),
legend.key.width = unit(0.2,"cm"),
legend.key.height = unit(0.2,"cm"),
legend.position = c(.55, .05),
legend.justification = c(0, 0),
legend.background = element_rect(fill=alpha('white', 0.6)),
legend.text = element_text(size = 5.5),
legend.title = element_text(size = 6)) +
xlab("Factor") + ylab("Incidents") + ggtitle("Incidents by Factor for Vehicle 4") +
coord_flip() +
guides(fill = guide_legend(ncol = 1, title = "Vehicle Type"))
p4Other vehicles are usually the cause for the fourth vehicle getting involved in an accident
type5 <- accidents %>% group_by(VEHICLE.TYPE.CODE.5) %>%
count(CONTRIBUTING.FACTOR.VEHICLE.5) %>% arrange(desc(n))
type5[type5 == ""] <- NA
type5 <- na.omit(type5)
type5 <- data.frame(type5)
temp5 <- filter(type5, CONTRIBUTING.FACTOR.VEHICLE.5 != "Unspecified")
p5 <- ggplot(temp5, aes(x = reorder(CONTRIBUTING.FACTOR.VEHICLE.5,n), y = n,
fill = VEHICLE.TYPE.CODE.5)) +
geom_bar(stat = "identity") + theme_minimal() +
theme(axis.text.y = element_text(size = 8),
legend.key.width = unit(0.2,"cm"),
legend.key.height = unit(0.2,"cm"),
legend.background = element_rect(fill=alpha('white', 0.6)),
legend.position = c(.57, .05),
legend.justification = c(0, 0),
legend.text = element_text(size = 6),
legend.title = element_text(size = 6)) +
xlab("Factor") + ylab("Incidents") +
ggtitle("Incidents by Factor for Vehicle 5") +
coord_flip() +
guides(fill = guide_legend(ncol = 1, title = "Vehicle Type"))
p5Other vehicles are usually the cause for the fourth vehicle getting involved in an accident
accidents$hour = as.POSIXlt(as.POSIXlt(paste(Sys.Date(), accidents$TIME)))$hour
accidents$daytime = ifelse(accidents$hour >= 7 & accidents$hour < 12
, "Morning"
, ifelse(accidents$hour >= 12 & accidents$hour < 18
, "Afternoon"
, ifelse(accidents$hour >= 18 & accidents$hour < 24
, "Evening"
, "Night")))
accidents_daytime <- accidents %>% group_by(BOROUGH, daytime) %>% summarise(nb = sum(NUMBER.OF.PERSONS.INJURED, na.rm = TRUE))
ggplot(accidents_daytime, aes(x = daytime, y = BOROUGH, fill = nb)) + geom_tile() + theme_minimal()Brooklyn is the most risky area during the whole day, Staten Island is quite safe during afternoon. Night is always less dangerous than the other time in a day.