The purpose of this Rmarkdown doc is to show the work I did for a GGWash post about ATE. If you notice errors or have questions, please email me at edwardpierrerodrigue [at] gmail [dot] com.
knitr::opts_chunk$set(echo = TRUE, warning=F, message=F)
knitr::opts_chunk$set(fig.width=15, fig.height=10)
# load the packages we need
library(readr)
library(dplyr)
library(ggplot2)
library(plotly)
library(leaflet)
library(sf)
library(scales)
# Load some of the data, for the month of May in 2024:
df <- read_csv("Moving_Violations_Issued_in_May_2024.csv")
# data from https://opendata.dc.gov/datasets/DCGIS::moving-violations-issued-in-may-2024/about
# Show a table detailing the counts of each type of speeding violation:
table(df$VIOLATION_PROCESS_DESC[df$VIOLATION_CODE %in% c(
"T119", "T120", "T121",
"T118",
"T122", "T123", "T276", "T277", "T278"
)])
##
## SPEED 11-15 MPH OVER THE SPEED LIMIT
## 208375
## SPEED 16-20 MPH OVER THE SPEED LIMIT
## 34770
## SPEED 21-25 MPH OVER THE SPEED LIMIT
## 7447
## SPEED 26-30 MPH OVER THE SPEED LIMIT
## 1
## SPEED UP TO TEN MPH OVER THE SPEED LIMIT
## 11
## SPEEDING IN CMV 11-14 MPH OVER SPEED LIMIT
## 1
speeding_violation_codes <- c("T119", "T120", "T121")
# There is also the "unreasonable speed" code ("T125") but it only appears a couple times, so omitting
# also omitting "T118" which is "speeding up to 10 mph over the limit" because there are almost no observations in that category.
# and omitting "T122", "T123", "T276", "T277", "T278" because there's not really any data under that code either.
# Subset the data to the speeding codes of interest for which we have complete data:
df <-
df %>%
filter(VIOLATION_CODE %in% speeding_violation_codes)
# Create various date variables:
weekend_days_in_may <- c(4, 5, 11, 12, 18, 19, 25, 26)
df$day <- as.numeric(substr(df$ISSUE_DATE, 9, 10))
df$hour <- round(df$ISSUE_TIME / 100, 0)
df$day_hour <- as.numeric(paste0(df$day, ".", df$hour))
df$weekend <- 0
df$weekend[df$day %in% weekend_days_in_may] <- 1
# Create a variable with a shorter violation code description:
df$short_desc <- ""
df$short_desc[df$VIOLATION_CODE == "T119"] <- "11-15 mph over limit"
df$short_desc[df$VIOLATION_CODE == "T120"] <- "16-20 mph over limit"
df$short_desc[df$VIOLATION_CODE == "T121"] <- "21-25 mph over limit"
Plotting total figures from the month of May 2024:
# There is a lot of speeding in DC. This graph doesn't even show all the people that are going 1-10 mph over the limit.
# Or people that are going 26+ mph over the limit. The city doesn't seem to publish this data.
df %>%
ggplot(., aes(short_desc)) +
geom_bar(fill="#109c37") +
theme_minimal() +
xlab("") +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=1)) +
scale_y_continuous(labels=function(x) format(x, big.mark = ",", scientific = FALSE)) +
ggtitle("There is a lot of speeding in DC") +
geom_text(aes(label = format(after_stat(count), big.mark = ',') ), stat='count', vjust = -0.2)
# If you just sum up the "total paid" column, you get $8,531,557.
format(x = sum(df$TOTAL_PAID), big.mark = ',')
## [1] "8,531,557"
# the total amount issued was $34,966,950
format(x = sum(df$FINE_AMOUNT) + sum(df$PENALTY_1) + sum(df$PENALTY_2) + sum(df$PENALTY_3) + sum(df$PENALTY_4) + sum(df$PENALTY_5), big.mark = ',')
## [1] "34,966,950"
When do people speed?
# speeding peaks in general during mid-day and the afternoon commute, except for
# a small number of sociopaths who break the limit by 21+ mph are more prevalent very late at night.
df %>%
group_by(hour, short_desc) %>%
summarise(ticket_count = n()) %>%
plot_ly(data = ., x = ~hour, y = ~ticket_count,
color = ~short_desc,
size=3)
Looking just at U st:
## Zooming in on U St:
# Three cameras on the main U St corridor gave out an average of 39 tickets a day, or about 1.6 an hour
df %>%
filter(df$LOCATION %in% c("1400 BLK U ST NW E/B", "1000 BLK U ST NW E/B", "1500 BLK U ST NW E/B")) %>%
group_by(day) %>%
summarize(count = n()) %>%
summarize(mean_tickets_per_day = mean(count, na.rm=T))
## # A tibble: 1 × 1
## mean_tickets_per_day
## <dbl>
## 1 38.7
df %>%
filter(df$LOCATION %in% c("1400 BLK U ST NW E/B", "1000 BLK U ST NW E/B", "1500 BLK U ST NW E/B")) %>%
summarize(mean_tickets_per_hour = n() / (31*24))
## # A tibble: 1 × 1
## mean_tickets_per_hour
## <dbl>
## 1 1.61
## Three cameras on U St gave out an average 1.6 tickets per hour, although at times they
## gave out as many as 11 tickets an hour, or one every 5 and a half minutes.
## And that's not even counting people that are driving 30 in a 20. The dataset basically doesn't include
## information on people that were driving between 1 and 10 mph over the limit.
df %>%
filter(df$LOCATION %in% c("1400 BLK U ST NW E/B", "1000 BLK U ST NW E/B", "1500 BLK U ST NW E/B")) %>%
group_by(day_hour) %>%
summarize(count = n()) %>%
arrange(-count)
## # A tibble: 498 × 2
## day_hour count
## <dbl> <int>
## 1 2.12 11
## 2 26.1 11
## 3 8.7 10
## 4 8.1 9
## 5 19.8 8
## 6 20.2 8
## 7 1.1 7
## 8 6.14 7
## 9 25.2 7
## 10 28.8 7
## # ℹ 488 more rows
Looking at risk of dying while walking on some of these streets with cameras:
# impute the real speeding, assuming an exponential distribution with mean 2 mph over the minimum value in the bucket
df$imputed_speed_over <- 11
df$imputed_speed_over[df$short_desc== "16-20 mph over limit"] <- 16
df$imputed_speed_over[df$short_desc== "21-25 mph over limit"] <- 21
# look at a few streets/cameras in particular:
analyze_a_few_cameras <- function(locs_vec, speed_limit, caption, coeff, plot_title) {
cat("Camera locations:")
for (location in locs_vec) {cat(paste('\n\t', location))}
# create a variable for the speed the person was actually going, given the
# speed limit on that street
cam_data <-
df %>%
filter(LOCATION %in% locs_vec) %>%
mutate(imputed_speed = imputed_speed_over + speed_limit + runif(n=n(), 0, 4))
# group the data by the amount of speeding
if (speed_limit == 20) {
cam_data$mph_cat <-
cut(cam_data$imputed_speed,
c(30, 35, 40, 60), include.lowest = F,
labels = c('31-35', '36-40', '41-45'))
} else if (speed_limit == 25) {
cam_data$mph_cat <-
cut(cam_data$imputed_speed,
c(35, 40, 45, 60), include.lowest = F,
labels = c('36-40', '41-45', '46-50'))
}
# add on the probability of dying if hit by someone going that fast at each age.
# This is taken from this study by AAA: https://aaafoundation.org/impact-speed-pedestrians-risk-severe-injury-death/
cam_data$prob_death_70 <- NA
cam_data$prob_death_70[cam_data$mph_cat == '36-40'] <- 63
cam_data$prob_death_70[cam_data$mph_cat == '41-45'] <- 78
cam_data$prob_death_30 <- NA
cam_data$prob_death_30[cam_data$mph_cat == '36-40'] <- 28
cam_data$prob_death_30[cam_data$mph_cat == '41-45'] <- 42
# create labels for our bar chart
cam_data$label <- NA
if (speed_limit == 20) {
cam_data$prob_death_70[cam_data$mph_cat == '31-35'] <- 46
cam_data$prob_death_30[cam_data$mph_cat == '31-35'] <- 17
speed_limit_vec = c('31-35', '36-40', '41-45')
} else if (speed_limit == 25) {
cam_data$prob_death_70[cam_data$mph_cat == '46-50'] <- 86
cam_data$prob_death_30[cam_data$mph_cat == '46-50'] <- 62
speed_limit_vec =c('36-40', '41-45', '46-50')
}
for (mph_group in speed_limit_vec) {
cam_data$label[cam_data$mph_cat == mph_group] <- length(cam_data$label[cam_data$mph_cat == mph_group])
}
# create a graph of the data:
line_color= 'black'
barcolor= '#109c37'
font_size= 17
p <-
ggplot() +
geom_bar(data=cam_data, aes(x=mph_cat), fill=barcolor) +
geom_point(data=cam_data, aes(x=as.factor(mph_cat), y=prob_death_70*coeff), size=4, color=line_color) +
geom_line(data=cam_data, aes(x=mph_cat, y=prob_death_70*coeff, group=1), linewidth=2, color=line_color) +
scale_y_continuous(sec.axis = sec_axis(~.*1/coeff, name="Probability of death at this speed")) +
geom_point(data=cam_data, aes(x=as.factor(mph_cat), y=prob_death_30*coeff), size=4, color=line_color) +
geom_line(data=cam_data, aes(x=mph_cat, y=prob_death_30*coeff, group=1), linewidth=2, color=line_color) +
scale_y_continuous(sec.axis = sec_axis(~.*1/coeff, name="% chance death at this speed for 70- and 30-year olds")) +
theme_minimal() +
ylab('Number of speeding tickets issued in May 2024') +
theme(axis.title.y.left = element_text(colour = barcolor, size=font_size),
axis.title.y.right = element_text(colour = line_color, size=font_size),
plot.title = element_text(size = 18, face = "bold"),
plot.caption = element_text(hjust = 0, size=font_size),
axis.text.y.left = element_text(colour=barcolor, size=12),
axis.text.y.right = element_text(colour=line_color, size=12),
axis.text.x = element_text(size=12)) +
xlab("mph") +
ggtitle(plot_title) +
geom_text(data=cam_data,
aes(x=mph_cat, y=label, label=label,),
color=barcolor,
vjust = -0.2) +
geom_text(data=cam_data,
aes(x=mph_cat, y=prob_death_70*coeff, label=paste0(prob_death_70, "%"),),
color=line_color,
vjust = -1) +
geom_text(data=cam_data,
aes(x=mph_cat, y=prob_death_30*coeff, label=paste0(prob_death_30, "%"),),
color=line_color,
vjust = -1) +
labs(caption = caption)
if (speed_limit == 20) {
p <- p +
annotate("text", x = 3, y = 87*coeff,
label = "A 70-year-old hit by someone driving\n41-45 mph has a 78% chance of dying",
color=line_color) +
annotate("text", x = 1, y = 28*coeff,
label = "A 30-year-old hit by someone\ndriving 31-35 mph has a roughly\n one-fifth chance of dying",
color=line_color)
} else if (speed_limit == 25) {
p <- p +
annotate("text", x = 3, y = 96*coeff,
label = "A 70-year-old hit by someone driving\n46-50 mph has a 86% chance of dying",
color=line_color) +
annotate("text", x = 1, y = 40*coeff,
label = "A 30-year-old hit by someone\ndriving 36-40 mph has a roughly\n 28% chance of dying",
color=line_color)
}
cat(paste('\nThis many total tickets in May:', nrow(cam_data)))
return(p)
}
analyze_a_few_cameras(locs_vec = c('1400 BLK U ST NW E/B', '1000 BLK U ST NW E/B'),
speed_limit=25,
coeff=6.4,
caption = "",
plot_title='One example: Speeding common, lethal on U Street.\nStreet design is the culprit.')
## Camera locations:
## 1400 BLK U ST NW E/B
## 1000 BLK U ST NW E/B
## This many total tickets in May: 759
# analyze_a_few_cameras(locs_vec = c('1500 BLK 7TH ST NW S/B'),
# speed_limit=25,
# coeff=1,
# caption=paste0("A person on a bicycle was killed nearby in 2022, at 7th & Rhode Island, according to DC's Vision Zero dashboard.\n",
# "At least 3 people have sustained 'major injuries' on this same couple-block stretch since 2017.\n",
# "Speeding data sourced from Open Data DC: https://opendata.dc.gov/datasets/725d52f45e11484c89974ae87910ad62_4/explore \n",
# "Crash data sourced from DC's Vision Zero dashboard. Analysis by the author."),
# plot_title='Speeding common, lethal on 7th near Kennedy Rec Center. Street design is the culprit.')
Plotting camera locations
# https://ddot.dc.gov/release/ddot-deploying-automated-traffic-enforcement-cameras-new-locations-december
cameras_installed_at_end_of_2023 <-
c('200 BLK RHODE ISLAND AVE NW E/B',
'200 BLK RHODE ISLAND AVE NW W/B',
'500 BLK MASSACHUSETTS AVE NW E/B',
'3900 BLK GEORGIA AVE NW S/B',
'5400 BLK GEORGIA AVE NW N/B',
'1300 BLK RHODE ISLAND AVE NE W/B',
'1300 BLK RHODE ISLAND AVE NE E/B',
'1500 BLK RHODE ISLAND AVE NE NE/B',
'3100 BLK S DAKOTA AVE NE SE/B',
'5100 BLK S DAKOTA AVE NE SE/B',
'5400 BLK S DAKOTA AVE NE NW/B',
'1400 BLK PENNSYLVANIA AVE SE SE/B',
'1200 BLK BRANCH AVE SE S/B',
'1200 BLK BRANCH AVE SE N/B',
'2500 BLK PENNSYLVANIA AVE SE NW/B',
'3300 BLK MINNESOTA AVE SE N/B',
'3300 BLK MINNESOTA AVE SE S/B',
'3500 BLK MINNESOTA AVE SE N/B',
'3700 BLK MINNESOTA AVE NE NE/B',
'3900 BLK MINNESOTA AVE NE SW/B',
'2500 BLK SHERIDAN ROAD SE N/B',
'700 BLK N CAPITOL ST NE N/B',
'2500 BLK SHERIDAN ROAD SE S/B',
'4200 BLK MLK JR AVE SW S/B',
'1900 BLK INDEPENDENCE AVE SE E/B',
'2700 BLK NAYLOR ROAD SE S/B',
'4300 BLK TEXAS AVE SE S/B',
'700 BLK SOUTHERN AVE SE SW/B')
to_plot <-
df %>%
filter(is.na(LONGITUDE)==F) %>%
mutate(lat_lon_id = paste0(LATITUDE, "_", LONGITUDE)) %>%
group_by(lat_lon_id) %>%
summarise(ticket_count = n(),
LOCATION = first(LOCATION),
LATITUDE = mean(LATITUDE, na.rm = T),
LONGITUDE = mean(LONGITUDE, na.rm = T)) %>%
filter(ticket_count > 10) %>%
mutate(custom_label = paste0(LOCATION, "<br/>",
"Total tickets: ", format(ticket_count, big.mark=",")))
to_plot$color <- 'blue'
to_plot$color[to_plot$LOCATION %in% cameras_installed_at_end_of_2023] <- 'orange'
# All of the new cameras were installed east of 16th St NW, and the density of
# cameras is higher in the eastern half of the District.
to_plot %>%
leaflet(options = leafletOptions(zoomControl = TRUE,
zoomSnap = 0.25,
zoomDelta = 1)) %>%
addProviderTiles(providers$CartoDB.Positron) %>% # Add default OpenStreetMap map tiles
addCircleMarkers(lng=~LONGITUDE, lat=~LATITUDE,
label=~lapply(custom_label, htmltools::HTML),
radius=~ticket_count^.38,
color = ~color,
opacity=.8)
Handheld radar gun data:
## handheld radar gun data
hhrd <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRRByUDHew_5iyaAWPt9JsmGnBfXqQmc1dulcnk6AeDzo37T80ufmMeIa3NNaB88ds2aTMsJ04eT8Y7/pub?gid=0&single=true&output=csv")
# total number of hours I was out there:
hhrd$start_time <- as.POSIXct(hhrd$data_collection_start_time, tz = 'EST', format='%H:%M %p')
hhrd$end_time <- as.POSIXct(hhrd$data_collection_end_time, tz = 'EST', format='%H:%M %p')
hhrd$hours_elapsed <- difftime(hhrd$end_time, hhrd$start_time, units = "hours")
hhrd %>%
mutate(id = paste(date, data_collection_start_time)) %>%
group_by(id) %>%
mutate(session_index = row_number()) %>%
filter(session_index == 1) %>%
ungroup() %>%
summarise(total_hours = sum(hours_elapsed, na.rm=T))
## # A tibble: 1 × 1
## total_hours
## <drtn>
## 1 3.85 hours
quantile(x = hhrd$speed_mph[hhrd$speed_limit_mph == 25], probs = c(.93, .99))
## 93% 99%
## 30.00 34.16
speed_vals <- 13:48
p_death_70 <- c(6.361, 7.353, 8.344, 9.336, 10.328, 11.319, 12.311, 13.303, 15.224,
17.176, 19.127, 21.079, 23.045, 25.929, 28.812, 31.696, 34.58,
37.464, 40.63, 43.86, 47.09, 50.325, 53.608, 56.89, 60.173, 63.455,
66.728, 69.569, 72.411, 75.252, 78.093, 80.417, 82.339, 84.262, 86.185, 88.107)
max_speed_index <- match(max(hhrd$speed_mph), speed_vals)
speed_vals <- speed_vals[1:max_speed_index]
p_death_70 <- p_death_70[1:max_speed_index]
ay <- list(
tickfont = list(size=11.7),
titlefont=list(size=14.6),
overlaying = "y",
nticks = 5,
side = "right",
title = "Second y axis"
)
coeff = .25
font_size = 14
ggplot() +
geom_histogram(data=hhrd,
aes(x=speed_mph, fill=as.factor(speed_limit_mph)),
color="#e9ecef", alpha=0.6, position = 'identity', bins = 29) +
geom_line(aes(x=speed_vals, y=p_death_70*coeff)) +
geom_point(aes(x=speed_vals, y=p_death_70*coeff)) +
geom_vline(aes(xintercept=20,), color='orange') +
geom_vline(aes(xintercept=25), color='skyblue') +
scale_y_continuous(breaks = seq(0, 100, by = 5),
sec.axis = sec_axis(~.*1/coeff, name="% chance death at this speed for 70-year-old person walking")) +
scale_fill_manual(values=c("orange", "skyblue")) +
scale_x_continuous(breaks=seq(from=10, to=40, by=2)) +
theme_minimal() +
labs(fill="Speed limit\non the street") +
ylab('Number of radar readings at this speed') +
xlab('Measured vehicle speed in mph') +
ggtitle('Hand-held radar gun data from Lanier Heights') +
theme(axis.title.y.left = element_text(size=font_size),
axis.title.y.right = element_text(size=font_size),
plot.title = element_text(size = 18, face = "bold"),
axis.text.y.left = element_text(size=12),
axis.text.y.right = element_text(size=12),
axis.text.x = element_text(size=12)) +
annotate(
'curve',
x = 30,
y = 20,
yend = 9.5,
xend = 29.8,
linewidth = 1,
curvature = 0.5,
arrow = arrow(length = unit(0.5, 'cm'))
) +
annotate("text", x = 35, y = 21,
size=3.5,
label = "Driver going 30 in front of\nthe Park Crest Apts (speed limit: 20).\nChance of killing 70-year-old: ~40%.") +
annotate(
'curve',
x = 38,
y = 5,
yend = 1.5,
xend = 39,
linewidth = 1,
curvature = -0.35,
arrow = arrow(length = unit(0.5, 'cm'))
) +
annotate("text", x = 36, y = 6,
size=3.5,
label = "Driver going 39 on\nAdams Mill, by the zoo.\nChance of killing 70-year-old: 66%")
Mapping cameras to wards:
# map cameras to wards
# data from here: https://opendata.dc.gov/datasets/DCGIS::wards-from-2022/about
geo <- sf::st_read( "Wards_from_2022.geojson", quiet = TRUE)
# plot(sf::st_geometry(geo))
geo <- geo %>% select(WARD, NAME, SHAPEAREA, geometry)
geo$area <- sf::st_area(geo)
sf::st_crs(geo)
## Coordinate Reference System:
## User input: WGS 84
## wkt:
## GEOGCRS["WGS 84",
## DATUM["World Geodetic System 1984",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["geodetic latitude (Lat)",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["geodetic longitude (Lon)",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4326]]
spatial_points <- st_as_sf(x = df[is.na(df$LATITUDE)==F,],
coords = c("LONGITUDE", "LATITUDE"),
crs = sf::st_crs(geo))
joined_data <-
geo %>%
st_join(spatial_points,
join = st_intersects,
left = TRUE)
joined_data <- sf::st_drop_geometry(joined_data)
to_plot <-
joined_data %>%
group_by(WARD) %>%
summarise(ticket_count = n(),
fine_amount = sum(FINE_AMOUNT, na.rm=T),
area = max(area, na.rm=T)) %>%
mutate(tickets_per_area = ticket_count / as.numeric(area))
# household income data from here: https://opendata.dc.gov/datasets/43e7da983bd24f8cbaceb6e654f0da3d/explore?showTable=true
to_plot$mean_hh_income <- NA
to_plot$mean_hh_income[to_plot$WARD == 1] <- 165893
to_plot$mean_hh_income[to_plot$WARD == 2] <- 181167
to_plot$mean_hh_income[to_plot$WARD == 3] <- 223149
to_plot$mean_hh_income[to_plot$WARD == 4] <- 163531
to_plot$mean_hh_income[to_plot$WARD == 5] <- 123490
to_plot$mean_hh_income[to_plot$WARD == 6] <- 169118
to_plot$mean_hh_income[to_plot$WARD == 7] <- 89172
to_plot$mean_hh_income[to_plot$WARD == 8] <- 77639
to_plot$fines_div_income <- to_plot$fine_amount / to_plot$mean_hh_income
ggplot(to_plot) +
geom_bar(mapping = aes(x=WARD, y=tickets_per_area), stat='identity')
a <- ggplot(to_plot) +
geom_bar(mapping = aes(x=WARD, y=mean_hh_income), stat='identity', fill='#109c37') +
theme_minimal() +
ggtitle("Average household income ($)") +
xlab("Ward") +
ylab("") +
scale_x_continuous(breaks = 1:8) +
scale_y_continuous(labels = scales::comma)
b <- ggplot(to_plot) +
geom_bar(mapping = aes(x=WARD, y=fine_amount), stat='identity', fill='#109c37') +
theme_minimal() +
ggtitle("Speeding camera fines in May 2024") +
xlab("Ward") +
ylab("") +
scale_x_continuous(breaks = 1:8) +
scale_y_continuous(labels = scales::comma)
ggpubr::ggarrange(a, b,
labels = c("A", "B"),
ncol = 2, nrow = 1)
ggplot(to_plot) +
geom_bar(mapping = aes(x=WARD, y=fines_div_income), stat='identity', fill="#109c37") +
theme_minimal() +
ggtitle("Speeding camera fines in May 2024 divided by average household income") +
xlab("Ward") +
ylab("Fines divided by average household income") +
scale_x_continuous(breaks = 1:8)
# Major injuries by ward between 08/12/2022 and 08/12/2024:
# ward 1: 66
# ward 2: 103
# ward 3: 32
# ward 4: 49
# ward 5: 114
# ward 6: 99
# ward 7: 122
# ward 8: 128
Plotting District-wide trends in speeding camera data
###########################################################################
## SPEEDING CAMERA TREND ANALYSIS
###########################################################################
myfiles <- list.files("./speed_camera_data")
## Uncomment this to load and mutate the data from a scratch
# for (ii in 1:length(myfiles)) {
# temp <- read.csv(file = file.path("./speed_camera_data", myfiles[ii]))
#
#
# temp$year <- as.numeric(substr(temp$ISSUE_DATE, start = 1, stop = 4))
# temp$month <- as.numeric(substr(x = temp$ISSUE_DATE, start = 6, stop = 7))
#
# year <- unique(temp$year)
# month <- unique(temp$month)
#
# pct_paid_anything <- sum(temp$TOTAL_PAID > 0, na.rm=T) / sum(temp$FINE_AMOUNT > 0, na.rm=T)
#
# temp2 <-
# temp %>%
# filter(VIOLATION_CODE %in% speeding_violation_codes) %>%
# filter(ISSUE_TIME >= 2130) %>% # alternate analysis where we only look at evening crashes
# group_by(year, month, LOCATION) %>%
# summarise(num_tix = n()) %>%
# filter(num_tix > 20) # alternate analysis where we only look at evening crashes
# # filter(num_tix > 50)
#
# temp <-
# temp %>%
# filter(VIOLATION_CODE %in% speeding_violation_codes) %>%
# group_by(year, month, LOCATION) %>%
# summarise(num_tix = n(),
# amount_paid = sum(TOTAL_PAID, na.rm=T)) %>%
# ungroup() %>%
# filter(num_tix > 20) %>% # alternate analysis where we only look at evening crashes
# # filter(num_tix > 50) %>%
# summarise(total_num_tix = sum(num_tix),
# total_amtt_paid = sum(amount_paid),
# avg_num_tix = mean(num_tix),
# avg_amt_paid = mean(amount_paid),
# num_unique_locs = length(unique(LOCATION)))
#
# temp$year <- year
# temp$month <- month
# temp$pct_paid_anything <- pct_paid_anything
#
# if (ii == 1) {
# df <- temp
# df2 <- temp2
# } else {
# df <- dplyr::bind_rows(df, temp)
# df2 <- dplyr::bind_rows(df2, temp2)
# }
#
# cat(paste0(ii, " of ", length(myfiles)))
# gc()
# }
#
#
# df$month_string <- as.character(df$month)
# df$month_string[nchar(x = df$month_string) == 1] <- paste0("0", df$month_string[nchar(x = df$month_string) == 1])
# df$date = as.Date(paste(df$year, df$month_string, "01", sep = "-"))
#
#
# df2$month_string <- as.character(df2$month)
# df2$month_string[nchar(x = df2$month_string) == 1] <- paste0("0", df2$month_string[nchar(x = df2$month_string) == 1])
# df2$date = as.Date(paste(df2$year, df2$month_string, "01", sep = "-"))
# write.csv(x = df,
# file = "evening speeding.csv",
# row.names = F)
#
# write.csv(x = df2,
# file = "evening speeding detailed.csv",
# row.names = F)
# write.csv(x = df,
# file = "speeding.csv",
# row.names = F)
#
# write.csv(x = df2,
# file = "speeding detailed.csv",
# row.names = F)
df <- read.csv("speeding.csv")
df$date = as.Date(df$date)
# most recent month's data (June 2024) is not fully in yet, so we'll omit:
df <- df[df$date != as.Date("2024-06-01"),]
df2 <- read.csv("speeding detailed.csv")
df2$date = as.Date(df2$date)
# most recent month's data (June 2024) is not fully in yet, so we'll omit:
df2 <- df2[df2$date != as.Date("2024-06-01"),]
df_evening <- read.csv("evening speeding.csv")
df_evening$date = as.Date(df_evening$date)
# most recent month's data (June 2024) is not fully in yet, so we'll omit:
df_evening <- df_evening[df_evening$date != as.Date("2024-06-01"),]
df2_evening <- read.csv("evening speeding detailed.csv")
df2_evening$date = as.Date(df2_evening$date)
# most recent month's data (June 2024) is not fully in yet, so we'll omit:
df2_evening <- df2_evening[df2_evening$date != as.Date("2024-06-01"),]
a <- ggplot(df, aes(x=date, y=total_num_tix)) +
geom_line() +
scale_y_continuous(labels=scales::comma_format(), limits = c(0, max(df$total_num_tix))) +
theme_minimal() +
ylab("") +
xlab("") +
ggtitle("Total number of tickets per month")
b <- ggplot(df, aes(x=date, y=total_amtt_paid)) +
geom_line() +
scale_y_continuous(labels = scales::dollar_format(prefix="$"), limits = c(0, max(df$total_amtt_paid))) +
theme_minimal() +
ylab("") +
xlab("") +
ggtitle("Total amount paid by month")
c <- ggplot(df, aes(x=date, y=avg_num_tix)) +
geom_line() +
scale_y_continuous(labels=scales::comma_format(), limits = c(0, max(df$avg_num_tix))) +
theme_minimal() +
ylab("") +
xlab("") +
ggtitle("Average number of tickets per location by month")
d <- ggplot(df, aes(x=date, y=avg_amt_paid)) +
geom_line() +
scale_y_continuous(labels=scales::comma_format(), limits = c(0, max(df$avg_amt_paid))) +
theme_minimal() +
ylab("") +
xlab("") +
ggtitle("Average amount paid per location by month")
ggplot(df, aes(x=date, y=num_unique_locs)) +
geom_line() +
scale_y_continuous(labels=scales::comma_format(), limits = c(0, max(df$num_unique_locs))) +
theme_minimal() +
ylab("") +
xlab("") +
ggtitle("Number of locations per month")
ggpubr::ggarrange(a, b, c, d,
labels = c("A", "B", "C", "D"),
ncol = 2, nrow = 2)
c + geom_smooth(method='lm')
ggplot(df, aes(x=date, y=pct_paid_anything)) +
geom_line() +
scale_y_continuous(labels=scales::percent_format(), limits = c(0, 1), breaks=seq(0, 1, .1)) +
annotate(
'curve',
x = as.Date("2016-01-01"),
y = mean(df$pct_paid_anything[df$date <= as.Date("2020-04-01")]),
yend = mean(df$pct_paid_anything[df$date <= as.Date("2020-04-01")]),
xend = as.Date("2020-04-01"),
linewidth = 1,
curvature = 0,
color='darkgrey'
) +
annotate(
'curve',
x = as.Date("2020-04-01"),
y = mean(df$pct_paid_anything[df$date >= as.Date("2020-04-01")]),
yend = mean(df$pct_paid_anything[df$date >= as.Date("2020-04-01")]),
xend = as.Date("2024-05-01"),
linewidth = 1,
curvature = 0,
color='darkgrey'
) +
theme_minimal() +
ylab("") +
xlab("") +
ggtitle("Percent of tickets that listed a fine where the driver paid anything")
Do cameras issue fewer tickets a year after they first appear in the data set?
# do locations issue fewer tickets a year later after they first appear in the data set?
calc_change <- function(input_data, first_month, max_n_years,
omit_cameras_present_in_first_month=T,
year_option = c("omit_2020", "omit_data_after_2019", "omit_earlier_than_2022")) {
suppressWarnings(
suppressMessages(
temp3 <-
input_data %>%
group_by(LOCATION) %>%
arrange(LOCATION, year, month) %>%
mutate(row_index = row_number()) %>%
mutate(key_months = row_index %in% first_month) %>%
mutate(key_month_literal = ifelse(test=key_months, yes=month, no=NA)) %>%
mutate(
key_month_literal_min = min(key_month_literal, na.rm=T),
key_month_literal_max = max(key_month_literal, na.rm=T)
) %>%
ungroup() %>%
filter(month == key_month_literal_max | month == key_month_literal_min) %>%
group_by(LOCATION, year) %>%
summarise(first_month = min(month, na.rm=T),
date = min(date),
month = mean(month, na.rm = T),
num_tix = sum(num_tix, na.rm=T),
) %>%
ungroup() %>%
group_by(LOCATION) %>%
arrange(LOCATION, year, month) %>%
mutate(row_index_new = row_number(),
n_obs = n()) %>%
filter(n_obs > 1) %>%
filter(row_index_new <= max_n_years) %>%
mutate(min_year = min(year),
max_year = max(year)) %>%
mutate(min_year_val = num_tix[which.min(year)],
max_year_val = num_tix[which.max(year)]
)
)
)
if (year_option == "omit_2020") {
temp2 <- temp3 %>% filter(min_year != 2020 & max_year != 2020)
}
if (year_option == "omit_data_after_2019") {
temp2 <- temp3 %>% filter((min_year < 2020) & (max_year < 2020))
}
if (year_option == "omit_earlier_than_2022") {
temp2 <- temp3 %>% filter(min_year > 2021 & max_year > 2021)
}
if (year_option == "") {
temp2 <- temp3
}
if (omit_cameras_present_in_first_month) {
temp2 <-
temp2 %>%
ungroup() %>%
mutate(min_year_in_dataset = min(date)) %>%
group_by(LOCATION) %>%
mutate(min_date = min(date)) %>%
filter(min_date != min_year_in_dataset)
}
temp1 <-
temp2 %>%
group_by(LOCATION) %>%
mutate(n_obs = n()) %>%
filter(n_obs > 1) %>%
mutate(min_year = min(year),
max_year = max(year)) %>%
mutate(min_year_val = num_tix[which.min(year)],
max_year_val = num_tix[which.max(year)]
) %>%
mutate(diff = max_year_val - min_year_val) %>%
mutate(percent_change = diff / min_year_val * 100)
temp <-
temp1 %>%
filter(row_index_new == 1)
print(summary(temp$diff))
cat(paste('Total change:', sum(temp$diff)))
cat(paste('\nFirst year value:', sum(temp$min_year_val)))
cat(paste('\nLast year value:', sum(temp$max_year_val)))
cat(paste("\n% change:", round(100*sum(temp$diff) / sum(temp$min_year_val)) ))
cat(paste("\nN=", nrow(temp)))
cat('\n\n')
return(temp)
}
rv <- calc_change(df2, first_month = 3, max_n_years = 2, year_option = "",
omit_cameras_present_in_first_month = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11921.0 -439.2 -55.5 -360.2 143.0 2562.0
## Total change: -57637
## First year value: 223527
## Last year value: 165890
## % change: -26
## N= 160
rv <- calc_change(df2, first_month = 3, max_n_years = 2, year_option = "omit_2020",
omit_cameras_present_in_first_month = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11921.0 -536.0 -60.0 -457.3 132.0 2562.0
## Total change: -38869
## First year value: 116536
## Last year value: 77667
## % change: -33
## N= 85
rv <- calc_change(df2, first_month = 3, max_n_years = 2, year_option = "omit_data_after_2019",
omit_cameras_present_in_first_month = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7614.00 -68.00 77.00 -25.51 350.00 2562.00
## Total change: -1046
## First year value: 42974
## Last year value: 41928
## % change: -2
## N= 41
rv <- calc_change(df2, first_month = 3, max_n_years = 2, year_option = "omit_earlier_than_2022",
omit_cameras_present_in_first_month = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11921.0 -938.0 -341.0 -917.0 -98.5 497.0
## Total change: -29345
## First year value: 57892
## Last year value: 28547
## % change: -51
## N= 32
ggplot(rv, aes(x = percent_change, y = LOCATION)) +
geom_segment(aes(yend = LOCATION), xend = 0, colour = "grey50") +
geom_point(size = 3) +
geom_vline(aes(xintercept=0))
rv <- calc_change(df2, first_month = 2, max_n_years = 2, year_option = "",
omit_cameras_present_in_first_month = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -10889.0 -708.0 -132.0 -524.4 90.0 1727.0
## Total change: -84433
## First year value: 248655
## Last year value: 164222
## % change: -34
## N= 161
rv <- calc_change(df2, first_month = 2, max_n_years = 2, year_option = "omit_2020",
omit_cameras_present_in_first_month = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -9143.0 -813.8 -46.5 -620.8 123.2 1579.0
## Total change: -53386
## First year value: 128672
## Last year value: 75286
## % change: -41
## N= 86
rv <- calc_change(df2, first_month = 2, max_n_years = 2, year_option = "omit_data_after_2019",
omit_cameras_present_in_first_month = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8426.0 -121.2 19.5 -200.6 241.0 1579.0
## Total change: -8427
## First year value: 44227
## Last year value: 35800
## % change: -19
## N= 42
rv <- calc_change(df2, first_month = 2, max_n_years = 2, year_option = "omit_earlier_than_2022",
omit_cameras_present_in_first_month = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -9143.0 -959.2 -521.0 -1073.2 -60.5 1048.0
## Total change: -34343
## First year value: 66619
## Last year value: 32276
## % change: -52
## N= 32
rv <- calc_change(df2_evening, first_month = 3, max_n_years = 2, year_option = "",
omit_cameras_present_in_first_month = T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -557.00 -26.75 -2.00 -14.00 6.00 227.00
## Total change: -364
## First year value: 2874
## Last year value: 2510
## % change: -13
## N= 26
Are streets wider in wards 7 and 8?
blocks <- sf::st_read( "Roadway_Block.geojson", quiet = TRUE)
# plot(sf::st_geometry(blocks))
# View(sf::st_drop_geometry(blocks))
to_plot <-
sf::st_drop_geometry(blocks) %>%
group_by(WARD_ID, SPEEDLIMITS_OB) %>%
summarise(mean_road_width = mean(TOTALTRAVELLANEWIDTH, na.rm=T)) %>%
filter(is.na(SPEEDLIMITS_OB)==F & SPEEDLIMITS_OB > 15 & WARD_ID %in% c(3,7,8) & SPEEDLIMITS_OB < 35)
ggplot(data=to_plot,
mapping=aes(x=as.factor(SPEEDLIMITS_OB), y=mean_road_width, fill=as.factor(WARD_ID))) +
geom_col(position='dodge') +
geom_text(
aes(label = round(to_plot$mean_road_width, 0)),
colour = "black", size = 3,
vjust = -1.5, position = position_dodge(.9)
) +
theme_minimal() +
xlab("Speed limit on the street") +
ylab('Average total width of the "travel lanes" in feet') +
ggtitle('Roads are wider in Wards 7 & 8, which leads drivers to go faster') +
scale_fill_discrete(name="Ward")