ATE analysis

The purpose of this Rmarkdown doc is to show the work I did for a GGWash post about ATE. If you notice errors or have questions, please email me at edwardpierrerodrigue [at] gmail [dot] com.

knitr::opts_chunk$set(echo = TRUE, warning=F, message=F)
knitr::opts_chunk$set(fig.width=15, fig.height=10)

# load the packages we need
library(readr)
library(dplyr)
library(ggplot2)
library(plotly)
library(leaflet)
library(sf)
library(scales)


# Load some of the data, for the month of May in 2024:
df <- read_csv("Moving_Violations_Issued_in_May_2024.csv")

# data from https://opendata.dc.gov/datasets/DCGIS::moving-violations-issued-in-may-2024/about

# Show a table detailing the counts of each type of speeding violation:
table(df$VIOLATION_PROCESS_DESC[df$VIOLATION_CODE %in% c(
  "T119", "T120", "T121",
  "T118",
  "T122", "T123", "T276", "T277", "T278"
)])

## 
##       SPEED 11-15 MPH OVER THE SPEED LIMIT 
##                                     208375 
##       SPEED 16-20 MPH OVER THE SPEED LIMIT 
##                                      34770 
##       SPEED 21-25 MPH OVER THE SPEED LIMIT 
##                                       7447 
##       SPEED 26-30 MPH OVER THE SPEED LIMIT 
##                                          1 
##   SPEED UP TO TEN MPH OVER THE SPEED LIMIT 
##                                         11 
## SPEEDING IN CMV 11-14 MPH OVER SPEED LIMIT 
##                                          1

speeding_violation_codes <- c("T119", "T120", "T121") 
# There is also the "unreasonable speed" code ("T125") but it only appears a couple times, so omitting
# also omitting "T118" which is "speeding up to 10 mph over the limit" because there are almost no observations in that category.
# and omitting "T122", "T123", "T276", "T277", "T278" because there's not really any data under that code either.

# Subset the data to the speeding codes of interest for which we have complete data:
df <-
  df %>%
  filter(VIOLATION_CODE %in% speeding_violation_codes)

# Create various date variables:
weekend_days_in_may <- c(4, 5, 11, 12, 18, 19, 25, 26)
df$day <- as.numeric(substr(df$ISSUE_DATE, 9, 10))
df$hour <- round(df$ISSUE_TIME / 100, 0)
df$day_hour <- as.numeric(paste0(df$day, ".", df$hour))
df$weekend <- 0
df$weekend[df$day %in% weekend_days_in_may] <- 1

# Create a variable with a shorter violation code description:
df$short_desc <- ""
df$short_desc[df$VIOLATION_CODE == "T119"] <- "11-15 mph over limit"
df$short_desc[df$VIOLATION_CODE == "T120"] <- "16-20 mph over limit"
df$short_desc[df$VIOLATION_CODE == "T121"] <- "21-25 mph over limit"

Plotting total figures from the month of May 2024:

# There is a lot of speeding in DC. This graph doesn't even show all the people that are going 1-10 mph over the limit.
# Or people that are going 26+ mph over the limit. The city doesn't seem to publish this data.
df %>%
  ggplot(., aes(short_desc)) + 
  geom_bar(fill="#109c37") + 
  theme_minimal() +
  xlab("") +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=1)) +
  scale_y_continuous(labels=function(x) format(x, big.mark = ",", scientific = FALSE)) +
  ggtitle("There is a lot of speeding in DC") +
  geom_text(aes(label = format(after_stat(count), big.mark = ',') ), stat='count', vjust = -0.2)

# If you just sum up the "total paid" column, you get $8,531,557. 
format(x = sum(df$TOTAL_PAID), big.mark = ',')

## [1] "8,531,557"

# the total amount issued was $34,966,950
format(x = sum(df$FINE_AMOUNT) + sum(df$PENALTY_1) + sum(df$PENALTY_2) + sum(df$PENALTY_3) + sum(df$PENALTY_4) + sum(df$PENALTY_5), big.mark = ',')

## [1] "34,966,950"

When do people speed?

# speeding peaks in general during mid-day and the afternoon commute, except for
# a small number of sociopaths who break the limit by 21+ mph are more prevalent very late at night.
df %>%
  group_by(hour, short_desc) %>%
  summarise(ticket_count = n()) %>%
  plot_ly(data = ., x = ~hour, y = ~ticket_count, 
          color = ~short_desc, 
          size=3)

Looking just at U st:

## Zooming in on U St:
# Three cameras on the main U St corridor gave out an average of 39 tickets a day, or about 1.6 an hour
df %>%
  filter(df$LOCATION %in% c("1400 BLK U ST NW E/B", "1000 BLK U ST NW E/B", "1500 BLK U ST NW E/B")) %>%
  group_by(day) %>%
  summarize(count = n()) %>%
  summarize(mean_tickets_per_day = mean(count, na.rm=T))

## # A tibble: 1 × 1
##   mean_tickets_per_day
##                  <dbl>
## 1                 38.7

df %>%
  filter(df$LOCATION %in% c("1400 BLK U ST NW E/B", "1000 BLK U ST NW E/B", "1500 BLK U ST NW E/B")) %>%
  summarize(mean_tickets_per_hour = n() / (31*24))

## # A tibble: 1 × 1
##   mean_tickets_per_hour
##                   <dbl>
## 1                  1.61

## Three cameras on U St gave out an average 1.6 tickets per hour, although at times they
## gave out as many as 11 tickets an hour, or one every 5 and a half minutes.
## And that's not even counting people that are driving 30 in a 20. The dataset basically doesn't include
## information on people that were driving between 1 and 10 mph over the limit.
df %>%
  filter(df$LOCATION %in% c("1400 BLK U ST NW E/B", "1000 BLK U ST NW E/B", "1500 BLK U ST NW E/B")) %>%
  group_by(day_hour) %>%
  summarize(count = n()) %>%
  arrange(-count)

## # A tibble: 498 × 2
##    day_hour count
##       <dbl> <int>
##  1     2.12    11
##  2    26.1     11
##  3     8.7     10
##  4     8.1      9
##  5    19.8      8
##  6    20.2      8
##  7     1.1      7
##  8     6.14     7
##  9    25.2      7
## 10    28.8      7
## # ℹ 488 more rows

Looking at risk of dying while walking on some of these streets with cameras:

# impute the real speeding, assuming an exponential distribution with mean 2 mph over the minimum value in the bucket
df$imputed_speed_over <- 11
df$imputed_speed_over[df$short_desc== "16-20 mph over limit"] <- 16
df$imputed_speed_over[df$short_desc== "21-25 mph over limit"] <- 21


# look at a few streets/cameras in particular:
analyze_a_few_cameras <- function(locs_vec, speed_limit, caption, coeff, plot_title) {
  cat("Camera locations:")
  for (location in locs_vec) {cat(paste('\n\t', location))}
  # create a variable for the speed the person was actually going, given the 
  # speed limit on that street
  cam_data <-
    df %>%
    filter(LOCATION %in% locs_vec) %>%
    mutate(imputed_speed = imputed_speed_over + speed_limit + runif(n=n(), 0, 4)) 
  
  # group the data by the amount of speeding
  if (speed_limit == 20) {
    cam_data$mph_cat <- 
    cut(cam_data$imputed_speed, 
        c(30, 35, 40, 60), include.lowest = F,
        labels = c('31-35', '36-40', '41-45'))
  } else if (speed_limit == 25) {
    cam_data$mph_cat <- 
    cut(cam_data$imputed_speed, 
        c(35, 40, 45, 60), include.lowest = F,
        labels = c('36-40', '41-45', '46-50'))
  }
  
  # add on the probability of dying if hit by someone going that fast at each age.
  # This is taken from this study by AAA: https://aaafoundation.org/impact-speed-pedestrians-risk-severe-injury-death/
  cam_data$prob_death_70 <- NA
  cam_data$prob_death_70[cam_data$mph_cat == '36-40'] <- 63
  cam_data$prob_death_70[cam_data$mph_cat == '41-45'] <- 78
  
  cam_data$prob_death_30 <- NA
  cam_data$prob_death_30[cam_data$mph_cat == '36-40'] <- 28
  cam_data$prob_death_30[cam_data$mph_cat == '41-45'] <- 42
  
  # create labels for our bar chart
  cam_data$label <- NA
  if (speed_limit == 20) {
    cam_data$prob_death_70[cam_data$mph_cat == '31-35'] <- 46
    cam_data$prob_death_30[cam_data$mph_cat == '31-35'] <- 17
    speed_limit_vec = c('31-35', '36-40', '41-45')
  } else if (speed_limit == 25) {
    cam_data$prob_death_70[cam_data$mph_cat == '46-50'] <- 86
    cam_data$prob_death_30[cam_data$mph_cat == '46-50'] <- 62
    speed_limit_vec =c('36-40', '41-45', '46-50')
  }
  for (mph_group in speed_limit_vec) {
    cam_data$label[cam_data$mph_cat == mph_group] <- length(cam_data$label[cam_data$mph_cat == mph_group])
  }
  
  
  # create a graph of the data:
  line_color= 'black'
  barcolor= '#109c37'
  font_size= 17
  
  p <-
    ggplot() + 
    geom_bar(data=cam_data, aes(x=mph_cat), fill=barcolor) +
    geom_point(data=cam_data, aes(x=as.factor(mph_cat), y=prob_death_70*coeff), size=4, color=line_color) +
    geom_line(data=cam_data, aes(x=mph_cat, y=prob_death_70*coeff, group=1), linewidth=2, color=line_color) +
    scale_y_continuous(sec.axis = sec_axis(~.*1/coeff, name="Probability of death at this speed")) +
    geom_point(data=cam_data, aes(x=as.factor(mph_cat), y=prob_death_30*coeff), size=4, color=line_color) +
    geom_line(data=cam_data, aes(x=mph_cat, y=prob_death_30*coeff, group=1), linewidth=2, color=line_color) +
    scale_y_continuous(sec.axis = sec_axis(~.*1/coeff, name="% chance death at this speed for 70- and 30-year olds")) +
    theme_minimal() +
    ylab('Number of speeding tickets issued in May 2024') +
    theme(axis.title.y.left = element_text(colour = barcolor, size=font_size),
          axis.title.y.right = element_text(colour = line_color, size=font_size),
          plot.title = element_text(size = 18, face = "bold"),
          plot.caption = element_text(hjust = 0, size=font_size),
          axis.text.y.left = element_text(colour=barcolor, size=12),
          axis.text.y.right = element_text(colour=line_color, size=12),
          axis.text.x = element_text(size=12)) +
    xlab("mph") +
    ggtitle(plot_title) +
    geom_text(data=cam_data, 
              aes(x=mph_cat, y=label, label=label,),
              color=barcolor,
              vjust = -0.2) +
    geom_text(data=cam_data, 
              aes(x=mph_cat, y=prob_death_70*coeff, label=paste0(prob_death_70, "%"),),
              color=line_color,
              vjust = -1) +
    geom_text(data=cam_data, 
              aes(x=mph_cat, y=prob_death_30*coeff, label=paste0(prob_death_30, "%"),),
              color=line_color,
              vjust = -1) +
    labs(caption = caption)
  
  
  if (speed_limit == 20) {
    p <- p +
      annotate("text", x = 3, y = 87*coeff,
             label = "A 70-year-old hit by someone driving\n41-45 mph has a 78% chance of dying",
             color=line_color) +
    annotate("text", x = 1, y = 28*coeff,
             label = "A 30-year-old hit by someone\ndriving 31-35 mph has a roughly\n one-fifth chance of dying",
             color=line_color)
  } else if (speed_limit == 25) {
    p <- p +
          annotate("text", x = 3, y = 96*coeff,
             label = "A 70-year-old hit by someone driving\n46-50 mph has a 86% chance of dying",
             color=line_color) +
    annotate("text", x = 1, y = 40*coeff,
             label = "A 30-year-old hit by someone\ndriving 36-40 mph has a roughly\n 28% chance of dying",
             color=line_color)
  }
  
  
  cat(paste('\nThis many total tickets in May:', nrow(cam_data)))
  
  return(p)
}



analyze_a_few_cameras(locs_vec = c('1400 BLK U ST NW E/B', '1000 BLK U ST NW E/B'),
                      speed_limit=25,
                      coeff=6.4,
                      caption = "",
                      plot_title='One example: Speeding common, lethal on U Street.\nStreet design is the culprit.')

## Camera locations:
##   1400 BLK U ST NW E/B
##   1000 BLK U ST NW E/B
## This many total tickets in May: 759

# analyze_a_few_cameras(locs_vec = c('1500 BLK 7TH ST NW S/B'),
#                       speed_limit=25,
#                       coeff=1,
#                       caption=paste0("A person on a bicycle was killed nearby in 2022, at 7th & Rhode Island, according to DC's Vision Zero dashboard.\n",
#                                      "At least 3 people have sustained 'major injuries' on this same couple-block stretch since 2017.\n",
#                                      "Speeding data sourced from Open Data DC: https://opendata.dc.gov/datasets/725d52f45e11484c89974ae87910ad62_4/explore \n",
#                                      "Crash data sourced from DC's Vision Zero dashboard. Analysis by the author."),
#                       plot_title='Speeding common, lethal on 7th near Kennedy Rec Center. Street design is the culprit.')

Plotting camera locations

# https://ddot.dc.gov/release/ddot-deploying-automated-traffic-enforcement-cameras-new-locations-december
cameras_installed_at_end_of_2023 <-
  c('200 BLK RHODE ISLAND AVE NW E/B',
  '200 BLK RHODE ISLAND AVE NW W/B',
  '500 BLK MASSACHUSETTS AVE NW E/B',
  '3900 BLK GEORGIA AVE NW S/B',
  '5400 BLK GEORGIA AVE NW N/B',
  '1300 BLK RHODE ISLAND AVE NE W/B',
  '1300 BLK RHODE ISLAND AVE NE E/B',
  '1500 BLK RHODE ISLAND AVE NE NE/B',
  '3100 BLK S DAKOTA AVE NE SE/B',
  '5100 BLK S DAKOTA AVE NE SE/B',
  '5400 BLK S DAKOTA AVE NE NW/B',
  '1400 BLK PENNSYLVANIA AVE SE SE/B',
  '1200 BLK BRANCH AVE SE S/B',
  '1200 BLK BRANCH AVE SE N/B',
  '2500 BLK PENNSYLVANIA AVE SE NW/B',
  '3300 BLK MINNESOTA AVE SE N/B',
  '3300 BLK MINNESOTA AVE SE S/B',
  '3500 BLK MINNESOTA AVE SE N/B',
  '3700 BLK MINNESOTA AVE NE NE/B',
  '3900 BLK MINNESOTA AVE NE SW/B',
  '2500 BLK SHERIDAN ROAD SE N/B',
  '700 BLK N CAPITOL ST NE N/B',
  '2500 BLK SHERIDAN ROAD SE S/B',
  '4200 BLK MLK JR AVE SW S/B',
  '1900 BLK INDEPENDENCE AVE SE E/B',
  '2700 BLK NAYLOR ROAD SE S/B',
  '4300 BLK TEXAS AVE SE S/B',
  '700 BLK SOUTHERN AVE SE SW/B')


to_plot <-
  df %>% 
  filter(is.na(LONGITUDE)==F) %>%
  mutate(lat_lon_id = paste0(LATITUDE, "_", LONGITUDE)) %>%
  group_by(lat_lon_id) %>%
  summarise(ticket_count = n(),
            LOCATION = first(LOCATION),
            LATITUDE  = mean(LATITUDE, na.rm = T),
            LONGITUDE = mean(LONGITUDE, na.rm = T)) %>%
  filter(ticket_count > 10) %>%
  mutate(custom_label = paste0(LOCATION, "<br/>",
                               "Total tickets: ", format(ticket_count, big.mark=",")))

to_plot$color <- 'blue'
to_plot$color[to_plot$LOCATION %in% cameras_installed_at_end_of_2023] <- 'orange'


# All of the new cameras were installed east of 16th St NW, and the density of 
# cameras is higher in the eastern half of the District.
to_plot %>%
  leaflet(options = leafletOptions(zoomControl = TRUE,
                                 zoomSnap = 0.25,
                                 zoomDelta = 1)) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%  # Add default OpenStreetMap map tiles
  addCircleMarkers(lng=~LONGITUDE, lat=~LATITUDE, 
                   label=~lapply(custom_label, htmltools::HTML), 
                   radius=~ticket_count^.38,
                   color = ~color,
                   opacity=.8)

Handheld radar gun data:

## handheld radar gun data
hhrd <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRRByUDHew_5iyaAWPt9JsmGnBfXqQmc1dulcnk6AeDzo37T80ufmMeIa3NNaB88ds2aTMsJ04eT8Y7/pub?gid=0&single=true&output=csv")


# total number of hours I was out there:
hhrd$start_time <- as.POSIXct(hhrd$data_collection_start_time, tz = 'EST', format='%H:%M %p')
hhrd$end_time <- as.POSIXct(hhrd$data_collection_end_time, tz = 'EST', format='%H:%M %p')
hhrd$hours_elapsed <- difftime(hhrd$end_time, hhrd$start_time, units = "hours")
hhrd %>%
  mutate(id = paste(date, data_collection_start_time)) %>%
  group_by(id) %>%
  mutate(session_index = row_number()) %>%
  filter(session_index == 1) %>%
  ungroup() %>%
  summarise(total_hours = sum(hours_elapsed, na.rm=T))

## # A tibble: 1 × 1
##   total_hours
##   <drtn>     
## 1 3.85 hours

quantile(x = hhrd$speed_mph[hhrd$speed_limit_mph == 25], probs = c(.93, .99))

##   93%   99% 
## 30.00 34.16

speed_vals <- 13:48
p_death_70 <- c(6.361, 7.353, 8.344, 9.336, 10.328, 11.319, 12.311, 13.303, 15.224, 
                17.176, 19.127, 21.079, 23.045, 25.929, 28.812, 31.696, 34.58, 
                37.464, 40.63, 43.86, 47.09, 50.325, 53.608, 56.89, 60.173, 63.455, 
                66.728, 69.569, 72.411, 75.252, 78.093, 80.417, 82.339, 84.262, 86.185, 88.107)

max_speed_index <- match(max(hhrd$speed_mph), speed_vals)
speed_vals <- speed_vals[1:max_speed_index]
p_death_70 <- p_death_70[1:max_speed_index]



ay <- list(
  tickfont = list(size=11.7),
  titlefont=list(size=14.6),
  overlaying = "y",
  nticks = 5,
  side = "right",
  title = "Second y axis"
)

coeff = .25
font_size = 14

ggplot() +
  geom_histogram(data=hhrd, 
                 aes(x=speed_mph, fill=as.factor(speed_limit_mph)),
                 color="#e9ecef", alpha=0.6, position = 'identity', bins = 29) +
  geom_line(aes(x=speed_vals, y=p_death_70*coeff)) +
  geom_point(aes(x=speed_vals, y=p_death_70*coeff)) +
  
  geom_vline(aes(xintercept=20,), color='orange') +
  geom_vline(aes(xintercept=25), color='skyblue') +
  
  scale_y_continuous(breaks = seq(0, 100, by = 5),
                     sec.axis = sec_axis(~.*1/coeff, name="% chance death at this speed for 70-year-old person walking")) +
  scale_fill_manual(values=c("orange", "skyblue")) +
  scale_x_continuous(breaks=seq(from=10, to=40, by=2)) +
  theme_minimal() +
  labs(fill="Speed limit\non the street") +
  ylab('Number of radar readings at this speed') +
  xlab('Measured vehicle speed in mph') +
  ggtitle('Hand-held radar gun data from Lanier Heights') +
  theme(axis.title.y.left = element_text(size=font_size),
        axis.title.y.right = element_text(size=font_size),
        plot.title = element_text(size = 18, face = "bold"),
        axis.text.y.left = element_text(size=12),
        axis.text.y.right = element_text(size=12),
        axis.text.x = element_text(size=12)) +
  annotate(
    'curve',
    x = 30,
    y = 20,
    yend = 9.5,
    xend = 29.8,
    linewidth = 1,
    curvature = 0.5,
    arrow = arrow(length = unit(0.5, 'cm'))
  ) +
  annotate("text", x = 35, y = 21,
           size=3.5,
           label = "Driver going 30 in front of\nthe Park Crest Apts (speed limit: 20).\nChance of killing 70-year-old: ~40%.") +
  annotate(
    'curve',
    x = 38,
    y = 5,
    yend = 1.5,
    xend = 39,
    linewidth = 1,
    curvature = -0.35,
    arrow = arrow(length = unit(0.5, 'cm'))
  ) +
  annotate("text", x = 36, y = 6,
           size=3.5,
           label = "Driver going 39 on\nAdams Mill, by the zoo.\nChance of killing 70-year-old: 66%")

Mapping cameras to wards:

# map cameras to wards
# data from here: https://opendata.dc.gov/datasets/DCGIS::wards-from-2022/about
geo <- sf::st_read( "Wards_from_2022.geojson", quiet = TRUE)
# plot(sf::st_geometry(geo))

geo <- geo %>% select(WARD, NAME, SHAPEAREA, geometry)
geo$area <- sf::st_area(geo)
sf::st_crs(geo)

## Coordinate Reference System:
##   User input: WGS 84 
##   wkt:
## GEOGCRS["WGS 84",
##     DATUM["World Geodetic System 1984",
##         ELLIPSOID["WGS 84",6378137,298.257223563,
##             LENGTHUNIT["metre",1]]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["geodetic latitude (Lat)",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["geodetic longitude (Lon)",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     ID["EPSG",4326]]

spatial_points <- st_as_sf(x = df[is.na(df$LATITUDE)==F,],                         
               coords = c("LONGITUDE", "LATITUDE"),
               crs = sf::st_crs(geo))
joined_data <- 
  geo %>%
  st_join(spatial_points,
          join = st_intersects, 
          left = TRUE)

joined_data <- sf::st_drop_geometry(joined_data)

to_plot <-
  joined_data %>%
  group_by(WARD) %>%
  summarise(ticket_count = n(),
         fine_amount          = sum(FINE_AMOUNT, na.rm=T),
         area                 = max(area, na.rm=T)) %>%
  mutate(tickets_per_area     = ticket_count / as.numeric(area))

# household income data from here: https://opendata.dc.gov/datasets/43e7da983bd24f8cbaceb6e654f0da3d/explore?showTable=true
to_plot$mean_hh_income <- NA
to_plot$mean_hh_income[to_plot$WARD == 1] <- 165893
to_plot$mean_hh_income[to_plot$WARD == 2] <- 181167
to_plot$mean_hh_income[to_plot$WARD == 3] <- 223149
to_plot$mean_hh_income[to_plot$WARD == 4] <- 163531
to_plot$mean_hh_income[to_plot$WARD == 5] <- 123490
to_plot$mean_hh_income[to_plot$WARD == 6] <- 169118
to_plot$mean_hh_income[to_plot$WARD == 7] <- 89172
to_plot$mean_hh_income[to_plot$WARD == 8] <- 77639

to_plot$fines_div_income <- to_plot$fine_amount / to_plot$mean_hh_income

ggplot(to_plot) +
  geom_bar(mapping = aes(x=WARD, y=tickets_per_area), stat='identity')

a <- ggplot(to_plot) +
  geom_bar(mapping = aes(x=WARD, y=mean_hh_income), stat='identity', fill='#109c37') +
  theme_minimal() +
  ggtitle("Average household income ($)") +
  xlab("Ward") +
  ylab("") +
  scale_x_continuous(breaks = 1:8) +
  scale_y_continuous(labels = scales::comma)

b <- ggplot(to_plot) +
  geom_bar(mapping = aes(x=WARD, y=fine_amount), stat='identity', fill='#109c37') +
  theme_minimal() +
  ggtitle("Speeding camera fines in May 2024") +
  xlab("Ward") +
  ylab("") +
  scale_x_continuous(breaks = 1:8) +
  scale_y_continuous(labels = scales::comma)


ggpubr::ggarrange(a, b, 
                  labels = c("A", "B"),
                  ncol = 2, nrow = 1)

ggplot(to_plot) +
  geom_bar(mapping = aes(x=WARD, y=fines_div_income), stat='identity', fill="#109c37") + 
  theme_minimal() +
  ggtitle("Speeding camera fines in May 2024 divided by average household income") +
  xlab("Ward") +
  ylab("Fines divided by average household income") +
  scale_x_continuous(breaks = 1:8)

# Major injuries by ward between 08/12/2022 and 08/12/2024:
# ward 1: 66
# ward 2: 103
# ward 3: 32
# ward 4: 49
# ward 5: 114
# ward 6: 99
# ward 7: 122
# ward 8: 128

Plotting District-wide trends in speeding camera data

###########################################################################
## SPEEDING CAMERA TREND ANALYSIS
###########################################################################


myfiles <- list.files("./speed_camera_data")


## Uncomment this to load and mutate the data from a scratch

# for (ii in 1:length(myfiles)) {
#   temp <- read.csv(file = file.path("./speed_camera_data", myfiles[ii]))
# 
# 
#   temp$year <- as.numeric(substr(temp$ISSUE_DATE, start = 1, stop = 4))
#   temp$month <- as.numeric(substr(x = temp$ISSUE_DATE, start = 6, stop = 7))
# 
#   year <- unique(temp$year)
#   month <- unique(temp$month)
# 
#   pct_paid_anything <- sum(temp$TOTAL_PAID > 0, na.rm=T) / sum(temp$FINE_AMOUNT > 0, na.rm=T)
# 
#   temp2 <-
#     temp %>%
#     filter(VIOLATION_CODE %in% speeding_violation_codes) %>%
#     filter(ISSUE_TIME >= 2130) %>%                                 # alternate analysis where we only look at evening crashes
#     group_by(year, month, LOCATION) %>%
#     summarise(num_tix = n()) %>%
#     filter(num_tix > 20)                                           # alternate analysis where we only look at evening crashes
#     # filter(num_tix > 50)
# 
#   temp <-
#     temp %>%
#     filter(VIOLATION_CODE %in% speeding_violation_codes) %>%
#     group_by(year, month, LOCATION) %>%
#     summarise(num_tix = n(),
#               amount_paid = sum(TOTAL_PAID, na.rm=T)) %>%
#     ungroup() %>%
#     filter(num_tix > 20) %>%                                       # alternate analysis where we only look at evening crashes
#     # filter(num_tix > 50) %>%
#     summarise(total_num_tix = sum(num_tix),
#               total_amtt_paid = sum(amount_paid),
#               avg_num_tix = mean(num_tix),
#               avg_amt_paid = mean(amount_paid),
#               num_unique_locs = length(unique(LOCATION)))
# 
#   temp$year <- year
#   temp$month <- month
#   temp$pct_paid_anything <- pct_paid_anything
# 
#   if (ii == 1) {
#     df <- temp
#     df2 <- temp2
#   } else {
#     df <- dplyr::bind_rows(df, temp)
#     df2 <- dplyr::bind_rows(df2, temp2)
#   }
# 
#   cat(paste0(ii, " of ", length(myfiles)))
#   gc()
# }
# 
# 
# df$month_string <- as.character(df$month)
# df$month_string[nchar(x = df$month_string) == 1] <- paste0("0", df$month_string[nchar(x = df$month_string) == 1])
# df$date = as.Date(paste(df$year, df$month_string, "01", sep = "-"))
# 
# 
# df2$month_string <- as.character(df2$month)
# df2$month_string[nchar(x = df2$month_string) == 1] <- paste0("0", df2$month_string[nchar(x = df2$month_string) == 1])
# df2$date = as.Date(paste(df2$year, df2$month_string, "01", sep = "-"))

# write.csv(x = df,
#           file = "evening speeding.csv",
#           row.names = F)
# 
# write.csv(x = df2,
#           file = "evening speeding detailed.csv",
#           row.names = F)

# write.csv(x = df,
#           file = "speeding.csv",
#           row.names = F)
#
# write.csv(x = df2,
#           file = "speeding detailed.csv",
#          row.names = F)


df <- read.csv("speeding.csv")
df$date = as.Date(df$date)
# most recent month's data (June 2024) is not fully in yet, so we'll omit:
df <- df[df$date != as.Date("2024-06-01"),]

df2 <- read.csv("speeding detailed.csv")
df2$date = as.Date(df2$date)
# most recent month's data (June 2024) is not fully in yet, so we'll omit:
df2 <- df2[df2$date != as.Date("2024-06-01"),]


df_evening <- read.csv("evening speeding.csv")
df_evening$date = as.Date(df_evening$date)
# most recent month's data (June 2024) is not fully in yet, so we'll omit:
df_evening <- df_evening[df_evening$date != as.Date("2024-06-01"),]

df2_evening <- read.csv("evening speeding detailed.csv")
df2_evening$date = as.Date(df2_evening$date)
# most recent month's data (June 2024) is not fully in yet, so we'll omit:
df2_evening <- df2_evening[df2_evening$date != as.Date("2024-06-01"),]


a <- ggplot(df, aes(x=date, y=total_num_tix)) +
  geom_line() +
  scale_y_continuous(labels=scales::comma_format(), limits = c(0, max(df$total_num_tix))) +
  theme_minimal() +
  ylab("") +
  xlab("") +
  ggtitle("Total number of tickets per month")

b <- ggplot(df, aes(x=date, y=total_amtt_paid)) +
  geom_line() +
  scale_y_continuous(labels = scales::dollar_format(prefix="$"), limits = c(0, max(df$total_amtt_paid))) +
  theme_minimal() +
  ylab("") +
  xlab("") +
  ggtitle("Total amount paid by month")

c <- ggplot(df, aes(x=date, y=avg_num_tix)) +
  geom_line() +
  scale_y_continuous(labels=scales::comma_format(), limits = c(0, max(df$avg_num_tix))) +
  theme_minimal() +
  ylab("") +
  xlab("") +
  ggtitle("Average number of tickets per location by month")

d <- ggplot(df, aes(x=date, y=avg_amt_paid)) +
  geom_line() +
  scale_y_continuous(labels=scales::comma_format(), limits = c(0, max(df$avg_amt_paid))) +
  theme_minimal() +
  ylab("") +
  xlab("") +
  ggtitle("Average amount paid per location by month")


ggplot(df, aes(x=date, y=num_unique_locs)) +
  geom_line() +
  scale_y_continuous(labels=scales::comma_format(), limits = c(0, max(df$num_unique_locs))) +
  theme_minimal() +
  ylab("") +
  xlab("") +
  ggtitle("Number of locations per month")

ggpubr::ggarrange(a, b, c, d, 
          labels = c("A", "B", "C", "D"),
          ncol = 2, nrow = 2)

c + geom_smooth(method='lm')

ggplot(df, aes(x=date, y=pct_paid_anything)) +
  geom_line() +
  scale_y_continuous(labels=scales::percent_format(), limits = c(0, 1), breaks=seq(0, 1, .1)) +
  annotate(
    'curve',
    x = as.Date("2016-01-01"),
    y = mean(df$pct_paid_anything[df$date <= as.Date("2020-04-01")]),
    yend = mean(df$pct_paid_anything[df$date <= as.Date("2020-04-01")]),
    xend = as.Date("2020-04-01"),
    linewidth = 1,
    curvature = 0,
    color='darkgrey'
    ) +
  annotate(
    'curve',
    x = as.Date("2020-04-01"),
    y = mean(df$pct_paid_anything[df$date >= as.Date("2020-04-01")]),
    yend = mean(df$pct_paid_anything[df$date >= as.Date("2020-04-01")]),
    xend = as.Date("2024-05-01"),
    linewidth = 1,
    curvature = 0,
    color='darkgrey'
  ) +
  theme_minimal() +
  ylab("") +
  xlab("") +
  ggtitle("Percent of tickets that listed a fine where the driver paid anything")

Do cameras issue fewer tickets a year after they first appear in the data set?

# do locations issue fewer tickets a year later after they first appear in the data set?
calc_change <- function(input_data, first_month, max_n_years, 
                        omit_cameras_present_in_first_month=T, 
                        year_option = c("omit_2020", "omit_data_after_2019", "omit_earlier_than_2022")) {
  suppressWarnings(
  suppressMessages(  
  temp3 <-
    input_data %>%
    group_by(LOCATION) %>%
    arrange(LOCATION, year, month) %>%
    mutate(row_index = row_number()) %>%
    mutate(key_months = row_index %in% first_month) %>%
    mutate(key_month_literal = ifelse(test=key_months, yes=month, no=NA)) %>% 
    mutate(
           key_month_literal_min = min(key_month_literal, na.rm=T),
           key_month_literal_max = max(key_month_literal, na.rm=T)
           ) %>% 
    ungroup() %>%
    filter(month == key_month_literal_max | month == key_month_literal_min) %>%
      group_by(LOCATION, year) %>%
      summarise(first_month = min(month, na.rm=T),
                date = min(date),
                month = mean(month, na.rm = T),
                num_tix = sum(num_tix, na.rm=T),
                ) %>%
      ungroup() %>%
      group_by(LOCATION) %>%
    arrange(LOCATION, year, month) %>%
    mutate(row_index_new = row_number(),
           n_obs     = n()) %>% 
   filter(n_obs > 1) %>%
    filter(row_index_new <= max_n_years) %>%
    mutate(min_year = min(year),
           max_year = max(year)) %>%
    mutate(min_year_val = num_tix[which.min(year)],
           max_year_val = num_tix[which.max(year)]
    )
  )
  )
  
  if (year_option == "omit_2020") {
    temp2 <- temp3 %>% filter(min_year != 2020 & max_year != 2020)
  }
  if (year_option == "omit_data_after_2019") {
    temp2 <- temp3 %>% filter((min_year < 2020) & (max_year < 2020))
  }
  if (year_option == "omit_earlier_than_2022") {
    temp2 <- temp3 %>% filter(min_year > 2021 & max_year > 2021)
  }
  if (year_option == "") {
    temp2 <- temp3
  }
  
  if (omit_cameras_present_in_first_month) {
    temp2 <-
      temp2 %>% 
      ungroup() %>%
      mutate(min_year_in_dataset = min(date)) %>%
      group_by(LOCATION) %>%
      mutate(min_date = min(date)) %>% 
      filter(min_date != min_year_in_dataset)
  }

  temp1 <-
    temp2 %>%
    group_by(LOCATION) %>%
    mutate(n_obs = n()) %>%
    filter(n_obs > 1) %>%
    mutate(min_year = min(year),
           max_year = max(year)) %>%
    mutate(min_year_val = num_tix[which.min(year)],
           max_year_val = num_tix[which.max(year)]
    ) %>%
    mutate(diff = max_year_val - min_year_val) %>%
    mutate(percent_change = diff / min_year_val * 100)
  
  temp <-
    temp1 %>%
    filter(row_index_new == 1)
  
  print(summary(temp$diff))
  cat(paste('Total change:', sum(temp$diff)))
  cat(paste('\nFirst year value:', sum(temp$min_year_val)))
  cat(paste('\nLast year value:', sum(temp$max_year_val)))
  cat(paste("\n% change:", round(100*sum(temp$diff) / sum(temp$min_year_val)) ))
  cat(paste("\nN=", nrow(temp)))
  cat('\n\n')
  return(temp)
}




rv <- calc_change(df2, first_month = 3, max_n_years = 2, year_option = "", 
                  omit_cameras_present_in_first_month = T)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -11921.0   -439.2    -55.5   -360.2    143.0   2562.0 
## Total change: -57637
## First year value: 223527
## Last year value: 165890
## % change: -26
## N= 160

rv <- calc_change(df2, first_month = 3, max_n_years = 2, year_option = "omit_2020", 
                  omit_cameras_present_in_first_month = T)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -11921.0   -536.0    -60.0   -457.3    132.0   2562.0 
## Total change: -38869
## First year value: 116536
## Last year value: 77667
## % change: -33
## N= 85

rv <- calc_change(df2, first_month = 3, max_n_years = 2, year_option = "omit_data_after_2019", 
                  omit_cameras_present_in_first_month = T)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -7614.00   -68.00    77.00   -25.51   350.00  2562.00 
## Total change: -1046
## First year value: 42974
## Last year value: 41928
## % change: -2
## N= 41

rv <- calc_change(df2, first_month = 3, max_n_years = 2, year_option = "omit_earlier_than_2022", 
                  omit_cameras_present_in_first_month = T)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -11921.0   -938.0   -341.0   -917.0    -98.5    497.0 
## Total change: -29345
## First year value: 57892
## Last year value: 28547
## % change: -51
## N= 32

ggplot(rv, aes(x = percent_change, y = LOCATION)) +
  geom_segment(aes(yend = LOCATION), xend = 0, colour = "grey50") +
  geom_point(size = 3) +
  geom_vline(aes(xintercept=0))

rv <- calc_change(df2, first_month = 2, max_n_years = 2, year_option = "", 
                  omit_cameras_present_in_first_month = T)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -10889.0   -708.0   -132.0   -524.4     90.0   1727.0 
## Total change: -84433
## First year value: 248655
## Last year value: 164222
## % change: -34
## N= 161

rv <- calc_change(df2, first_month = 2, max_n_years = 2, year_option = "omit_2020", 
                  omit_cameras_present_in_first_month = T)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -9143.0  -813.8   -46.5  -620.8   123.2  1579.0 
## Total change: -53386
## First year value: 128672
## Last year value: 75286
## % change: -41
## N= 86

rv <- calc_change(df2, first_month = 2, max_n_years = 2, year_option = "omit_data_after_2019", 
                  omit_cameras_present_in_first_month = T)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -8426.0  -121.2    19.5  -200.6   241.0  1579.0 
## Total change: -8427
## First year value: 44227
## Last year value: 35800
## % change: -19
## N= 42

rv <- calc_change(df2, first_month = 2, max_n_years = 2, year_option = "omit_earlier_than_2022", 
                  omit_cameras_present_in_first_month = T)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -9143.0  -959.2  -521.0 -1073.2   -60.5  1048.0 
## Total change: -34343
## First year value: 66619
## Last year value: 32276
## % change: -52
## N= 32

rv <- calc_change(df2_evening, first_month = 3, max_n_years = 2, year_option = "", 
                  omit_cameras_present_in_first_month = T)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -557.00  -26.75   -2.00  -14.00    6.00  227.00 
## Total change: -364
## First year value: 2874
## Last year value: 2510
## % change: -13
## N= 26

Are streets wider in wards 7 and 8?

blocks <- sf::st_read( "Roadway_Block.geojson", quiet = TRUE)
# plot(sf::st_geometry(blocks))
# View(sf::st_drop_geometry(blocks))

to_plot <-
  sf::st_drop_geometry(blocks) %>% 
  group_by(WARD_ID, SPEEDLIMITS_OB) %>%
    summarise(mean_road_width = mean(TOTALTRAVELLANEWIDTH, na.rm=T)) %>%
  filter(is.na(SPEEDLIMITS_OB)==F & SPEEDLIMITS_OB > 15 & WARD_ID %in% c(3,7,8) & SPEEDLIMITS_OB < 35)

ggplot(data=to_plot, 
       mapping=aes(x=as.factor(SPEEDLIMITS_OB), y=mean_road_width, fill=as.factor(WARD_ID))) +
  geom_col(position='dodge') +
  geom_text(
    aes(label = round(to_plot$mean_road_width, 0)),
    colour = "black", size = 3,
    vjust = -1.5, position = position_dodge(.9)
  ) +
  theme_minimal() +
  xlab("Speed limit on the street") +
  ylab('Average total width of the "travel lanes" in feet') +
  ggtitle('Roads are wider in Wards 7 & 8, which leads drivers to go faster') +
  scale_fill_discrete(name="Ward")

ATE analysis

pete rodrigue

2024-08-04