The source code for this project is at:
https://github.com/justlab/COVID_19_admin_disparities
This project is part of the the following publication:
Carrión, D., Colicino, E., Pedretti, N. F., Arfer, K. B., Rush, J., DeFelice, N., & Just, A. C. (2021). Neighborhood-level disparities and subway utilization during the COVID-19 pandemic in New York City. Nature Communications, 12(1). https://doi.org/10.1038/s41467-021-24088-7

t_start = Sys.time()
# CRAN packages:
library(tidyverse)
library(sf)
library(lubridate)
library(tidycensus)
library(ggExtra)
library(ggridges)
library(ggsn)
library(ragg) 
library(rstan)
library(drc)
library(spdep)
library(broom)
library(MASS)
library(spatialreg)
library(here)
library(pdftools)
library(matrixStats)
library(egg)
library(ggpubr)
library(scales)
library(qs)
library(corrplot)
library(readxl)
library(splines)
library(magic)
library(httr)
library(jsonlite)
library(DHARMa)
library(kableExtra)
library(lwgeom)
# Github packages available via: 
#   remotes::install_github("justlab/Just_universal")  
#   remotes::install_github("justlab/MTA_turnstile")
library(Just.universal) 

Session Configuration

#### SESSION CONFIGURATION ####
options(dplyr.summarise.inform=FALSE)

# data will default to a subfolder "data/" within working directory
# unless 1. set by an environment variable:
data.root = Sys.getenv("COVID_DATA")
# or 2. set with an alternative path here:
if (data.root == "") data.root = here("data")
if (!dir.exists(data.root)) dir.create(data.root)
print(paste("data being downloaded into directory", dQuote(data.root)))
## [1] "data being downloaded into directory \"/home/rushj03/dev/COVID_19_admin_disparities/data\""
# Get some data from the git repository rather than downloading from original
#   source, to avoid changes in model results due to updated data
use_repo_data = TRUE

if(Sys.getenv("MTA_TURNSTILE_DATA_DIR") == ""){ 
  # set up default download location for MTA turnstile data
  mta_dir = file.path(data.root, "mta_turnstile")
  if(!dir.exists(mta_dir)) dir.create(mta_dir, recursive = TRUE)
  Sys.setenv(MTA_TURNSTILE_DATA_DIR = mta_dir)
}
library(MTA.turnstile)
## MTA.turnstile data directory: /home/rushj03/dev/COVID_19_admin_disparities/data/mta_turnstile
t_turnstile_1 = Sys.time()

# output path for figures
fig.path = here("figures")
if(!dir.exists(fig.path)) dir.create(fig.path)

export.figs = TRUE 
vector.figs = FALSE
if(export.figs) message(paste0("Saving ", ifelse(vector.figs, "PDF", "PNG") ," figures to:\n "), fig.path) else message("Not saving figures")
## Saving PNG figures to:
##  /home/rushj03/dev/COVID_19_admin_disparities/figures
# source file output directory for publication
source_path = here("figures", "source_files")
if(!dir.exists(source_path)) dir.create(source_path)

# To generate census data, you need an API key, which you can request here: https://api.census.gov/data/key_signup.html
#census_api_key("INSERT YOUR CENSUS API KEY HERE", install = TRUE) 
if(Sys.getenv("CENSUS_API_KEY")=="") warning("Census API Key Missing")

# pairmemo function cache
pairmemo.dir = file.path(data.root, "pairmemo")
dir.create(pairmemo.dir, showWarnings = F)
pm = function(...) pairmemo(
    directory = pairmemo.dir,
    n.frame = 2,
    ...)

About pairmemo:

pairmemo caches the results of functions on disk based on their input parameters.
If you run this script and then make changes to code, we recommend that you clear the pairmemo cache of any functions that run in the lines following your change before running the script again.
The following commented lines would clear the pairmemo cache for each function. If you do not change the input Census or Pluto data, you would likely only need to run the last one.

#pairmemo.clear(get.Pluto)
#pairmemo.clear(get.qn.blocks)
#pairmemo.clear(acs.main)
#pairmemo.clear(get_essential_acs)
#pairmemo.clear(get.tract.res)
#pairmemo.clear(fit_BWQS_model)

Functions

#### Functions ####

read_w_filenames <- function(flnm) {
  read_csv(flnm) %>%
    mutate(filename = flnm)
}

extract_waic <- function (stanfit){
  log_lik <- rstan::extract(stanfit, "log_lik")$log_lik
  dim(log_lik) <- if (length(dim(log_lik)) == 1) 
    c(length(log_lik), 1)
  else c(dim(log_lik)[1], prod(dim(log_lik)[2:length(dim(log_lik))]))
  S <- nrow(log_lik)
  n <- ncol(log_lik)
  lpd <- log(colMeans(exp(log_lik)))
  p_waic <- colVars(log_lik)
  elpd_waic <- lpd - p_waic
  waic <- -2 * elpd_waic
  loo_weights_raw <- 1/exp(log_lik - max(log_lik))
  loo_weights_normalized <- loo_weights_raw/matrix(colMeans(loo_weights_raw), 
                                                   nrow = S, ncol = n, byrow = TRUE)
  loo_weights_regularized <- pmin(loo_weights_normalized, sqrt(S))
  elpd_loo <- log(colMeans(exp(log_lik) * loo_weights_regularized)/colMeans(loo_weights_regularized))
  p_loo <- lpd - elpd_loo
  pointwise <- cbind(waic, lpd, p_waic, elpd_waic, p_loo, elpd_loo)
  total <- colSums(pointwise)
  se <- sqrt(n * colVars(pointwise))
  return(list(waic = total["waic"], elpd_waic = total["elpd_waic"], 
              p_waic = total["p_waic"], elpd_loo = total["elpd_loo"], 
              p_loo = total["p_loo"]))
}

# Used to export most figures; figures that print as side effect like corrplot() not supported
write_figs <- function(fig, fig_name, fig_width, fig_height, vector_figs = vector.figs){
  fig_path = file.path(fig.path, paste0(fig_name, "_", Sys.Date()))
  if("gg" %in% class(fig)){
    ggsave(fig, filename = paste0(fig_path, ifelse(vector_figs, ".svg", ".png")), 
           width = fig_width, height = fig_height, dpi = 300)
  } else {
    if(vector_figs){
      svg(paste0(fig_path, ".svg"), pointsize = 9)
    } else {
      png(filename = paste0(fig_path, ".png"), width = fig_width, height = fig_height)
    }
    print(fig)
    dev.off()
  }
  if(vector_figs){ # convert SVG to PDF
    system(paste0("rsvg-convert -f pdf -o ", fig_path, ".pdf", " ", fig_path, ".svg"))
    unlink(paste0(fig_path, ".svg"))
  }
  message(paste0("Wrote ", fig_path, ifelse(vector_figs, ".pdf", ".png")))
}

# Download a file, update metadata records, and load it with function `f`
# File metadata is stored in a sqlite file, by default in data/downloads/meta.sqlite
download = function(url, to, f, ...){
    f(download.update.meta(url, file.path(data.root, "downloads"), to),
        ...)
}

Load Data

#### Load Data ####
# Check if subway turnstile data is already downloaded
lf1 = list.files(file.path(Sys.getenv("MTA_TURNSTILE_DATA_DIR"), "downloads/mta_turnstile/"), pattern = ".txt")
# Check if other data is already downloaded
lf2 = list.files(file.path(data.root, "downloads"), pattern = "ny_xwalk.csv.gz")
# Subway ridership data
Subway_ridership_by_UHF <- relative.subway.usage(2020L, "nhood")
t_turnstile_2 = Sys.time()
# get the Pluto dataset from #https://www1.nyc.gov/site/planning/data-maps/open-data/dwn-pluto-mappluto.page 
pm(fst = T,
get.Pluto <- function() download(
    "https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_pluto_20v3_csv.zip",
    "pluto.zip",
    function(p)
        read_csv(unz(p, "pluto_20v3.csv"), col_types = cols(spdist2 = col_character(), 
                                                overlay2 = col_character(),
                                                zonedist4 = col_character()))[,
            c("landuse", "bbl", "numfloors", "unitstotal", "unitsres",
                "zipcode")]))
Pluto <- as.data.frame(get.Pluto())
# If you get an error in running this function, the download may be incomplete/corrupt
# In that case, delete the pluto.zip file and try again:
#unlink(file.path(data.root, "downloads", "pluto.zip"))
if (file.exists(file.path(data.root, "Bldg_Footprints.qs"))) {
    Bldg_Footprints <- qread(file.path(data.root, "Bldg_Footprints.qs"))
} else {
  Bldg_Footprints <- download(
    # https://data.cityofnewyork.us/Housing-Development/Building-Footprints/nqwf-w8eh
    "https://data.cityofnewyork.us/api/geospatial/nqwf-w8eh?method=export&format=Shapefile",
    "building_footprints.zip",
    function(p)
      st_read(paste0("/vsizip/", p)))
  qsave(Bldg_Footprints, file.path(data.root, "Bldg_Footprints.qs"))
}
ZCTA_by_boro <- download(
    "https://web.archive.org/web/20180209024431id_/https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm",
    "uhf_neighborhoods.html",
    function(p)
       {# XML::readHTMLTable doesn't identify the columns correctly.
        x = str_match_all(read_file(p), regex(dotall = T, paste0(
            '<td headers="header1"[^>]+>\\s*(.+?)</td>',
            '(.+?)',
            '(?=<td headers="header1"|</table>)')))[[1]]
        do.call(rbind, lapply(1 : nrow(x), function(i)
            data.frame(boro = x[i, 2], zip = as.integer(
                str_extract_all(x[i, 3], "\\b\\d{5}\\b")[[1]]))))})

# Download the specific day of test results by ZCTA being used
ZCTA_test_download <- download(
  "https://raw.githubusercontent.com/nychealth/coronavirus-data/6d7c4a94d6472a9ffc061166d099a4e5d89cd3e3/tests-by-zcta.csv",
  "2020-05-07_tests-by-zcta.csv",
  identity
)

# Download COVID-19 testing data 
ZCTA_test_series <- ZCTA_test_download %>% 
  map_df(~read_w_filenames(.)) %>%
  mutate(date = as.Date(str_extract(filename, "[:digit:]{4}-[:digit:]{2}-[:digit:]{2}"))) %>%
  dplyr::select(-filename)
## Parsed with column specification:
## cols(
##   MODZCTA = col_double(),
##   Positive = col_double(),
##   Total = col_double(),
##   zcta_cum.perc_pos = col_double()
## )
# UHF definitions by zip codes
UHF_ZipCodes <- UHF_ZipCodes <- download(
    "http://www.infoshare.org/misc/UHF.pdf",
    "uhf_zips.pdf",
    function(p)
       {x = str_match_all(pdf_text(p)[2],
            "(\\d+)\\s+(\\S.+?\\S)\\s*([0-9,]+)")[[1]]
        do.call(rbind, lapply(1 : nrow(x), function(i)
            data.frame(code = x[i, 2], name = x[i, 3], zip = as.integer(
                str_extract_all(x[i, 4], "\\b\\d{5}\\b")[[1]]))))})

# UHF shapefile 
UHF_shp <- download(
    "https://www1.nyc.gov/assets/doh/downloads/zip/uhf42_dohmh_2009.zip",
    "nyc_uhf_nhoods_shapefile.zip",
    function(p) read_sf(paste0("/vsizip/", p, "/UHF_42_DOHMH_2009")))

# NYC boroughs from NYC Open Data
NYC_basemap_shp <- download(
  "https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm?method=export&format=Shapefile",
  "Borough_Boundaries.zip",
  function(p){
    unzip(p, exdir = file.path(data.root, "downloads"))
    # open data platform generates a random UUID for every download
    ufile = list.files(file.path(data.root, "downloads"), pattern = "geo_export_.*\\.shp", full.names = TRUE)
    st_read(ufile, stringsAsFactors = FALSE, quiet = TRUE) %>% st_transform(., crs = 2263)
  } 
)

# DOHMH MODZCTA Shapefile
MODZCTA_NYC_shp <- download(
  "https://data.cityofnewyork.us/api/geospatial/pri4-ifjk?method=export&format=Shapefile",
  "Modified Zip Code Tabulation Areas (MODZCTA).zip", 
  function(p) read_sf(paste0("/vsizip/", p))
)

# Food outlets 
if(use_repo_data){
  food_retail <- read_csv(here("data", "retail_food_stores_2019-06-13.csv"))
} else {
  food_retail <- download(
      "https://data.ny.gov/api/views/9a8c-vfzj/rows.csv",
      "retail_food_stores.csv",
      read_csv)
}
## Parsed with column specification:
## cols(
##   County = col_character(),
##   `License Number` = col_character(),
##   `Operation Type` = col_character(),
##   `Establishment Type` = col_character(),
##   `Entity Name` = col_character(),
##   `DBA Name` = col_character(),
##   `Street Number` = col_character(),
##   `Street Name` = col_character(),
##   `Address Line 2` = col_logical(),
##   `Address Line 3` = col_logical(),
##   City = col_character(),
##   State = col_character(),
##   `Zip Code` = col_double(),
##   `Square Footage` = col_double(),
##   Location = col_character()
## )
# Download deaths by ZCTA as of May 23rd
deaths_by23May2020_by_zcta <- download(
  "https://raw.githubusercontent.com/nychealth/coronavirus-data/8d88b2c06cf6b65676d58b28979731faa10c193c/data-by-modzcta.csv",
  "2020-05-23_data-by-modzcta.csv",
  read_csv
)
## Parsed with column specification:
## cols(
##   MODIFIED_ZCTA = col_double(),
##   NEIGHBORHOOD_NAME = col_character(),
##   BOROUGH_GROUP = col_character(),
##   COVID_CASE_COUNT = col_double(),
##   COVID_CASE_RATE = col_double(),
##   POP_DENOMINATOR = col_double(),
##   COVID_DEATH_COUNT = col_double(),
##   COVID_DEATH_RATE = col_double(),
##   PERCENT_POSITIVE = col_double()
## )
# download MODZCTA to ZCTA crosswalk, current version from repo
modzcta_to_zcta <- download(
  "https://raw.githubusercontent.com/nychealth/coronavirus-data/master/Geography-resources/ZCTA-to-MODZCTA.csv",
  "ZCTA-to-MODZCTA.csv",
  read_csv
)
## Parsed with column specification:
## cols(
##   ZCTA = col_double(),
##   MODZCTA = col_double()
## )
modzcta_to_zcta1 <- modzcta_to_zcta %>% mutate(ZCTA = as.character(ZCTA))
modzcta_to_zcta2 <- modzcta_to_zcta1 %>% mutate(MODZCTA = as.character(MODZCTA))
MODZCTAs_in_NYC <- as.character(unique(ZCTA_test_series$MODZCTA))
ZCTAs_in_NYC <- as.character(unique(modzcta_to_zcta$ZCTA))

# download ZIP to Tract crosswalk from HUD
zip_to_tract <- download( 
  "https://www.huduser.gov/portal/datasets/usps/ZIP_TRACT_062020.xlsx",
  "ZIP_TRACT_062020.xlsx",
  function(p) suppressWarnings(read_excel(path = p, col_types = c("text", "text", "numeric", "skip", "skip", "skip")))
) 

# Block to ZCTA and County crosswalk for NY
ny_xwalk <- download("https://lehd.ces.census.gov/data/lodes/LODES7/ny/ny_xwalk.csv.gz",
                     "ny_xwalk.csv.gz",
                     function(p) {
                       zzf = gzfile(p)
                       read.csv(zzf) %>% 
                         dplyr::select(cty, tabblk2010, zcta, blklondd, blklatdd) %>% 
                         mutate(tabblk2010 = as.character(tabblk2010), 
                                zcta = as.character(zcta),
                                cntyfips = as.character(cty)) %>% 
                         dplyr::select(-cty)
                     }
)

# We have many sources of data, so these just help to combine the various data types
NYC_counties1 <- c("Bronx","Kings","Queens","New York","Richmond")
NYC_counties1_full <- c("Bronx County","Kings County","Queens County","New York County","Richmond County")
NYC_boro_county_match <- tibble(County = c("Bronx","Kings","Queens","New York","Richmond"), 
                                boro = c("Bronx","Brooklyn","Queens","Manhattan","Staten Island"), 
                                full_county = c("Bronx County","Kings County","Queens County","New York County","Richmond County"),
                                fips = c("36005", "36047", "36081", "36061", "36085"))

# stan model script
BWQS_stan_model <- here("code", "nb_bwqs_cov.stan") 

Census Data

#### Census Data ####

# function to pull 2010 block population for Queens & Nassau counties
pm(get.qn.blocks <- function(){
  nassau_blk_pop <- get_decennial(geography = "block", variables = "P001001", 
                                  state = "NY", county = "Nassau", geometry = FALSE)
  queens_blk_pop <- get_decennial(geography = "block", variables = "P001001", 
                                  state = "NY", county = "Queens", geometry = FALSE)
  bind_rows(nassau_blk_pop, queens_blk_pop) %>% 
    dplyr::select(GEOID, value) %>% rename("pop2010" = "value")
})

# function to pull 2018 ACS data 
pm(acs.main <- function(admin_unit = c("zcta", "tract"), state_unit = c(NULL, "NY"), sf_shapes = c(TRUE, FALSE)) {
     ACS_Data <- get_acs(geography = admin_unit,
                         state = state_unit,
                         geometry = sf_shapes,
                         variables = c(medincome = "B19013_001",
                                       total_pop1 = "B01003_001",
                                       fpl_100 = "B06012_002", 
                                       fpl_100to150 = "B06012_003",
                                       median_rent = "B25031_001",
                                       total_hholds1 = "B22003_001",
                                       hholds_snap = "B22003_002",
                                       over16total_industry1 = "C24050_001",
                                       ag_industry = "C24050_002",
                                       construct_industry = "C24050_003",
                                       manufact_industry = "C24050_004",
                                       wholesaletrade_industry = "C24050_005",
                                       retail_industry = "C24050_006",
                                       transpo_and_utilities_industry = "C24050_007",
                                       information_industry = "C24050_008",
                                       finance_and_realestate_industry = "C24050_009",
                                       science_mngmt_admin_industry = "C24050_010",
                                       edu_health_socasst_industry = "C24050_011",
                                       arts_entertain_rec_accomodate_industry = "C24050_012",
                                       othsvcs_industry = "C24050_013",
                                       publicadmin_industry = "C24050_014",
                                       total_commute1 = "B08301_001",
                                       drove_commute = "B08301_002",
                                       pubtrans_bus_commute = "B08301_011",
                                       pubtrans_subway_commute = "B08301_013",
                                       pubtrans_railroad_commute = "B08301_013",
                                       pubtrans_ferry_commute = "B08301_015",
                                       taxi_commute = "B08301_016",
                                       bicycle_commute = "B08301_018",
                                       walked_commute = "B08301_019",
                                       workhome_commute = "B08301_021",
                                       unemployed = "B23025_005",
                                       under19_noinsurance = "B27010_017",
                                       age19_34_noinsurance = "B27010_033",
                                       age35_64_noinsurance = "B27010_050",
                                       age65plus_noinsurance = "B27010_066",
                                       hisplat_raceethnic = "B03002_012",
                                       nonhispLat_white_raceethnic = "B03002_003",
                                       nonhispLat_black_raceethnic = "B03002_004",
                                       nonhispLat_amerindian_raceethnic = "B03002_005",
                                       nonhispLat_asian_raceethnic = "B03002_006",
                                       age65_plus  = "B08101_008"),
                         year = 2018,
                         output = "wide",
                         survey = "acs5")
     
     if(admin_unit=="zcta"){
       ACS_Data <- ACS_Data %>% #only pull out the estimates and cleaning variable names
         mutate(GEOID = str_sub(GEOID, -5, -1)) %>%
         filter(GEOID %in% ZCTAs_in_NYC) %>%
         dplyr::select(-NAME)  %>%
         dplyr::select(GEOID, !ends_with("M")) %>%
         rename_at(vars(ends_with("E")), .funs = list(~str_sub(., end = -2)))
     }
     
     if(admin_unit=="tract"){
       ACS_Data <- ACS_Data %>% #only pull out the estimates and cleaning variable names
         filter(substr(GEOID,1,5) %in% NYC_boro_county_match$fips) %>% # Tracts in NYC counties
         dplyr::select(-NAME)  %>%
         dplyr::select(GEOID, !ends_with("M")) %>%
         rename_at(vars(ends_with("E")), .funs = list(~str_sub(., end = -2)))
     }
     
     return(ACS_Data)
   })

# Use 2010 block population to scale Nassau County ZCTA contribution to NYC MODZCTAs
# This is the method used according to data dictionary at NYC Open Data:
#   https://data.cityofnewyork.us/Health/Modified-Zip-Code-Tabulation-Areas-MODZCTA-/pri4-ifjk
nassau_zcta_weights <- function(zcta_acs, mz_to_z, blk_to_z){
  blk_pop = get.qn.blocks()
  
  modzcta_span = c("11429", "11411", "11004") # MODZCTA with ZCTAs from both Queens & Nassau
  border_zcta <- mz_to_z %>% filter(MODZCTA %in% modzcta_span) %>% pull(ZCTA)
  # Filter block-zcta crosswalk table to Queens-Nassau ZCTA of interest
  blk_to_z <- blk_to_z %>% filter(zcta %in% border_zcta)
  
  # Join population to block-zcta crosswalk
  xwalk_pop <- blk_to_z %>% left_join(blk_pop, by = c("tabblk2010" = "GEOID")) 
  
  # Summarise 2010 population by ZCTA to calculate proportions inside NYC
  zcta_pop_2010 <- xwalk_pop %>%
    group_by(zcta) %>%
    summarise(z_pop_2010 = sum(pop2010), .groups = "drop_last")
  queens_zcta_pop_2010 <- xwalk_pop %>%
    filter(cntyfips == "36081") %>%
    group_by(zcta) %>%
    summarise(queens_z_pop_2010 = sum(pop2010), .groups = "drop_last")
  zcta_pop_props <- queens_zcta_pop_2010 %>% 
    left_join(zcta_pop_2010, by = "zcta") %>% 
    mutate(in_NYC_prop = queens_z_pop_2010/z_pop_2010)
  
  zcta_weights <- zcta_acs %>% dplyr::select(GEOID) %>% 
    left_join(zcta_pop_props, by = c("GEOID" = "zcta")) %>%
    dplyr::select(GEOID, in_NYC_prop) %>% 
    mutate(in_NYC_prop = 
             case_when(is.na(in_NYC_prop) ~ 1,
                       TRUE               ~ in_NYC_prop))
  
  # Apply weights for all Census variables except median vars
  zcta_acs <- zcta_acs %>% left_join(zcta_weights, by = "GEOID") 
  varvec <- 1:ncol(zcta_acs)
  varvec <- varvec[-grep("GEOID|in_NYC_prop|medincome|median_rent", names(zcta_acs))]
  zcta_acs <- zcta_acs %>% mutate_at(vars(all_of(varvec)), ~ . * in_NYC_prop)
}

# function to clean ACS data
clean_acs_data_and_derive_vars <- function(df, admin_unit = c("zcta", "tract")){
  if(admin_unit=="zcta"){
    ACS_Data1a <- df %>%
      left_join(., modzcta_to_zcta1, by = c("GEOID" = "ZCTA")) %>%
      group_by(MODZCTA) %>%
      summarise_at(vars(medincome, median_rent), ~weighted.mean(., total_pop1, na.rm = T)) %>% 
      rename(zcta = "MODZCTA")
    
    ACS_Data1b <- df %>%
      left_join(., modzcta_to_zcta1, by = c("GEOID" = "ZCTA")) %>%
      group_by(MODZCTA) %>%
      summarise_at(vars(total_pop1:fpl_100to150, total_hholds1:age65_plus), ~sum(.)) %>%
      mutate_at(vars(ends_with("_commute")), ~round((./total_commute1)*100, 2)) %>% #proportion of people relying on a given mode of transit
      mutate_at(vars(ends_with("_raceethnic")), ~round((./total_pop1)*100, 2)) %>% #proportion of ppl reporting a given race/ethncity 
      mutate(not_insured = round(((under19_noinsurance + age19_34_noinsurance + age35_64_noinsurance + age65plus_noinsurance) / total_pop1)*100, 2), #proportion uninsured
             #snap_hholds = round((hholds_snap/total_hholds1)*100, 2), #proportion relying on SNAP
             #fpl_150 = round(((fpl_100+fpl_100to150)/total_pop1)*100, 2), #proportion 150% or less of FPL
             unemployed = round((unemployed/over16total_industry1)*100, 2), #proportion unemployed
             not_quarantined_jobs = round(((ag_industry+(construct_industry*.25)+wholesaletrade_industry+ #an estimate of who is still leaving the house for work based on industry
                                              (edu_health_socasst_industry*.5)+transpo_and_utilities_industry)/over16total_industry1)*100, 2)) %>%
      dplyr::select(-ends_with("_noinsurance"), -fpl_100, -fpl_100to150, -ends_with("_industry"), -hholds_snap) %>%
      rename(zcta = "MODZCTA") 
    
    ACS_Data2 <- left_join(ACS_Data1a, ACS_Data1b, by = "zcta") %>%
      mutate(zcta = as.character(zcta))
    
  } else{
    
    ACS_Data2 <- df %>%
      mutate_at(vars(ends_with("_commute")), ~round((./total_commute1)*100, 2)) %>% #proportion of people relying on a givenmode of transit
      mutate_at(vars(ends_with("_raceethnic")), ~round((./total_pop1)*100, 2)) %>% #proportion of ppl reporting a given race/ethncity 
      mutate(not_insured = round(((under19_noinsurance + age19_34_noinsurance + age35_64_noinsurance + age65plus_noinsurance) / total_pop1)*100, 2), #proportion uninsured
             unemployed = round((unemployed/over16total_industry1)*100, 2), #proportion unemployed
             not_quarantined_jobs = round(((ag_industry+(construct_industry*.25)+wholesaletrade_industry+ #an estimate of who is still leaving the house for work based on industry
                                              (edu_health_socasst_industry*.5)+transpo_and_utilities_industry)/over16total_industry1)*100, 2)) %>%
      dplyr::select(-ends_with("_noinsurance"), -ends_with("_industry"),-fpl_100, -fpl_100to150,-hholds_snap) 
  }
  
  return(ACS_Data2)
}

# Functions to pull mode of transportation for our approximate of essential workers
pm(fst = T, 
   get_essential_acs <- function(admin_unit, state_unit) {
     get_acs(geography = admin_unit, #pull down the relevant categories 
             state = state_unit,
             variables = c(ag_car1_commute = "B08126_017",
                           ag_pubtrans_commute = "B08126_047",
                           construct_car1_commute ="B08126_018",
                           construct_pubtrans_commute = "B08126_048",
                           wholesale_car1_commute = "B08126_020",
                           wholesale_pubtrans_commute = "B08126_050",
                           transpo_car1_commute = "B08126_022",
                           transpo_pubtrans_commute = "B08126_052",
                           ed_hlthcare_car1_commute = "B08126_026",
                           ed_hlthcare_pubtrans_commute = "B08126_056"),
             year = 2018, 
             output = "wide",
             survey = "acs5")
})

acs.essential <- function(admin_unit, zcta_pop = NULL, state_unit = NULL) {
  if(!admin_unit %in% c("zcta", "tract")) stop("admin_unit must be either 'zcta' or 'tract'")
  if(admin_unit == "tract" & is.null(state_unit)) stop("state_unit must be set to download tracts")
  if(!is.null(state_unit)) if(state_unit != "NY") stop("state_unit must be either NULL or 'NY'")
  
  ACS_EssentialWrkr_Commute <- get_essential_acs(admin_unit = admin_unit, state_unit = state_unit)
     
   if(admin_unit == "zcta"){
     if(is.null(zcta_pop)) stop("zcta_pop must be set for scaling MODZCTA on Queens/Nassau boundary")
     
     ACS_Essential_worker_estimates <- ACS_EssentialWrkr_Commute %>% #clean data and aggregate 
       dplyr::select(-ends_with("M"), -NAME) %>%
       mutate(GEOID = str_sub(GEOID, -5, -1)) %>%
       filter(GEOID %in% ZCTAs_in_NYC) %>%
       # scale ZCTAs by proportion of population in NYC
       right_join(zcta_pop %>% dplyr::select("GEOID", "in_NYC_prop"), by = "GEOID") %>% 
       mutate_at(vars(2:11), ~ . * in_NYC_prop) %>% 
       # summarize ZCTA to MODZCTA
       left_join(., modzcta_to_zcta1, by = c("GEOID" = "ZCTA")) %>%
       group_by(MODZCTA) %>%
       summarise_at(vars(2:11),
                    ~ sum(., na.rm = T)) %>% 
       rename(zcta = "MODZCTA") %>%
       mutate_at(vars(starts_with("ed_hlthcare")), ~ round(. / 2), 0) %>% #maintain same proportions as estimated nonquarintined jobs
       mutate_at(vars(starts_with("construct")), ~ round(. / 4), 0) %>%
       mutate(
         essentialworker_drove = rowSums(dplyr::select(., contains("car1_commute"))),
         essentialworker_pubtrans = rowSums(dplyr::select(., contains("pubtrans")))) %>%
       dplyr::select(zcta, essentialworker_drove, essentialworker_pubtrans) %>%
       mutate(zcta = as.character(zcta))
   } 
   else { # tracts
       
     ACS_Essential_worker_estimates <- ACS_EssentialWrkr_Commute %>% #clean data and aggregate 
         dplyr::select(-ends_with("M"), -NAME) %>%
         filter(substr(GEOID,1,5) %in% NYC_boro_county_match$fips) %>% # Tracts in NYC counties
         mutate_at(vars(starts_with("ed_hlthcare")), ~round(./2), 0) %>% #maintain same proportions as estimated nonquarintined jobs
         mutate_at(vars(starts_with("construct")), ~round(./4), 0) %>%
         mutate(essentialworker_drove = rowSums(dplyr::select(., contains("car1_commute"))), 
                essentialworker_pubtrans = rowSums(dplyr::select(., contains("pubtrans")))) %>%
         dplyr::select(GEOID, essentialworker_drove, essentialworker_pubtrans)   
   }
     
   return(ACS_Essential_worker_estimates)
}
# ZCTA CENSUS DATA
options(tigris_use_cache = TRUE)
ACS_Data1 <- as.data.frame(acs.main("zcta", NULL, FALSE)) #download the zcta data
ACS_Data_scaled <- nassau_zcta_weights(ACS_Data1, modzcta_to_zcta2, ny_xwalk)
ACS_Data2 <- clean_acs_data_and_derive_vars(ACS_Data_scaled, "zcta")
ACS_EssentialWrkr_Commute1 = as.data.frame(acs.essential("zcta", zcta_pop = ACS_Data_scaled, state_unit = NULL))

print(paste("The 2018 5-year ACS population range in NYC MODZCTAs is:", paste(range(ACS_Data2$total_pop1), collapse = "-")))
## [1] "The 2018 5-year ACS population range in NYC MODZCTAs is: 3028-112425"
# TRACT CENSUS DATA  
acs_tracts <- acs.main("tract", "NY", TRUE)
acs_tracts2 <- clean_acs_data_and_derive_vars(acs_tracts, "tract")
acs_tracts_commute1 = as.data.frame(acs.essential("tract", state_unit = "NY"))

t_census = Sys.time()

Grocery stores per area

#### Grocery stores per area ####
non_supermarket_strings <- c("DELI|TOBACCO|GAS|CANDY|7 ELEVEN|7-ELEVEN|LIQUOR|ALCOHOL|BAKERY|CHOCOLATE|DUANE READE|WALGREENS|CVS|RITE AID|RAVIOLI|WINERY|WINE|BEER|CAFE|COFFEE")

food_retail_filtered <- food_retail %>% 
  filter(County %in% NYC_boro_county_match$County) %>% 
  filter(str_detect(`Establishment Type`, "J") & str_detect(`Establishment Type`, "A") & str_detect(`Establishment Type`, "C") &
           !str_detect(`Establishment Type`, "H")) %>%
  filter(!str_detect(`Entity Name`, non_supermarket_strings) & !str_detect(`DBA Name`, non_supermarket_strings)) %>%
  filter(`Square Footage`>=4500) %>%
  mutate(zcta = as.character(str_extract(Location, "[:digit:]{5}"))) %>% 
  mutate(Address = case_when(
    # some locations geocode better when address includes city name
    `License Number` %in% c("638599", "712410", "706967", "710078") ~ 
      paste(paste(`Street Number`, `Street Name`), City, State, zcta, sep = ","),
    # but most geocode better without it, see limitation #4 at: http://gis.ny.gov/gisdata/inventories/details.cfm?DSID=1278
    TRUE ~ 
      paste(paste(`Street Number`, `Street Name`), State, zcta, sep = ",") )
  )

# Geocode grocers, using a cached version if available to make analysis reproducible
# The geocoding service may be updated in the future and give different results
cached_grocers = here("data", "grocers_geocode_2020-11-09.csv")
if(file.exists(cached_grocers) & use_repo_data){
  gctable <- read.csv(cached_grocers)
  failed = which(gctable$score == 0)
  message("Loaded cached geocoded grocers: ", nrow(gctable)-length(failed), "/", nrow(gctable), " have coordinates.") # 997/1037
  if(nrow(gctable) != nrow(food_retail_filtered)) warning("Cached geocoded table has different row count than non-geocoded table")
} else {
  # locations are returned in crs=26918, UTM 18N NAD83
  api = "https://gisservices.its.ny.gov/arcgis/rest/services/Locators/Street_and_Address_Composite/GeocodeServer/findAddressCandidates?f=json&maxLocations=1&SingleLine="
  message("Geocoding ", nrow(food_retail_filtered), " grocers with NY State ITS geocoder...")
  t1 = Sys.time()
  res = lapply(food_retail_filtered$Address, function(addr) {
    GET(url = paste0(api, URLencode(addr))) })
  tdiff = Sys.time() - t1
  # extract results
  geocodes = lapply(res, function(page) fromJSON(rawToChar(page$content), flatten = TRUE)$candidates)
  failed = which(sapply(geocodes,class) != "data.frame")
  geocodes[failed] <- lapply(1:length(failed), function(void){
    data.frame(address = NA_character_, score = 0, location.x = NA_real_, location.y = NA_real_)})
  gctable = bind_rows(geocodes)
  message("Geocoded ", nrow(food_retail_filtered)-length(failed), "/", nrow(food_retail_filtered), 
          " grocers in ", round(as.numeric(tdiff),1), " ", attributes(tdiff)$units)
  write.csv(gctable, paste0(data.root, "/grocers_geocode_", Sys.Date(), ".csv"))
}
## Loaded cached geocoded grocers: 997/1037 have coordinates.
# Count grocers by tract
gctable = filter(gctable, score > 0)
grocerSF = st_as_sf(gctable, coords = c("location.x", "location.y"), crs = 26918) %>% st_transform(crs = 2263)
tractSF = acs_tracts2[, "GEOID"] %>% st_transform(., crs = 2263)
tract_grocers = suppressWarnings(st_intersection(tractSF, grocerSF)) %>%
  st_set_geometry(., NULL) %>%
  group_by(GEOID) %>%
  summarise(grocers = n_distinct(`address`))
#nrow(tract_grocers) # 749
#sum(tract_grocers$grocers) # 993

# Count grocers by ZCTA 
zctaSF <- MODZCTA_NYC_shp %>% dplyr::select(modzcta, geometry) %>% st_transform(crs = 2263) 
zcta_grocers <- suppressWarnings(st_intersection(zctaSF, grocerSF)) %>%
  st_set_geometry(., NULL) %>%
  group_by(modzcta) %>%
  summarise(grocers = n_distinct(`address`))
# sum(zcta_grocers$grocers) # 993
# nrow(zcta_grocers) # 172
# range(zcta_grocers$grocers) # 1, 21

Subway station locations

#### Subway station locations ####
SubwayStation_shp <- as_tibble(turnstile()$stations) %>%
  st_as_sf(., coords = c("lon", "lat"), crs = 4269) %>%
  st_transform(., crs = 2263) %>%
  filter(!str_detect(ca, "PTH")) #removing New Jersey PATH stations

Residential area

#### Residential area ####
Pluto_ResOnly <- Pluto %>%
  filter(landuse>="01" & landuse<="04") %>%
  mutate(base_bbl = as.character(bbl)) %>%
  dplyr::select(-bbl)

ResBBLs <- as.character(Pluto_ResOnly$base_bbl)

# zcta-level residential area
Res_Bldg_Footprints <- Bldg_Footprints %>%  
  st_set_geometry(., NULL) %>%
  mutate(base_bbl = as.character(base_bbl)) %>%
  filter(base_bbl %in% ResBBLs &
           feat_code == "2100") %>%
  mutate(bldg_volume = shape_area * heightroof) %>%
  left_join(., Pluto_ResOnly, by = "base_bbl") %>%
  mutate(bldg_volume = if_else(is.na(bldg_volume), shape_area*numfloors*10, bldg_volume),
         res_volume = (bldg_volume/unitstotal)*unitsres, 
         zcta = as.character(zipcode)) %>%
  group_by(zcta) %>%
  summarise(total_res_volume_zcta = sum(res_volume, na.rm = TRUE)) 

# tract-level residential area 
Res_Bldg_Footprints2 <- Bldg_Footprints %>% 
  st_transform(crs = 2263) %>%
  suppressWarnings(st_centroid(., of_largest_polygon = TRUE)) %>%
  mutate(base_bbl = as.character(base_bbl)) %>%
  filter(base_bbl %in% ResBBLs &
           feat_code == "2100") %>%
  mutate(bldg_volume = shape_area * heightroof) %>%
  left_join(., Pluto_ResOnly, by = "base_bbl") %>%
  mutate(bldg_volume = if_else(is.na(bldg_volume), shape_area*numfloors*10, bldg_volume),
         res_volume = (bldg_volume/unitstotal)*unitsres)
get.tract.res <- function(res, tracts) suppressWarnings(st_intersection(res, tracts))
res_bldg_tract <- get.tract.res(Res_Bldg_Footprints2, tractSF) # 72 seconds
res_bldg_tract_sum <- st_set_geometry(res_bldg_tract, NULL) %>%
  group_by(GEOID) %>%
  summarise(total_res_volume_tract = sum(res_volume, na.rm = TRUE))
#nrow(res_bldg_tract_sum) # 2132

COVID Tests

#### COVID Tests ####

MODZCTA_NYC_shp1 <- MODZCTA_NYC_shp %>%
  dplyr::select(modzcta, geometry) %>%
  rename("zcta" = "modzcta")

May7_tests <- ZCTA_test_series %>%
  filter(date=="2020-05-07") %>%
  mutate(zcta = as.character(MODZCTA)) %>%
  rename(total_tests = "Total") %>%
  dplyr::select(zcta, date, Positive, total_tests)

ZCTA_by_boro1 <- ZCTA_by_boro %>%
  mutate(boro = as.character(boro),
         zcta = as.character(zip)) %>%
  dplyr::select(-zip) %>%
  bind_rows(., 
            tibble(boro = as.character(c("Manhattan", "Manhattan" ,"Queens")), #correcting nas
                     zcta = as.character(c("10069", "10282", "11109"))))

# get water mask for maps
source(here("code/water_mask.R"))
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
# Figure 3B - Map of Tests by MODZCTA

theme_smallmaps <- theme(legend.title = element_text(face = "bold", size = 9), 
                         plot.title = element_text(size = 9.5),
                         panel.background = element_rect(fill = "#cccccc"), 
                         panel.grid = element_blank(),
                         legend.background = element_rect(fill = "transparent"),
                         legend.position = c(0.22, 0.80),
                         legend.text = element_text(size = 8.5),
                         plot.margin = unit(c(4,0,4,0), "pt"),
                         legend.key.size = unit(1.1, "lines"),
                         axis.text = element_blank(),
                         axis.ticks = element_blank(),
                         axis.ticks.length = unit(0, "pt"),
                         axis.title = element_blank())

fig3b <- MODZCTA_NYC_shp1 %>%
  left_join(., May7_tests, by = "zcta") %>%
  left_join(., ACS_Data2, by = "zcta") %>%
  filter(zcta != "99999") %>%
  mutate(pos_per_100000 = (Positive/total_pop1)*100000) %>%
  ggplot() +
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(aes(fill = pos_per_100000), lwd = 0.2)+
  scalebar(MODZCTA_NYC_shp1, dist = 5, dist_unit = "km", 
           transform = TRUE, model = "WGS84", 
           st.size = 2.8, height = 0.015, border.size = 0.5, 
           anchor = c(x = -73.71, y = 40.51)) + 
  labs(fill = "Positives per 100,000") +
  ggtitle("Cumulative positive COVID tests by zip code (May 7, 2020)") +
  scale_fill_gradientn(colours=brewer_pal("YlGnBu", type = "seq")(7)) + 
  coord_sf(crs = st_crs(MODZCTA_NYC_shp1),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
           expand = FALSE) +
  theme_bw(base_size = 6) + 
  theme_smallmaps

write_csv(st_drop_geometry(MODZCTA_NYC_shp1 %>%
                             left_join(., May7_tests, by = "zcta") %>%
                             left_join(., ACS_Data2, by = "zcta") %>%
                             filter(zcta != "99999") %>%
                             mutate(pos_per_100000 = (Positive/total_pop1)*100000)) %>% dplyr::select(zcta, pos_per_100000), file.path(source_path, "fig3b.csv"))

Create data frames of all above information

#### Create data frames of all above information ####

ZCTA_ACS_COVID_shp <- MODZCTA_NYC_shp1 %>%
  st_transform(., crs = 2263) %>%
  dplyr::select(zcta, geometry) %>%
  left_join(., ACS_Data2, by = "zcta") %>%
  left_join(., May7_tests, by = "zcta") %>%
  left_join(., Res_Bldg_Footprints, by = "zcta") %>%
  left_join(., ACS_EssentialWrkr_Commute1, by = "zcta") %>%
  left_join(., zcta_grocers, by = c("zcta" = "modzcta")) %>%
  mutate(pop_density = as.numeric(total_pop1/st_area(geometry)),
         avg_hhold_size = round((total_pop1/total_hholds1), 2),
         pos_per_100000 = (Positive/total_pop1)*100000,
         testing_ratio = (total_tests/total_pop1),
         res_vol_zctadensity = as.numeric(total_res_volume_zcta/st_area(geometry)), 
         res_vol_popdensity = as.numeric(total_pop1/total_res_volume_zcta),
         pubtrans_ferrysubway_commute = pubtrans_subway_commute + pubtrans_ferry_commute,
         grocers = replace_na(grocers, 0),
         grocers_per_1000 = (grocers/total_pop1)*1000,
         pos_per_100000 = round(pos_per_100000, 0),
         valid_var = "0",
         one_over_grocers_per_1000 = if_else(is.infinite(1/grocers_per_1000), (1/.0293)+1, 1/grocers_per_1000), #max + 1 for zero values
         didnot_workhome_commute = 100 - workhome_commute,
         one_over_medincome = 1/medincome) %>%
  dplyr::select(-pubtrans_subway_commute, -pubtrans_ferry_commute) %>%
  left_join(., ZCTA_by_boro1, by = "zcta") %>%
  mutate_at(vars(starts_with("essentialworker_")), ~round((./over16total_industry1)*100, 2)) %>%
  filter(zcta != "99999") #remove na

ZCTA_ACS_COVID <- ZCTA_ACS_COVID_shp %>%
  st_set_geometry(., NULL) #remove geometry

tract_vars <- tractSF %>% # uses local CRS
  left_join(., st_set_geometry(acs_tracts2, NULL), by = "GEOID") %>%
  left_join(., acs_tracts_commute1, by = "GEOID") %>%
  left_join(., res_bldg_tract_sum, by = "GEOID") %>%
  left_join(., tract_grocers, by = "GEOID") %>%
  mutate(pop_density = as.numeric(total_pop1/st_area(geometry)),
         avg_hhold_size = round((total_pop1/total_hholds1), 2),
         res_vol_tractdensity = as.numeric(total_res_volume_tract/st_area(geometry)), 
         res_vol_popdensity = as.numeric(total_pop1/total_res_volume_tract),
         pubtrans_ferrysubway_commute = pubtrans_subway_commute + pubtrans_ferry_commute,
         grocers = replace_na(grocers, 0),
         grocers_per_1000 = (grocers/total_pop1)*1000,
         valid_var = "0",
         didnot_workhome_commute = 100 - workhome_commute,
         one_over_grocers_per_1000 = if_else(is.infinite(1/grocers_per_1000), (1/.0293)+1, 1/grocers_per_1000),
         one_over_medincome = 1/medincome) %>% 
  dplyr::select(-pubtrans_subway_commute, -pubtrans_ferry_commute) %>%
  mutate_at(vars(starts_with("essentialworker_")), ~round((./over16total_industry1)*100, 2))

Tract to ZCTA weighted assignment

#### Tract to ZCTA weighted assignment ####
# Shares of residential addresses in ZCTAs by Tract are later used to select
# representative tract-level SES values at a specified location on the ECDF

modzcta_to_zcta_chr <- data.frame(ZCTA = as.character(modzcta_to_zcta$ZCTA), 
                                  MODZCTA = as.character(modzcta_to_zcta$MODZCTA))

# Calculate the proportion of population each combined ZCTA contributes to MODZCTA
modzcta_to_zcta_pop <- modzcta_to_zcta_chr %>%
  left_join(ACS_Data_scaled[, c("GEOID", "total_pop1")], by = c("ZCTA" = "GEOID")) %>% 
  group_by(MODZCTA) %>%
  mutate(sum_pop = sum(total_pop1)) %>%
  ungroup() %>%
  mutate(pop_prop = total_pop1/sum_pop) %>% 
  arrange(MODZCTA)

# Sum ZCTA->Tract RES_RATIO weights when multiple ZCTA combine to a single MODZCTA
modzcta_to_tract <- modzcta_to_zcta_pop %>%
  filter(MODZCTA != "99999") %>% 
  # Weights of each ZCTA are scaled by their proportion of MODZCTA population
  left_join(zip_to_tract, by = c("ZCTA" = "ZIP")) %>%
  mutate(scaled_res_ratio = RES_RATIO * pop_prop) %>% 
  # note: MODZCTA 10001 & 10007 contain a ZCTA with a small share of population
  # but a zero share of RES_RATIO. This makes their sum of scaled_res_ratio by
  # MODZCTA less than 1, but this will not effect the weighted median or 3Q values
  # by tract.
  group_by(MODZCTA, TRACT) %>%
  dplyr::summarize(SUM_RES_RATIO = sum(scaled_res_ratio), .groups = "drop") %>%
  # Weights are then multiplied by the population density of each tract
  left_join(tract_vars, by = c("TRACT" = "GEOID")) %>%
  dplyr::select(MODZCTA, TRACT, SUM_RES_RATIO, total_pop1, total_hholds1) %>%
  filter(total_hholds1 > 0) %>% 
  mutate(tract_popdens = total_pop1/total_hholds1) %>% 
  mutate(W_RES_RATIO = SUM_RES_RATIO * tract_popdens) %>%
  dplyr::select(MODZCTA, TRACT, W_RES_RATIO)
modzcta_to_tract
## # A tibble: 3,020 x 3
##    MODZCTA TRACT       W_RES_RATIO
##    <chr>   <chr>             <dbl>
##  1 10001   36061005600     0.0307 
##  2 10001   36061005800     0.0648 
##  3 10001   36061007100     0      
##  4 10001   36061007600     0.250  
##  5 10001   36061008400     0.00422
##  6 10001   36061009100     0.154  
##  7 10001   36061009300     0.145  
##  8 10001   36061009500     0.231  
##  9 10001   36061009700     0.250  
## 10 10001   36061009900     0.253  
## # … with 3,010 more rows
# length(unique(modzcta_to_tract$MODZCTA)) #  177
# length(unique(modzcta_to_tract$TRACT))   # 2111
# length(unique(tractSF$GEOID))            # 2167

# check res_ratio against tracts with no population
tract_modzcta_pop <- modzcta_to_tract %>%
  left_join(acs_tracts2, by = c("TRACT" = "GEOID")) %>%
  dplyr::select(MODZCTA, TRACT, total_pop1, W_RES_RATIO)

# checking for tracts with no population but res_ratio > 0
# (not possible now that the weighted ratio uses population)
#tract_modzcta_pop %>% filter(total_pop1 == 0 & W_RES_RATIO > 0) %>% nrow() # 0

# check to make sure no MODZCTA have all zeroes for all tract res_ratios
all_zero_ratio <- tract_modzcta_pop %>% group_by(MODZCTA) %>% filter(all(W_RES_RATIO == 0))
if(nrow(all_zero_ratio) > 0){
  warning("The following MODZCTA have all zeroes for weighted tract residents:\n ", 
          paste(all_zero_ratio$MODZCTA, collapse = ","))
} 
modzcta_to_tract2 <- dplyr::select(tract_modzcta_pop, -total_pop1)

### Preparation done --- Now for the Analysis ###

Part 1: Creation of BWQS Neighborhood Infection Risk Scores

#### Part 1: Creation of BWQS Neighborhood Infection Risk Scores ####
# Step 1: Create univariate scatterplots with negative binomial linear smoothers
# to make sure direction of associations are consistent for all variables
# Visualize the non-linear relationship between infection rate and test_ratio (not adjusted for social inequity)
# infections versus testing ratio with smoothers; this covariate is not quantiled so the shape of association is important
ggplot(ZCTA_ACS_COVID, aes(x = testing_ratio, y = pos_per_100000)) + geom_point() +
  ylab("infections per 100,000") +
  geom_smooth(color = "red", formula = y ~ x, method = "glm.nb") +
  stat_smooth(color = "green", method = "gam", formula = y ~ s(x), method.args = list(family = "nb"), se = F) +
  geom_smooth(method = "glm.nb", formula = y ~ splines::ns(x, 3), se = FALSE) + theme_bw()

# 10 social variables
plotsocial <- ggplot(ZCTA_ACS_COVID, aes(y = pos_per_100000)) + 
  geom_point() + geom_smooth(color = "red", formula = y ~ x, method = "glm.nb") + ylab("infections per 100,000") + theme_bw()
plotsocial %+% aes(x = one_over_medincome)

plotsocial %+% aes(x = not_insured)

plotsocial %+% aes(x = one_over_grocers_per_1000)

plotsocial %+% aes(x = unemployed)

plotsocial %+% aes(x = res_vol_popdensity)

plotsocial %+% aes(x = didnot_workhome_commute)

plotsocial %+% aes(x = not_quarantined_jobs)

plotsocial %+% aes(x = avg_hhold_size)

plotsocial %+% aes(x = essentialworker_drove)

plotsocial %+% aes(x = essentialworker_pubtrans)

SES_vars <- names(ZCTA_ACS_COVID %>% dplyr::select(one_over_medincome, not_insured, one_over_grocers_per_1000, unemployed, 
                                                    not_quarantined_jobs, essentialworker_pubtrans, essentialworker_drove, 
                                                    didnot_workhome_commute, res_vol_popdensity, avg_hhold_size))
# Step 2a: Examine relationships between explanatory variables to make sure nothing >0.9 correlation, as this could bias BWQS
Cors_SESVars <- cor(x = ZCTA_ACS_COVID %>% dplyr::select(all_of(SES_vars)), method = "kendall")
Cors_SESVars1 <- as.data.frame(Cors_SESVars)
Cors_SESVars1$var1 <- row.names(Cors_SESVars1)
Cors_SESVars2 <- gather(data = Cors_SESVars1, key = "var2", value = "correlation", -var1) %>%
  filter(var1 != var2) 

Cors_SESVars2 %>% arrange(desc(correlation)) %>% distinct(correlation, .keep_all = T) 
##                        var1                     var2 correlation
## 1     essentialworker_drove     not_quarantined_jobs   0.6121033
## 2  essentialworker_pubtrans       one_over_medincome   0.5432472
## 3                unemployed       one_over_medincome   0.5341512
## 4               not_insured       one_over_medincome   0.5270258
## 5  essentialworker_pubtrans               unemployed   0.5243542
## 6   didnot_workhome_commute    essentialworker_drove   0.5086620
## 7            avg_hhold_size     not_quarantined_jobs   0.4787113
## 8   didnot_workhome_commute     not_quarantined_jobs   0.4763680
## 9            avg_hhold_size  didnot_workhome_commute   0.4598744
## 10       res_vol_popdensity       one_over_medincome   0.4581407
## 11       res_vol_popdensity essentialworker_pubtrans   0.4544725
## 12       res_vol_popdensity              not_insured   0.4382568
## 13           avg_hhold_size    essentialworker_drove   0.4198843
##  [ reached 'max' / getOption("max.print") -- omitted 32 rows ]
if(export.figs) {
  fig_path = file.path(fig.path, paste0("sfig1_", Sys.Date()))
  if(vector.figs){
    svg(paste0(fig_path, ".svg"), pointsize = 9, width = 7, height = 6)
  } else {
    png(filename = paste0(fig_path, ".png"), width = 300*7, height = 300*6, pointsize = 30)
  }
  corrplot(Cors_SESVars, p.mat = Cors_SESVars, insig = "p-value", type = "lower", 
         sig.level = -1, tl.col = "black", tl.srt = 45)
  dev.off()
  if(vector.figs){
    system(paste0("rsvg-convert -f pdf -o ", fig_path, ".pdf", " ", fig_path, ".svg"))
    unlink(paste0(fig_path, ".svg"))
  }
}

Supp. Fig. 1

# Step 2b: Examine Univariable kendall associations for all selected variables with the outcome  
bind_cols(Variables = SES_vars,
          
        ZCTA_ACS_COVID %>%
            dplyr::select(all_of(SES_vars), pos_per_100000) %>%
            summarise_at(vars(all_of(SES_vars)), list(~cor(., pos_per_100000, method = "kendall"))) %>%
            t() %>%
            as_tibble(),
        
        ZCTA_ACS_COVID %>%
          dplyr::select(all_of(SES_vars), pos_per_100000) %>%
          summarise_at(vars(all_of(SES_vars)), 
                       list(~cor.test(., pos_per_100000, method = "kendall")$p.value)) %>%
          t() %>%
          as_tibble()) %>%
  
  mutate(`Correlation (Tau)` = round(V1...2, 3),
         `p value` = as.character(ifelse(V1...3 < 0.0001, "< 0.0001", round(V1...3, 3))),) %>%
  dplyr::select(-V1...2, -V1...3) 
## Warning: The `x` argument of `as_tibble.matrix()` must have column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## New names:
## * V1 -> V1...2
## * V1 -> V1...3
## # A tibble: 10 x 3
##    Variables                 `Correlation (Tau)` `p value`
##    <chr>                                   <dbl> <chr>    
##  1 one_over_medincome                      0.322 < 0.0001 
##  2 not_insured                             0.236 < 0.0001 
##  3 one_over_grocers_per_1000               0.268 < 0.0001 
##  4 unemployed                              0.294 < 0.0001 
##  5 not_quarantined_jobs                    0.542 < 0.0001 
##  6 essentialworker_pubtrans                0.175 0.001    
##  7 essentialworker_drove                   0.483 < 0.0001 
##  8 didnot_workhome_commute                 0.453 < 0.0001 
##  9 res_vol_popdensity                      0.108 0.033    
## 10 avg_hhold_size                          0.463 < 0.0001
# Step 3: Prepare data for BWQS and pass to stan for model fitting 
pm(fit_BWQS_model <- function(data_list, stan_model_path){
  fitted_model <- stan(
    file = stan_model_path,
    data = data_list,
    chains = 1,
    warmup = 2500,
    iter = 20000,
    thin = 10,
    refresh = 0,
    algorithm = "NUTS",
    seed = 1234,
    control = list(max_treedepth = 20,
                   adapt_delta = 0.999999999999999))
  return(fitted_model)
})

Compute_Bayes_R2 <- function(fit) {
  y_pred = exp(extract(fit,"eta")$eta)
  var_fit = apply(y_pred, 1, var)
  mean_fit = apply(y_pred, 1, mean)
  phi = extract(fit,"phi")$phi
  var_res = mean_fit  + (mean_fit^2)/phi
  r2 = var_fit / (var_fit + var_res)
  return(list(meanr2 = mean(r2),
              medianr2 = median(r2),
              lowerr2 = quantile(r2, .025),
              upperr2 = quantile(r2, .975)))
}

prep_BWQS_data <- function(df, ses_varnames){
  y = as.numeric(df$pos_per_100000)
  X <- df %>%
    dplyr::select(all_of(ses_varnames))
  K <- ns(df$testing_ratio, df = 3)
  for (vname in ses_varnames)
    X[[vname]] <- ecdf(X[[vname]])(X[[vname]]) * 10
  data <-as.data.frame(cbind(y,X)) # Aggregate data in a data.frame
  
  data_list = list(N      = NROW(data),
                   N_new  = NROW(data),
                   C      = NCOL(X),
                   K      = NCOL(K),
                   XC     = cbind(as.matrix(X)),
                   XK     = cbind(K),
                   XC_new = cbind(as.matrix(X)),
                   XK_new = cbind(K),
                   Dalp   = rep(1,length(ses_varnames)),
                   y      = as.vector(data$y))
  return(list(data_list = data_list, X = X, K = K))
}

m1data <- prep_BWQS_data(ZCTA_ACS_COVID, SES_vars)

t_dataprep = Sys.time()

# fit our primary model -- negative binomial
m1 <- fit_BWQS_model(m1data$data_list, BWQS_stan_model)

t_m1 = Sys.time()

BWQS model diagnostics

#### BWQS model diagnostics ####

# print m1 for model diagnostics (n_eff as % of 1750, Rhat, trace plots, acf)
# m1
min(summary(m1)$summary[,"n_eff"]/1750) # effective sample size is at worst 78%
## [1] 0.6940459
traceplot(m1, pars = c("beta1", "W"))

stan_ac(m1, pars = c("beta1", "W"))
## Warning: Ignoring unknown parameters: fun.y
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`

extract_waic(m1)
## $waic
##     waic 
## 2370.164 
## 
## $elpd_waic
## elpd_waic 
## -1185.082 
## 
## $p_waic
##   p_waic 
## 15.05826 
## 
## $elpd_loo
##  elpd_loo 
## -1185.086 
## 
## $p_loo
##    p_loo 
## 15.06246
round(Compute_Bayes_R2(m1)$meanr2, 2)
## [1] 0.93
round(Compute_Bayes_R2(m1)$lowerr2, 2)
## 2.5% 
## 0.92
round(Compute_Bayes_R2(m1)$upperr2, 2)
## 97.5% 
##  0.95
# residual analysis with DHARMa
residuals_qq <- createDHARMa(simulatedResponse = t(extract(m1,"y_new")$y_new), observedResponse = m1data$data_list$y)
## No fitted predicted response provided, using the mean of the simulations
#plotQQunif(residuals_qq, testOutliers = F, testDispersion = F) #base graphics
ks_unif <- testUniformity(residuals_qq)
residuals_qq_unif <- gap::qqunif(residuals_qq$scaledResiduals,pch=2,bty="n", logscale = F, col = "black", cex = 0.6, main = "QQ plot residuals", cex.main = 1)
sfig3_qq <- ggplot(data.frame(x = residuals_qq_unif$x, y = residuals_qq_unif$y)) +
  geom_abline(linetype = "dashed", color = "blue") +
  scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) +
  scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
  geom_point(aes(x, y), size = 0.8) +
  annotate("text", x = 0.02, y = 0.9, hjust = 0, size = 3.2,
           label = paste0("Kolmogorov-Smirnov test for\ncomparison of distributions: p=",
                          round(ks_unif$p.value, 2))) +
  coord_equal(ratio = 1) +
  labs(x = "Expected", y = "Observed") +
  theme_bw() + theme(plot.margin = unit(c(5.5, 11, 5.5, 5.5), "points"))
if(export.figs){
  write_figs(sfig3_qq, "sfig3", fig_width = 3.5, fig_height = 3.4)
}
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/sfig3_2021-06-17.png

Supp. Fig. 3

write_csv(as_tibble(residuals_qq_unif), path = file.path(source_path, "sfig3.csv"))
# examine parameter estimates
exp(mean(extract(m1, "beta1")$beta1))
## [1] 1.077078
vars = c("phi", "beta0", "beta1", paste0("delta", 1:3), SES_vars)
parameters_to_drop <- c("phi", paste0("delta", 1:3), "beta0", "beta1")
number_of_coefficients <- length(vars)

BWQS_params <- bind_cols(as_tibble(summary(m1)$summary[1:number_of_coefficients,c(1,4,6,8:10)]), label = vars)

BWQS_params %>% filter(label == "beta1") %>% mutate_at(vars(1:3), ~exp(.))
## # A tibble: 1 x 7
##    mean `2.5%` `50%` `97.5%` n_eff  Rhat label
##   <dbl>  <dbl> <dbl>   <dbl> <dbl> <dbl> <chr>
## 1  1.08   1.07  1.08  0.0856 1706.  1.00 beta1
BWQS_fits <- BWQS_params %>%
  rename(lower = "2.5%",
         upper = "97.5%") %>%
  filter(!label %in% parameters_to_drop) %>%
  arrange(desc(mean)) %>%
  mutate(group = factor(if_else(label == "one_over_medincome"|label =="not_insured"|label == "unemployed", "Finances &\nAccess to care",
                         if_else(label == "one_over_grocers_per_1000", "Food\nAccess",
                                 if_else(str_detect(label, "essential")|label == "not_quarantined_jobs"|label=="didnot_workhome_commute", "Commuting and\nEssential Work",
                                         if_else(label == "avg_hhold_size"|label == "res_vol_popdensity", "Population\nDensity", "Unmatched")))),
                        levels = c("Commuting and\nEssential Work", "Finances &\nAccess to care", "Population\nDensity", "Food\nAccess")))

labels1 <- c("one_over_medincome" = "1/\nMedian income", 
             "not_insured" = "Uninsured", 
             "unemployed" = "Unemployed", 
             "one_over_grocers_per_1000" = "1/\nGrocers per 1000",
             "not_quarantined_jobs" = "Essential Workers", 
             "essentialworker_pubtrans" = "Essential Worker:\nPublic Transit", 
             "essentialworker_drove" = "Essential Worker:\nDriving Commute", 
             "didnot_workhome_commute" = "1/\nWork from home", 
             "res_vol_popdensity" = "Population Density\nby Residential Volume", 
             "avg_hhold_size" = "Average people\nper household")

labels2 <- c("phi" = "Overdispersion", 
             "beta0" = "Intercept", 
             "beta1" = "BWQS term",
             "delta1" = "Testing ratio: spline term 1",
             "delta2" = "Testing ratio: spline term 2",
             "delta3" = "Testing ratio: spline term 3",
             labels1)

Supplemental Table 2

# Supplemental Table 2: Parameter estimates, credible intervals, and diagnostics from main BWQS infections model
BWQS_params %>% bind_cols(., "terms" = labels2) %>%
  dplyr::select(-label) %>%
  mutate_at(vars(1:6), ~round(., 3)) %>%
  mutate_at(vars(5), ~round(., 0)) %>% 
  mutate(`95% CrI`=paste0("(",format(`2.5%`, nsmall=3),", ",format(`97.5%`, nsmall=3),")")) %>% 
  mutate(terms = 
           case_when(terms=="BWQS term" ~ "COVID-19 inequity index",
                     TRUE               ~ terms)) %>%
  dplyr::select(-`2.5%`, -`97.5%`) %>%
  rename(median=`50%`) %>%
  dplyr::select(terms, mean, `95% CrI`, median, Rhat, n_eff) %>%
  kbl(align=rep('r', 6), font_size = 9.5) %>%
  kable_classic(full_width = F, html_font = "Arial")
terms mean 95% CrI median Rhat n_eff
Overdispersion 104.050 (81.448, 130.015) 103.552 0.999 1636
Intercept 6.261 ( 6.173, 6.352) 6.261 1.001 1643
COVID-19 inequity index 0.074 ( 0.063, 0.086) 0.074 1.000 1706
Testing ratio: spline term 1 0.984 ( 0.900, 1.072) 0.986 1.002 1905
Testing ratio: spline term 2 2.029 ( 1.809, 2.241) 2.029 1.001 1661
Testing ratio: spline term 3 1.123 ( 1.015, 1.233) 1.125 1.000 1792
1/ Median income 0.098 ( 0.009, 0.219) 0.093 0.999 1590
Uninsured 0.181 ( 0.067, 0.306) 0.179 0.999 1496
Unemployed 0.026 ( 0.001, 0.077) 0.021 1.000 1509
1/ Grocers per 1000 0.053 ( 0.002, 0.138) 0.046 1.000 1766
Essential Workers 0.062 ( 0.002, 0.181) 0.050 1.000 1767
Essential Worker: Public Transit 0.050 ( 0.002, 0.140) 0.043 0.999 1534
Essential Worker: Driving Commute 0.167 ( 0.033, 0.287) 0.171 0.999 1690
1/ Work from home 0.088 ( 0.010, 0.196) 0.083 1.002 1706
Population Density by Residential Volume 0.115 ( 0.018, 0.218) 0.113 1.000 1702
Average people per household 0.159 ( 0.038, 0.293) 0.159 1.000 1453
write_csv(BWQS_fits, file.path(source_path, "fig2.csv"))
# create a figure with parameter estimates for the weights
fig2 <- ggplot(data=BWQS_fits, aes(x= reorder(label, mean), y=mean, ymin=lower, ymax=upper)) +
  geom_pointrange() + 
  coord_flip() + 
  xlab("") + 
  ylab("Variable Weight: Mean (95% credible intervals)") +
  scale_x_discrete(labels = labels1) + 
  theme_bw(base_size = 18) +
  facet_grid(group~., scales = "free", space = "free") +
  theme(strip.text.x = element_text(size = 14))
if(export.figs){ 
  write_figs(fig2, "fig2", fig_width = 8, fig_height = 8)
}  
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/fig2_2021-06-17.png

Fig. 2

# Step 4: Construct the summative COVID-19 inequity index value for each ZCTA
# Use the variable-specific weight on the quantiled splits to create a 10 point ZCTA-level infection risk score  
BWQS_weights <- as.numeric(summary(m1)$summary[(length(vars)-length(SES_vars) + 1):number_of_coefficients,c(1)])

ZCTA_ACS_COVID2 <- m1data$X * BWQS_weights[col(ZCTA_ACS_COVID)] 

BWQS_index <- ZCTA_ACS_COVID2 %>% 
  dplyr::mutate(BWQS_index = rowSums(.)) %>% 
  dplyr::select(BWQS_index) 

# Visualize the relationship between BWQS and test_ratio
ggplot(data.frame(BWQS_index = BWQS_index$BWQS_index, testing_ratio = ZCTA_ACS_COVID$testing_ratio), aes(testing_ratio, BWQS_index)) + geom_point() + 
  geom_smooth() + geom_smooth(color = "red", method = "lm")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

# BWQS is correlated with testing_ratio
cor.test(BWQS_index$BWQS_index, ZCTA_ACS_COVID$testing_ratio, method = "spearman")  
## 
##  Spearman's rank correlation rho
## 
## data:  BWQS_index$BWQS_index and ZCTA_ACS_COVID$testing_ratio
## S = 473508, p-value = 2.703e-12
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.487643
# Partial regression plot for BWQS index
# shows a nice linear relationship of BWQS index with infections after adjustment for testing
nb_testing_ns3df <- glm.nb(m1data$data_list$y ~ m1data$K)
yresid <- resid(nb_testing_ns3df)
bwqs_testing_ns3df <- lm(BWQS_index$BWQS_index ~ m1data$K)
bwqsresid <- resid(bwqs_testing_ns3df)
ggplot(data.frame(yresid, bwqsresid), aes(bwqsresid, yresid)) + geom_point() + 
  geom_smooth(formula = y ~ x, method = "lm", se = F) + 
  ylab("residual log infection rate\n(adjusted for testing)") + xlab("residual BWQS infection risk index\n(adjusted for testing)")

summary(lm(yresid ~ bwqsresid))
## 
## Call:
## lm(formula = yresid ~ bwqsresid)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.63000 -0.35184  0.01091  0.32491  1.93895 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.05117    0.04792  -1.068    0.287    
## bwqsresid    0.48676    0.02997  16.242   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6376 on 175 degrees of freedom
## Multiple R-squared:  0.6012, Adjusted R-squared:  0.5989 
## F-statistic: 263.8 on 1 and 175 DF,  p-value: < 2.2e-16
rm(nb_testing_ns3df, yresid, bwqs_testing_ns3df, bwqsresid)

# Visualize the full model observed vs predicted -- shows a close relationship (which is why R2 is so high)
preddf <- data.frame(data.frame(pred = colMeans(extract(m1,"y_new")$y_new), y = m1data$data_list$y))
ggplot(preddf, aes(pred, y)) + 
  geom_point() +  
  geom_abline(linetype = "dashed", color = "grey10") + 
  coord_fixed(ratio = 1) + 
  scale_x_continuous("Predicted Infections per 100,000", label=comma, limits = range(preddf)) + 
  scale_y_continuous("Infections per 100,000", label=comma) + 
  theme_bw()

rm(preddf)

# calculate credible interval over the mean predicted infections
# at the median testing_ratio
sim_out <- data.frame(extract(m1, pars = c("beta0", "beta1", "delta")))
median_testing <- predict(m1data$K, median(ZCTA_ACS_COVID$testing_ratio))
# calculate term for median testing_rate
sim_out$deltamedian <- with(sim_out, median_testing[1] * delta.1 + 
                              median_testing[2] * delta.2 + 
                              median_testing[3] * delta.3)
# make a sequence of BWQS values
xseqlength <- 300
bwqs_seq <- seq(from = min(BWQS_index$BWQS_index), 
                to = max(BWQS_index$BWQS_index), 
                length.out = xseqlength)
sim_matrix <- sim_out$beta0 %o% rep(1, xseqlength) + 
  sim_out$beta1 %o% bwqs_seq + 
  sim_out$deltamedian%o% rep(1, xseqlength)
sim_bwqs_df <- data.frame(bwqs_seq, 
                     lower = exp(colQuantiles(sim_matrix, probs = 0.025)),
                     upper = exp(colQuantiles(sim_matrix, probs = 0.975)),
                     mean = exp(colMeans(sim_matrix)))
rm(median_testing, xseqlength, bwqs_seq, sim_matrix)

# calculate credible interval over the mean predicted infections
# at the median COVID-19 inequity index
sim_out$beta1median <- sim_out$beta1 * median(BWQS_index$BWQS_index)
# make a sequence of testing_ratio values
xseqlength <- 300
inequity_seq <- data.frame(x = seq(from = min(ZCTA_ACS_COVID$testing_ratio),
                                   to = max(ZCTA_ACS_COVID$testing_ratio),
                                   length.out = xseqlength))
inequity_seq <- data.frame(x = inequity_seq$x, t(sapply(inequity_seq$x, FUN = function(x) predict(m1data$K, x))))
sim_matrix <- 
  sim_out$beta0 %o% rep(1, xseqlength) + 
  sim_out$beta1median %o% rep(1, xseqlength) + 
  sim_out$delta.1 %o% inequity_seq$X1  + 
  sim_out$delta.2 %o% inequity_seq$X2  + 
  sim_out$delta.3 %o% inequity_seq$X3
sim_testing_df <- data.frame(x = inequity_seq$x, 
                     lower = exp(colQuantiles(sim_matrix, probs = 0.025)),
                     upper = exp(colQuantiles(sim_matrix, probs = 0.975)),
                     mean = exp(colMeans(sim_matrix)))
rm(sim_out, xseqlength, inequity_seq, sim_matrix)

# Visualize the relationship between BWQS index and infection rate at the median testing_ratio
write_csv(sim_bwqs_df, file.path(source_path, "fig1_a.csv"))
write_csv(bind_cols(BWQS_index, y = m1data$data_list$y), file.path(source_path, "fig1_b.csv"))

BWQS_scatter <- ggplot(sim_bwqs_df, aes(x = bwqs_seq)) + 
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = 'grey75') + 
  geom_line(aes(y = mean)) + 
  geom_point(data = data.frame(BWQS_index, y = m1data$data_list$y), aes(BWQS_index, y), alpha = 0.5) + 
  scale_x_continuous("COVID-19 inequity index") + 
  scale_y_continuous("Infections per 100,000", label=comma) + 
  theme_bw(base_size = 18)
BWQS_scatter <- ggExtra::ggMarginal(BWQS_scatter, type = "histogram", fill = "grey40", margins = "both", 
                                    xparams = list(binwidth = 0.5), yparams = list(binwidth = 200))
if(export.figs) {
  write_figs(BWQS_scatter, "fig1", fig_width = 96*5, fig_height = 96*5)
}
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/fig1_2021-06-17.png

Fig. 1

# Visualize the relationship between testing_ratio and infection rate at the median BWQS
write_csv(sim_testing_df, file.path(source_path, "sfig2.csv"))
testing_scatter <- ggplot(sim_testing_df, aes(x = x)) + 
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = 'grey75') + 
  geom_line(aes(y = mean)) + 
  geom_point(data = data.frame(testing_ratio = ZCTA_ACS_COVID$testing_ratio, y = m1data$data_list$y), 
             aes(testing_ratio, y), alpha = 0.5) + 
  scale_x_continuous("Testing ratio") + 
  scale_y_continuous("Infections per 100,000", label=comma) + 
  theme_bw(base_size = 18)
testing_scatter <- ggExtra::ggMarginal(testing_scatter, type = "histogram", fill = "grey40", margins = "both",
                                       xparams = list(binwidth = 0.005), yparams = list(binwidth = 200))
if(export.figs) {
  write_figs(testing_scatter, "sfig2", fig_width = 96*5, fig_height = 96*5)
}
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/sfig2_2021-06-17.png

Supp. Fig. 2

# Step 5: Visualize the spatial distribution of ZCTA-level infection risk scores 

ZCTA_BWQS_COVID_shp <- ZCTA_ACS_COVID_shp %>%
  bind_cols(., BWQS_index)

# reproject to WGS84 to be compatible with scalebar

ZCTA_BWQS_COVID_shp <- st_transform(ZCTA_BWQS_COVID_shp, 4326)

write_csv(st_drop_geometry(ZCTA_BWQS_COVID_shp %>% dplyr::select(zcta, BWQS_index)), file.path(source_path, "fig3a.csv"))
# Figure 3A - BWQS Index by ZCTA
fig3a <- ggplot() + 
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(data = ZCTA_BWQS_COVID_shp, aes(fill = BWQS_index), lwd = 0.2) + 
  scalebar(ZCTA_BWQS_COVID_shp, dist = 5, dist_unit = "km", 
           transform = TRUE, model = "WGS84", 
           st.size = 2.8, height = 0.015, border.size = 0.5, st.dist = 0.011,
           anchor = c(x = -73.71, y = 40.495)) + 
  scale_fill_gradientn(colours=brewer_pal("YlGnBu", type = "seq")(7)) + 
  coord_sf(crs = st_crs(ZCTA_BWQS_COVID_shp),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
            expand = FALSE) +
  theme_bw(base_size = 6) + 
  labs(fill = "COVID-19 Inequity Index") +
  annotate("text", x = -73.709, y = 40.48, label = "water mask © OpenStreetMap contributors",
           size = theme_get()$text[["size"]]/4, hjust = "right") + 
  theme(legend.title = element_text(face = "bold", size = 9), 
        panel.background = element_rect(fill = "#cccccc"), 
        legend.background = element_rect(fill = "transparent"),
        panel.grid = element_blank(),
        legend.position = c(0.125, 0.90),
        legend.text = element_text(size = 8.5),
        legend.key.size = unit(1.1, "lines"),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title.y = element_blank(),
        axis.title.x = element_blank())
# Step 6: Compare quantile distribution of ZCTA-level BWQS scores by the race/ethnic composition of residents  
Demographics <- ACS_Data_scaled %>% 
  dplyr::select(GEOID, ends_with("_raceethnic"), total_pop1) %>%
  left_join(., modzcta_to_zcta1, by = c("GEOID" = "ZCTA")) %>%
  group_by(MODZCTA) %>%
  summarise_at(vars(ends_with("_raceethnic"), total_pop1), ~sum(.)) %>%
  mutate(zcta = as.character(MODZCTA),
         other_raceethnic = total_pop1 - (hisplat_raceethnic + nonhispLat_white_raceethnic + nonhispLat_black_raceethnic + 
                                            nonhispLat_amerindian_raceethnic + nonhispLat_asian_raceethnic))
  
ZCTA_BQWS <- ZCTA_BWQS_COVID_shp %>%
  st_set_geometry(., NULL) %>%
  dplyr::select(zcta, BWQS_index)

Demographics_for_ridges <- Demographics %>%
  left_join(., ZCTA_BQWS, by = "zcta") %>%
  dplyr::select(-total_pop1) %>%
  gather(key = "Race/Ethnicity", value = "Population", hisplat_raceethnic:other_raceethnic) %>%
  mutate(`Race/Ethnicity` = if_else(`Race/Ethnicity`=="hisplat_raceethnic","Hispanic, LatinX",
                                    if_else(`Race/Ethnicity`=="nonhispLat_black_raceethnic", "Black",
                                            if_else(`Race/Ethnicity`=="nonhispLat_white_raceethnic", "white",
                                                    if_else(`Race/Ethnicity`== "nonhispLat_asian_raceethnic", "Asian", "other")))),
         `Race/Ethnicity` = factor(`Race/Ethnicity`, levels = c( "white",  "Asian", "other","Hispanic, LatinX","Black")),
         Population = as.numeric(Population)) 

(Demographics_for_ridges %>%
  group_by(`Race/Ethnicity`) %>%
  summarise(weighted.mean(BWQS_index, Population),
            weightedMedian(BWQS_index, Population)))
## # A tibble: 5 x 3
##   `Race/Ethnicity` `weighted.mean(BWQS_index, Po… `weightedMedian(BWQS_index, P…
##   <fct>                                     <dbl>                          <dbl>
## 1 white                                      4.34                           4.69
## 2 Asian                                      5.48                           6.20
## 3 other                                      5.14                           5.66
## 4 Hispanic, LatinX                           6.10                           6.61
## 5 Black                                      6.26                           6.51
write_csv(Demographics_for_ridges, file.path(source_path, "fig4.csv"))
fig4 <- ggplot(Demographics_for_ridges,
       aes(x = BWQS_index, y = `Race/Ethnicity`)) + 
  xlab("COVID-19 inequity index")+
  geom_density_ridges(
    aes(height = ..density..,  
        weight = Population / sum(Population)),
    scale = 0.95,
    stat ="density") + 
  scale_y_discrete(expand = c(0, 0)) + 
  expand_limits(y = 6) + 
  theme_bw(base_size = 18) + 
  theme(legend.position = "none")
if(export.figs){
  write_figs(fig4, "fig4", fig_width = 8, fig_height = 5)
} 
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/fig4_2021-06-17.png

Fig. 4

Below_25th_zctas <- ZCTA_BQWS %>%
  filter(BWQS_index<quantile(BWQS_index, .25))
Race_Ethncity_below25th <- Demographics %>%
  filter(zcta %in% Below_25th_zctas$zcta) %>%
  dplyr::select(-zcta, -MODZCTA) %>% 
  summarise_all(funs(sum(.))) %>%
  mutate_at(vars(ends_with("_raceethnic")), funs(round((./total_pop1)*100, 2))) %>%
  mutate(Group = "Below 25th percentile BWQS")

Between_25thand75th_zctas <- ZCTA_BQWS %>%
  filter(BWQS_index>quantile(BWQS_index, .25) & BWQS_index<quantile(BWQS_index, .75))

Race_Ethncity_btw_25th75th <- Demographics %>%
  filter(zcta %in% Between_25thand75th_zctas$zcta) %>%
  dplyr::select(-zcta, -MODZCTA) %>% 
  summarise_all(funs(sum(.))) %>%
  mutate_at(vars(ends_with("_raceethnic")), funs(round((./total_pop1)*100, 2)))%>%
  mutate(Group = "Between 25-75th percentile BWQS")

Above_75th_zctas <- ZCTA_BQWS %>%
  filter(BWQS_index>quantile(BWQS_index, .75))

Race_Ethncity_above_75th <- Demographics %>%
  filter(zcta %in% Above_75th_zctas$zcta) %>%
  dplyr::select(-zcta, -MODZCTA) %>% 
  summarise_all(funs(sum(.))) %>%
  mutate_at(vars(ends_with("_raceethnic")), funs(round((./total_pop1)*100, 2))) %>%
  mutate(Group = "Above 75th percentile BWQS")

Race_Ethncity_all <- Demographics %>%
  dplyr::select(-zcta, -MODZCTA) %>% 
  summarise_all(funs(sum(.))) %>%
  mutate_at(vars(ends_with("_raceethnic")), funs(round((./total_pop1)*100, 2)))%>%
  mutate(Group = "NYC Population")

Demographics_by_BWQS <- bind_rows(Race_Ethncity_all, Race_Ethncity_below25th, Race_Ethncity_btw_25th75th, Race_Ethncity_above_75th) %>%
  mutate(other = other_raceethnic + nonhispLat_amerindian_raceethnic)  %>%
  dplyr::select(-other_raceethnic, -nonhispLat_amerindian_raceethnic, - total_pop1) %>%
  rename("Hispanic, LatinX" = "hisplat_raceethnic",
         "Black" = "nonhispLat_black_raceethnic", 
          "white" = "nonhispLat_white_raceethnic", 
         "Asian" = "nonhispLat_asian_raceethnic") %>%
  gather(., "Race/Ethnicity", "Proportion", 1:4,6) %>%
  mutate(`Race/Ethnicity` = factor(`Race/Ethnicity`, levels = c("other", "Asian", "white", "Hispanic, LatinX", "Black")),
         Group = factor(Group, levels = c("NYC Population", "Below 25th percentile BWQS", "Between 25-75th percentile BWQS", "Above 75th percentile BWQS")))

labels_demographics <- c("NYC Population" = "NYC Population", "Below 25th percentile BWQS" = "Below 25th\npercentile BWQS", 
                         "Between 25-75th percentile BWQS" = "Between 25-75th\npercentile BWQS", "Above 75th percentile BWQS" = "Above 75th\npercentile BWQS")
labels_demographics <- c("NYC Population" = "NYC Population", "Below 25th percentile BWQS" = "Below 25th\n%ile index", 
                         "Between 25-75th percentile BWQS" = "Between 25-75th\n%ile index", "Above 75th percentile BWQS" = "Above 75th\n%ile index")

write_csv(Demographics_by_BWQS, file.path(source_path, "sfig4.csv"))
sfig4 <- ggplot(Demographics_by_BWQS, aes(fill=`Race/Ethnicity`, y=Proportion, x=Group)) + 
    geom_rect(data = subset(Demographics_by_BWQS, Group=="NYC Population"), 
            aes(xmin=as.numeric(Group)-.35,xmax=as.numeric(Group)+.35, ymin=0, ymax=100, fill="gray85"), color = "gray", alpha = .1) +
  geom_bar(data = subset(Demographics_by_BWQS, Group!="NYC Population"), position="stack", stat="identity", width = .75) +
  geom_bar(data = subset(Demographics_by_BWQS, Group=="NYC Population"), position="stack", stat="identity", width = .45) +
  scale_fill_manual(breaks = c("other", "Asian", "white", "Hispanic, LatinX", "Black"), 
                    values = c("#984ea3","#ff7f00","gray85", "#4daf4a", "#e41a1c", "#377eb8"))  +
  geom_text(aes(label=ifelse(Proportion >= 5, paste0(sprintf("%.0f", Proportion),"%"),"")),
            position=position_stack(vjust=0.5), colour="black", size = 8) + 
  scale_y_continuous("Percentage", expand = c(0,0), labels = function(x) paste0(x, "%")) + 
  scale_x_discrete(limits = c( "NYC Population", "Below 25th percentile BWQS", "Between 25-75th percentile BWQS", "Above 75th percentile BWQS"), 
                   labels = labels_demographics) + 
  theme_bw(base_size = 16) +
  theme(legend.text = element_text(size = 14),
        axis.text.y = element_text(size = 16),
        axis.text.x = element_text(size = 16, color = "black"), 
        axis.title.x = element_blank()) 
if(export.figs){
  write_figs(sfig4, "sfig4", fig_width = 12, fig_height = 6)
} 
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/sfig4_2021-06-17.png

Supp. Fig. 4

t_postm1 = Sys.time()

BWQS by Tract

#### BWQS by Tract ####

get_tract_vars_by_zcta <- function(tract_vars, modzcta_tract_crosswalk, whichq){
  whichq = str_to_lower(whichq)
  if(whichq %in% c("median", "q2", "2q", "2")){
    qname = "median"
    qnum = 2
  } else if(whichq %in% c("q3", "3q", "3")){
    qname = "3Q"
    qnum = 3
  } else { stop("unhandled quartile selected: ", whichq)}
  
  ses_zcta <- tract_vars %>% 
    st_set_geometry(., NULL) %>%
    left_join(., modzcta_tract_crosswalk, by = c("GEOID" = "TRACT")) %>% 
    filter(!is.na(MODZCTA)) %>% 
    group_by(MODZCTA) %>%
    summarise(essentialworker_drove_rename = Hmisc::wtd.quantile(essentialworker_drove, W_RES_RATIO)[[qnum]],
              essentialworker_pubtrans_rename = Hmisc::wtd.quantile(essentialworker_pubtrans, W_RES_RATIO)[[qnum]],
              not_quarantined_jobs_rename = Hmisc::wtd.quantile(not_quarantined_jobs, W_RES_RATIO)[[qnum]],
              didnot_workhome_commute_rename = Hmisc::wtd.quantile(didnot_workhome_commute, W_RES_RATIO)[[qnum]],
              not_insured_rename = Hmisc::wtd.quantile(not_insured, W_RES_RATIO)[[qnum]],
              one_over_medincome_rename = Hmisc::wtd.quantile(one_over_medincome, W_RES_RATIO)[[qnum]],
              unemployed_rename = Hmisc::wtd.quantile(unemployed, W_RES_RATIO)[[qnum]],
              avg_hhold_size_rename = Hmisc::wtd.quantile(avg_hhold_size, W_RES_RATIO)[[qnum]],
              res_vol_popdensity_rename = Hmisc::wtd.quantile(res_vol_popdensity, W_RES_RATIO)[[qnum]],
              one_over_grocers_per_1000_rename = Hmisc::wtd.quantile(one_over_grocers_per_1000, W_RES_RATIO)[[qnum]]) 
  
  ses_zcta %>% rename_with(~ gsub("rename", qname, .x))
}

# Select median tract values by MODZCTA, weighted by residential address share and population density by tract
SES_zcta_median <- get_tract_vars_by_zcta(tract_vars, modzcta_to_tract2, "median")
SES_vars_median = names(SES_zcta_median)[names(SES_zcta_median) != "MODZCTA"]

# join SES to testing data: positive/100k and testing_ratio
SES_zcta_median_testing <- ZCTA_ACS_COVID %>%
  dplyr::select(zcta, pos_per_100000, testing_ratio) %>%
  left_join(SES_zcta_median, by = c("zcta" = "MODZCTA"))

Warnings for undefined values in log_lik[1:177] and y_new[1:177] in the two models below using tract-level data have been hidden. There were undefined values during an early part of the warmup period, which print out a long list of errors, but these do not impact the posterior draws (as seen by trace plots and the potential scale-reduction R_hat statistics).

# BWQS using median at the tract level 
m2data_median <- prep_BWQS_data(SES_zcta_median_testing, SES_vars_median)
m2_median <- fit_BWQS_model(data_list = m2data_median$data_list, stan_model_path = BWQS_stan_model)
# Check for NA or Inf values in the post-warmup draws
m2_thinned_draws = m2_median@sim$samples[[1]]
m2_args = m2_median@stan_args[[1]]
m2_post_draws = (m2_args$warmup/m2_args$thin +1):(m2_args$iter/m2_args$thin)
range(m2_post_draws)
## [1]  251 2000
m2_check = sapply(m2_thinned_draws, function(pv) any(is.na(pv[m2_post_draws])) | any(is.infinite(pv[m2_post_draws])))
# Do any post-warmup draws have NA or Inf values? 
any(m2_check) 
## [1] FALSE
traceplot(m2_median, pars = c("beta1", "W"))

Parameter estimates, credible intervals, and diagnostics from model using median tract values by MODZCTA

BWQS_params_m2 <- bind_cols(as_tibble(summary(m2_median)$summary[1:number_of_coefficients,c(1,4,6,8:10)]), label = vars)
BWQS_params_m2 %>% bind_cols(., "terms" = labels2) %>%
  dplyr::select(-label) %>%
  mutate_at(vars(1:6), ~round(., 3)) %>%
  mutate_at(vars(5), ~round(., 0)) %>% 
  mutate(`95% CrI`=paste0("(",format(`2.5%`, nsmall=3),", ",format(`97.5%`, nsmall=3),")")) %>% 
  mutate(terms = 
           case_when(terms=="BWQS term" ~ "COVID-19 inequity index",
                     TRUE               ~ terms)) %>%
  dplyr::select(-`2.5%`, -`97.5%`) %>%
  rename(median=`50%`) %>%
  dplyr::select(terms, mean, `95% CrI`, median, Rhat, n_eff) %>%
  kbl(align=rep('r', 6), font_size = 9.5) %>%
  kable_classic(full_width = F, html_font = "Arial")
terms mean 95% CrI median Rhat n_eff
Overdispersion 107.824 (84.252, 134.502) 107.535 1.004 1799
Intercept 6.259 ( 6.175, 6.342) 6.259 1.000 1853
COVID-19 inequity index 0.079 ( 0.067, 0.091) 0.079 1.000 1803
Testing ratio: spline term 1 0.950 ( 0.865, 1.039) 0.950 1.000 1451
Testing ratio: spline term 2 1.995 ( 1.780, 2.213) 1.995 0.999 1897
Testing ratio: spline term 3 1.147 ( 1.034, 1.257) 1.148 0.999 1776
1/ Median income 0.141 ( 0.026, 0.248) 0.142 1.000 1879
Uninsured 0.062 ( 0.004, 0.147) 0.057 1.000 1741
Unemployed 0.064 ( 0.002, 0.178) 0.054 0.999 1563
1/ Grocers per 1000 0.056 ( 0.003, 0.141) 0.051 1.000 1863
Essential Workers 0.186 ( 0.087, 0.289) 0.185 0.999 1950
Essential Worker: Public Transit 0.090 ( 0.010, 0.186) 0.089 1.000 1786
Essential Worker: Driving Commute 0.046 ( 0.002, 0.122) 0.041 1.000 1761
1/ Work from home 0.286 ( 0.190, 0.389) 0.287 1.001 1686
Population Density by Residential Volume 0.040 ( 0.002, 0.108) 0.035 1.000 1793
Average people per household 0.028 ( 0.001, 0.072) 0.025 1.001 1722
t_m2 = Sys.time()

# Select third quartile tract values by MODZCTA, weighted by residential address share and population density by tract
SES_zcta_3q <- get_tract_vars_by_zcta(tract_vars, modzcta_to_tract2, "3q")
SES_vars_3q = names(SES_zcta_3q)[names(SES_zcta_3q) != "MODZCTA"]
SES_zcta_3q_testing <- ZCTA_ACS_COVID %>%
  dplyr::select(zcta, pos_per_100000, testing_ratio) %>%
  left_join(SES_zcta_3q, by = c("zcta" = "MODZCTA"))
#BWQS using the 3rd quartile at the tract level 
m2data_3q <- prep_BWQS_data(SES_zcta_3q_testing, SES_vars_3q)
m2_3q <- fit_BWQS_model(data_list = m2data_3q$data_list, stan_model_path = BWQS_stan_model)
# Check for NA or Inf values in the post-warmup draws
m2_3q_thinned_draws = m2_3q@sim$samples[[1]]
m2_3q_args = m2_3q@stan_args[[1]]
m2_3q_post_draws = (m2_3q_args$warmup/m2_3q_args$thin +1):(m2_3q_args$iter/m2_3q_args$thin)
range(m2_3q_post_draws)
## [1]  251 2000
m2_3q_check = sapply(m2_3q_thinned_draws, function(pv) any(is.na(pv[m2_3q_post_draws])) | any(is.infinite(pv[m2_3q_post_draws])))
# Do any post-warmup draws have NA or Inf values? 
any(m2_3q_check) 
## [1] FALSE
traceplot(m2_3q, pars = c("beta1", "W"))

Parameter estimates, credible intervals, and diagnostics from model using third quartile tract values by MODZCTA

BWQS_params_m2_3q <- bind_cols(as_tibble(summary(m2_3q)$summary[1:number_of_coefficients,c(1,4,6,8:10)]), label = vars)
BWQS_params_m2_3q %>% bind_cols(., "terms" = labels2) %>%
  dplyr::select(-label) %>%
  mutate_at(vars(1:6), ~round(., 3)) %>%
  mutate_at(vars(5), ~round(., 0)) %>% 
  mutate(`95% CrI`=paste0("(",format(`2.5%`, nsmall=3),", ",format(`97.5%`, nsmall=3),")")) %>% 
  mutate(terms = 
           case_when(terms=="BWQS term" ~ "COVID-19 inequity index",
                     TRUE               ~ terms)) %>%
  dplyr::select(-`2.5%`, -`97.5%`) %>%
  rename(median=`50%`) %>%
  dplyr::select(terms, mean, `95% CrI`, median, Rhat, n_eff) %>%
  kbl(align=rep('r', 6), font_size = 9.5) %>%
  kable_classic(full_width = F, html_font = "Arial")
terms mean 95% CrI median Rhat n_eff
Overdispersion 109.752 (85.968, 137.586) 109.024 1.001 1689
Intercept 6.254 ( 6.172, 6.341) 6.255 1.000 1491
COVID-19 inequity index 0.081 ( 0.069, 0.093) 0.081 0.999 1725
Testing ratio: spline term 1 0.948 ( 0.861, 1.033) 0.947 1.002 1778
Testing ratio: spline term 2 1.987 ( 1.774, 2.187) 1.988 1.000 1666
Testing ratio: spline term 3 1.157 ( 1.057, 1.264) 1.157 1.002 1654
1/ Median income 0.180 ( 0.066, 0.280) 0.183 0.999 1592
Uninsured 0.090 ( 0.009, 0.181) 0.090 1.000 1717
Unemployed 0.048 ( 0.001, 0.141) 0.038 1.000 1717
1/ Grocers per 1000 0.067 ( 0.005, 0.155) 0.062 1.000 1942
Essential Workers 0.180 ( 0.085, 0.274) 0.178 0.999 1933
Essential Worker: Public Transit 0.078 ( 0.006, 0.171) 0.075 1.002 1745
Essential Worker: Driving Commute 0.036 ( 0.001, 0.101) 0.031 1.000 1674
1/ Work from home 0.248 ( 0.154, 0.345) 0.247 0.999 1663
Population Density by Residential Volume 0.050 ( 0.003, 0.122) 0.046 0.999 1811
Average people per household 0.023 ( 0.001, 0.065) 0.020 1.000 1549
t_m2_3q = Sys.time() 

Summarize_BWQS_performance <- function(model, df){
         model_type = deparse(substitute(model))
         waic = extract_waic(model)$waic[[1]]
         bayesr2 = Compute_Bayes_R2(model)$meanr2
         rmse = sqrt(mean((df$pos_per_100000 - colMeans(extract(model,"y_new")$y_new))^2))  
         effect_estimate = exp(mean(extract(model, "beta1")$beta1))
         
         summarized <- bind_cols(model_name = as.character(model_type), waic = waic, bayesr2 = bayesr2, rmse = rmse, effect_estimate = effect_estimate)
         
         return(summarized)
}

bind_rows(Summarize_BWQS_performance(m1, SES_zcta_median_testing),
  Summarize_BWQS_performance(m2_median, SES_zcta_median_testing),
  Summarize_BWQS_performance(m2_3q, SES_zcta_3q_testing),
)
## # A tibble: 3 x 5
##   model_name  waic bayesr2  rmse effect_estimate
##   <chr>      <dbl>   <dbl> <dbl>           <dbl>
## 1 m1         2370.   0.934  184.            1.08
## 2 m2_median  2362.   0.936  183.            1.08
## 3 m2_3q      2359.   0.937  183.            1.08

Part 2: Compare capacity to socially distance (as measured by transit data) by neighborhood-level risk scores

#### Part 2: Compare capacity to socially distance (as measured by transit data) by neighborhood-level risk scores ####  

UHF_ZipCodes1 <- UHF_ZipCodes %>%
  mutate(zcta = as.character(zip),
         uhf = as.character(code)) %>%
  dplyr::select(zcta, uhf)
  
ZCTA_BWQS_score <- ZCTA_BWQS_COVID_shp %>%
  st_drop_geometry() %>%
  dplyr::select(zcta, BWQS_index)
                
UHF_BWQS <- ZCTA_ACS_COVID %>%
  left_join(., UHF_ZipCodes1, by = "zcta") %>%
  left_join(., ZCTA_BWQS_score, join = "zcta") %>%
  group_by(uhf) %>%
  summarise(uhf_weighted_bwqs = weighted.mean(BWQS_index, total_pop1)) #population weighting
## Joining, by = "zcta"
UHF_BWQS_COVID_shp <- UHF_shp %>% 
  mutate(uhf = as.character(UHFCODE)) %>%
  left_join(., UHF_BWQS, by = "uhf") %>%
  mutate(Risk = factor(ifelse(uhf_weighted_bwqs > median(uhf_weighted_bwqs, na.rm = T), #median split of neighborhood risk
                              "High", "Low"), levels = c("High", "Low")))

ggplot(UHF_BWQS_COVID_shp) + 
  geom_sf(aes(fill = uhf_weighted_bwqs))

Subway_ridership_by_UHF %>% filter(place=="all" & date >= "2020-01-29" & date <= "2020-04-30") %>% 
           ggplot() + geom_point(aes(date, usage.median.ratio, color = place))

Mean_Ridership <- Subway_ridership_by_UHF %>% #Use the mean ridership to identify the proper function for a nonlinear model
  filter(date >= "2020-01-29" & date <= "2020-04-30" & place=="all") %>%   
  arrange(date) %>%
  mutate(time_index = time(date))

Mean_Ridership %>% 
  ggplot() + geom_point(aes(date, usage.median.ratio))

#The weibull probability distribution works best for these data
fit_drm_w2.4 <- drm(usage.median.ratio ~ time_index, fct =  W2.4(), data = Mean_Ridership)

# suppress warning about vector recycling in predict.drc.R
handler <- function(w) if( any( grepl( "Recycling array of length 1 in array-vector arithmetic is deprecated.", w) ) ){
  invokeRestart( "muffleWarning" ) } 

DRM_mean_predictions <- bind_cols(Mean_Ridership,
                                  as_tibble(withCallingHandlers(predict(fit_drm_w2.4, interval = "confidence"), warning = handler ))) 

# Supplementary Figure 6
write_csv(DRM_mean_predictions %>% dplyr::select(date, usage.median.ratio, Lower, Upper, Prediction), file.path(source_path, "sfig6.csv"))
sfig6 <- ggplot() + geom_point(data = DRM_mean_predictions, aes(x = Mean_Ridership$date, y = Mean_Ridership$usage.median.ratio)) + 
  geom_ribbon(data = DRM_mean_predictions, aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50", alpha = .5) +
  geom_line(aes(x = DRM_mean_predictions$date, y = DRM_mean_predictions$Prediction), color = "red") + 
  theme_bw(base_size = 16) +
  xlab("Date") +
  scale_y_continuous("Relative Subway Ridership (%)", labels = scales::percent)
if(export.figs){
  write_figs(sfig6, "sfig6", fig_width = 8, fig_height = 5)
} 
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/sfig6_2021-06-17.png

Supp. Fig. 6

# create a dataframe for the analysis 
service_changes_in_lowsubway_areas <- tibble(date = as.Date(c("2020-02-01", "2020-02-02", "2020-02-08", "2020-02-09", "2020-02-15", "2020-02-16", "2020-02-22", "2020-02-23", "2020-02-29", "2020-03-01", "2020-03-07", "2020-03-08", 
                                                      "2020-02-01", "2020-02-02", "2020-02-08", "2020-02-09", "2020-02-16", "2020-02-16")),
                                             place = c("403","403","403","403","403","403","403","403","403","403","403","403", 
                                                       "502","502","502","502","502","502"))

Subway_BWQS_df <- Subway_ridership_by_UHF %>%
  left_join(., st_set_geometry(UHF_BWQS_COVID_shp, NULL), by = c("place" = "uhf")) %>%
  filter(!is.na(place) & date >= "2020-01-29" & date <= "2020-04-30") %>%
  group_by(place) %>%
  arrange(date) %>%
  mutate(time_index = time(date)) %>%
  filter(!is.na(Risk) & date != "2020-02-17") %>% # removing Presidents' Day national holiday 
  anti_join(., service_changes_in_lowsubway_areas, by = c("date", "place")) %>% # removing low outliers due to service changes in low subway density areas
  ungroup()
  
fit_drm_all <- drm(usage.median.ratio ~ time_index, fct = W2.4(), data = Subway_BWQS_df)
plot(fit_drm_all)

fit_drm_interact <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_df, separate = T) #using this for inference
fit_drm_interact1 <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_df, separate = F) #using this for plotting -- can't get separate = T to plot correctly!
fit_drm_risk_high <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_df, subset = Risk=="High")
fit_drm_risk_low <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_df, subset = Risk=="Low")
anova(fit_drm_all, fit_drm_interact) # comparing the mean only model to the interaction model 
## 
## 1st model
##  fct:      W2.4()
## 2nd model
##  fct:      NULL
## ANOVA table
## 
##           ModelDf     RSS Df F value p value
## 1st model    3291 21.2853                   
## 2nd model    3287  9.7026  4  980.99    0.00
summary(fit_drm_interact)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##           Estimate  Std. Error t-value   p-value    
## b:Low  -11.3040270   0.3538881 -31.942 < 2.2e-16 ***
## c:Low    0.0948493   0.0032067  29.578 < 2.2e-16 ***
## d:Low    0.9771003   0.0028346 344.709 < 2.2e-16 ***
## e:Low   44.8234506   0.0974580 459.926 < 2.2e-16 ***
## b:High -10.9365546   0.3342480 -32.720 < 2.2e-16 ***
## c:High   0.1600456   0.0033492  47.786 < 2.2e-16 ***
## d:High   0.9644644   0.0027734 347.756 < 2.2e-16 ***
## e:High  46.5678161   0.1056268 440.871 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.07296489 (3287 degrees of freedom)
confint(fit_drm_interact)
##               2.5 %      97.5 %
## b:Low  -11.99789039 -10.6101636
## c:Low    0.08856194   0.1011366
## d:Low    0.97154263   0.9826580
## e:Low   44.63236612  45.0145351
## b:High -11.59190996 -10.2811993
## c:High   0.15347887   0.1666124
## d:High   0.95902665   0.9699022
## e:High  46.36071510  46.7749171
compParm(fit_drm_interact, "b", "-")
## 
## Comparison of parameter 'b' 
## 
##          Estimate Std. Error t-value p-value
## Low-High -0.36747    0.48678 -0.7549  0.4504
compParm(fit_drm_interact, "c", "-")
## 
## Comparison of parameter 'c' 
## 
##            Estimate Std. Error t-value   p-value    
## Low-High -0.0651964  0.0046368  -14.06 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
slopes <- as_tibble(coef(fit_drm_interact), rownames = "vars") %>%
  separate(col = vars, into = c("vars", "BWQS_risk"), sep = ":") %>%
  spread(., vars, value) %>%
  mutate(slope = -b*(-d/(4*e)))

as_tibble(confint(fit_drm_interact), rownames = "vars") %>%
  separate(col = vars, into = c("vars", "BWQS_risk"), sep = ":") %>%
  filter(vars == "b") %>%
  right_join(., slopes, by = "BWQS_risk") %>%
  mutate(slope_lower_ci = -`2.5 %`*(-d/(4*e)),
  slope_higher_ci = -`97.5 %`*(-d/(4*e))) %>%
  dplyr::select(BWQS_risk, slope, slope_lower_ci, slope_higher_ci)
## # A tibble: 2 x 4
##   BWQS_risk   slope slope_lower_ci slope_higher_ci
##   <chr>       <dbl>          <dbl>           <dbl>
## 1 Low       -0.0616        -0.0654         -0.0578
## 2 High      -0.0566        -0.0600         -0.0532
fit_drm_predictions <- as_tibble(withCallingHandlers(predict(fit_drm_interact1, as.data.frame(Subway_BWQS_df %>% dplyr::select(usage.median.ratio, time_index, Risk)), 
                                                             interval = "confidence"), warning = handler))
Subway_BWQS_df1 <- bind_cols(Subway_BWQS_df, fit_drm_predictions) 

Subway_BWQS_df2 <- Subway_BWQS_df1 %>%
  filter(date>"2020-02-16") %>% # subsetting for visualization
  mutate(Risk = if_else(Risk == "High", "High (above median)", "Low (below median)"))

write_csv(Subway_BWQS_df2 %>% dplyr::select(date, usage.median.ratio, Risk, Lower, Upper, Prediction), file.path(source_path, "fig5.csv"))
fig5 <- ggplot() + 
  geom_jitter(data = Subway_BWQS_df2, aes(x = date, y = usage.median.ratio, color = Risk), alpha = .5, position = position_jitter(height = 0, width = 0.4))+ 
  geom_ribbon(data = subset(Subway_BWQS_df2, Risk == "High (above median)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_df2, Risk == "Low (below median)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_line(data = Subway_BWQS_df2, aes(x = date, y = Prediction, color = Risk)) +
  scale_x_date("Date", date_minor_breaks = "1 week") + 
  scale_y_continuous("Relative Subway Ridership (%)", labels = scales::percent) + 
  geom_vline(xintercept = date("2020-03-22"),color = "grey30", lty = 2) + 
  theme_bw(base_size = 16) +
  labs(colour="COVID-19 inequity index") +
  theme(legend.title = element_text(face = "bold", size = 12), legend.position = c(0.8, 0.7))  
if(export.figs){
  write_figs(fig5, "fig5", fig_width = 8, fig_height = 6)
} 
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/fig5_2021-06-17.png

Fig. 5

# which UHF neighborhoods were dropped?
included_uhf <- Subway_BWQS_df %>% distinct(UHFCODE)
notincluded_uhf_shp <- UHF_BWQS_COVID_shp %>%
  filter(!UHFCODE %in% included_uhf$UHFCODE & 
           UHFCODE !=0) %>%
  mutate(NotIncluded = "*")

write_csv(UHF_BWQS_COVID_shp, file.path(source_path, "sfig5_a.csv"))
write_csv(SubwayStation_shp, file.path(source_path, "sfig5_b.csv"))
write_csv(notincluded_uhf_shp, file.path(source_path, "sfig5_c.csv"))
# Supplementary Figure 5
sfig5 <- ggplot() + 
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(data = NYC_basemap_shp, size = 0.2) +
  geom_sf(data = subset(UHF_BWQS_COVID_shp, !is.na(Risk)), aes(fill = Risk), size = 0.2) + 
  labs(fill = "COVID-19 inequity index\ngrouping") + 
  geom_sf(data = SubwayStation_shp, size = 0.6) +
  geom_sf_text(data = notincluded_uhf_shp, aes(label = NotIncluded), size = 9, color = "grey15") +
  coord_sf(crs = st_crs(ZCTA_BWQS_COVID_shp),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
           expand = FALSE) +
  annotate("text", x = -73.709, y = 40.48, label = "water mask © OpenStreetMap contributors",
           size = theme_get()$text[["size"]]/6, hjust = "right") +
  theme_bw() + 
  theme_smallmaps + 
  theme(legend.position = c(0.22, 0.86), plot.margin = margin(0, 0, 0, 0, "pt"))
if(export.figs){
  write_figs(sfig5, "sfig5", fig_width = 4, fig_height = 4)
}
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/sfig5_2021-06-17.png

Supp. Fig. 5

Part 3: Spatial analysis of mortality in relation to BWQS scores

#### Part 3: Spatial analysis of mortality in relation to BWQS scores  ####
# Step 1: Create dataframes with the relevant information 
deaths_by23May2020_by_zcta1 <- deaths_by23May2020_by_zcta %>%
  left_join(., modzcta_to_zcta, by = c("MODIFIED_ZCTA" = "MODZCTA")) %>%
  mutate(zcta = as.character(MODIFIED_ZCTA)) %>%
  rename("deaths_count" = "COVID_DEATH_COUNT") %>%
  dplyr::select(zcta, deaths_count, COVID_DEATH_RATE) %>%
  distinct(zcta, .keep_all = T)

ZCTA_BWQS_COVID_shp1 <- ZCTA_ACS_COVID_shp %>% 
  left_join(.,ZCTA_BWQS_score, by = "zcta") %>%
  left_join(., deaths_by23May2020_by_zcta1, by = "zcta") %>%
  mutate(prop_65plus = age65_plus/total_pop1,
         zcta = as.numeric(zcta)) 

write_csv(st_drop_geometry(ZCTA_BWQS_COVID_shp1 %>% dplyr::select(zcta, COVID_DEATH_RATE)), file.path(source_path, "fig3c.csv"))
# Figure 3C - Mortality
fig3c <- ZCTA_BWQS_COVID_shp1 %>% 
  ggplot() +
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(aes(fill = COVID_DEATH_RATE), lwd = 0.2)+
  scalebar(MODZCTA_NYC_shp1, dist = 5, dist_unit = "km", 
           transform = TRUE, model = "WGS84", 
           st.size = 2.8, height = 0.015, border.size = 0.5,
           anchor = c(x = -73.71, y = 40.51)) + 
  labs(fill = "Mortality per 100,000") +
  ggtitle("Cumulative COVID mortality by zip code (May 23, 2020)") +
  scale_fill_gradientn(colours=brewer_pal("YlGnBu", type = "seq")(7)) + 
  coord_sf(crs = st_crs(ZCTA_BWQS_COVID_shp),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
           expand = FALSE) +
  theme_bw(base_size = 6) + 
  theme_smallmaps


# Combine subfigures to make Figure 3: maps of BWQS, Tests, and Mortality by ZCTA
fig3a_2 = fig3a + labs(tag = "a") + theme(plot.tag.position = c(-0.018, 0.985), plot.tag = element_text(face = "bold", size = 15))
fig3b_2 = fig3b + labs(tag = "b") + theme(plot.tag.position = c(-0.028, 0.93), plot.tag = element_text(face = "bold", size = 15))
fig3c_2 = fig3c + labs(tag = "c") + theme(plot.tag.position = c(-0.028, 0.93), plot.tag = element_text(face = "bold", size = 15))

plotres = 300
if(export.figs) {
  if(vector.figs){
    fig3_path = file.path(fig.path, paste0("fig3_combined_", Sys.Date(), ".svg"))
    svg(filename = fig3_path, width = 1.9*4.25, height = 1.9*6)
  } else {
    agg_png(filename = file.path(fig.path, paste0("fig3_combined_", Sys.Date(), ".png")), 
            width = plotres*4.25, height = plotres*6, scaling = 2)
  }
  grid.arrange(arrangeGrob(fig3a_2, ncol = 1, nrow = 1), 
               arrangeGrob(fig3b_2, fig3c_2, ncol = 2, nrow = 1),
               nrow = 2, ncol = 1, heights = c(1.9,1))
  dev.off()
  if(vector.figs){
    system(paste0("rsvg-convert -f pdf -o ", str_sub(fig3_path,1,-5), ".pdf", " ", fig3_path))
    unlink(fig3_path)
  } 
}

Fig. 3

# Step 2: Run negative binomial model with spatial filtering  

# Step 2a: Identify the neighbor relationships 
spdat.sens <- as_Spatial(ZCTA_BWQS_COVID_shp1)
ny.nb6 <- knearneigh(sp::coordinates(spdat.sens), k=6)
ny.nb6 <- knn2nb(ny.nb6)
ny.nb6 <- make.sym.nb(ny.nb6)
ny.wt6 <- nb2listw(ny.nb6, style="W")

# Step 2b: Fit the model to identify the component of the data with substantial spatial autocorrelation
fit.nb.ny.sens<-glm.nb(deaths_count~offset(log(total_pop1))+BWQS_index, spdat.sens)
lm.morantest(fit.nb.ny.sens, listw = ny.wt6)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm.nb(formula = deaths_count ~ offset(log(total_pop1)) +
## BWQS_index, data = spdat.sens, init.theta = 6.644443327, link = log)
## weights: ny.wt6
## 
## Moran I statistic standard deviate = 3.2076, p-value = 0.0006692
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.126081335     -0.001798209      0.001589415
me.fit.sens <- spatialreg::ME(deaths_count~offset(log(total_pop1))+BWQS_index,
                              spdat.sens@data, family=negative.binomial(fit.nb.ny.sens$theta), listw = ny.wt6, verbose=T, alpha=.1, nsim = 999)
## eV[,16], I: 0.05661728 ZI: NA, pr(ZI): 0.06
## eV[,23], I: 0.01195511 ZI: NA, pr(ZI): 0.297
# Step 2c: Pull out these fits and visualize the autocorrelation
fits.sens <- data.frame(fitted(me.fit.sens))
spdat.sens$me16 <- fits.sens$vec16
spdat.sens$me23 <- fits.sens$vec23
spplot(spdat.sens, "me16", at=quantile(spdat.sens$me16, p=seq(0,1,length.out = 7)))

spplot(spdat.sens, "me23", at=quantile(spdat.sens$me23, p=seq(0,1,length.out = 7)))

# Step 2d: Include the fits in our regression model as an additional parameter
clean.nb.sens<-glm.nb(deaths_count~offset(log(total_pop1))+BWQS_index+fitted(me.fit.sens), spdat.sens@data)
tidy(clean.nb.sens) %>% mutate(estimate_exp = exp(estimate))
## # A tibble: 4 x 6
##   term                     estimate std.error statistic  p.value estimate_exp
##   <chr>                       <dbl>     <dbl>     <dbl>    <dbl>        <dbl>
## 1 (Intercept)                -7.29     0.0842    -86.6  0.           0.000681
## 2 BWQS_index                  0.180    0.0151     11.9  8.78e-33     1.20    
## 3 fitted(me.fit.sens)vec16   -0.922    0.384      -2.40 1.64e- 2     0.398   
## 4 fitted(me.fit.sens)vec23   -1.78     0.383      -4.65 3.25e- 6     0.169
as_tibble(confint(clean.nb.sens), rownames = "vars")%>% mutate_at(vars(2:3), .funs = list(~exp(.)))
## Waiting for profiling to be done...
## # A tibble: 4 x 3
##   vars                      `2.5 %` `97.5 %`
##   <chr>                       <dbl>    <dbl>
## 1 (Intercept)              0.000576 0.000807
## 2 BWQS_index               1.16     1.23    
## 3 fitted(me.fit.sens)vec16 0.188    0.841   
## 4 fitted(me.fit.sens)vec23 0.0785   0.361
lm.morantest(clean.nb.sens, resfun = residuals, listw=ny.wt6)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: glm.nb(formula = deaths_count ~ offset(log(total_pop1)) +
## BWQS_index + fitted(me.fit.sens), data = spdat.sens@data, init.theta =
## 7.95088153, link = log)
## weights: ny.wt6
## 
## Moran I statistic standard deviate = 1.6812, p-value = 0.04636
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##      0.065068136     -0.002561584      0.001618272
spdat.sens$nb_residuals <- clean.nb.sens$residuals
spplot(spdat.sens, "nb_residuals", at=quantile(spdat.sens$nb_residuals, p=seq(0,1,length.out = 10)))

pal_sfig10 <- brewer_pal(type = "div", palette = "RdYlBu")(5)[2:5] 
pal_sfig10[[2]] <- "#fafadf" # make yellow less saturated

write_csv(bind_cols(zcta = spdat.sens[["zcta"]], nb_residuals = spdat.sens[["nb_residuals"]]), file.path(source_path, "sfig10.csv"))
sfig10 <- ggplot() + 
  geom_sf(data = basemap_water, fill = "white", lwd = 0) + 
  geom_sf(data = st_as_sf(spdat.sens), aes(fill = cut_width(nb_residuals, width = 1)), lwd = 0.2) + 
  scale_fill_discrete(type = pal_sfig10) + 
  coord_sf(crs = st_crs(MODZCTA_NYC_shp1),
           xlim = c(plot_bounds$xmin, plot_bounds$xmax), 
           ylim = c(plot_bounds$ymin, plot_bounds$ymax),
           expand = FALSE) + 
  labs(fill = "Residuals\n(Standard Deviations)") + 
  annotate("text", x = -73.709, y = 40.48, label = "water mask © OpenStreetMap contributors",
           size = theme_get()$text[["size"]]/6, hjust = "right") +
  theme_bw() + 
  theme(panel.background = element_rect(fill = "#cccccc"), 
        panel.grid = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        legend.title = element_text(face = "bold", size = 9),
        legend.text = element_text(size = 8.5),
        plot.margin = unit(c(0.1,0.1,0.1,0.1), "in")
        )
if(export.figs){
  write_figs(sfig10, "sfig10", fig_width = 5, fig_height = 3.2)
} 
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/sfig10_2021-06-17.png

Supp. Fig. 10

Sensitivity Analyses

#### Sensitivity Analyses ####
# half-Cauchy prior for overdispersion parameter
BWQS_NB_halfcauchy_stan_model <- here("code", "nb_bwqs_halfcauchy.stan") 
m1_halfcauchy <- fit_BWQS_model(m1data$data_list, BWQS_NB_halfcauchy_stan_model)
# beta1 effect estimate using half-cauchy prior for overdispersion
exp(summary(m1_halfcauchy)$summary[3,c(1,4,8)])
##     mean     2.5%    97.5% 
## 1.076923 1.065495 1.088503
# main model (inverse gamma prior for overdispersion)
exp(summary(m1)$summary[3,c(1,4,8)])
##     mean     2.5%    97.5% 
## 1.077078 1.065505 1.089368
# no meaningful difference in effect estimate or credible interval!

# BWQS weights -- stability of the rankings 
as_tibble(extract(m1, c("W[1]","W[2]", "W[3]", "W[4]", "W[5]", "W[6]", "W[7]", "W[8]", "W[9]", "W[10]"))) %>%
  rownames_to_column("iteration") %>%
  gather(weight, value, "W[1]":"W[10]") %>%
  group_by(iteration) %>%
  slice(which.max(value)) %>%
  ungroup() %>%
    group_by(weight) %>%
    dplyr::summarise(times_largest_weight = n()) %>%
    mutate(pct_largest_weight = round((times_largest_weight/1750)*100, 2))
## # A tibble: 9 x 3
##   weight times_largest_weight pct_largest_weight
##   <chr>                 <int>              <dbl>
## 1 W[1]                     72              4.11 
## 2 W[10]                   455             26    
## 3 W[2]                    561             32.1  
## 4 W[4]                      5              0.290
## 5 W[5]                     13              0.74 
## 6 W[6]                      1              0.06 
## 7 W[7]                    523             29.9  
## 8 W[8]                     48              2.74 
## 9 W[9]                     72              4.11
# using essential workers, testing ratio, and median income
glm_esstl_and_income <- glm.nb(pos_per_100000 ~ not_quarantined_jobs + medincome + testing_ratio, data = ZCTA_ACS_COVID)
broom::tidy(glm_esstl_and_income)
## # A tibble: 4 x 5
##   term                    estimate   std.error statistic   p.value
##   <chr>                      <dbl>       <dbl>     <dbl>     <dbl>
## 1 (Intercept)           6.34       0.0701          90.4  0.       
## 2 not_quarantined_jobs  0.0185     0.00248          7.46 8.96e- 14
## 3 medincome            -0.00000250 0.000000359     -6.96 3.36e- 12
## 4 testing_ratio        19.5        0.888           22.0  4.07e-107
enframe(predict(glm_esstl_and_income)) %>% mutate(value = exp(value))
## # A tibble: 177 x 2
##    name  value
##    <chr> <dbl>
##  1 1     1293.
##  2 2     1301.
##  3 3      877.
##  4 4      870.
##  5 5      710.
##  6 6      787.
##  7 7      677.
##  8 8     1200.
##  9 9      987.
## 10 10     891.
## # … with 167 more rows
# Just using proportion uninsured and median income
glm_insurance_and_income <- glm.nb(pos_per_100000 ~ not_insured + medincome, data = ZCTA_ACS_COVID)
broom::tidy(glm_insurance_and_income)
## # A tibble: 3 x 5
##   term           estimate   std.error statistic  p.value
##   <chr>             <dbl>       <dbl>     <dbl>    <dbl>
## 1 (Intercept)  8.14       0.120          68.0   0.      
## 2 not_insured -0.00420    0.00857        -0.490 6.24e- 1
## 3 medincome   -0.00000737 0.000000891    -8.26  1.41e-16
# PCA of social variables 
pca_socialvars <- prcomp(ZCTA_ACS_COVID %>% dplyr::select(all_of(SES_vars)), center = T, scale. = T)
PC1 <- enframe(pca_socialvars$rotation[,1]) %>% arrange(desc(value)) %>% rename("label" = "name")

df_for_PCA_analysis <- ZCTA_ACS_COVID %>% bind_cols(., PC1 = pca_socialvars$x[,1])

# Compare BWQS ranks to Principal Components ranks
BWQS_fits %>%
  dplyr::select(mean, label) %>%
  mutate(rank_bwqs = rank(mean)) %>%
  left_join(., PC1, by = "label") %>%
  mutate(rank_pca = rank(value)) %>%
  dplyr::select(2, 3, 5, 1, 4)
## # A tibble: 10 x 5
##    label                     rank_bwqs rank_pca   mean  value
##    <chr>                         <dbl>    <dbl>  <dbl>  <dbl>
##  1 not_insured                      10        4 0.181  0.317 
##  2 essentialworker_drove             9        2 0.167  0.201 
##  3 avg_hhold_size                    8        9 0.159  0.382 
##  4 res_vol_popdensity                7        3 0.115  0.216 
##  5 one_over_medincome                6        8 0.0976 0.363 
##  6 didnot_workhome_commute           5        7 0.0884 0.359 
##  7 not_quarantined_jobs              4       10 0.0617 0.397 
##  8 unemployed                        3        6 0.0532 0.358 
##  9 essentialworker_pubtrans          2        5 0.0500 0.335 
## 10 one_over_grocers_per_1000         1        1 0.0264 0.0890
glm_princomp <- glm.nb(pos_per_100000 ~ PC1 + ns(testing_ratio, df = 3), data = df_for_PCA_analysis)
summary(glm_princomp)
## 
## Call:
## glm.nb(formula = pos_per_100000 ~ PC1 + ns(testing_ratio, df = 3), 
##     data = df_for_PCA_analysis, init.theta = 94.08962057, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5314  -0.5865   0.0748   0.5553   3.0714  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                6.635831   0.047047  141.05   <2e-16 ***
## PC1                        0.071807   0.004999   14.36   <2e-16 ***
## ns(testing_ratio, df = 3)1 0.980427   0.042311   23.17   <2e-16 ***
## ns(testing_ratio, df = 3)2 2.032569   0.106295   19.12   <2e-16 ***
## ns(testing_ratio, df = 3)3 1.135794   0.053268   21.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(94.0896) family taken to be 1)
## 
##     Null deviance: 2796.19  on 176  degrees of freedom
## Residual deviance:  179.89  on 172  degrees of freedom
## AIC: 2381.1
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  94.1 
##           Std. Err.:  10.7 
## 
##  2 x log-likelihood:  -2369.08
exp(confint(glm.nb(pos_per_100000 ~ PC1 + ns(testing_ratio, df = 3), data = df_for_PCA_analysis)))
## Waiting for profiling to be done...
##                                 2.5 %     97.5 %
## (Intercept)                694.185891 836.816961
## PC1                          1.063885   1.085095
## ns(testing_ratio, df = 3)1   2.456061   2.892820
## ns(testing_ratio, df = 3)2   6.173665   9.430225
## ns(testing_ratio, df = 3)3   2.804839   3.461578
enframe(predict(glm_princomp)) %>% mutate(glm_princomp = exp(value)) %>% dplyr::select(glm_princomp)
## # A tibble: 177 x 1
##    glm_princomp
##           <dbl>
##  1        1339.
##  2        1089.
##  3         727.
##  4         942.
##  5         741.
##  6         819.
##  7         891.
##  8        1065.
##  9         919.
## 10         809.
## # … with 167 more rows
enframe(predict(glm_esstl_and_income)) %>% mutate(glm_esstl_and_income = exp(value)) %>% dplyr::select(glm_esstl_and_income)
## # A tibble: 177 x 1
##    glm_esstl_and_income
##                   <dbl>
##  1                1293.
##  2                1301.
##  3                 877.
##  4                 870.
##  5                 710.
##  6                 787.
##  7                 677.
##  8                1200.
##  9                 987.
## 10                 891.
## # … with 167 more rows
Compare_Metrics <- bind_cols(ZCTA_ACS_COVID,
                             glm_princomp = enframe(predict(glm_princomp)) %>% mutate(glm_princomp = exp(value)) %>% dplyr::select(glm_princomp),
                             glm_esstl_and_income = enframe(predict(glm_esstl_and_income)) %>% mutate(glm_esstl_and_income = exp(value)) %>% dplyr::select(glm_esstl_and_income),
                             BWQS_index = colMeans(extract(m1,"y_new")$y_new))

# comparison of RMSE and Kendall's tau metrics for three different modeling approaches
# Supplemental Table 4
bind_rows(Compare_Metrics %>% 
            summarise_at(vars(c("BWQS_index", "glm_princomp", "glm_esstl_and_income")), 
                         ~cor(pos_per_100000, .x, method = "kendall")) %>%
            mutate(parameter = "kendalls"),
          Compare_Metrics %>% 
            summarise_at(vars(c("BWQS_index", "glm_princomp", "glm_esstl_and_income")), 
                         ~sqrt(mean(pos_per_100000-.x)^2)) %>%
            mutate(parameter = "RMSE"))
## # A tibble: 2 x 4
##   BWQS_index glm_princomp glm_esstl_and_income parameter
##        <dbl>        <dbl>                <dbl> <chr>    
## 1      0.869        0.850                0.823 kendalls 
## 2      2.67         2.14                 8.10  RMSE

Subway analyses

#### Subway analyses ####
# Different BWQS groupings for subway analysis (<.25, .25-.75, .75)

UHF_BWQS_COVID_3split_shp <- UHF_shp %>% 
  mutate(uhf = as.character(UHFCODE)) %>%
  left_join(., UHF_BWQS, by = "uhf") %>%
  mutate(Risk = factor(ifelse(uhf_weighted_bwqs < quantile(uhf_weighted_bwqs, probs = .25, na.rm = T), "Low",
                              if_else(uhf_weighted_bwqs > quantile(uhf_weighted_bwqs, probs = .75, na.rm = T), "High", "Mid")),
                       levels = c("High", "Mid", "Low")))


Subway_BWQS_3split_df <- Subway_ridership_by_UHF %>%
  left_join(., st_set_geometry(UHF_BWQS_COVID_3split_shp, NULL), by = c("place" = "uhf")) %>%
  filter(!is.na(place) & date >= "2020-01-29" & date <= "2020-04-30") %>%
  group_by(place) %>%
  arrange(date) %>%
  mutate(time_index = time(date)) %>%
  filter(!is.na(Risk) & date != "2020-02-17") %>% #removing Presidents' Day national holiday 
  anti_join(., service_changes_in_lowsubway_areas, by = c("date", "place")) #removing low outliers due to service changes in low subway density areas

fit_drm_risk_3split <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_3split_df, separate = F)
anova(fit_drm_all, fit_drm_risk_3split)
## 
## 1st model
##  fct:      W2.4()
##  pmodels: 1 (for all parameters)
## 2nd model
##  fct:      W2.4()
##  pmodels: Risk (for all parameters)
## ANOVA table
## 
##           ModelDf    RSS Df F value p value
## 1st model    3291 21.285                   
## 2nd model    3283 15.418  8  156.17    0.00
summary(fit_drm_risk_3split)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##           Estimate  Std. Error t-value   p-value    
## b:Low  -12.0674637   0.4806596 -25.106 < 2.2e-16 ***
## b:Mid  -11.0552063   0.3517647 -31.428 < 2.2e-16 ***
## b:High -10.8711772   0.3998416 -27.189 < 2.2e-16 ***
## c:Low    0.0728684   0.0038963  18.702 < 2.2e-16 ***
## c:Mid    0.1293654   0.0033929  38.128 < 2.2e-16 ***
## c:High   0.1751586   0.0040680  43.058 < 2.2e-16 ***
## d:Low    0.9804785   0.0035690 274.718 < 2.2e-16 ***
## d:Mid    0.9599509   0.0028982 331.228 < 2.2e-16 ***
##  [ reached getOption("max.print") -- omitted 4 rows ]
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.06852921 (3283 degrees of freedom)
compParm(fit_drm_risk_3split, "b", operator = "-")
## 
## Comparison of parameter 'b' 
## 
##          Estimate Std. Error t-value p-value  
## Low-Mid  -1.01226    0.59563 -1.6995 0.08932 .
## Low-High -1.19629    0.62523 -1.9134 0.05579 .
## Mid-High -0.18403    0.53255 -0.3456 0.72969  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
compParm(fit_drm_risk_3split, "c", operator = "-")
## 
## Comparison of parameter 'c' 
## 
##            Estimate Std. Error  t-value   p-value    
## Low-Mid  -0.0564971  0.0051665 -10.9352 < 2.2e-16 ***
## Low-High -0.1022903  0.0056329 -18.1595 < 2.2e-16 ***
## Mid-High -0.0457932  0.0052972  -8.6448 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(fit_drm_risk_3split)

fit_drm_risk_3split_high <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_3split_df, subset = Risk=="High")
fit_drm_risk_3split_mid <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_3split_df, subset = Risk=="Mid")
fit_drm_risk_3split_low <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_3split_df, subset = Risk=="Low")

summary(fit_drm_risk_3split_high)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -10.8693720   0.3278018 -33.158 < 2.2e-16 ***
## c:(Intercept)   0.1751430   0.0033359  52.502 < 2.2e-16 ***
## d:(Intercept)   0.9758825   0.0027014 361.256 < 2.2e-16 ***
## e:(Intercept)  46.8619040   0.1050027 446.292 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.05619323 (1008 degrees of freedom)
summary(fit_drm_risk_3split_mid)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -11.0643597   0.3961231 -27.932 < 2.2e-16 ***
## c:(Intercept)   0.1294098   0.0038142  33.928 < 2.2e-16 ***
## d:(Intercept)   0.9599361   0.0032584 294.602 < 2.2e-16 ***
## e:(Intercept)  45.9148150   0.1186594 386.946 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.07705059 (1359 degrees of freedom)
summary(fit_drm_risk_3split_low)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -12.0693043   0.4732119 -25.505 < 2.2e-16 ***
## c:(Intercept)   0.0728774   0.0038346  19.005 < 2.2e-16 ***
## d:(Intercept)   0.9804786   0.0035126 279.135 < 2.2e-16 ***
## e:(Intercept)  44.2647902   0.1135436 389.848 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.06744523 (916 degrees of freedom)
confint(fit_drm_risk_3split_high)
##                     2.5 %      97.5 %
## b:(Intercept) -11.5126240 -10.2261199
## c:(Intercept)   0.1685968   0.1816891
## d:(Intercept)   0.9705815   0.9811834
## e:(Intercept)  46.6558551  47.0679529
confint(fit_drm_risk_3split_mid)
##                     2.5 %      97.5 %
## b:(Intercept) -11.8414388 -10.2872806
## c:(Intercept)   0.1219274   0.1368923
## d:(Intercept)   0.9535440   0.9663282
## e:(Intercept)  45.6820397  46.1475904
confint(fit_drm_risk_3split_low)
##                      2.5 %       97.5 %
## b:(Intercept) -12.99800978 -11.14059877
## c:(Intercept)   0.06535177   0.08040294
## d:(Intercept)   0.97358495   0.98737217
## e:(Intercept)  44.04195432  44.48762611
# b is not actually a slope - but a scaling factor that needs to be transformed into the slope
slopes_3split <- as_tibble(coef(fit_drm_risk_3split_high), rownames = "vars") %>%
  bind_cols(BWQS_risk = "high") %>%
  bind_rows(as_tibble(coef(fit_drm_risk_3split_mid), rownames = "vars") %>% bind_cols(BWQS_risk = "mid")) %>%
  bind_rows(as_tibble(coef(fit_drm_risk_3split_low), rownames = "vars") %>% bind_cols(BWQS_risk = "low")) %>%
  mutate(vars = str_remove(vars, fixed(":(Intercept)"))) %>%
  spread(., vars, value) %>%
  mutate(slope = -b*(-d/(4*e))) 

as_tibble(confint(fit_drm_risk_3split_high), rownames = "vars") %>%
  bind_cols(BWQS_risk = "high") %>%
  bind_rows(as_tibble(confint(fit_drm_risk_3split_mid), rownames = "vars") %>% bind_cols(BWQS_risk = "mid")) %>%
  bind_rows(as_tibble(confint(fit_drm_risk_3split_low), rownames = "vars") %>% bind_cols(BWQS_risk = "low")) %>%
  mutate(vars = str_remove(vars, fixed(":(Intercept)"))) %>%
  filter(vars == "b") %>%
  right_join(., slopes_3split, by = "BWQS_risk") %>%
  mutate(slope_lower_ci = -`2.5 %`*(-d/(4*e)),
         slope_higher_ci = -`97.5 %`*(-d/(4*e))) %>%
  dplyr::select(BWQS_risk, slope, slope_lower_ci, slope_higher_ci)
## # A tibble: 3 x 4
##   BWQS_risk   slope slope_lower_ci slope_higher_ci
##   <chr>       <dbl>          <dbl>           <dbl>
## 1 high      -0.0566        -0.0599         -0.0532
## 2 mid       -0.0578        -0.0619         -0.0538
## 3 low       -0.0668        -0.0720         -0.0617
fit_drm_predictions_3split <- as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_high, interval = "confidence"), warning = handler) %>% 
                                   bind_cols(., Risk = "High")) %>%
  bind_rows(., as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_mid, interval = "confidence"), 
                                             warning = handler ) %>% bind_cols(., Risk = "Mid"))) %>%
  bind_rows(., as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_low, interval = "confidence"), 
                                             warning = handler ) %>% bind_cols(., Risk = "Low")))

Subway_BWQS_3split_df1 <- bind_cols(Subway_BWQS_3split_df %>% arrange(Risk, date), 
                             fit_drm_predictions_3split %>% dplyr::select(-Risk)) 


Subway_BWQS_3split_df2 <- Subway_BWQS_3split_df1 %>%
  filter(date>"2020-02-16") %>% # subsetting for visualization
  mutate(Risk = factor(if_else(Risk == "High", "High (≥ 75%ile)", 
                        if_else(Risk=="Mid", "Mid (IQR)", "Low (≤ 25%ile)")),
                       levels = c("High (≥ 75%ile)", "Mid (IQR)", "Low (≤ 25%ile)")))

# Supplementary Figure 7
write_csv(Subway_BWQS_3split_df2 %>% dplyr::select(date, usage.median.ratio, Risk, Lower, Upper, Prediction), file.path(source_path, "sfig7.csv"))
## Adding missing grouping variables: `place`
sfig7 <- ggplot() + 
  geom_jitter(data = Subway_BWQS_3split_df2, aes(x = date, y = usage.median.ratio, color = as.factor(Risk)), alpha = .5, position = position_jitter(height = 0, width = 0.4))+ 
  geom_ribbon(data = subset(Subway_BWQS_3split_df2, Risk == "High (≥ 75%ile)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_3split_df2, Risk == "Mid (IQR)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_3split_df2, Risk == "Low (≤ 25%ile)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_line(data = Subway_BWQS_3split_df2, aes(x = date, y = Prediction, color = as.factor(Risk))) +
  scale_x_date("Date", date_minor_breaks = "1 week") +
  scale_y_continuous("Relative Subway Ridership (%)", labels = scales::percent) + 
  geom_vline(xintercept = date("2020-03-22"),color = "grey30", lty = 2) + 
  theme_bw(base_size = 16) +
  labs(colour="COVID-19 inequity index") +
  theme(legend.title = element_text(face = "bold", size = 12), legend.position = c(0.8, 0.7))  
if(export.figs){ 
  write_figs(sfig7, "sfig7", fig_width = 8, fig_height = 6)
}
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/sfig7_2021-06-17.png

Supp. Fig. 7

# ZCTA level BWQS for subway 
Subway_ridership_by_ZCTA <- relative.subway.usage(2020, by = "zcta")

# There are clearly some higher outliers that need to be removed at the zcta level 
# Remove the zctas associated with the uhfs below

high_april_bronx_subway <- tibble(date = as.Date(c("2020-04-04", "2020-04-05", "2020-04-11", "2020-04-12", "2020-04-18", "2020-04-19", "2020-04-25", "2020-04-26")),
                                  place = c("10469", "10469","10469","10469","10469","10469","10469","10469"))

Subway_BWQS_ZCTA_df <- Subway_ridership_by_ZCTA %>%
  left_join(., ZCTA_BWQS_score, by = c("place" = "zcta")) %>%
  filter(!is.na(place) & date >= "2020-01-29" & date <= "2020-04-30") %>%
  group_by(place) %>%
  arrange(date) %>%
  mutate(time_index = time(date)) %>%
  ungroup() %>%
  filter(!is.na(BWQS_index) & date != "2020-02-17") %>% #removing Presidents' Day national holiday 
  anti_join(., high_april_bronx_subway, by = c("date", "place")) %>% #removing low outliers due to service changes in low subway density areas
  mutate(Risk = factor(if_else(BWQS_index < quantile(BWQS_index, probs = .25), "Low",
                              if_else(BWQS_index > quantile(BWQS_index, probs = .75), "High", "Mid")),
                       levels = c("High", "Mid", "Low")))

fit_drm_risk_3split_zcta_high <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_ZCTA_df, subset = Risk=="High")
fit_drm_risk_3split_zcta_mid <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_ZCTA_df, subset = Risk=="Mid")
fit_drm_risk_3split_zcta_low <- drm(usage.median.ratio ~ time_index, curveid = Risk, fct = W2.4(), data = Subway_BWQS_ZCTA_df, subset = Risk=="Low")


summary(fit_drm_risk_3split_zcta_high)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -10.9339014   0.3239343 -33.754 < 2.2e-16 ***
## c:(Intercept)   0.1657780   0.0032882  50.416 < 2.2e-16 ***
## d:(Intercept)   0.9645988   0.0026566 363.090 < 2.2e-16 ***
## e:(Intercept)  46.8781965   0.1028828 455.647 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.09138835 (2748 degrees of freedom)
summary(fit_drm_risk_3split_zcta_mid)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -11.3144667   0.3824104 -29.587 < 2.2e-16 ***
## c:(Intercept)   0.1357604   0.0035994  37.718 < 2.2e-16 ***
## d:(Intercept)   0.9779780   0.0030449 321.182 < 2.2e-16 ***
## e:(Intercept)  45.9310266   0.1089925 421.415 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.1489131 (5700 degrees of freedom)
summary(fit_drm_risk_3split_zcta_low)
## 
## Model fitted: Weibull (type 2) (4 parms)
## 
## Parameter estimates:
## 
##                  Estimate  Std. Error t-value   p-value    
## b:(Intercept) -11.8041674   0.3454714 -34.168 < 2.2e-16 ***
## c:(Intercept)   0.0619249   0.0028874  21.447 < 2.2e-16 ***
## d:(Intercept)   0.9885724   0.0026769 369.299 < 2.2e-16 ***
## e:(Intercept)  43.9325609   0.0871976 503.827 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error:
## 
##  0.08812425 (2756 degrees of freedom)
# b is not actually a slope - but a scaling factor that needs to be transformed into the slope
slopes_zcta <- as_tibble(coef(fit_drm_risk_3split_zcta_high), rownames = "vars") %>%
  bind_cols(BWQS_risk = "high") %>%
  bind_rows(as_tibble(coef(fit_drm_risk_3split_zcta_mid), rownames = "vars") %>% bind_cols(BWQS_risk = "mid")) %>%
  bind_rows(as_tibble(coef(fit_drm_risk_3split_zcta_low), rownames = "vars") %>% bind_cols(BWQS_risk = "low")) %>%
  mutate(vars = str_remove(vars, fixed(":(Intercept)"))) %>%
  spread(., vars, value) %>%
  mutate(slope = -b*(-d/(4*e))) 

as_tibble(confint(fit_drm_risk_3split_zcta_high), rownames = "vars") %>%
  bind_cols(BWQS_risk = "high") %>%
  bind_rows(as_tibble(confint(fit_drm_risk_3split_zcta_mid), rownames = "vars") %>% bind_cols(BWQS_risk = "mid")) %>%
  bind_rows(as_tibble(confint(fit_drm_risk_3split_zcta_low), rownames = "vars") %>% bind_cols(BWQS_risk = "low")) %>%
  mutate(vars = str_remove(vars, fixed(":(Intercept)"))) %>%
  filter(vars == "b") %>%
  right_join(., slopes_zcta, by = "BWQS_risk") %>%
  mutate(slope_lower_ci = -`2.5 %`*(-d/(4*e)),
         slope_higher_ci = -`97.5 %`*(-d/(4*e))) %>%
  dplyr::select(BWQS_risk, slope, slope_lower_ci, slope_higher_ci)
## # A tibble: 3 x 4
##   BWQS_risk   slope slope_lower_ci slope_higher_ci
##   <chr>       <dbl>          <dbl>           <dbl>
## 1 high      -0.0562        -0.0595         -0.0530
## 2 mid       -0.0602        -0.0642         -0.0562
## 3 low       -0.0664        -0.0702         -0.0626
fit_drm_predictions_zcta <- as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_zcta_high, interval = "confidence"), warning = handler) %>% 
                                   bind_cols(., Risk = "High")) %>%
  bind_rows(., as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_zcta_mid, interval = "confidence"), 
                                             warning = handler ) %>% bind_cols(., Risk = "Mid"))) %>%
  bind_rows(., as_tibble(withCallingHandlers(predict(fit_drm_risk_3split_zcta_low, interval = "confidence"), 
                                             warning = handler ) %>% bind_cols(., Risk = "Low"))) # %>% mutate(time_index = row_number()) 

Subway_BWQS_ZCTA_df1 <- bind_cols(Subway_BWQS_ZCTA_df %>% arrange(Risk, date), 
                             fit_drm_predictions_zcta %>% dplyr::select(-Risk)) 


Subway_BWQS_ZCTA_df2 <- Subway_BWQS_ZCTA_df1 %>%
  filter(date>"2020-02-16") %>% # subsetting for visualization
  mutate(Risk = factor(if_else(Risk == "High", "High (≥ 75%ile)", 
                        if_else(Risk=="Mid", "Mid (IQR)", "Low (≤ 25%ile)")), 
                       levels = c("High (≥ 75%ile)", "Mid (IQR)", "Low (≤ 25%ile)")))

# Supplementary Figure 8
write_csv(Subway_BWQS_ZCTA_df2 %>% dplyr::select(date, usage.median.ratio, Lower, Upper, Prediction, Risk), file.path(source_path, "sfig8.csv"))
sfig8 <- ggplot() + 
  geom_jitter(data = Subway_BWQS_ZCTA_df2, aes(x = date, y = usage.median.ratio, color = Risk), size = 0.3, alpha = 0.3, position = position_jitter(height = 0, width = 0.42))+ 
  geom_ribbon(data = subset(Subway_BWQS_ZCTA_df2, Risk == "High (≥ 75%ile)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_ZCTA_df2, Risk == "Mid (IQR)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_ribbon(data = subset(Subway_BWQS_ZCTA_df2, Risk == "Low (≤ 25%ile)"), aes(x = date, ymin = Lower, ymax = Upper), fill = "grey50") +
  geom_line(data = Subway_BWQS_ZCTA_df2, aes(x = date, y = Prediction, color = Risk)) +
  scale_x_date("Date", date_minor_breaks = "1 week") + 
  scale_y_continuous("Relative Subway Ridership (%)", labels = scales::percent) + 
  geom_vline(xintercept = date("2020-03-22"),color = "grey30", lty = 2) + 
  theme_bw(base_size = 16) +
  labs(colour="COVID-19 inequity index") +
  theme(legend.title = element_text(face = "bold", size = 12), legend.position = c(0.8, 0.7))  
if(export.figs){ 
  write_figs(sfig8, "sfig8", fig_width = 8, fig_height = 6)
}
## Wrote /home/rushj03/dev/COVID_19_admin_disparities/figures/sfig8_2021-06-17.png

Supp. Fig. 8

# Supplementary Figure 9: compare MTA turnstile data to Google mobility reports
source(here("code/mta_vs_google.R"), echo = FALSE)
write_csv(comp_long, file.path(source_path, "sfig9.csv"))
mobplot

Appendix

#### Appendix #### 
t_final = Sys.time()

Runtimes

## [1] "t_start:        2021-06-17 10:16:03"
## [1] "t_turnstile_1:  2021-06-17 10:16:10"
## [1] "t_turnstile_2:  2021-06-17 10:16:22"
## [1] "t_census:       2021-06-17 10:16:48"
## [1] "t_dataprep:     2021-06-17 10:19:58"
## [1] "t_m1:           2021-06-17 10:19:58"
## [1] "t_postm1:       2021-06-17 10:20:08"
## [1] "t_m2:           2021-06-17 10:20:12"
## [1] "t_m2_3q:        2021-06-17 10:20:16"
## [1] "t_final:        2021-06-17 10:21:39"

Time part 1: Script start through loading of MTA Turnstile package:

print(t_turnstile_1 - t_start)
## Time difference of 7.07228 secs

Time part 2: Downloading (if this is the first run) and processing of subway turnstile data by neighborhood:
This run used cached downloads of MTA turnstile data. The time to run this section including downloads is around 14 minutes.

print(t_turnstile_2 - t_turnstile_1)
## Time difference of 12.21892 secs

Time part 3: Downloading of all other data and processing of Census data:
This run used cached downloads of data. The time to run this section including downloads is around 5 minutes.

print(t_census - t_turnstile_2)
## Time difference of 25.54943 secs

Time part 4: All other data processing, modeling, and figure generation:
This run used cached model output for at least one of the three BWQS models. The time to run this section without any cached model output is around 35 minutes.

print(t_final - t_census)
## Time difference of 4.842822 mins

Total runtime:

print(t_final - t_start)
## Time difference of 5.590166 mins

Session Info

sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 4.0.2 (2020-06-22)
##  os       Ubuntu 18.04.5 LTS          
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/New_York            
##  date     2021-06-17                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────
##  ! package        * version    date       lib source                                 
##  P abind          * 1.4-5      2016-07-21 [?] CRAN (R 4.0.2)                         
##  P acepack          1.4.1      2016-10-29 [?] CRAN (R 4.0.2)                         
##  P askpass          1.1        2019-01-13 [?] CRAN (R 4.0.2)                         
##  P assertthat       0.2.1      2019-03-21 [?] CRAN (R 4.0.2)                         
##  P backports        1.1.8      2020-06-17 [?] CRAN (R 4.0.2)                         
##  P base64enc        0.1-3      2015-07-28 [?] CRAN (R 4.0.2)                         
##  P BBmisc           1.11       2017-03-10 [?] CRAN (R 4.0.2)                         
##  P bit              4.0.4      2020-08-04 [?] CRAN (R 4.0.2)                         
##  P bit64            4.0.5      2020-08-30 [?] CRAN (R 4.0.2)                         
##  P bitops           1.0-6      2013-08-17 [?] CRAN (R 4.0.2)                         
##  P blob             1.2.1      2020-01-20 [?] CRAN (R 4.0.2)                         
##  P boot             1.3-25     2020-04-26 [?] CRAN (R 4.0.0)                         
##  P broom          * 0.5.6      2020-04-20 [?] CRAN (R 4.0.2)                         
##  P callr            3.4.3      2020-03-28 [?] CRAN (R 4.0.2)                         
##  P car              3.0-8      2020-05-21 [?] CRAN (R 4.0.2)                         
##  P carData          3.0-4      2020-05-22 [?] CRAN (R 4.0.2)                         
##  P cellranger       1.1.0      2016-07-27 [?] CRAN (R 4.0.2)                         
##  P checkmate        2.0.0      2020-02-06 [?] CRAN (R 4.0.2)                         
##  P class            7.3-17     2020-04-26 [?] CRAN (R 4.0.0)                         
##  P classInt         0.4-3      2020-04-07 [?] CRAN (R 4.0.2)                         
##  P cli              2.0.2      2020-02-28 [?] CRAN (R 4.0.2)                         
##  P cluster          2.1.0      2019-06-19 [?] CRAN (R 4.0.0)                         
##  P coda             0.19-3     2019-07-05 [?] CRAN (R 4.0.2)                         
##  P codetools        0.2-16     2018-12-24 [?] CRAN (R 4.0.0)                         
##  P colorspace       1.4-1      2019-03-18 [?] CRAN (R 4.0.2)                         
##  P corrplot       * 0.84       2017-10-16 [?] CRAN (R 4.0.2)                         
##  P crayon           1.3.4      2017-09-16 [?] CRAN (R 4.0.2)                         
##  P curl             4.3        2019-12-02 [?] CRAN (R 4.0.2)                         
##  P data.table     * 1.13.0     2020-07-24 [?] CRAN (R 4.0.2)                         
##  P DBI              1.1.0      2019-12-15 [?] CRAN (R 4.0.2)                         
##  P dbplyr           1.4.4      2020-05-27 [?] CRAN (R 4.0.2)                         
##  P deldir           0.1-25     2020-02-03 [?] CRAN (R 4.0.2)                         
##  P DHARMa         * 0.3.3.0    2020-09-08 [?] CRAN (R 4.0.2)                         
##  P digest           0.6.25     2020-02-23 [?] CRAN (R 4.0.2)                         
##  P dplyr          * 1.0.0      2020-05-29 [?] CRAN (R 4.0.2)                         
##  P drc            * 3.0-1      2016-08-30 [?] CRAN (R 4.0.2)                         
##  P e1071            1.7-3      2019-11-26 [?] CRAN (R 4.0.2)                         
##  P egg            * 0.4.5      2019-07-13 [?] CRAN (R 4.0.2)                         
##  P ellipsis         0.3.1      2020-05-15 [?] CRAN (R 4.0.2)                         
##  P evaluate         0.14       2019-05-28 [?] CRAN (R 4.0.2)                         
##  P expm             0.999-4    2019-03-21 [?] CRAN (R 4.0.2)                         
##  P fansi            0.4.1      2020-01-08 [?] CRAN (R 4.0.2)                         
##  P farver           2.0.3      2020-01-16 [?] CRAN (R 4.0.2)                         
##  P fastmap          1.0.1      2019-10-08 [?] CRAN (R 4.0.2)                         
##  P fastmatch        1.1-0      2017-01-28 [?] CRAN (R 4.0.2)                         
##  P forcats        * 0.5.0      2020-03-01 [?] CRAN (R 4.0.2)                         
##  P foreach          1.5.0      2020-03-30 [?] CRAN (R 4.0.2)                         
##  P foreign          0.8-79     2020-04-26 [?] CRAN (R 4.0.0)                         
##  P Formula          1.2-3      2018-05-03 [?] CRAN (R 4.0.2)                         
##  P fs               1.4.2      2020-06-30 [?] CRAN (R 4.0.2)                         
##  P fst            * 0.9.2      2020-04-01 [?] CRAN (R 4.0.2)                         
##  P gap              1.2.2      2020-02-02 [?] CRAN (R 4.0.2)                         
##  P gdata            2.18.0     2017-06-06 [?] CRAN (R 4.0.2)                         
##  P generics         0.0.2      2018-11-29 [?] CRAN (R 4.0.2)                         
##  P ggExtra        * 0.9        2019-08-27 [?] CRAN (R 4.0.2)                         
##  P ggmap            3.0.0      2019-02-05 [?] CRAN (R 4.0.2)                         
##  P ggplot2        * 3.3.2      2020-06-19 [?] CRAN (R 4.0.2)                         
##  P ggpubr         * 0.4.0      2020-06-27 [?] CRAN (R 4.0.2)                         
##  P ggridges       * 0.5.2      2020-01-12 [?] CRAN (R 4.0.2)                         
##  P ggsignif         0.6.0      2019-08-08 [?] CRAN (R 4.0.2)                         
##  P ggsn           * 0.5.0      2019-02-18 [?] CRAN (R 4.0.2)                         
##  P glue             1.4.2      2020-08-27 [?] CRAN (R 4.0.2)                         
##  P gmodels          2.18.1     2018-06-25 [?] CRAN (R 4.0.2)                         
##  P gridExtra      * 2.3        2017-09-09 [?] CRAN (R 4.0.2)                         
##  P gtable           0.3.0      2019-03-25 [?] CRAN (R 4.0.2)                         
##  P gtools           3.8.2      2020-03-31 [?] CRAN (R 4.0.2)                         
##  P haven            2.3.1      2020-06-01 [?] CRAN (R 4.0.2)                         
##  P here           * 0.1        2017-05-28 [?] CRAN (R 4.0.2)                         
##  P highr            0.8        2019-03-20 [?] CRAN (R 4.0.2)                         
##  P Hmisc            4.4-0      2020-03-23 [?] CRAN (R 4.0.2)                         
##  P hms              0.5.3      2020-01-08 [?] CRAN (R 4.0.2)                         
##  P htmlTable        2.0.0      2020-06-21 [?] CRAN (R 4.0.2)                         
##  P htmltools        0.5.0      2020-06-16 [?] CRAN (R 4.0.2)                         
##  P htmlwidgets      1.5.1      2019-10-08 [?] CRAN (R 4.0.2)                         
##  P httpuv           1.5.4      2020-06-06 [?] CRAN (R 4.0.2)                         
##  P httr           * 1.4.1      2019-08-05 [?] CRAN (R 4.0.2)                         
##  P inline           0.3.15     2018-05-18 [?] CRAN (R 4.0.2)                         
##  P iterators        1.0.12     2019-07-26 [?] CRAN (R 4.0.2)                         
##  P jpeg             0.1-8.1    2019-10-24 [?] CRAN (R 4.0.2)                         
##  P jsonlite       * 1.7.0      2020-06-25 [?] CRAN (R 4.0.2)                         
##  P Just.universal * 0.0.0.9000 2020-11-24 [?] github (justlab/Just_universal@45762c3)
##  P kableExtra     * 1.3.1      2020-10-22 [?] CRAN (R 4.0.2)                         
##  P KernSmooth       2.23-17    2020-04-26 [?] CRAN (R 4.0.0)                         
##  P knitr            1.29       2020-06-23 [?] CRAN (R 4.0.2)                         
##  P labeling         0.3        2014-08-23 [?] CRAN (R 4.0.2)                         
##  P later            1.1.0.1    2020-06-05 [?] CRAN (R 4.0.2)                         
##  P lattice          0.20-41    2020-04-02 [?] CRAN (R 4.0.0)                         
##  P latticeExtra     0.6-29     2019-12-19 [?] CRAN (R 4.0.2)                         
##  P LearnBayes       2.15.1     2018-03-18 [?] CRAN (R 4.0.2)                         
##  P lifecycle        0.2.0      2020-03-06 [?] CRAN (R 4.0.2)                         
##  P lme4             1.1-23     2020-04-07 [?] CRAN (R 4.0.2)                         
##  P loo              2.2.0      2019-12-19 [?] CRAN (R 4.0.2)                         
##  P lubridate      * 1.7.9      2020-06-08 [?] CRAN (R 4.0.2)                         
##  P lwgeom         * 0.2-5      2020-06-12 [?] CRAN (R 4.0.2)                         
##  P magic          * 1.5-9      2018-09-17 [?] CRAN (R 4.0.2)                         
##  P magrittr         1.5        2014-11-22 [?] CRAN (R 4.0.2)                         
##  P maptools         1.0-1      2020-05-14 [?] CRAN (R 4.0.2)                         
##  P MASS           * 7.3-51.6   2020-04-26 [?] CRAN (R 4.0.0)                         
##  P Matrix         * 1.2-18     2019-11-27 [?] CRAN (R 4.0.0)                         
##  P matrixStats    * 0.56.0     2020-03-13 [?] CRAN (R 4.0.2)                         
##  P memoise          1.1.0      2017-04-21 [?] CRAN (R 4.0.2)                         
##  P mgcv             1.8-31     2019-11-09 [?] CRAN (R 4.0.0)                         
##  P mime             0.9        2020-02-04 [?] CRAN (R 4.0.2)                         
##  P miniUI           0.1.1.1    2018-05-18 [?] CRAN (R 4.0.2)                         
##  P minqa            1.2.4      2014-10-09 [?] CRAN (R 4.0.2)                         
##  P modelr           0.1.8      2020-05-19 [?] CRAN (R 4.0.2)                         
##  P MTA.turnstile  * 0.0.0.9001 2020-11-24 [?] Github (justlab/MTA_turnstile@544eba3) 
##  P multcomp         1.4-13     2020-04-08 [?] CRAN (R 4.0.2)                         
##  P munsell          0.5.0      2018-06-12 [?] CRAN (R 4.0.2)                         
##  P mvtnorm          1.1-1      2020-06-09 [?] CRAN (R 4.0.2)                         
##  P nlme             3.1-147    2020-04-13 [?] CRAN (R 4.0.0)                         
##  P nloptr           1.2.2.1    2020-03-11 [?] CRAN (R 4.0.2)                         
##  P nnet             7.3-14     2020-04-26 [?] CRAN (R 4.0.0)                         
##  P openxlsx         4.1.5      2020-05-06 [?] CRAN (R 4.0.2)                         
##  P ParamHelpers     1.14       2020-03-24 [?] CRAN (R 4.0.2)                         
##  P pdftools       * 2.3.1      2020-05-22 [?] CRAN (R 4.0.2)                         
##  P pillar           1.4.4      2020-05-05 [?] CRAN (R 4.0.2)                         
##  P pkgbuild         1.0.8      2020-05-07 [?] CRAN (R 4.0.2)                         
##  P pkgconfig        2.0.3      2019-09-22 [?] CRAN (R 4.0.2)                         
##  P plotrix          3.7-8      2020-04-16 [?] CRAN (R 4.0.2)                         
##  P plyr             1.8.6      2020-03-03 [?] CRAN (R 4.0.2)                         
##  P png              0.1-7      2013-12-03 [?] CRAN (R 4.0.2)                         
##  P prettyunits      1.1.1      2020-01-24 [?] CRAN (R 4.0.2)                         
##  P processx         3.4.2      2020-02-09 [?] CRAN (R 4.0.2)                         
##  P promises         1.1.1      2020-06-09 [?] CRAN (R 4.0.2)                         
##  P ps               1.3.3      2020-05-08 [?] CRAN (R 4.0.2)                         
##  P purrr          * 0.3.4      2020-04-17 [?] CRAN (R 4.0.2)                         
##  P qpdf             1.1        2019-03-07 [?] CRAN (R 4.0.2)                         
##  P qs             * 0.22.1     2020-06-10 [?] CRAN (R 4.0.2)                         
##  P R6               2.4.1      2019-11-12 [?] CRAN (R 4.0.2)                         
##  P ragg           * 0.4.0      2020-10-05 [?] CRAN (R 4.0.2)                         
##  P RApiSerialize    0.1.0      2014-04-19 [?] CRAN (R 4.0.2)                         
##  P rappdirs         0.3.1      2016-03-28 [?] CRAN (R 4.0.2)                         
##  P raster           3.3-7      2020-06-27 [?] CRAN (R 4.0.2)                         
##  P RColorBrewer     1.1-2      2014-12-07 [?] CRAN (R 4.0.2)                         
##  P Rcpp             1.0.4.6    2020-04-09 [?] CRAN (R 4.0.2)                         
##  P RcppParallel     5.0.2      2020-06-24 [?] CRAN (R 4.0.2)                         
##  P readr          * 1.3.1      2018-12-21 [?] CRAN (R 4.0.2)                         
##  P readxl         * 1.3.1      2019-03-13 [?] CRAN (R 4.0.2)                         
##    renv             0.12.2     2020-11-04 [1] CRAN (R 4.0.2)                         
##  P reprex           0.3.0      2019-05-16 [?] CRAN (R 4.0.2)                         
##  P rgdal            1.5-12     2020-06-26 [?] CRAN (R 4.0.2)                         
##  P RgoogleMaps      1.4.5.3    2020-02-12 [?] CRAN (R 4.0.2)                         
##  P rio              0.5.16     2018-11-26 [?] CRAN (R 4.0.2)                         
##  P rjson            0.2.20     2018-06-08 [?] CRAN (R 4.0.2)                         
##  P rlang            0.4.6      2020-05-02 [?] CRAN (R 4.0.2)                         
##  P rmarkdown        2.5        2020-10-21 [?] CRAN (R 4.0.2)                         
##  P rpart            4.1-15     2019-04-12 [?] CRAN (R 4.0.0)                         
##  P rprojroot        1.3-2      2018-01-03 [?] CRAN (R 4.0.2)                         
##  P RSQLite        * 2.2.1      2020-09-30 [?] CRAN (R 4.0.2)                         
##  P rstan          * 2.19.3     2020-02-11 [?] CRAN (R 4.0.2)                         
##  P rstatix          0.6.0      2020-06-18 [?] CRAN (R 4.0.2)                         
##  P rstudioapi       0.11       2020-02-07 [?] CRAN (R 4.0.2)                         
##  P rvest            0.3.5      2019-11-08 [?] CRAN (R 4.0.2)                         
##  P sandwich         2.5-1      2019-04-06 [?] CRAN (R 4.0.2)                         
##  P scales         * 1.1.1      2020-05-11 [?] CRAN (R 4.0.2)                         
##  P sessioninfo      1.1.1      2018-11-05 [?] CRAN (R 4.0.2)                         
##  P sf             * 0.9-6      2020-09-13 [?] CRAN (R 4.0.2)                         
##  P shiny            1.5.0      2020-06-23 [?] CRAN (R 4.0.2)                         
##  P sp             * 1.4-2      2020-05-20 [?] CRAN (R 4.0.2)                         
##  P spatialreg     * 1.1-5      2019-12-01 [?] CRAN (R 4.0.2)                         
##  P spData         * 0.3.5      2020-04-06 [?] CRAN (R 4.0.2)                         
##  P spdep          * 1.1-5      2020-06-29 [?] CRAN (R 4.0.2)                         
##  P StanHeaders    * 2.21.0-6   2020-08-16 [?] CRAN (R 4.0.2)                         
##  P statmod          1.4.34     2020-02-17 [?] CRAN (R 4.0.2)                         
##  P stringfish       0.12.1     2020-06-05 [?] CRAN (R 4.0.2)                         
##  P stringi          1.5.3      2020-09-09 [?] CRAN (R 4.0.2)                         
##  P stringr        * 1.4.0      2019-02-10 [?] CRAN (R 4.0.2)                         
##  P survival         3.1-12     2020-04-10 [?] CRAN (R 4.0.0)                         
##  P systemfonts      0.3.2      2020-09-29 [?] CRAN (R 4.0.2)                         
##  P textshaping      0.1.2      2020-10-08 [?] CRAN (R 4.0.2)                         
##  P TH.data          1.0-10     2019-01-21 [?] CRAN (R 4.0.2)                         
##  P tibble         * 3.0.1      2020-04-20 [?] CRAN (R 4.0.2)                         
##  P tidycensus     * 0.10.2     2020-09-28 [?] CRAN (R 4.0.2)                         
##  P tidyr          * 1.1.0      2020-05-20 [?] CRAN (R 4.0.2)                         
##  P tidyselect       1.1.0      2020-05-11 [?] CRAN (R 4.0.2)                         
##  P tidyverse      * 1.3.0      2019-11-21 [?] CRAN (R 4.0.2)                         
##  P tigris           0.9.4      2020-04-01 [?] CRAN (R 4.0.2)                         
##  P units            0.6-7      2020-06-13 [?] CRAN (R 4.0.2)                         
##  P utf8             1.1.4      2018-05-24 [?] CRAN (R 4.0.2)                         
##  P uuid             0.1-4      2020-02-26 [?] CRAN (R 4.0.2)                         
##  P vctrs            0.3.1      2020-06-05 [?] CRAN (R 4.0.2)                         
##  P viridisLite      0.3.0      2018-02-01 [?] CRAN (R 4.0.2)                         
##  P webshot          0.5.2      2019-11-22 [?] CRAN (R 4.0.2)                         
##  P withr            2.2.0      2020-04-20 [?] CRAN (R 4.0.2)                         
##  P xfun             0.15       2020-06-21 [?] CRAN (R 4.0.2)                         
##  P xgboost          1.1.1.1    2020-06-14 [?] CRAN (R 4.0.2)                         
##  P xml2             1.3.2      2020-04-23 [?] CRAN (R 4.0.2)                         
##  P xtable           1.8-4      2019-04-21 [?] CRAN (R 4.0.2)                         
##  P yaml             2.2.1      2020-02-01 [?] CRAN (R 4.0.2)                         
##  P zip            * 2.0.4      2019-09-01 [?] CRAN (R 4.0.2)                         
##  P zoo              1.8-8      2020-05-02 [?] CRAN (R 4.0.2)                         
## 
## [1] /home/rushj03/dev/COVID_19_admin_disparities/renv/library/R-4.0/x86_64-pc-linux-gnu
## [2] /data/scratch/rtemp/RtmpeGn2vn/renv-system-library
## 
##  P ── Loaded and on-disk path mismatch.