# CoVID-19 county death analysis with ABSMs # downloaded from The Public Health Disparities Geocoding Project # https://www.hsph.harvard.edu/thegeocodingproject/covid-19-resources/ # Code replicates analyses from # Chen JT, Krieger N. Revealing the unequal burden of COVID-19 by income, race/ethnicity, # and household crowding: US county vs ZIP code analyses. Harvard Center for Population and Development # Studies Working Paper Series, Volume 19, Number 1. April 21, 2020. # updated: May 5, 2020 # Jarvis T. Chen, ScD # 15 May 2020 library(tidycensus) library(tidyverse) library(dplyr) library(Hmisc) # setwd("~/") # census_api_key() # get pop denominator estimates raw.acstotal <- get_acs(geography="county", variables=c("B01003_001E", "B02001_001E", "B02001_002E", "B02001_003E", "B02001_004E", "B02001_005E", "B02001_006E", "B02001_007E", "B02001_008E", "B02001_009E", "B02001_010E", "B03001_001E", "B03001_003E", "B01001H_001E"), year=2018, output="wide", cache_table=T) %>% mutate(percBlack = B02001_003E/B02001_001E, percHisp = B03001_003E/B03001_001E, pop_total = B01003_001E, pop_white = B02001_002E, pop_black = B02001_003E, pop_amind = B02001_004E, pop_api = B02001_005E + B02001_006E, pop_hisp = B03001_003E, pop_wnh = B01001H_001E, percColor = (B01003_001E - B01001H_001E)/B01003_001E) %>% dplyr::select(GEOID, percBlack, percHisp, percColor, pop_total, pop_white, pop_black, pop_amind, pop_api, pop_hisp, pop_wnh) # this code is to compute ICEwbinc and poverty from ACS data raw.white <- get_acs(geography="county",table="B19001A", year=2018,output="wide", cache_table=T) raw.black <- get_acs(geography="county",table="B19001B", year=2018,output="wide", cache_table=T) raw.all <- get_acs(geography="county", table="B19001", year=2018, output="wide", cache_table=T) raw.pov <- get_acs(geography="county",table="B17001", year=2018, output="wide", cache_table=T) raw.crowd <- get_acs(geography="county",table="B25014", year=2018, output="wide", cache_table=T) d.merge <- left_join(raw.white, raw.black, by="GEOID") d.merge <- left_join(d.merge, raw.all, by="GEOID") d.merge <- left_join(d.merge, raw.pov, by="GEOID") d.merge <- left_join(d.merge, raw.crowd, by="GEOID") d.merge <- select(d.merge, c("GEOID","B19001_001E","B19001A_014E","B19001A_015E","B19001A_016E","B19001A_017E", "B19001B_002E","B19001B_003E","B19001B_004E","B19001B_005E", "B17001_001E","B17001_002E", "B25014_001E", "B25014_005E","B25014_006E","B25014_007E", "B25014_011E","B25014_012E","B25014_013E")) %>% mutate(ICEwbinc=((B19001A_014E + B19001A_015E + B19001A_016E + B19001A_017E) - (B19001B_002E + B19001B_003E + B19001B_004E + B19001B_005E))/B19001_001E, cntyPov = B17001_002E/B17001_001E, cntyCrowd = (B25014_005E + B25014_006E + B25014_007E + B25014_011E + B25014_012E + B25014_013E) / B25014_001E, cntySevereCrowd = (B25014_006E + B25014_007E + B25014_012E + B25014_013E) / B25014_001E) %>% dplyr::select(c("GEOID","ICEwbinc","cntyPov", "cntyCrowd","cntySevereCrowd")) d.acs <- left_join(raw.acstotal, d.merge, by="GEOID") %>% mutate(apINDPOV=factor(case_when(0<=cntyPov & cntyPov <0.05 ~ "0-4.9%", 0.05<=cntyPov & cntyPov<0.10 ~ "5-9.9%", 0.10<=cntyPov & cntyPov<0.15 ~ "10-14.9%", 0.15<=cntyPov & cntyPov<0.2 ~ "15-19.9%", 0.20<=cntyPov & cntyPov<=1 ~ "20-100%"), levels=c("0-4.9%","5-9.9%","10-14.9%","15-19.9%","20-100%")), qINDPOV=cut(cntyPov, wtd.quantile(cntyPov, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1))), qICEwbinc=cut(ICEwbinc, wtd.quantile(ICEwbinc, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1))), qcrowd = cut(cntyCrowd, wtd.quantile(cntyCrowd, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1))), qSevereCrowd = cut(cntySevereCrowd, wtd.quantile(cntySevereCrowd, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1))), qpercColor = cut(percColor, wtd.quantile(percColor, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1))), qpercBlack = cut(percBlack, wtd.quantile(percBlack, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1)))) %>% ungroup() # Data are available from USAfacts, including constituent counties of NYC # https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/ covid.usafacts <- read.csv("covid_deaths_usafacts_050520.csv") %>% mutate(GEOID=ifelse(countyFIPS<9999, paste0("0",countyFIPS), paste0(countyFIPS)), stfips=substr(GEOID,1,2)) %>% filter(!GEOID %in% c("00","01")) %>% select(GEOID, stfips, countyFIPS, County.Name, State, stateFIPS, X5.5.20) covid.merge <- left_join(covid.usafacts, d.acs, by="GEOID") %>% mutate(deaths=X5.5.20, deathRate = 100000*deaths/(104*pop_total/365.25)) # function to compute rates for any ABSM # f.summarize.absm takes the name of the ABSM variable and reflevel (reference group) # express rates per 100,000 person years # to do that, multiply the population count by 104 days and divide by 365.25 f.summarize.absm <- function(var, reflevel){ this.absm <- enquo(var) this.summary <- covid.merge %>% group_by(!!this.absm) %>% summarise(numcases=sum(deaths), denom=sum(pop_total)) %>% mutate(denomtime = 104*denom/365.25, eventRate = numcases/denomtime, varEventRate = numcases/denomtime^2, eventRatelo95 = eventRate - 1.96*sqrt(varEventRate), eventRateup95 = eventRate + 1.96*sqrt(varEventRate), dummyMerge=1, qval=factor(!!this.absm), absm=paste(enexpr(var)) ) ref.thisabsm <- this.summary %>% filter(!!this.absm==reflevel) %>% mutate(ref.ir = eventRate, ref.var = varEventRate) %>% select(dummyMerge, ref.ir, ref.var) %>% ungroup() that.summary <- left_join(this.summary, ref.thisabsm, by="dummyMerge") %>% mutate(IRD = eventRate - ref.ir, IRR = eventRate / ref.ir, varIRD = varEventRate + ref.var, varlogirr = (varEventRate/eventRate^2) + (ref.var/ref.ir^2), IRDlo95 = IRD - 1.96*sqrt(varIRD), IRDup95 = IRD + 1.96*sqrt(varIRD), IRRlo95 = exp(log(IRR) - 1.96*sqrt(varlogirr)), IRRup95 = exp(log(IRR) + 1.96*sqrt(varlogirr))) %>% mutate(var.log.irr=ifelse(!!this.absm==reflevel,NA,varlogirr), IRDlo95=ifelse(!!this.absm==reflevel,NA,IRDlo95), IRDup95=ifelse(!!this.absm==reflevel,NA,IRDup95), IRRlo95=ifelse(!!this.absm==reflevel,NA,IRRlo95), IRRup95=ifelse(!!this.absm==reflevel,NA,IRRup95)) %>% select(absm, qval, numcases, denom, eventRate, eventRatelo95, eventRateup95, IRD, IRDlo95, IRDup95, IRR, IRRlo95, IRRup95) return(that.summary) } covid.apINDPOV <- f.summarize.absm(apINDPOV,levels(covid.merge$apINDPOV)[1]) covid.qINDPOV <- f.summarize.absm(qINDPOV,levels(covid.merge$qINDPOV)[1]) covid.qICEwbinc <- f.summarize.absm(qICEwbinc,levels(covid.merge$qICEwbinc)[5]) covid.qcrowd <- f.summarize.absm(qcrowd,levels(covid.merge$qcrowd)[1]) covid.qpercBlack <- f.summarize.absm(qpercBlack,levels(covid.merge$qpercBlack)[1]) covid.qpercColor <- f.summarize.absm(qpercColor,levels(covid.merge$qpercColor)[1]) # write the results out to a csv file # re-express rates as rates per 100,000 all.results <- bind_rows(covid.apINDPOV, covid.qICEwbinc, covid.qcrowd, covid.qpercColor) %>% mutate(crudeRate = 100000*eventRate, crudeRatelo95 = 100000*eventRatelo95, crudeRateup95 = 100000*eventRateup95, IRD = 100000*IRD, IRDlo95 = 100000*IRDlo95, IRDup95 = 100000*IRDup95) %>% dplyr::select(-eventRate, -eventRatelo95, -eventRateup95) write.csv(all.results, "us_county_results_050520.csv") # plotting code f.absm.plot <- function(data, var, this.label){ ddata <- subset(data, !is.na(data$qval)) rate.labels <- paste0(round(c(100000*ddata$eventRate),2)) count.labels <- paste0("(",formatC(ddata$numcases, big.mark=",")," / ",formatC(ddata$denom,big.mark=",",format="d"),")") this.plot <- ggplot(ddata, aes(x=qval, y=100000*eventRate)) + geom_point() + geom_errorbar(aes(ymin=100000*eventRatelo95, ymax=100000*eventRateup95, width=0.1)) + ylab("COVID-19 death rate per 100,000 population") + xlab(paste0("county ",paste(enexpr(this.label)))) + ggtitle(paste0("US COVID-19 deaths per 100,000 population \nby county ", paste(enexpr(this.label))," (as of 4.16.2020)")) + annotate("text", x=c(1:5), y=2+c(100000*ddata$eventRateup95), label=rate.labels) + annotate("text", x=c(1:5), y=1+c(100000*ddata$eventRateup95), label=count.labels, size=3) ggsave(paste0("cov19_county_deaths_", enexpr(var),"_05052020.pdf"), this.plot, width=8, height=6) } f.absm.plot(covid.apINDPOV, apINDPOV, this.label="% below poverty (categories)") f.absm.plot(covid.qINDPOV, qINDPOV, this.label="% below poverty (quintiles)") f.absm.plot(covid.qICEwbinc, qICEwbinc, this.label="Index of Concentration at the Extremes (white/black race + income)") f.absm.plot(covid.qcrowd, qcrowd, this.label="% crowding (>1 person per room)") f.absm.plot(covid.qpercColor, qpercColor, this.label="% population of color") f.absm.plot(covid.qpercBlack, qpercBlack, this.label="% Black population") # tabulation of number of zip codes write.csv(cbind(table(covid.merge$apINDPOV), table(covid.merge$qINDPOV), table(covid.merge$qICEwbinc), table(covid.merge$qcrowd), table(covid.merge$qpercBlack), table(covid.merge$qpercColor)), "us_countiesInABSMCategories_050520.csv") sum(covid.merge$deaths) # Create a version of the plots all in one figure # Note that you have to fiddle with the position of the annotations (how much to offset the labels # on the y axis). library(ggpubr) f.absm.tweet <- function(data, var, this.label, label.chars, this.ylim){ this.theme <- theme( axis.title.x = element_text(size = 7), axis.text.x = element_text(size = 6), axis.title.y = element_text(size = 9), axis.text.y = element_text(size=7)) ddata <- subset(data, !is.na(data$qval)) rate.labels <- paste0(round(c(100000*ddata$eventRate),1)) count.labels <- paste0(formatC(ddata$numcases, big.mark=",")) denom.labels <- paste0(formatC(ddata$denom,big.mark=",",format="d")) this.plot <- ggplot(ddata, aes(x=qval, y=100000*eventRate)) + geom_point(shape=18, size=3, color=these.reds) + scale_x_discrete(labels=label.chars) + this.theme + ylim(this.ylim) + geom_errorbar(aes(ymin=100000*eventRatelo95, ymax=100000*eventRateup95, width=0.1), color=these.reds) + ylab("") + xlab(paste0("county ",paste(enexpr(this.label)))) + annotate("text", x=c(1:5), y=9+c(100000*ddata$eventRateup95), label=rate.labels, size=3) + annotate("text", x=c(1:5), y=6+c(100000*ddata$eventRateup95), label=count.labels, size=2) + annotate("text", x=c(1:5), y=3+c(100000*ddata$eventRateup95), label=denom.labels, size=2) return(this.plot) } this.ylim <- 100000*c(min(bind_rows(covid.apINDPOV, covid.qICEwbinc, covid.qcrowd, covid.qpercColor)$eventRatelo95, na.rm=T), max(bind_rows(covid.apINDPOV, covid.qICEwbinc, covid.qcrowd, covid.qpercColor)$eventRateup95, na.rm=T))+ c(-1, 15) these.reds <- c(1,1,1,1,1) label.chars <- c("0-4.9","5-9.9","10-14.9","15-19.9","20+") a <- f.absm.tweet(covid.apINDPOV, apINDPOV, this.label="% below poverty", label.chars, this.ylim) label.chars <- c("(-0.522,0.114]", "(0.114,0.159]", "(0.159,0.205]", "(0.205,0.283]", "(0.283,0.536]") b <- f.absm.tweet(covid.qICEwbinc, qICEwbinc, this.label="Index of Concentration at the Extremes (high income white vs. low income black)", label.chars, this.ylim) label.chars <- c("0-1.5","1.5-2.1","2.1-3.1","3.1-4.9","4.9+") c <- f.absm.tweet(covid.qcrowd, qcrowd, this.label="% crowding (>1 person per room)", label.chars, this.ylim) label.chars <- c("0-17.2","17.2-30.0","30.2-44.3","44.3-61.0","61+") d <- f.absm.tweet(covid.qpercColor, qpercColor, this.label="% population of color", label.chars, this.ylim) together <- ggarrange(a,b,c,d, ncol=4) ggsave("cov19_county_deaths_050520.pdf", together, width=10.5, height=6) # output the rates in the counties with the highest death counts covid.top25 <- covid.merge %>% arrange(desc(deaths)) %>% select(GEOID, County.Name, State, deaths, pop_total, deathRate, cntyPov, ICEwbinc, cntyCrowd, percColor) write.csv(covid.top25, "covid_county_death_sorted.csv")