# Example of analysis of excess mortality linked to ZIP code social metrics # downloaded from The Public Health Disparities Geocoding Project # https://www.hsph.harvard.edu/thegeocodingproject/covid-19-resources/ # # Replicates analyses used in: # Chen JT, Waterman PD, Krieger N. COVID-19 and the unequal surge in mortality rates in Massachusetts, # by city/town and ZIP Code measures of poverty, household crowding, race/ethnicity, and racialized economic # segregation. Harvard Center for Population and Development Studies Working Paper Series, Volume 19, Number 2. May 9, 2020. # # This code updates the analyses in the working paper by extending to April 28, 2020 and # calculating rates for one-week rather than two-week periods. # Jarvis T. Chen, ScD # 18 May 2020 # Assumes that we have: # 1. Dataset of deaths in 2020: we show an example of cleaning ma.death.raw.2020 below. # 2. Dataset of deaths in 2015-2019: assume we have ma.death2019, ma.death2018, ma.death2017, ma.death2016, ma.death2015 # 3. Dataset of population denominators by ZIP code tabulation area (ZCTA) and age: this is called ma.acs.pop2 below. # 4. Dataset of ZCTA area-based socioeconomic measures (ABSMs): this is called d.acs2018 below. library(tidycensus) library(tidyverse) library(dplyr) # an example of data cleaning of 2020 death data ma.death2020 <- select(ma.death.raw.2020, "GENDER", "DOD", "DOB", "RES_CITY", "RES_STATE", "RES_ZIP", "AGE") %>% rename(Sex=GENDER, citytown="RES_CITY") %>% filter(RES_STATE=="MASSACHUSETTS") %>% mutate(GEOID=ifelse(RES_ZIP<=9999, paste0("0",RES_ZIP), paste0(RES_ZIP)), dthdate=as.Date(DOD, "%Y-%m-%d", tz="US/Eastern"), bthdate=as.Date(DOB, "%Y-%m-%d", tz="US/Eastern"), ageAtDth = as.numeric(dthdate-bthdate)/365.25, year="2020", oneweek = case_when("2020-01-01"<=dthdate & dthdate <"2020-01-07" ~ 1, "2020-01-08"<=dthdate & dthdate <"2020-01-14" ~ 2, "2020-01-15"<=dthdate & dthdate <"2020-01-21" ~ 3, "2020-01-22"<=dthdate & dthdate <"2020-01-28" ~ 4, "2020-01-29"<=dthdate & dthdate <"2020-02-04" ~ 5, "2020-02-05"<=dthdate & dthdate <"2020-02-11" ~ 6, "2020-02-12"<=dthdate & dthdate <"2020-02-18" ~ 7, "2020-02-19"<=dthdate & dthdate <"2020-02-25" ~ 8, "2020-02-26"<=dthdate & dthdate <"2020-03-03" ~ 9, "2020-03-04"<=dthdate & dthdate <"2020-03-10" ~ 10, "2020-03-11"<=dthdate & dthdate <"2020-03-17" ~ 11, "2020-03-18"<=dthdate & dthdate <"2020-03-24" ~ 12, "2020-03-25"<=dthdate & dthdate <"2020-03-31" ~ 13, "2020-04-01"<=dthdate & dthdate <"2020-04-07" ~ 14, "2020-04-08"<=dthdate & dthdate <"2020-04-14" ~ 15, "2020-04-15"<=dthdate & dthdate <"2020-04-21" ~ 16, "2020-04-22"<=dthdate & dthdate <"2020-04-28" ~ 17), twoweek=floor(oneweek/2), agecat = case_when( ageAtDth <5 ~ "Age0-4", 5 <= ageAtDth & ageAtDth<10 ~ "Age5-9", 10 <= ageAtDth & ageAtDth<15 ~ "Age10-14", 15 <= ageAtDth & ageAtDth<20 ~ "Age15-19", 20 <= ageAtDth & ageAtDth<25 ~ "Age20-24", 25 <= ageAtDth & ageAtDth<30 ~ "Age25-29", 30 <= ageAtDth & ageAtDth<35 ~ "Age30-34", 35 <= ageAtDth & ageAtDth<45 ~ "Age35-44", 45 <= ageAtDth & ageAtDth<55 ~ "Age45-54", 55 <= ageAtDth & ageAtDth<65 ~ "Age55-64", 65 <= ageAtDth & ageAtDth<75 ~ "Age65-74", 75 <= ageAtDth & ageAtDth<85 ~ "Age75-84", ageAtDth >=85 ~ "Age85+")) %>% filter(oneweek<=17 & twoweek>=0) %>% select(Sex, agecat, ageAtDth, GEOID, dthdate, citytown, RES_STATE, year, oneweek, twoweek) # Example: # extract MA zip code list from all years of data # in order to match with US ZCTA ABSM dataset ma.zips2020 <- names(table(ma.death2020$GEOID)) ma.zips2019 <- names(table(ma.death2019$GEOID)) ma.zips2018 <- names(table(ma.death2018$GEOID)) ma.zips2017 <- names(table(ma.death2017$GEOID)) ma.zips2016 <- names(table(ma.death2016$GEOID)) ma.zips2015 <- names(table(ma.death2015$GEOID)) ma.allzips <- full_join(data.frame(GEOID=ma.zips2020, in2020=1), data.frame(GEOID=ma.zips2019, in2019=1), by="GEOID") %>% full_join(data.frame(GEOID=ma.zips2018, in2018=1), by="GEOID") %>% full_join(data.frame(GEOID=ma.zips2018, in2018=1), by="GEOID") %>% full_join(data.frame(GEOID=ma.zips2017, in2017=1), by="GEOID") %>% full_join(data.frame(GEOID=ma.zips2016, in2016=1), by="GEOID") %>% full_join(data.frame(GEOID=ma.zips2015, in2015=1), by="GEOID") # restrict ABSM datasets to the full list of MA zip codes d.acs2018.ma <- d.acs2018 %>% right_join(ma.allzips, by="GEOID") %>% mutate(year=2018, apINDPOV=factor(case_when(0<=zctaPov & zctaPov<0.05 ~ "0-4.9%", 0.05<=zctaPov & zctaPov<0.10 ~ "5-9.9%", 0.10<=zctaPov& zctaPov<0.2 ~ "10-19.9%", 0.20<=zctaPov& zctaPov<=1 ~ "20-100%"), levels=c("0-4.9%","5-9.9%","10-19.9%","20-100%")), qINDPOV=cut(zctaPov, wtd.quantile(zctaPov, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1)), include.lowest=T), qICEwnhinc=cut(ICEwnhinc, wtd.quantile(ICEwnhinc, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1)), include.lowest=T), qcrowd = cut(zctaCrowd, wtd.quantile(zctaCrowd, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1)), include.lowest=T), qpercColor = cut(percColor, wtd.quantile(percColor, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1)), include.lowest=T), qpercBlack = cut(percBlack, wtd.quantile(percBlack, weights=pop_total, probs=c(0,0.2,0.4,0.6,0.8,1)), include.lowest=T)) # link with ABSM dataset # use d.acs2018.ma which is centered on year 2016 ma.merge2020 <- left_join(ma.death2020, d.acs2018.ma, by="GEOID") ma.merge2019 <- left_join(ma.death2019, d.acs2018.ma, by="GEOID") ma.merge2018 <- left_join(ma.death2018, d.acs2018.ma, by="GEOID") ma.merge2017 <- left_join(ma.death2017, d.acs2018.ma, by="GEOID") ma.merge2016 <- left_join(ma.death2016, d.acs2018.ma, by="GEOID") ma.merge2015 <- left_join(ma.death2015, d.acs2018.ma, by="GEOID") # combine 2015 to 2019 data ma.merge1519 <- bind_rows(ma.merge2019, ma.merge2018, ma.merge2017, ma.merge2016, ma.merge2015) %>% mutate(year=as.numeric(year.x)) %>% select(-year.x, -year.y) # these are ZIP codes that appear in the FACT OF DEATH file # but have no corresponding ZCTA in the ACS files # These likely represent weird zip codes that are not ZCTAs # (e.g. post office boxes) d.acs2018.weird <- anti_join(ma.allzips, d.acs2018, by="GEOID") d.cases.weird2020 <- inner_join(d.acs2018.weird, ma.death2020,by="GEOID") d.cases.weird1519 <- inner_join(d.acs2018.weird, ma.merge1519,by="GEOID") # read in the year 2000 standard million for age standardization # We obtained this from SEER: available at https://seer.cancer.gov/stdpopulations/ # SEER gives 19 age categories, which we collapse into categories matching what is available in # the US Census American Community Survey seer.age19 <- read_fwf("stdpop.19ages.txt", fwf_widths(c(3,3,8), c("standard","age","std_raw"))) %>% filter(standard=="201") %>% mutate(agecat=recode(age, '000'="Age0-4", '001'="Age0-4", '002'="Age5-9", '003'="Age10-14", '004'="Age15-19", '005'="Age20-24", '006'="Age25-29", '007'="Age30-34", '008'="Age35-44", '009'="Age35-44", '010'="Age45-54", '011'="Age45-54", '012'="Age55-64", '013'="Age55-64", '014'="Age65-74", '015'="Age65-74", '016'="Age75-84", '017'="Age75-84", '018'="Age85+"), std.pop=as.numeric(std_raw)) %>% group_by(agecat) %>% summarise(std=sum(std.pop)) ma.pop.for.agestd <- left_join(ma.acs.pop2, d.acs2018.ma, by="GEOID") these.agecat <- paste0('Age',c('0-4','5-9','10-14','15-19','20-24','25-29', '30-34','35-44','45-54','55-64','65-74','75-84','85+')) # check the total deaths for 2000 and 2015-2019 data sum(table(ma.merge2020$oneweek)) sum(table(ma.merge1519$oneweek)) # Function to summarize excess deaths, age standardized rates, rate difference, and rate ratios # by categories of ZCTA ABSMs f.ts.agestd <- function(this.absm){ absm <- enquo(this.absm) # get levels of the absm num.lev.absm <- length(levels(d.acs2018.ma[[paste(enexpr(this.absm))]])) absm.levels <- levels(d.acs2018.ma[[paste(enexpr(this.absm))]]) # make a template for twoweek by absm tweek <- data.frame(agecat=rep(these.agecat, num.lev.absm*17), oneweek=rep(rep(c(1:17), each=13), num.lev.absm), absm.name=factor(rep(levels(d.acs2018.ma[[paste(enexpr(this.absm))]]), each=17*13)), levels=absm.levels) names(tweek)[3] <- paste(enexpr(this.absm)) # sum the number of deaths over a weekly period, within categories of absm this.data2020 <- ma.merge2020 %>% group_by(agecat, oneweek, !!absm) %>% summarise(numdeaths=n()) # aggregate denominators in the 2020 file this.denom2020 <- ma.pop.for.agestd %>% group_by(agecat, !!absm) %>% summarise(denom=sum(acs.pop)) %>% right_join(tweek, by=c("agecat",paste(enexpr(this.absm)))) # merge numerators and denominators for 2020 merge2020 <- right_join(this.data2020, this.denom2020, by=c("oneweek","agecat",paste(enquo(absm))[2])) # sum the average number of deaths over a weekly period over 5 years of data and within categories of absm this.data1519 <- ma.merge1519 %>% group_by(agecat, oneweek, !!absm) %>% summarise(numdeaths=n()/5) # aggregate denominators in the 2015-2019 file this.denom1519 <- ma.pop.for.agestd %>% group_by(agecat, !!absm) %>% summarise(denom=sum(acs.pop)) %>% right_join(tweek, by=c("agecat",paste(enexpr(this.absm)))) # merge numerators and denominators for 2015-2019 # divide by five so this is average number of deaths and average population over 5 years merge1519 <- right_join(this.data1519, this.denom1519, by=c("oneweek","agecat",paste(enquo(absm))[2])) # compute excess on a weekly basis # compute rates per 100000 person-years # to do this, calculate for person-weeks and re-multiply to get person-years. # Note that we adjust the size of the person-time denominator # for oneweek=9 which corresponds to the week containing February 29 when there is a leap year. this.ma.compare <- full_join(merge2020, merge1519, by=c("oneweek", "agecat", paste(enquo(absm))[2]), suffix=c(".2020",".1519")) %>% left_join(seer.age19, by="agecat") %>% filter(!is.na(!!absm)) %>% mutate(denom.2020=ifelse(oneweek==9, denom.2020*15/14, denom.2020), denom.1519=ifelse(oneweek==9, denom.1519*71/70, denom.1519), rate2020=std*numdeaths.2020/(denom.2020/52.17857), var.rate2020 = std^2*numdeaths.2020/(denom.2020/52.17857)^2, rate1519=std*numdeaths.1519/(denom.1519/52.17857), var.rate1519 = std^2*numdeaths.1519/(denom.1519/52.17857)^2, adjdeath.2020 = std*numdeaths.2020, adjdeath.1519 = std*numdeaths.1519, w.sq = std^2) %>% group_by(oneweek, !!absm) %>% summarise(raw.deaths1519 = sum(numdeaths.1519,na.rm=T), raw.deaths2020 = sum(numdeaths.2020,na.rm=T), raw.pop1519 = sum(denom.1519,na.rm=T), raw.pop2020 = sum(denom.2020,na.rm=T), sum.w.iri2020 = sum(rate2020,na.rm=T), sum.w.iri1519 = sum(rate1519,na.rm=T), sum.w.var.iri2020 = sum(var.rate2020,na.rm=T), sum.w.var.iri1519 = sum(var.rate1519,na.rm=T), sum.wt = sum(std), sum.wt.sq = sum(w.sq), sum.w.death2020 = sum(adjdeath.2020,na.rm=T), sum.w.death1519 = sum(adjdeath.1519,na.rm=T)) %>% mutate(ir.std.2020 = sum.w.iri2020/sum.wt, var.ir.std.2020 = sum.w.var.iri2020/sum.wt.sq, ir.std.2020.lo95 = ir.std.2020 - 1.96*sqrt(var.ir.std.2020), ir.std.2020.up95 = ir.std.2020 + 1.96*sqrt(var.ir.std.2020), ir.std.1519 = sum.w.iri1519/sum.wt, var.ir.std.1519 = sum.w.var.iri1519/sum.wt.sq, ir.std.1519.lo95 = ir.std.1519 - 1.96*sqrt(var.ir.std.1519), ir.std.1519.up95 = ir.std.1519 + 1.96*sqrt(var.ir.std.1519), death.std.2020 = sum.w.death2020/sum.wt, death.std.1519 = sum.w.death1519/sum.wt, death.std.excess = death.std.2020 - death.std.1519, ird.std = ir.std.2020 - ir.std.1519, var.ird.std = var.ir.std.2020 + var.ir.std.1519, ird.std.lo95 = ird.std - 1.96*sqrt(var.ird.std), ird.std.up95 = ird.std + 1.95*sqrt(var.ird.std), irr.std = ir.std.2020/ir.std.1519, var.irr.std = (var.ir.std.2020/ir.std.2020^2) + (var.ir.std.1519/ir.std.1519^2), irr.std.lo95 = exp(log(irr.std) - 1.96*sqrt(var.irr.std)), irr.std.up95 = exp(log(irr.std) + 1.95*sqrt(var.irr.std)), raw.excess = raw.deaths2020 - raw.deaths1519) %>% ungroup() %>% select(oneweek, !!absm, raw.excess, raw.deaths1519, raw.pop1519, death.std.1519, ir.std.1519, ir.std.1519.lo95, ir.std.1519.up95, raw.deaths2020, raw.pop2020, death.std.2020, ir.std.2020, ir.std.2020.lo95, ir.std.2020.up95, death.std.excess, ird.std, ird.std.lo95, ird.std.up95, irr.std, irr.std.lo95, irr.std.up95) return(this.ma.compare) } # The ABSM variable tends to be coerced from factor to character, # so there is extra code here to restore the factor levels. ma.std.compare.apINDPOV <- f.ts.agestd(apINDPOV) ma.std.compare.apINDPOV <- mutate(ma.std.compare.apINDPOV, qval = factor(apINDPOV, levels=levels(d.acs2018.ma$apINDPOV)), qval.order = as.numeric(qval), qabsm="apINDPOV") %>% arrange(oneweek, qval) ma.std.compare.qcrowd <- f.ts.agestd(qcrowd) ma.std.compare.qcrowd <- mutate(ma.std.compare.qcrowd, qval = factor(qcrowd, levels=levels(d.acs2018.ma$qcrowd)), qval.order = as.numeric(qval), qabsm="qcrowd") %>% arrange(oneweek, qval) ma.std.compare.qpercColor <- f.ts.agestd(qpercColor) ma.std.compare.qpercColor <- mutate(ma.std.compare.qpercColor, qval = factor(qpercColor, levels=levels(d.acs2018.ma$qpercColor)), qval.order = as.numeric(qval), qabsm="qpercColor") %>% arrange(oneweek, qval) ma.std.compare.qpercBlack <- f.ts.agestd(qpercBlack) ma.std.compare.qpercBlack <- mutate(ma.std.compare.qpercBlack, qval = factor(qpercBlack, levels=levels(d.acs2018.ma$qpercBlack)), qval.order = as.numeric(qval), qabsm="qpercBlack") %>% arrange(oneweek, qval) ma.std.compare.qICEwnhinc <- f.ts.agestd(qICEwnhinc) ma.std.compare.qICEwnhinc <- mutate(ma.std.compare.qICEwnhinc, qval = factor(qICEwnhinc, levels=levels(d.acs2018.ma$qICEwnhinc)), qval.order = as.numeric(qval), qabsm="qICEwnhinc") %>% arrange(oneweek, qval) # combine all the results into a file to re-sort ma.std.compare.all <- bind_rows(ma.std.compare.apINDPOV, ma.std.compare.qcrowd, ma.std.compare.qpercColor, ma.std.compare.qpercBlack, ma.std.compare.qICEwnhinc) %>% mutate(ir.std.1519 = 100000*ir.std.1519, ir.std.1519.lo95 = 100000*ir.std.1519.lo95, ir.std.1519.up95 = 100000*ir.std.1519.up95, ir.std.2020 = 100000*ir.std.2020, ir.std.2020.lo95 = 100000*ir.std.2020.lo95, ir.std.2020.up95 = 100000*ir.std.2020.up95, ird.std = 100000*ird.std, ird.std.lo95 = 100000*ird.std.lo95, ird.std.up95 = 100000*ird.std.up95, raw.diff = raw.deaths2020 - raw.deaths1519) %>% select(qabsm, qval, oneweek, raw.excess, ir.std.1519, ir.std.1519.lo95, ir.std.1519.up95, ir.std.2020, ir.std.2020.lo95, ir.std.2020.up95, ird.std, ird.std.lo95, ird.std.up95, irr.std, irr.std.lo95, irr.std.up95, qval.order) %>% mutate(qabsm = factor(qabsm)) # output all of the results to a csv file. write.csv(arrange(ma.std.compare.all, qabsm, qval.order, oneweek), "ma_zcta_oneweek_april28.csv") # create labels for the weekly periods for graphing oneweek.labels <- c("01-01,\n01-07", "01-08,\n01-14", "01-15,\n01-21", "01-22,\n01-28", "01-29,\n02-04", "02-05,\n02-11", "02-12,\n02-18", "02-19,\n02-25", "02-26,\n03-03", "03-04,\n03-10", "03-11,\n03-17", "03-18,\n03-25", "03-25,\n03-31", "04-01,\n04-07", "04-08,\n04-14", "04-15,\n04-21", "04-22,\n04-28") library(gridExtra) library(grid) library(ggpubr) # plot in the style of the Boston Globe graphics f.globestyle <- function(sumobject, year, which.rate, this.absm, absm.label, numlev){ rate <- enquo(which.rate) absm <- enquo(this.absm) this.ylim <- 100000*c(min(sumobject$ir.std.1519, sumobject$ir.std.2020), max(sumobject$ir.std.1519, sumobject$ir.std.2020)) this.plot <- ggplot(sumobject, aes(x=oneweek, y=100000*!!rate, group=!!absm, color=!!absm)) + geom_point() + geom_line() + ylim(this.ylim) + scale_x_discrete(name="Weeks",limits=oneweek.labels) + scale_color_manual(name=paste(enexpr(absm.label)), values=these.colors[c((6-numlev):5)]) + ylab("Rate") + ggtitle(paste0("By ", absm.label, ", ", year)) + theme(axis.text.x=element_text(size=4), axis.text.y=element_text(size=8), axis.title.x=element_text(size=8), axis.title.y=element_text(size=8), plot.title = element_text(size = 9), legend.position="none") return(this.plot) } these.colors <- brewer.pal(5, "Set1") globe.apINDPOV.1519 <- f.globestyle(ma.std.compare.apINDPOV, year="2015-2019", which.rate=ir.std.1519, apINDPOV, "% poverty", numlev=4) globe.apINDPOV.2020 <- f.globestyle(ma.std.compare.apINDPOV, year="2020", which.rate=ir.std.2020, apINDPOV, "% poverty", numlev=4) globe.qcrowd.1519 <- f.globestyle(ma.std.compare.qcrowd, year="2015-2019", which.rate=ir.std.1519, qcrowd, "% crowding", numlev=5) globe.qcrowd.2020 <- f.globestyle(ma.std.compare.qcrowd, year="2020", which.rate=ir.std.2020, qcrowd, "% crowding", numlev=5) globe.qpercColor.1519 <- f.globestyle(ma.std.compare.qpercColor, year="2015-2019", which.rate=ir.std.1519, qpercColor, "% population of color", numlev=5) globe.qpercColor.2020 <- f.globestyle(ma.std.compare.qpercColor, year="2020", which.rate=ir.std.2020, qpercColor, "% population of color", numlev=5) these.colors <- rev(these.colors) globe.qICEwnhinc.1519 <- f.globestyle(ma.std.compare.qICEwnhinc, year="2015-2019", which.rate=ir.std.1519, qICEwnhinc, "Index of Concentration at the Extremes", numlev=5) globe.qICEwnhinc.2020 <- f.globestyle(ma.std.compare.qICEwnhinc, year="2020", which.rate=ir.std.2020, qICEwnhinc, "Index of Concentration at the Extremes", numlev=5) # arrange the plots on a grid grid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c("bottom", "right")) { plots <- list(...) position <- match.arg(position) g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]] lheight <- sum(legend$height) lwidth <- sum(legend$width) gl <- lapply(plots, function(x) x + theme(legend.position="none")) gl <- c(gl, ncol = ncol, nrow = nrow) combined <- switch(position, "bottom" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 1, heights = unit.c(unit(1, "npc") - lheight, lheight)), "right" = arrangeGrob(do.call(arrangeGrob, gl), legend, ncol = 2, widths = unit.c(unit(1, "npc") - lwidth, lwidth))) return(combined) } globe.apINDPOV <- grid_arrange_shared_legend(globe.apINDPOV.1519, globe.apINDPOV.2020, position="bottom",ncol = 2, nrow = 1) globe.qcrowd <- grid_arrange_shared_legend(globe.qcrowd.1519, globe.qcrowd.2020, position="bottom",ncol = 2, nrow = 1) globe.qpercColor <- grid_arrange_shared_legend(globe.qpercColor.1519, globe.qpercColor.2020, position="bottom",ncol = 2, nrow = 1) globe.qICEwnhinc <- grid_arrange_shared_legend(globe.qICEwnhinc.1519, globe.qICEwnhinc.2020, position="bottom",ncol = 2, nrow = 1) together <- ggarrange(globe.apINDPOV, globe.qcrowd, globe.qpercColor, globe.qICEwnhinc, ncol=1) ggsave("globestyle_zcta_april28.pdf", together, width=8, height=9.5)