Question 1: Find counties with more than 100 new cases per 100,000 residents of the past 14 days
Find the 5 worst cumulative counties, 5 counties with most new case, list of safe counties, and a text report
#state of intrest
state.interest= "California"

newCases<- covid %>% 
  filter(state== state.interest) %>% 
  group_by(county, date) %>% 
  summarise(totalCases=sum(cases))  %>% 
  mutate(NewCases= totalCases- lag(totalCases)) %>% 
  ungroup()
create 2 tables: 5 counties with most cumulative cases and 5 counties with most new cases
#table: 5 most cumulative cases
  
mostCasesTbl<- newCases %>%
  filter(date==max(date)) %>% 
  select(county,totalCases) %>%  
  slice_max(totalCases,n=5)  
  
  
#make table nicer
knitr::kable(mostCasesTbl, caption= "Total Cases by County", col.names = c("County", "Total Cases"), 
format.args = list(big.mark = ","))
Total Cases by County
County Total Cases
Los Angeles 253,985
Riverside 55,073
Orange 52,121
San Bernardino 50,699
San Diego 42,742
# table: 5 counties with most new cases

mostNewCases<- newCases %>% 
  filter(date==max(date)) %>%  
  select(county, NewCases) %>% 
  slice_max(NewCases, n=5)


#make table nicer
knitr::kable(mostNewCases, caption= "New Cases by County", col.names = c("County", "New Cases"), 
format.args = list(big.mark = ","))
New Cases by County
County New Cases
Los Angeles 809
San Diego 265
Orange 185
Fresno 159
San Bernardino 156
#use stateCases data set that is filtered to state.interest 
stCases<- covid %>% 
  filter(state == state.interest) %>% 
  rename(FIPStxt = fips)


stateCasesPopJoin<-left_join(stCases,popEst,by = "FIPStxt")
Create 2 tables: 5 counties with most cases per capita, 5 counties with most new cases per capita.
#Table1: most cumlative cases per capita

mostCasesPerCapita<- stateCasesPopJoin %>% 
  group_by(county, POP_ESTIMATE_2019,date) %>%  
  summarise(totalCases=sum(cases)) %>% 
  mutate(NewCases= totalCases- lag(totalCases)) %>% 
  ungroup() %>% 
  filter(date== max(date)) %>% 
  mutate(casesPerCapita= totalCases/POP_ESTIMATE_2019) %>% 
  select(county,casesPerCapita) %>% 
  slice_max(casesPerCapita, n=5)


#make table nicer

knitr::kable(mostCasesPerCapita, caption= "Top 5 Highest Covid Cases by County Per Capitia", col.names = c("County", "Cases Per Capita"))
Top 5 Highest Covid Cases by County Per Capitia
County Cases Per Capita
Imperial 0.0622134
Kings 0.0464038
Kern 0.0341423
Tulare 0.0324199
Merced 0.0307584
#table 2: most current cases per capita
newCasesPerCapita<- stateCasesPopJoin %>% 
  group_by(county, POP_ESTIMATE_2019, date) %>% 
  summarise(totalCases=sum(cases)) %>% 
  mutate(NewCases= totalCases- lag(totalCases)) %>% 
  ungroup() %>% 
  filter(date==max(date)) %>% 
  mutate(newCasesPerCap = NewCases/POP_ESTIMATE_2019) %>% 
  select(county, newCasesPerCap) %>% 
  slice_max(newCasesPerCap, n=5)


#make table nicer

knitr::kable(newCasesPerCapita, caption= "Top 5 Highest New Covid Cases by County Per Capitia", col.names = c("County", "New Cases Per Capita"))
Top 5 Highest New Covid Cases by County Per Capitia
County New Cases Per Capita
Kings 0.0002615
San Benito 0.0002388
Monterey 0.0002027
Lake 0.0001708
Fresno 0.0001591
# 14 day data
time.length = 13

twoWeek<- stateCasesPopJoin %>% 
  filter(date >= max(date)-time.length) %>% 
  group_by(county,POP_ESTIMATE_2019,date) %>% 
  summarise(totalcase= sum(cases)) %>% 
  mutate(lagCases=totalcase-lag(totalcase,time.length)) %>% 
  ungroup() %>% 
  mutate(CasesPer = 100000*(lagCases/POP_ESTIMATE_2019)) %>% 
  filter(CasesPer>100) %>% 
  select(county,CasesPer) %>% 
  arrange(-CasesPer)

 
  knitr::kable(twoWeek, caption= "CA Counties With More Than 100 New Cases In The Past 14 days, per 100,000 Residents", col.names = c("County", "New Cases Per 100,000 Residents"))
CA Counties With More Than 100 New Cases In The Past 14 days, per 100,000 Residents
County New Cases Per 100,000 Residents
Kings 494.9653
Imperial 306.2660
Glenn 281.7596
Butte 275.1088
Madera 233.2721
Monterey 233.1470
Stanislaus 229.9059
San Benito 224.4937
Tulare 222.0101
Fresno 211.4901
Colusa 204.2048
San Joaquin 193.0071
Merced 183.3045
Sonoma 182.2647
Yuba 180.5054
Sutter 159.8416
Kern 156.7426
San Bernardino 140.2239
Sacramento 137.3660
Calaveras 135.0615
Modoc 124.4203
Contra Costa 122.4940
San Diego 122.0670
Los Angeles 121.6941
San Mateo 118.4493
Ventura 116.3112
Marin 115.9080
Yolo 114.7392
Santa Clara 114.1166
Orange 112.8258
Santa Barbara 111.7584
Santa Cruz 109.4384
Alameda 107.9381
#find total cases
mostCasesTbl1<- newCases %>%
  filter(date==max(date)) %>% 
  select(county,totalCases) %>% 
  summarise(stateTotal=sum(totalCases))

#find total new, last 14 days
twoWeek1<- stateCasesPopJoin %>% 
  filter(date >= max(date)-time.length) %>% 
  group_by(county,POP_ESTIMATE_2019,date) %>% 
  summarise(totalcase= sum(cases)) %>% 
  mutate(lagCases=totalcase-lag(totalcase,time.length)) %>% 
  ungroup() %>% 
  select(county,lagCases) %>% 
  filter(lagCases>=0) %>% 
  summarise(totNewCase= sum(lagCases))

#find the safe counties
twoWeek2<- stateCasesPopJoin %>% 
  filter(date >= max(date)-time.length) %>% 
  group_by(county,POP_ESTIMATE_2019,date) %>% 
  summarise(totalcase= sum(cases)) %>% 
  mutate(lagCases=totalcase-lag(totalcase,time.length)) %>% 
  ungroup() %>% 
  mutate(CasesPer = 100000*(lagCases/POP_ESTIMATE_2019)) %>% 
  filter(CasesPer<100) %>% 
  select(county,CasesPer) %>% 
  arrange(CasesPer)


knitr::kable(twoWeek2, caption= "Safe California Counties", col.names = c("County", "New Cases Per 100,000 People"))
Safe California Counties
County New Cases Per 100,000 People
Alpine 0.00000
Sierra 0.00000
Mono 6.92329
Mariposa 11.62588
Plumas 15.95151
Trinity 16.28002
Del Norte 21.57342
Shasta 23.32297
El Dorado 34.74329
Nevada 36.08842
Lassen 39.25032
Siskiyou 41.34224
Humboldt 43.52381
Tuolumne 56.90370
Solano 68.80483
Placer 78.57826
Tehama 79.89675
San Luis Obispo 85.12562
Riverside 87.59197
Napa 90.74805
Lake 91.63483
San Francisco 91.65684
Inyo 94.24026
Amador 95.59267
Mendocino 97.98384
# 10:Results) Total number of cases in California is 621,981 and the total number of new cases in the past 14 days is 112,474. The total number of safe counties, (<100 new cases per 100,000 ppl) is 13 counties in California.
Question 2:
In this question, we are going to look at the story of 4 states and the impact scale can have on data interprtation. The states include: New York, California, Louisiana, and Florida.
#create state level daily new and rolling 7 day average 
states.of.interest <-  c("New York", "California", "Louisiana", "Florida")
stateLvl<- covid %>% 
 filter(state %in% states.of.interest) %>% 
 group_by(state,date) %>% 
  summarise(cases=sum(cases))  %>% 
 mutate(DailyNewCases = c(NA, diff(cases)),
         roll7=rollmean(DailyNewCases,7,fill = NA, align = "right")) %>% 
  pivot_longer(cols = c(DailyNewCases,roll7),names_to = "type",values_to="values")

#create ggplot of daily new cases and 7 day average
plot01Lab2<- ggplot(data = stateLvl, aes(x=date, y=values))+
  geom_line(aes(col = state))+
  labs(x="Date", y="Case Count",
       title= "New Daily COVID-19 Case Count and 7 Day Rolling Average",
       caption = "Data From NY Times")+
  facet_grid(type~state, scales = "free_y")+
  theme_bw()+
  ggsave(plot = last_plot(), file= "../img/COVIDPlot.png")
plot01Lab2

#NY Times style
stateLvls<- covid %>% 
 filter(state %in% states.of.interest) %>% 
 group_by(state,date) %>% 
  summarise(cases=sum(cases))  %>% 
 mutate(DailyNewCases = c(NA, diff(cases)),
         roll7=rollmean(DailyNewCases,7,fill = NA, align = "right")) 

#NY Times style ggplot of new daily cases
nyTimes01<-  ggplot(data=stateLvls, aes(x = date)) +  
  geom_col(aes(y = DailyNewCases), col = NA, fill = "#F5B8B5") +  
  geom_line(aes(y = roll7), col = "darkred", size = 1) +  
  theme_bw()+
  labs(title = paste("New Daily COVID-19 Case Count and 7 Day Rolling Average", state.interest), x="Date", y= "Case Count") +  
  facet_wrap(~state)+
  ggsave(plot = last_plot(), file= "../img/NYTIIMESCOVIDPlot.png")
nyTimes01

#use region data to get common key
region<- data.frame(State = state.abb, state = state.name)

#statelvl1 has roll7, daily new cases, and total cases
stateLv1<-  covid %>% 
 filter(state %in% states.of.interest) %>% 
 group_by(state,date) %>% 
  summarise(cases=sum(cases))  %>% 
 mutate(DailyNewCases = c(NA, diff(cases)),
         roll7=rollmean(DailyNewCases,7,fill = NA, align = "right")) %>% 
  ungroup() 
 

  
#table join region(aka state abbr) to stateLvl1
stateLvl2<-left_join(stateLv1,region,by = "state") 


#clean up population data to match at the stateLvl2, trick with divide by 2 to negate pop double counts
popEst1<- popEst %>%   
  group_by(State) %>% 
  summarise(totalPop<- sum(POP_ESTIMATE_2019)/2)


#table join the state_abb covid data to usda population data, now has correct state pop associated with covid cases
#table also has new cases/pop and rolling7 per capita
#recreating facet_grid to show total population 
stateLvl3<-left_join(stateLvl2,popEst1,by = "State") %>% 
  rename(state_pop = "totalPop <- sum(POP_ESTIMATE_2019)/2") %>% 
  mutate(NewCasesPerCap= DailyNewCases/state_pop) %>% 
  mutate(roll7PerCap=roll7/state_pop) %>% 
  pivot_longer(cols = c(NewCasesPerCap,roll7PerCap),names_to ="type",values_to ="values")

#make adjusted for population plot
stateLvl3plot<- ggplot(data = stateLvl3, aes(x=date,y=values))+
  geom_line(aes(color=state))+
  scale_color_manual(values= c("California" = "red", "Florida" = "green", 
      "Louisiana" = "blue", "New York"= "Cyan3")) + 
  labs(x="Date", y="Case Count",
       title= "New Daily COVID-19 Case Count and 7 Day Rolling Average",subtitle = "Adjusted Per Capita",
       caption = "Data From NY Times")+
  facet_grid(type~state, scales = "free_y")+
  theme_bw()+
  ggsave(plot = last_plot(), file= "../img/COVIDPlotAdjust.png")

stateLvl3plot

#NY Times style
stateLvls3<-left_join(stateLvl2,popEst1,by = "State") %>% 
  rename(state_pop = "totalPop <- sum(POP_ESTIMATE_2019)/2") %>% 
  mutate(NewCasesPerCap= DailyNewCases/state_pop) %>% 
  mutate(roll7PerCap=roll7/state_pop) 

#NY Times style ggplot per capita
nyTimes02<-  ggplot(data=stateLvls3, aes(x = date)) +  
  geom_col(aes(y = NewCasesPerCap), col = NA, fill = "#F5B8B5") +  
  geom_line(aes(y = roll7PerCap), col = "darkred", size = 1) +  
  theme_bw() +  
  labs(title = paste("New Daily COVID-19 Case Count and 7 Day Rolling Average"),subtitle = "Adjusted Per Capita", x= "Date", y= "Case Count") + 
  facet_wrap(~state)+
  ggsave(plot = last_plot(), file= "../img/NYTIMEScovidPerCap.png")
nyTimes02

Results: Scaling COVID-19 cases based on the respective state population makes the relative amount of cases appear to be less severe in some states and more severe in others. This occurs because we are comparing only raw case data values. California, a highly populated state is likely to have more cases than Lousiana, a less populated state. When state cases are adjusted for population, using cases per capita the relative proportion of people within each state contracting COVID-19 becomes more clear.
Question 3: Exploring spatial data
#read in county spatial data
counties = read_csv("../data/county-centroids.csv")

#Join to COVID-19 Data
#get daily cumulative cases
#find weighted mean
covidSpatial<-inner_join(covid,counties,by = "fips") %>%  
  group_by(date) %>% 
  summarise(wmX= sum(LON*cases)/sum(cases),wmY= sum(LAT*cases)/sum(cases),cases=sum(cases)) %>% 
  mutate(month=format(date,"%m"))


#Create weighted mean plot
weightedMeanPlot<- ggplot(data = covidSpatial)+
  borders("state",fill="light gray", colour = "light blue")+
  scale_color_viridis_d()+
  geom_point(aes(x=wmX, y=wmY, color= month))+
  theme_linedraw()+
  labs(color= "month",
       size = "Cases (1,000,000)",
       x="Longitude",
       y="Latitude",
       title = "Weighted Mean Center of COVID-19 Outbreak in the US",
       caption = "NY Times Data")+
      ggsave(plot = last_plot(), file= "../img/COVIDweightedMean.png")

weightedMeanPlot

Results: The movement of the weighted mean center of COVID-19 cases in the United States moves west/southwest over the progression of the year 2020. This is inline with case reportings as the virus hit the east coast, NYC in particular, very hard in the beginning then spread west to almost all heavily populated cities.