# Create a list of the clean datasets we will use throughout analysisdatasets <-list()# Check out summary from OneDrive that has our variable namesget_str(data_paper_meta)# Pull vars to reduce metrics downmeta <- data_paper_meta %>%filter(!is.na(variable_name),!variable_name %in%c('tbd', '', 'NONE'), analyze !='',!is.na(analyze) ) %>%select( variable_name, resolution,starts_with('n_'), latest_year, dimension:metric, weighting:desirable, )get_str(meta)# Load neast metrics for data paper, both county and stateget_str(data_paper_metrics)all <- data_paper_metrics %>%filter_fips('neast') %>%inner_join(select(meta, variable_name, resolution)) %>%mutate(across(c(year, value), as.numeric))get_str(all)## Cleaning# "annualAvgWklyWageNAICS11" should have NAs instead of zeroesall$value[all$variable_name =="annualAvgWklyWageNAICS11"& all$value ==0] <-NA# Save to dataset listdatasets$dp_metrics_all <- all# Pull out county and state separately, based on spec in metadatadp_county <- all %>%filter_fips('counties') %>%filter(resolution !='state') %>%select(-resolution)get_str(dp_county)unique(dp_county$fips)datasets$dp_metrics_county <- dp_countydp_state <- all %>%filter_fips('states') %>%filter(resolution =='state') %>%select(-resolution)get_str(dp_state)unique(dp_state$fips)datasets$dp_metrics_state <- dp_state# Get another DF of weighting variablesweights <- metrics %>%filter(variable_name %in% weighting$variable_name) %>%filter_fips('neast')get_str(weights)# Save to dataset listdatasets$dp_weights <- weights# Pivot wider for transformations, then alphabetize columnsdp_county_wide <- dp_county %>%# Remove one weird doubled up valuefilter(!( fips =='42'& year ==2022& variable_name =='hayYieldMeasuredInTonsAcre' )) %>%pivot_wider(id_cols =c(fips, year),values_from ='value',names_from ='variable_name' ) %>%select(fips, year, sort(names(.)[2:length(names(.))]))get_str(dp_county_wide)# Save to datasetsdatasets$dp_county_wide <- dp_county_wide
2 Missing Data
Explore missing data for each individual metric. Note that there is a baseline of missing for every one because of the issues with Connecticut counties. This is a bit awkward.
Code
skimr::skim(dp_county_wide)# Missing years are county areas, that's fine# Other missing values are because years don't match up, that's fineget_str(dp_county_wide)vars <-names(dp_county_wide[, 3:ncol(dp_county_wide)])out <-map_dbl(vars, ~ { years <- dp_county_wide %>%drop_na(.x) %>%distinct(year) %>%pull(year) df <- dp_county_wide %>%select(year, fips, all_of(.x)) %>%filter(year %in% years) %>%complete(year, fips) prop_mis <-sum(is.na(df[.x])) /nrow(df)return(prop_mis)})# Graph thismiss <-bind_cols(vars, out) %>%setNames(c('var', 'prop_miss'))miss_plot <- miss %>%ggplot(aes(y =reorder(var, prop_miss), x = prop_miss)) +geom_col(color ='black',fill ='#154734' ) +labs(x ='Proportion Missing',y ='Metric' ) +theme_bw()
3 Distributions
Exploring distributions using latest year available for each metric.
3.1 County Data
Code
# Get skews of variablesdat <- dp_county %>%get_latest_year() %>%pivot_wider(id_cols = fips,values_from ='value',names_from ='variable_name' )# get_str(dat)skewed <- psych::describe(dat[, -1]) %>%as.data.frame() %>%rownames_to_column('variable_name') %>% dplyr::select(variable_name, skew) %>% dplyr::filter(abs(skew) >2) %>%pull(variable_name)plots <-map(names(dat)[-1], \(var){# color based on skewnessif (var %in% skewed) { fill <-'red' color <-'darkred' } else { fill <-'#154724' color <-'black' }# Make plot for variable dat %>%ggplot(aes(x =!!sym(var))) +geom_density(fill = fill,color = color,alpha =0.5 ) +theme_classic() +theme(plot.margin =unit(c(rep(0.5, 4)), 'cm'))}) # Arrange them in 4 columnsggarrange(plotlist = plots,ncol =4,nrow =11)