This page explores correlations among metrics across counties using the latest available time point for each metric.
1 Wrangling
Here, filter our dataset to latest time points for each metric, filter to county data only, and join with a crosswalk that matches variable names to metric names. We then make an ordered framework that we use to arrange axes in a correlation matrix below.
Code
# metric crosswalkcrosswalk <- SMdocs::dp_meta %>%select(variable_name, metric)# Get df readyget_str(SMdata::metrics)df <- SMdata::metrics %>%filter(variable_name %in% SMdocs::dp_meta$variable_name) %>% SMdocs::get_latest_year() %>% SMdata::filter_fips('new') %>%mutate(variable_name =str_split_i(variable_name, '_', 1)) %>%left_join(crosswalk) %>%select(-variable_name) %>%pivot_wider(id_cols = fips,values_from = value,names_from = metric )projecter::get_str(df)# Get an ordered framework from which we will pull metrics in alphabetical order# and get the counts of metrics per dimension that we will use to draw lines# between dimensionsprojecter::get_str(dp_meta)ordered_framework <- dp_meta %>%arrange(dimension, metric) %>%filter(metric %in%names(df))ordered_framework
2 Figure
This figure shows absolute values of Spearman rank correlation coefficients among county level metrics. Green cells show significant correlations, while grey cells are insignificant. Darker green cells are stronger correlations and lighter green cells are weaker (but still significant).
Also, credits to Isabella and Tessa for making and improving this figure substantially.
Code
# Get line placements to divide dimensions# Reverse them to match alphabetical order of dimensions top to bottom# Get cumulative sums to space them out across graph# Then add 0.5 to each to put them in between cellscounts <- ordered_framework %>%group_by(dimension) %>%summarize(count =n()) %>%pull(count)hline_placements <- counts %>%rev() %>%cumsum() %>% {. +0.5}hline_placementsvline_placements <- counts %>%cumsum() %>% {. +0.5}vline_placements# Reorder our df in proper metric ordermetric_order <- ordered_framework %>%pull(metric)df <- df %>%select(fips, any_of(metric_order)) %>%select(rev(everything()))get_str(df)# Create matrix of valuesmat <- df %>%na.omit() %>%select(-fips) %>%as.matrix()# Get correlationscor <-rcorr(mat, type ='spearman')# Melt correlation values and rename columnscor_r <- reshape::melt(cor$r) %>%setNames(c('var_1', 'var_2', 'value'))# Save p valuescor_p <-melt(cor$P)p.value <- cor_p$value# Absolute values of correlationscor_r <- cor_r %>%mutate(value =abs(value))# Pulling out significant p valuescor_r <- cor_r %>%mutate(p_value =ifelse(var_1 == var_2, 1, cor_p$value)) %>%# Make diagonal line non-significantmutate(significant =ifelse(p_value <=0.05, TRUE, FALSE))# Reverse the order to use for x-axis labels (start at Economics)reversed_metric_order <-rev(metric_order)# Create a reversed version of the cor_r data frame for the x-axiscor_r <- cor_r %>%mutate(var_1 =factor(var_1, levels = reversed_metric_order) )# Remove last cols []get_str(cor_r)cor_r <- cor_r %>%filter(var_1 != var_2)# filter(!(var_1 == "Average Weekly Wages" & var_2 == "Violent Crime Rate")) %>%# filter(!(var_1 == "Violent Crime Rate" & var_2 == "Average Weekly Wages"))get_str(cor_r)# Only set values for lower triangle of the matrixcor_r <- cor_r %>%mutate(row_idx =as.integer(factor(var_1, levels = reversed_metric_order)),col_idx =as.integer(factor(var_2, levels = reversed_metric_order)),value =ifelse(row_idx <= col_idx, NA, value) # Set upper triangle to NA ) %>%select(-row_idx, -col_idx) # Remove temporary columns# Pasting dimension colors to metricscolored_labels <- ordered_framework %>%mutate(label =paste0("<span style='color:", dp_text_palette[dimension],"'>",to_title_case(str_replace_all(metric, "_", " ")),"</span>" ) ) %>%select(metric, label) %>%deframe()## Removing topmost y axis label and rightmost x axis label# Create modified label vectorsx_labels <- colored_labelsx_labels[metric_order[length(metric_order)]] <-""# remove last x-axis labely_labels <- colored_labelsy_labels[reversed_metric_order[length(reversed_metric_order)]] <-""# Blank topmost y-axis label# Create tick vectors (TRUE = show tick, FALSE = hide tick)x_ticks <-rep(TRUE, length(metric_order))x_ticks[length(metric_order)] <-FALSE# Hide rightmost x-axis ticky_ticks <-rep(TRUE, length(reversed_metric_order))y_ticks[length(reversed_metric_order)] <-FALSE# Hide topmost y-axis tick# Dummy data frame for dimension legenddimension_legend_df <-tibble(x =-Inf,y =-Inf,dimension =factor(names(dp_text_palette), levels =names(dp_text_palette)))# Make heatmapcorplot <-ggplot(cor_r, aes(x = var_1, y = var_2)) +# Color in significant correlationsgeom_tile(data = cor_r %>%filter(!is.na(value) & significant), aes(fill = value) ) +# Make non-significant correlations whitegeom_tile(data = cor_r %>%filter(!is.na(value) &!significant), fill ="gray90" ) +# Grid lines for lower trianglegeom_tile(data = cor_r %>%filter(!is.na(value)),color ="gray60",linewidth =0.2,fill =NA ) +geom_point(data = dimension_legend_df,aes(x = x, y = y, color = dimension),shape =15,size =5,inherit.aes =FALSE,show.legend =TRUE,alpha =0# fully transparent points, so they don't show up ) +scale_fill_gradient2(low ="white", high ="#1b7837", midpoint =0,name ="Correlation",na.value ="transparent" ) +scale_color_manual(values = dp_text_palette,labels =str_to_title(names(dp_text_palette)),name ="Dimension" ) +# Legend overridesguides(fill =guide_colorbar(title ="Correlation", order =1 ),color =guide_legend(override.aes =list(shape =15, size =5, alpha =1),nrow =2,order =2 ) ) +# Horizontal Linesgeom_segment(x =0,xend = vline_placements[4],y = hline_placements[1], yend = hline_placements[1] ) +geom_segment(x =0,xend = vline_placements[3],y = hline_placements[2], yend = hline_placements[2] ) +geom_segment(x =0,xend = vline_placements[2],y = hline_placements[3], yend = hline_placements[3] ) +geom_segment(x =0,xend = vline_placements[1],y = hline_placements[4], yend = hline_placements[4] ) +# Vertical Linesgeom_segment(x = vline_placements[1],xend = vline_placements[1],y =0, yend = hline_placements[4] ) +geom_segment(x = vline_placements[2],xend = vline_placements[2],y =0, yend = hline_placements[3] ) +geom_segment(x = vline_placements[3],xend = vline_placements[3],y =0, yend = hline_placements[2] ) +geom_segment(x = vline_placements[4],xend = vline_placements[4],y =0, yend = hline_placements[1] ) +# Correct x and y axis labels with the orderscale_x_discrete(labels = x_labels,limits = metric_order ) +scale_y_discrete(labels = y_labels,limits = reversed_metric_order ) +theme(axis.text.x =element_markdown(angle =45, hjust =1),axis.text.y =element_markdown(),axis.text =element_text(family ="Times New Roman"),axis.title =element_blank(),axis.ticks.length.x =unit(ifelse(x_ticks, 0.15, 0), "cm"),axis.ticks.length.y =unit(ifelse(y_ticks, 0.15, 0), "cm"),plot.title =element_text(hjust =0),legend.position ="top",legend.title.position ="top",legend.justification ="center",legend.title =element_text(size =10,family ="Times New Roman",hjust =0.5 ),legend.text =element_text(size =9, family ="Times New Roman"),legend.key =element_rect(fill ="transparent", colour =NA),legend.background =element_rect(fill ="transparent", colour =NA),plot.background =element_rect(fill ="transparent", colour =NA),panel.background =element_rect(fill ="transparent", colour =NA) )ggsave(plot = corplot,filename ='outputs/fig_corplot.png',height =9,width =9,dpi =300,bg ='white')
Reading the tea leaves here, it looks like we have lots of strong correlations within the Health dimension. This might suggest that the health metrics are particularly cohesive. It might also suggest that they are redundant, and we could use fewer metrics to measure the dimension. In contrast, the Economics dimension, for example, has very few correlations within its metrics. Again, this might signify a lack of theoretical coherence among metrics or that it is indeed a diverse dimension that requires disparate data sources to measure.
As for significant inter-dimension correlations, there are surprisingly few. This is probably a good thing on average, as strong correlations between dimensions would mean double counting of similar constructs. On the other hand, we might want to see more of these if we are to expect that changes in one dimension cause subsequent changes in another. For example, stricter environmental policies might lead to better environmental outcomes, but we would also want to see any changes it causes in the Production and Economic dimensions.
3 LaTeX Table
Making \(\LaTeX\) table for paper.
Code
# Latex tablehead(cor_r)head(cor_p)df <-full_join(cor_r, cor_p, by =join_by(var_1 == Var1, var_2 == Var2)) %>%select(1:4) %>%setNames(c('var_1', 'var_2', 'r', 'p'))get_str(df)# Format and add starsdf <- df %>%filter(!is.na(r)) %>%mutate(stars =case_when( p <0.001~'***', p >0.001& p <0.01~'***', p >0.01& p <0.05~'**',.default ='' ),r =format(round(r, 3), nsmall =3),r =paste0(r, stars) ) %>%select(-c(stars, p))get_str(df)# Pivotdf <- df %>%pivot_wider(names_from = var_2,values_from = r,id_cols = var_1 )get_str(df)# Make names short, remove var header, turn NA into ''truncate_strings <-function(x) {if (str_length(x) >20) {paste0(str_sub(x, end =20), '...') } else { x }}df <- df %>%mutate(var_1 =as.character(var_1),var_1 =map_vec(var_1, truncate_strings),across(!var_1, ~ifelse(is.na(.x), '', .x)) ) %>%setNames(c(map(names(.), truncate_strings)))get_str(df)colnames(df)[1] <-' 'get_str(df)# Paste rotatebox around column namescolnames(df)[2:ncol(df)] <-paste0('\\makecell{\\rotatebox{90}{',colnames(df)[2:ncol(df)],'}}')df %>%kbl(format ="latex",caption ='Correlation matrix of county level metrics',label ='tab_correlations',booktabs =TRUE,escape =FALSE# align = 'c' ) %>%kable_styling(font_size =8,bootstrap_options =c("condensed" ),latex_options ='scale_down' ) %>%landscape() %>%footnote(general_title ="",general ='\\\\textit{Note: } Spearman correlations among county level metrics using latest time points available. Connecticut counties are not included in analyses. ***: p < 0.001, **: p < 0.01, *: p < 0.05. ',threeparttable =TRUE,escape =FALSE ) %>%save_kable(file ='outputs/tab_correlations.tex' )
4 Long Table
Also including a long-format interactive table that shows p-values and coefficients. Use this for reporting in paper.
Code
# Create long table with p values and starstab <- cor_r %>%rename(rho = value, p = p_value) %>%filter(!is.na(rho)) %>%mutate(sig =ifelse(p <0.05, '*', ''))tab %>%get_reactable(defaultColDef =colDef(format =colFormat(digits =3) ) )