Exploratory Data Analysis

Code
options(scipen = 999)

pacman::p_load(
  dplyr,
  htmltools,
  stringr,
  tidyr,
  purrr,
  broom,
  skimr,
  ggplot2,
  ggpubr,
  tibble,
  Hmisc,
  reshape2,
  viridisLite,
  conflicted,
  readr
)

# GitHub packages
pacman::p_load_gh(
  'ChrisDonovan307/projecter',
  'Food-Systems-Research-Institute/SMdata'
)

# Functions
source('dev/get_reactable.R')
source('dev/get_missing.R')
source('dev/data_pipeline_functions.R')

conflicts_prefer(
  dplyr::select(),
  dplyr::filter(),
  dplyr::pull(),
  dplyr::summarize(),
  stats::lag(),
  base::setdiff(),
  .quiet = TRUE
)

1 Wrangling

Code
# Create a list of the clean datasets we will use throughout analysis
datasets <- list()

# Check out summary from OneDrive that has our variable names
get_str(data_paper_meta)

# Pull vars to reduce metrics down
meta <- data_paper_meta %>% 
  filter(
    !is.na(variable_name),
    !variable_name %in% c('tbd', '', 'NONE'),
    analyze != '',
    !is.na(analyze)
  ) %>% 
  select(
    variable_name,
    resolution,
    starts_with('n_'),
    latest_year,
    dimension:metric,
    weighting:desirable,
  )
get_str(meta)

# Load neast metrics for data paper, both county and state
get_str(data_paper_metrics)
all <- data_paper_metrics %>% 
  filter_fips('neast') %>% 
  inner_join(select(meta, variable_name, resolution)) %>% 
  mutate(across(c(year, value), as.numeric))
get_str(all)

## Cleaning
# "annualAvgWklyWageNAICS11" should have NAs instead of zeroes
all$value[all$variable_name == "annualAvgWklyWageNAICS11" & all$value == 0] <- NA

# Save to dataset list
datasets$dp_metrics_all <- all

# Pull out county and state separately, based on spec in metadata
dp_county <- all %>% 
  filter_fips('counties') %>% 
  filter(resolution != 'state') %>% 
  select(-resolution)
get_str(dp_county)
unique(dp_county$fips)
datasets$dp_metrics_county <- dp_county

dp_state <- all %>% 
  filter_fips('states') %>% 
  filter(resolution == 'state') %>% 
  select(-resolution)
get_str(dp_state)
unique(dp_state$fips)
datasets$dp_metrics_state <- dp_state

# Get another DF of weighting variables
weights <- metrics %>%
  filter(variable_name %in% weighting$variable_name) %>%
  filter_fips('neast')
get_str(weights)
 
# Save to dataset list
datasets$dp_weights <- weights
  
# Pivot wider for transformations, then alphabetize columns
dp_county_wide <- dp_county %>% 
  # Remove one weird doubled up value
  filter(!(
    fips == '42' & year == 2022 & variable_name == 'hayYieldMeasuredInTonsAcre'
  )) %>%
  pivot_wider(
    id_cols = c(fips, year),
    values_from = 'value',
    names_from = 'variable_name'
  ) %>% 
  select(fips, year, sort(names(.)[2:length(names(.))]))
get_str(dp_county_wide)

# Save to datasets
datasets$dp_county_wide <- dp_county_wide

2 Missing Data

Explore missing data for each individual metric. Note that there is a baseline of missing for every one because of the issues with Connecticut counties. This is a bit awkward.

Code
skimr::skim(dp_county_wide)
# Missing years are county areas, that's fine
# Other missing values are because years don't match up, that's fine
get_str(dp_county_wide)
vars <- names(dp_county_wide[, 3:ncol(dp_county_wide)])

out <- map_dbl(vars, ~ {
  years <- dp_county_wide %>% 
    drop_na(.x) %>% 
    distinct(year) %>% 
    pull(year)
  df <- dp_county_wide %>% 
    select(year, fips, all_of(.x)) %>% 
    filter(year %in% years) %>% 
    complete(year, fips)
  prop_mis <- sum(is.na(df[.x])) / nrow(df)
  return(prop_mis)
})

# Graph this
miss <- bind_cols(vars, out) %>% 
  setNames(c('var', 'prop_miss'))
miss_plot <- miss %>% 
  ggplot(aes(y = reorder(var, prop_miss), x = prop_miss)) +
  geom_col(
    color = 'black',
    fill = '#154734'
  ) +
  labs(
    x = 'Proportion Missing',
    y = 'Metric'
  ) +
  theme_bw()

3 Distributions

Exploring distributions using latest year available for each metric.

3.1 County Data

Code
# Get skews of variables
dat <- dp_county %>% 
  get_latest_year() %>% 
  pivot_wider(
    id_cols = fips,
    values_from = 'value',
    names_from = 'variable_name'
  )
# get_str(dat)

skewed <- psych::describe(dat[, -1]) %>% 
  as.data.frame() %>% 
  rownames_to_column('variable_name') %>% 
  dplyr::select(variable_name, skew) %>% 
  dplyr::filter(abs(skew) > 2) %>% 
  pull(variable_name)

plots <- map(names(dat)[-1], \(var){
  # color based on skewness
  if (var %in% skewed) {
    fill <- 'red'
    color <- 'darkred'
  } else {
    fill <- '#154724'
    color <- 'black'
  }
  
  # Make plot for variable
  dat %>% 
    ggplot(aes(x = !!sym(var))) + 
    geom_density(
      fill = fill,
      color = color,
      alpha = 0.5
    ) +
    theme_classic() +
    theme(plot.margin = unit(c(rep(0.5, 4)), 'cm'))
}) 

# Arrange them in 4 columns
ggarrange(
  plotlist = plots,
  ncol = 4,
  nrow = 11
)

3.2 State Data

Code
source('dev/get_distributions.R')
dat <- dp_state %>% 
  filter(!(
    fips == '42' & year == 2022 & variable_name == 'hayYieldMeasuredInTonsAcre'
  )) %>% 
  get_latest_year() %>% 
  pivot_wider(
    id_cols = fips,
    values_from = 'value',
    names_from = 'variable_name'
  )
get_distributions(
  df = dat,
  n_col = 4,
  n_row = 5
)

Back to top