Exploratory Data Analysis

Code
options(scipen = 999)

pacman::p_load(
  dplyr,
  htmltools,
  stringr,
  tidyr,
  purrr,
  broom,
  skimr,
  ggplot2,
  ggpubr,
  tibble,
  Hmisc,
  reshape2,
  viridisLite,
  conflicted,
  readr,
  reactable
)

# GitHub packages
pacman::p_load_current_gh(
  'ChrisDonovan307/projecter',
  'Food-Systems-Research-Institute/SMdata'
)

# Load SMdocs
devtools::load_all()

conflicts_prefer(
  dplyr::select(),
  dplyr::filter(),
  dplyr::pull(),
  dplyr::summarize(),
  stats::lag(),
  base::setdiff(),
  .quiet = TRUE
)

1 Missing Data

In this section, we explore the missingness of our collected data. We do not dive into metrics for which there are no data available at all in certain years; rather, we dig into the data that are available to see whether they systematically miss counties based on county characteristics. We first explore how many missing data points there are per metric, and then look by county to explore how many metrics are available for that resolution. We exclude Connecticut from these analyses because of changes to their county system that make it prohibitively difficult to compare.

1.1 By Metric

First, we calculate how many data points there should be for each metric, based on the number of counties (not including CT) and the number of years the metric covers. Then we can get determine the percentage of missing data per metric.

Code
vars <- meta_vars(dp_metrics_county)

# Years covered by each metric, multiplied by number of counties: 209 (not CT)
n_counties <- 209

# Get years, number of data points, total possible number of points based on 
# years and 209 counties, then the percentage missing
metric_cov <- map(vars, ~ {
  var_df <- dp_metrics_county %>%
    filter(
      variable_name == .x,
      str_detect(fips, '^09', negate = TRUE)
    )
  n_years <- var_df %>% 
    pull(year) %>%
    unique() %>%
    length()
  n_points <- nrow(var_df)
  df <- data.frame(
    variable_name = .x,
    n_years = n_years, 
    n_points = n_points
  )
  return(df)
}) %>% 
  bind_rows() %>% 
  mutate(
    n_possible = n_years * n_counties,
    perc_miss = round(100 - ((n_points / n_possible) * 100), 2)
  )

missy_metric_df <- metric_cov %>% 
  filter(perc_miss > 5)
missy_metrics <- missy_metric_df$variable_name
print_missy_metrics <- paste0(missy_metrics, collapse = ', ')
  
get_reactable(metric_cov)

The metrics with > 5% missing are: consEasementAcres, propInvasive, propOpsPrecision, sizeDiversity, treeDiversity. Now let’s see in which counties they are missing data:

Code
non_ct_fips <- fips_key %>% 
  filter(
    str_length(fips) == 5,
    str_detect(fips, '^09', negate = TRUE)
  ) %>% 
  pull(fips)

missing_table <- dp_metrics_county %>% 
  filter(
    variable_name %in% missy_metrics,
    str_detect(fips, '^09', negate = TRUE)
  ) %>% 
  group_by(variable_name, year) %>%
  summarize(
    fips_present = c(fips),
    fips_missing = list(setdiff(non_ct_fips, fips_present)),
  ) %>% 
  select(-fips_present) %>%
  unique() %>% 
  mutate(
    n_fips_missing = lengths(fips_missing)
  )

# Table
get_reactable(
  missing_table,
  defaultColDef = reactable::colDef(minWidth = 100),
  columns = list(
    fips_missing = reactable::colDef(minWidth = 500)
  )
)

The propInvasive metric (proportion of invasive species based on the USFS Forest Inventory Analysis) is missing a substantial amount of data in 2000 and 2001, but it quickly subsides in the following years. Few other metrics show a substantial amount of missing data across counties.

1.2 By County

Now we look at which counties are missing the most metrics, ranked in order for each of the top five most missing metrics.

Code
out <- map(missy_metrics, ~ {
  missing_table %>% 
    filter(variable_name == .x) %>% 
    pull(fips_missing) %>% 
    unlist() %>% 
    table() %>% 
    as.data.frame() %>% 
    setNames(c('fips', 'times_missing')) %>% 
    arrange(desc(times_missing))
}) %>% 
  setNames(c(missy_metrics))

# Add county names
out <- map(out, ~ {
  left_join(.x, select(fips_key, -state_code))
})

Conservation Easement Acres:

Code
get_reactable(out$consEasementAcres, fullWidth = FALSE)

Proportion of Invasive Species:

Code
get_reactable(out$propInvasive)

Proportion of farms with precision ag:

Code
get_reactable(out$propOpsPrecision)

Tree Size Diversity:

Code
get_reactable(out$sizeDiversity)

Tree Species Diversity:

Code
get_reactable(out$treeDiversity)

1.3 Overall

Finally, let’s look at the overall proportion of missing data for each metric at the county level:

Code
skimr::skim(SMdocs::dp_metrics_county_wide)
# Missing years are county areas, that's fine
# Other missing values are because years don't match up, that's fine
get_str(dp_metrics_county_wide)
vars <- names(dp_metrics_county_wide[, 3:ncol(dp_metrics_county_wide)])

out <- map_dbl(vars, ~ {
  years <- dp_metrics_county_wide %>% 
    drop_na(.x) %>% 
    distinct(year) %>% 
    pull(year)
  df <- dp_metrics_county_wide %>% 
    select(year, fips, all_of(.x)) %>% 
    filter(year %in% years) %>% 
    complete(year, fips)
  prop_mis <- sum(is.na(df[.x])) / nrow(df)
  return(prop_mis)
})

# Graph it
miss <- bind_cols(vars, out) %>% 
  setNames(c('var', 'prop_miss'))
miss_plot <- miss %>% 
  ggplot(aes(y = reorder(var, prop_miss), x = prop_miss)) +
  geom_col(
    color = 'black',
    fill = '#154734'
  ) +
  labs(
    x = 'Proportion Missing',
    y = 'Metric'
  ) +
  theme_bw()
miss_plot

We can see that plantRar (Rarefied richness of plant observations from iNaturalist) is missing a substantial amount of data. This is because we only did these calculations in counties with at least 100 recorded observations per year. The purpose here was to avoid systematic bias toward populated counties by rarefying or downsampling the data. We are also missing a substantial number of observations from CDC surveys on social isolation and social support.

2 Distributions

Here we explore distributions of each metric using the latest year available for each variable. Graphs in red have a skew > 2. We use these exploratory graphs to get a sense of the data and inform our analyses.

2.1 County Data

Code
# Get skews of variables
# get_str(dp_metrics_county)
dat <- dp_metrics_county %>% 
  get_latest_year() %>% 
  pivot_wider(
    id_cols = fips,
    values_from = 'value',
    names_from = 'variable_name'
  )
# get_str(dat)

skewed <- psych::describe(dat[, -1]) %>% 
  as.data.frame() %>% 
  mutate(across(everything(), as.numeric)) %>% 
  tibble::rownames_to_column('variable_name') %>% 
  dplyr::select(variable_name, skew) %>% 
  dplyr::filter(abs(skew) > 1) %>% 
  dplyr::pull(variable_name)

plots <- map(names(dat)[-1], \(var){
  # color based on skewness
  if (var %in% skewed) {
    fill <- 'red'
    color <- 'darkred'
  } else {
    fill <- '#154724'
    color <- 'black'
  }
  
  # Make plot for variable
   dat %>% 
    select(!!sym(var)) %>% 
    mutate(!!sym(var) := as.numeric(!!sym(var))) %>% 
    na.omit() %>% 
    ggplot(aes(x = !!sym(var), group = 1)) + 
    geom_density(
      fill = fill,
      color = color,
      alpha = 0.5
    ) +
    theme_classic() +
    scale_x_continuous(n.breaks = 5) +
    theme(plot.margin = unit(c(rep(0.5, 4)), 'cm'))
}) 

# Arrange them in 4 columns
# TODO: Turn this into a package function
n_col <- 4
n_plots <- length(plots)
n_rows <- ceiling(n_plots/n_col)
ggarrange(
  plotlist = plots,
  ncol = n_col,
  nrow = n_rows
)

2.2 State Data

Code
state_dat <- dp_metrics_state %>% 
  filter(!(
    fips == '42' & year == 2022 & variable_name == 'hayYieldMeasuredInTonsAcre'
  )) %>% 
  get_latest_year() %>% 
  pivot_wider(
    id_cols = fips,
    values_from = 'value',
    names_from = 'variable_name'
  )
state_plots <- SMdocs::get_distributions(df = state_dat)
  
# Arrange with proper number of rows based on given number of cols
n_col <- 4
n_plots <- length(state_plots)
n_rows <- ceiling(n_plots/n_col)
ggpubr::ggarrange(
  plotlist = state_plots,
  ncol = n_col,
  nrow = n_rows
)

Back to top