Metric Analysis

Code
options(scipen = 999)

pacman::p_load(
  conflicted,
  dplyr,
  htmltools,
  stringr,
  tidyr,
  reactable,
  purrr,
  broom,
  knitr,
  kableExtra,
  tictoc,
  furrr,
  parallelly,
  splm,
  spdep,
  spatialreg,
  sf,
  skimr,
  ggplot2,
  ggpubr,
  readr,
  lmtest,
  sandwich,
  performance
)

pacman::p_load_gh(
  'ChrisDonovan307/projecter', 
  'Food-Systems-Research-Institute/SMdata'
)

# Functions
source('dev/get_reactable.R')
source('dev/get_missing.R')

conflicts_prefer(
  dplyr::select(),
  dplyr::filter(),
  dplyr::pull(),
  stats::lag(),
  base::setdiff(),
  .quiet = TRUE
)

results <- list()
datasets <- readRDS('data/data_paper/datasets.rds')

This page explores time series data at state and county level, then runs some regressions to see whether metrics are improving or declining over time at the county level.

1 Time Series

Plot time series of average variable that has at least 2 data points. These are raw, unweighted values.

1.1 County Data

Code
# Prepare vector of county variables that are time series
get_str(datasets$dp_metrics_county)
vars <- unique(datasets$dp_metrics_county$variable_name)
series_vars <- map(vars, ~ {
  count <- datasets$dp_metrics_county %>% 
    filter(variable_name == .x) %>% 
    pull(year) %>% 
    unique() %>% 
    length()
  if (count > 1) {
    return(.x)
  } else {
    return(NULL)
  }
}) %>% 
  keep(~ !is.null(.x)) %>%
  unlist()
datasets$series_vars <- series_vars

A few hinky things going on here - voter turnout is apparently flat across all counties. It looks like this is because it is only 2023 to 2025 and it is from a secondary source that pulled it from the 5-year census. So this needs to be fixed.

Land value per acre also appears jenky, but it is because Richmond County, NY (Staten Island) has a land value of 2.5 million per acre while most other counties are below 100k. We should probably remove this as an outlier.

Code
plots <- map(series_vars, ~ {
  df <- datasets$dp_metrics_county %>%
    filter(variable_name == .x) %>%
    ggplot(aes(x = year, y = value)) +
    geom_line(
      aes(group = fips),
      alpha = 0.3
    ) +
    geom_smooth(
      method = 'loess',
      lwd = 2
      # col = '#154734'
    ) +
    labs(
      x = 'Year',
      title = .x
    ) +
    theme_classic()
})

ggarrange(
  plotlist = plots,
  ncol = 4,
  nrow = 11
)

1.2 State Data

Code
# get_str(dp_state)
vars <- unique(datasets$dp_metrics_state$variable_name)
state_series_vars <- map(vars, ~ {
  count <- datasets$dp_metrics_state %>% 
    filter(variable_name == .x) %>% 
    pull(year) %>% 
    unique() %>% 
    length()
  if (count > 1) {
    return(.x)
  } else {
    return(NULL)
  }
}) %>% 
  keep(~ !is.null(.x)) %>%
  unlist()

# Save vector of series vars for later
datasets$state_series_vars <- state_series_vars

plots <- map(state_series_vars, ~ {
  df <- datasets$dp_metrics_state %>%
    filter(variable_name == .x) %>%
    ggplot(aes(x = year, y = value)) +
    geom_line(
      aes(group = fips),
      alpha = 0.3
    ) +
    geom_smooth(
      method = 'loess',
      lwd = 2
      # col = '#154734'
    ) +
    labs(
      x = 'Year',
      title = .x
    ) +
    theme_classic()
})

ggarrange(
  plotlist = plots,
  ncol = 3,
  nrow = 4
)

2 Linear Models

First trying linear models with time as independent variable. Normalizing dataset before we do anything else.

Code
get_str(datasets$dp_metrics_county)

# Function to min max then scale to 0 to 100 for percent change
min_max <- function(x) {
  normed <- (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
  normed * 100
}

vars <- unique(datasets$dp_metrics_county$variable_name)
datasets$dp_metrics_county_normed <- map(vars, ~ {
  datasets$dp_metrics_county %>%
    filter(variable_name == .x) %>% 
    drop_na(value) %>% 
    group_by(variable_name) %>% 
    mutate(normed_value = min_max(value)) %>% 
    ungroup()
}) %>% 
  list_rbind()
get_str(datasets$dp_metrics_county_normed)  

2.1 Regressions

First create a weights crosswalk that links variable_names to weighting variables.

Code
weights_crosswalk <- data.frame(
  metric = c(
    'GDP',
    'GDP from agriculture',
    'Total number of producers',
    'Total number of operations',
    'Population (annual)',
    'Population (5 year)',
    'Population (5 year, smooth)',
    'Acres of land',
    'Acres of water',
    'Area under agriculture (CDL)',
    'Number employed in agriculture',
    'Acres operated',
    'Acres operated, smooth',
    'Acres of cropland ag land',
    'Acres of woodland ag land',
    'Acres of pasture ag land',
    'Total farm income',
    'Total sales from commodities'
  ),
  variable_name = c(
    'gdp',
    'gdpFromAg',
    'nProducers',
    'nOperations',
    'populationAnnual',
    'population5Year',
    'population5YearSmooth',
    'landAcres',
    'waterAcres',
    'cdlAgAcres',
    'annualAvgEmplvlNAICS11',
    'acresOperated',
    'acresOperatedSmooth',
    'agLandCropAcres',
    'agLandWoodAcres',
    'agLandPastureAcres',
    'totalFarmIncome',
    'totalSalesCommodities'
  )
)
weights_crosswalk

# Link weight vars to the appropriate variables
weights_crosswalk <- data_paper_meta %>% 
  select(variable_name, weighting) %>% 
  filter(variable_name %in% datasets$dp_metrics_county_normed$variable_name) %>% 
  left_join(weights_crosswalk, by = join_by(weighting == metric)) %>% 
  setNames(c('variable_name', 'weight_metric', 'weight_variable')) %>% 
  select(-weight_metric) %>% 
  na.omit()
get_str(weights_crosswalk)
weights_crosswalk

Run regressions for all time series variables where time points > 1.

Code
# Get a crosswalk to add state as a fixed effect
state_link <- fips_key %>% 
  filter(str_length(fips) == 2, fips != '00') %>% 
  select(fips, state_code)

res <- map(datasets$series_vars, ~ {
  tryCatch(
    {
      df <- datasets$dp_metrics_county_normed %>% 
        filter(variable_name == .x) %>% 
        mutate(
          value = as.numeric(value),
          year = as.numeric(year),
          state_fips = str_sub(fips, end = 2)
        ) %>% 
        left_join(state_link, by = join_by(state_fips == fips))
      
      # Add weighting variable
      weight_var <- weights_crosswalk$weight_variable[weights_crosswalk$variable_name == .x]
      weight_df <- datasets$dp_weights
      if (!is.na(weight_var) && length(weight_var) > 0) {
        weight_df <- weight_df %>% 
          filter(variable_name == weight_var)
      }
      weight_df <- weight_df %>% 
        mutate(
          value = as.numeric(value), 
          year = as.numeric(year)
        ) %>% 
        rename(
          weight_value = value, 
          weight_name = variable_name,
          weight_year = year
        )
      # get_str(weight_df)
      
      # Depending on whether weight has a year, do appropriate join
      # Note that we might be losing some years here if weighting var doesn't
      # cover the variable enough
      if (all(is.na(weight_df$weight_year))) {
        df <- df %>% 
          inner_join(weight_df, by = join_by(fips))
      } else {
        df <- df %>% 
          inner_join(weight_df, by = join_by(fips, year == weight_year))
      }
      
      # Reduce to county fips only. Remove dupes 
      df <- df %>%
        filter(str_length(fips) == 5) %>% 
        distinct()
      
      # Weighted OLS 
      model <- lm(
        normed_value ~ year + state_code, 
        data = df,
        weights = weight_value
      )
      return(model)
    },
    error = function(e) {
      message(paste('Error with var', .x))
    }
  )
}) %>% 
  setNames(c(datasets$series_vars)) %>%
  discard(\(x) is.null(x))
get_str(res, 1)
get_str(res)

# Robust SEs
pacman::p_load(lmtest)
coeftest(res[[1]])

# Save immediate output
saveRDS(res, 'data/objects/weighted_ols_outs.rds')

2.2 Residuals

Check residuals of outputs:

Code
res <- readRDS('data/objects/weighted_ols_outs.rds')
res_plots <- map(res, ~ {
  try(performance::check_model(.x))
}) %>% 
  purrr::keep(~ length(.x) > 1)
saveRDS(res_plots, 'data/objects/weighted_ols_residuals.rds')

(Residual plots are giving us trouble currently)

Now that we have regression outputs, let’s put them in a table to see which ones change significantly over time.

Code
# Put together one table of results from all models, add stars
res <- readRDS('data/objects/weighted_ols_outs.rds')
get_str(res)

# Check for NA outputs
res[[1]]$coefficients[['year']]
coefs <- map(res, ~ .x$coefficients[['year']])
names(coefs[is.na(coefs)])

# Mobility disability is NA because there is only 1 year after we join with 
# weighting variable. So we need to just throw it out if we can't get smoothed
# population for 2024.
res <- res %>% 
  discard(~ is.na(.x$coefficients[[2]]))

# test
# vcov = vcovHC(res[[1]], type = 'HC0')
# test <- coeftest(res[[1]], vcov = vcov)
# test['year', ]
# cis <- coefci(res[[1]], vcov. = vcov, level = 0.95)
# cis['year', ]

# Make a table of regression outputs, including robust standard errors
res_df <- imap(res, ~ {
  vcov = vcovHC(.x, type = 'HC0')
  df <- coeftest(.x, vcov = vcov) %>% 
    tidy() %>% 
    filter(term == 'year') %>% 
    mutate(term = .y)
  conf_ints <- coefci(.x)['year', ]
  df$ci_low <- conf_ints[1]
  df$ci_high <- conf_ints[2]
  df
}) %>% 
  bind_rows() %>% 
  mutate(sig = ifelse(p.value < 0.05, '*', ''))
res_df

# How many time series metrics total
(n_metrics <- nrow(res_df))

# How many change over time significantly
(
  n_change <- res_df %>% 
    filter(sig == '*') %>% 
    nrow()
)

# Percentage significant
(perc_sig <- mean(res_df$sig == '*', na.rm = TRUE) * 100)

Note that we only have 2 years for the mobility disability metric - 2023 and 2024. But we don’t have a smoothed 5-year population for 2024, only 2023. So we can only use a single year, which means there is no variation in year. So we are dropping this from the analysis.

Out of 34 metrics with > 1 time point, we have 23 metrics that change significantly over time (67.65%).

Still have to make sense of the directional values of our metrics. These were defined in the Aggregation from the main analysis branch.

Code
# Of those that are significant, what are trends
# leaving in all of them, even if not significant though
raw_outputs <- res_df %>% 
  mutate(trend = case_when(
    estimate > 0 & sig == '*' ~ 'increasing',
    sig == '' ~ NA_character_,
    .default = 'decreasing'
  ))

# Bring in directional values. Keep only the ones without question marks, 
# clearly higher or lower
get_str(data_paper_meta)
directions <- data_paper_meta %>% 
  filter(str_detect(desirable, 'higher|lower'), !is.na(variable_name)) %>% 
  mutate(desirable = str_remove_all(desirable, '\\?')) %>% 
  select(variable_name, desirable)
reverse <- directions %>% 
  filter(desirable == 'lower') %>% 
  pull(variable_name)

# Reduce to only vars that have a direction.
# Here we lose a few metrics
model_outputs <- raw_outputs %>% 
  filter(term %in% directions$variable_name) %>% 
  mutate(
    reverse = ifelse(term %in% reverse, TRUE, FALSE),
    outcome = case_when(
      reverse == FALSE & trend == 'increasing' ~ 'better',
      reverse == FALSE & trend == 'decreasing' ~ 'worse',
      reverse == TRUE & trend == 'increasing' ~ 'worse',
      reverse == TRUE & trend == 'decreasing' ~ 'better',
      .default = NA
    ),
    across(where(is.numeric), ~ format(round(.x, 3), nsmall = 3))
  )
model_outputs

# See what we lost
(lost <- setdiff(raw_outputs$term, model_outputs$term))
(n_lost <- length(lost))
(print_lost <- lost %>% paste0(collapse = ', '))

# proportion getting better or worse
(tab <- table(model_outputs$outcome))
(percs <- round(prop.table(tab), 3) * 100)

59.1% are getting better and 40.9% are getting worse. We drop 3 metrics here that don’t have proper directions (vacancyRate, populationChangePerc, annualPrecipMM).

Check out table with outputs for each regression. Filter by significance and outcome. Trend is just whether it is going up or down, outcome is whether that is good or bad.

2.3 Table

Code
# Add framework info
framed_outputs <- data_paper_meta %>% 
  select(dimension, index, indicator, metric, variable_name) %>% 
  right_join(model_outputs, by = join_by(variable_name == term))

# Create interactive table for Quarto
get_reactable(
  framed_outputs, 
  defaultPageSize = 10,
  columns = list(
   variable_name = colDef(minWidth = 200),
   sig = colDef(minWidth = 50),
   reverse = colDef(minWidth = 75),
   statistic = colDef(minWidth = 75),
   outcome = colDef(minWidth = 75)
  )
)

2.4 Figure

Figure below shows 95% confidence intervals for the coefficients of each metric, including state as a covariate.

Code
# get_str(framed_outputs)
lm_figure <- framed_outputs %>% 
  mutate(
    across(c(estimate, starts_with('ci')), as.numeric),
    outcome = ifelse(is.na(outcome), 'no change', outcome) %>% 
      snakecase::to_title_case()
  ) %>% 
  filter(!variable_name %in% c('voterTurnout', 'mobilityDisability')) %>% 
  ggplot(aes(x = estimate, y = reorder(metric, estimate))) +
  geom_point(aes(color = outcome)) +
  geom_errorbarh(aes(xmin = ci_low, xmax = ci_high, color = outcome)) +
  geom_vline(xintercept = 0) +
  scale_color_manual(values = c('darkgreen', 'grey', 'purple')) +
  theme_minimal() +
  labs(
    x = 'Average Relative Percent Change per Year',
    y = 'Metric',
    color = 'Outcome'
  ) + 
  scale_x_continuous(n.breaks = 8)
ggsave(
  'outputs/fig_lm.png',
  plot = lm_figure,
  width = 8,
  height = 5,
  units = 'in',
  bg = 'white'
)
lm_figure

3 Spatial Regressions

3.1 One Model

spatialreg for spatially weighted regressions, but not panels. First trying a single example with gini. Note that because Census data spans the change in administrative boundaries in CT, we can’t use CT at all here.

Code
get_str(datasets$dp_metrics_county)

dat <- datasets$dp_metrics_county %>% 
  pivot_wider(
    id_cols = fips:year,
    names_from = variable_name,
    values_from = value
  )
get_str(dat)

# Remove CT. Drop na ginis, but, get complete cases for relevant fips and years
dat <- dat %>% 
  select(fips, year, gini) %>% 
  mutate(gini_normed = min_max(gini)) %>% 
  filter(str_detect(fips, '^09', negate = TRUE)) %>%
  drop_na(gini) %>% 
  complete(fips, year)
get_str(dat)

# List weights
lw <- neast_counties_2024 %>% 
  right_join(dat) %>% 
  select(fips, year, gini) %>% 
  poly2nb() %>% 
  nb2listw(zero.policy = TRUE)

# Regular weights
get_str(weights_crosswalk)
weight_var <- weights_crosswalk$weight_variable[weights_crosswalk$variable_name == 'gini']
weight_df <- datasets$dp_weights

weight_df <- weight_df %>% 
  mutate(
    value = as.numeric(value), 
    year = as.numeric(year)
  ) %>% 
  rename(
    weight_value = value, 
    weight_name = variable_name,
    weight_year = year
  ) %>% 
  filter(weight_name == weight_var, str_length(fips) == 5)
get_str(weight_df)

get_str(dat)
dat <- left_join(dat, weight_df, by = join_by(fips, year == weight_year)) %>% 
  distinct()
get_str(dat)

# Run regression
sar <- errorsarlm(
  gini_normed ~ year,
  data = dat,
  listw = lw,
  weight = weight_value
)
Code
summary(sar)

Call:errorsarlm(formula = gini_normed ~ year, data = dat, listw = lw, 
    weights = weight_value)

Residuals:
     Min       1Q   Median       3Q      Max 
-36.0447 -17.4249 -11.0732  -3.2657  54.2701 

Type: error 
Coefficients: (asymptotic standard errors) 
              Estimate Std. Error z value Pr(>|z|)
(Intercept) -938.81700  309.60177 -3.0323 0.002427
year           0.49176    0.15341  3.2055 0.001348

Lambda: 0.50321, LR test value: 31.78, p-value: 0.000000017264
Approximate (numerical Hessian) standard error: 0.080826
    z-value: 6.2258, p-value: 0.00000000047902
Wald statistic: 38.761, p-value: 0.00000000047902

Log likelihood: -2891.242 for error model
ML residual variance (sigma squared): 66033000, (sigma: 8126.1)
Number of observations: 627 
Number of parameters estimated: 4 
AIC: 5790.5, (AIC for lm: 5820.3)

3.2 All Models

Use errorsarlm on all metrics.

Code
get_str(datasets$dp_metrics_county)
state_link <- fips_key %>% 
  filter(str_length(fips) == 2, fips != '00') %>% 
  select(fips, state_code)

dat <- datasets$dp_metrics_county %>% 
  pivot_wider(
    id_cols = fips:year,
    names_from = variable_name,
    values_from = value
  ) %>% 
  filter(str_detect(fips, '^09', negate = TRUE)) %>% 
  mutate(link = str_sub(fips, end = 2)) %>% 
  left_join(state_link, by = join_by(link == fips)) %>% 
  select(-link)
get_str(dat)

# Get vector of vars
(vars <- names(dat)[!names(dat) %in% c('fips', 'year', 'state_code')])

tic()
get_time()
plan(multisession, workers = availableCores(omit = 2))
out <- future_map(vars, \(var) {
# out <- map(vars[vars == 'treeDiversity'], \(var) {
  cat('\n\nStarting', var)
  
  tryCatch({
    browser()
    # Reduce to relevant variable, remove NAs
    df <- dat %>% 
      select(fips, year, !!sym(var), state_code) %>% 
      drop_na(!!sym(var)) %>% 
      mutate(normed = min_max(!!sym(var)))
 
    # Regular weights
    weight_var <- weights_crosswalk$weight_variable[weights_crosswalk$variable_name == var]
    if (!is.na(weight_var) && length(weight_var) > 0) {
      # weight_df <- weight_df %>% 
      #   filter(variable_name == weight_var)
      weight_df <- datasets$dp_weights %>% 
        mutate(
          value = as.numeric(value), 
          year = as.numeric(year)
        ) %>% 
        rename(
          weight_value = value, 
          weight_name = variable_name,
          weight_year = year
        ) %>% 
        filter(weight_name == weight_var, str_length(fips) == 5)
      
      # Join weights back to df
      # df <- df %>% 
      #   inner_join(weight_df, by = join_by(fips, year == weight_year))
      if (all(is.na(weight_df$weight_year))) {
        df <- df %>% 
          inner_join(weight_df, by = join_by(fips))
      } else {
        df <- df %>% 
          inner_join(weight_df, by = join_by(fips, year == weight_year))
      }
      
      cat('\nJoined to weights:', weight_var)
    } else {
      cat('\nNo weighting variable')
    }
    
    # List weights
    lw <- neast_counties_2024 %>% 
      right_join(df) %>% 
      select(fips, year, normed) %>% 
      poly2nb() %>% 
      nb2listw(zero.policy = TRUE)   
    cat('\nCreated lw')
    
    # Assign weights arg if variable is weighted. Otherwise NULL
    if (!is.na(weight_var) && length(weight_var) > 0) {
      weights_arg <- df[['weight_value']]
    } else {
      weights_arg <- NULL
    }
    
    # Run error sar model with appropriate (or no) weighting
    model <- errorsarlm(
      normed ~ as.numeric(year) + state_code,
      data = df,
      listw = lw,
      weights = weights_arg
    ) 
    cat('\nRan model')
   
    return(model)
    },
    error = function(e) {
      return(paste0('Error with ', var, ': ', e))
    }
  )
}, .progress = FALSE) %>% 
  setNames(c(vars))
plan(sequential)
toc()
# 6 to 12 minutes or so

# out
# out[[1]]

get_str(out)
get_str(out[[1]])
map(out, class)
# get_str(out[[1]])
# map(out, ~ .x['fips'])
# map(out, ~ summary(.x))

saveRDS(out, 'data/objects/errorsarlm_outs.rds')

3.2.1 Reactable Table

Code
sar_outs <- readRDS('data/objects/errorsarlm_outs.rds')
start <- length(sar_outs)
sar_outs <- sar_outs[names(sar_outs) != 'state_code'] %>% 
  keep(~ class(.x) == 'Sarlm')
end <- length(sar_outs)
lost <- start - end
# 39 down to 35 here

sar_df <- imap(sar_outs, ~ {
  tryCatch(
    {
      df <- tidy(.x)
      est <- df[2, 'estimate'] %>% as.numeric
      se <- df[2, 'std.error'] %>% as.numeric
      alpha <- 0.05
      z <- qnorm(1 - alpha/2)
      confints <- c(est - z * se, est + z * se)
      
      # confints <- confint(.x)[3, ]
      out <- df %>% 
        filter(str_detect(term, 'year')) %>% 
        mutate(
          variable_name = .y,
          conf_low = confints[1],
          conf_high = confints[2]
        ) %>% 
        select(-term) %>% 
        mutate(sig = ifelse(p.value < 0.05, '*', ' ') %>% as.character())  
      return(out)
    },
      error = function(e) {
        message('Error')
        print(e)
    }
  )
}) %>% 
  keep(is.data.frame) %>% 
  bind_rows()
get_str(sar_df)
# sar_df

# 35 down to 32. Where did others go
(lost <- setdiff(names(sar_outs), sar_df$variable_name))
check <- sar_outs[names(sar_outs) %in% lost]
# Not defined because of singularities

sar_raw_outputs <- sar_df %>% 
  mutate(trend = case_when(
    estimate > 0 & sig == '*' ~ 'increasing',
    sig == '' ~ NA_character_,
    .default = 'decreasing'
  ))
# get_str(sar_raw_outputs)
# sar_raw_outputs

# Bring in directional values. Keep only the ones without question marks, 
# clearly higher or lower
get_str(data_paper_meta)
directions <- data_paper_meta %>% 
  filter(str_detect(desirable, 'higher|lower'), !is.na(variable_name)) %>% 
  mutate(desirable = str_remove_all(desirable, '\\?')) %>% 
  select(variable_name, desirable)
reverse <- directions %>% 
  filter(desirable == 'lower') %>% 
  pull(variable_name)

# Reduce to only vars that have a direction.
# Here we lose a few metrics
sar_model_outputs <- sar_raw_outputs %>% 
  filter(variable_name %in% directions$variable_name) %>% 
  mutate(
    reverse = ifelse(variable_name %in% reverse, TRUE, FALSE),
    outcome = case_when(
      reverse == FALSE & trend == 'increasing' ~ 'better',
      reverse == FALSE & trend == 'decreasing' ~ 'worse',
      reverse == TRUE & trend == 'increasing' ~ 'worse',
      reverse == TRUE & trend == 'decreasing' ~ 'better',
      .default = NA
    ),
    outcome = ifelse(sig != '*', NA, outcome),
    across(where(is.numeric), ~ format(round(.x, 3), nsmall = 3))
  )
sar_model_outputs

(n_metrics <- dim(sar_model_outputs)[1])

# See what we lost
(lost <- setdiff(sar_raw_outputs$variable_name, sar_model_outputs$variable_name))
(n_lost <- length(lost))
(print_lost <- lost %>% paste0(collapse = ', '))

# proportion getting better or worse
(tab <- get_table(sar_model_outputs$outcome))
(percs <- round(prop.table(tab), 3) * 100)

We have 29 metrics modeled. 48.3% are getting better, 31% are getting worse, and 20.7% are stable. We drop 3 metrics here that don’t have proper directions (vacancyRate, populationChangePerc, annualPrecipMM).

Code
sar_framed_outputs <- data_paper_meta %>% 
  select(dimension, index, indicator, metric, variable_name) %>% 
  right_join(sar_model_outputs, by = join_by(variable_name == variable_name))

## Save some outputs for paper
# Raw csv for reference
write_csv(
  sar_framed_outputs,
  'outputs/errorsar_reg_outputs.csv'
)

## Create interactive table for Quarto
get_reactable(
  sar_framed_outputs, 
  defaultPageSize = 10,
  columns = list(
   variable_name = colDef(minWidth = 200),
   sig = colDef(minWidth = 50),
   reverse = colDef(minWidth = 75),
   statistic = colDef(minWidth = 75),
   outcome = colDef(minWidth = 75)
  )
)

3.2.2 LaTeX Table

Saving table to \(\LaTeX\) for paper:

Code
get_str(sar_framed_outputs)
tibble [29 × 15] (S3: tbl_df/tbl/data.frame)
 $ dimension    : chr [1:29] "economics" "economics" "economics" "economics" "..
 $ index        : chr [1:29] "community economy" "community economy" "communi"..
 $ indicator    : chr [1:29] "access to land" "business failure rate" "wealth"..
 $ metric       : chr [1:29] "Land and building value per acre" "Change in ag"..
 $ variable_name: chr [1:29] "landValPerAcre" "otyAnnualAvgEstabsCountPctChgN"..
 $ estimate     : chr [1:29] " 0.009" " 0.046" " 1.207" " 0.464" " 0.069" " 1"..
 $ std.error    : chr [1:29] "0.002" "0.010" "0.042" "0.112" "0.017" "0.078" "..
 $ statistic    : chr [1:29] "  6.193" "  4.532" " 28.711" "  4.142" "  4.084"..
 $ p.value      : chr [1:29] "0.000" "0.000" "0.000" "0.000" "0.000" "0.000" "..
 $ conf_low     : chr [1:29] " 0.006" " 0.026" " 1.125" " 0.244" " 0.036" " 0"..
 $ conf_high    : chr [1:29] " 0.012" " 0.066" " 1.290" " 0.683" " 0.103" " 1"..
 $ sig          : chr [1:29] "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "..
 $ trend        : chr [1:29] "increasing" "increasing" "increasing" "increasi"..
 $ reverse      : logi [1:29] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRU..
 $ outcome      : chr [1:29] "better" "better" "better" "worse" "worse" "wors"..
Code
# Reduce to specific columns
latex_df <- sar_framed_outputs %>% 
  select(metric, estimate:p.value, sig, outcome) %>% 
  mutate(p.value = ifelse(sig == '*', paste0(p.value, '*'), p.value)) %>% 
  select(-sig) %>% 
  setNames(c(names(.) %>% snakecase::to_title_case() %>% str_remove(' Value')))
get_str(latex_df)
tibble [29 × 6] (S3: tbl_df/tbl/data.frame)
 $ Metric   : chr [1:29] "Land and building value per acre" "Change in agricu"..
 $ Estimate : chr [1:29] " 0.009" " 0.046" " 1.207" " 0.464" " 0.069" " 1.074"..
 $ Std Error: chr [1:29] "0.002" "0.010" "0.042" "0.112" "0.017" "0.078" "0.0"..
 $ Statistic: chr [1:29] "  6.193" "  4.532" " 28.711" "  4.142" "  4.084" " "..
 $ P        : chr [1:29] "0.000*" "0.000*" "0.000*" "0.000*" "0.000*" "0.000*"..
 $ Outcome  : chr [1:29] "better" "better" "better" "worse" "worse" "worse" ""..
Code
# Save to latex
latex_df %>% 
  kbl(
    format = 'latex',
    caption = 'Coefficients for spatial error models of year on metric',
    label = 'tab_errorsarlm',
    booktabs = TRUE
  ) %>% 
  kable_styling(
    font_size = 9
  ) %>% 
  save_kable(
    file = 'outputs/tab_errorsarlm.tex'
  )

3.2.3 Figure

Code
# get_str(framed_outputs)
# test <- framed_outputs %>% 
#   mutate(
#     across(c(estimate:conf_high), ~ as.numeric(.x)),
#     outcome = ifelse(is.na(outcome), 'no change', outcome) %>% 
#       snakecase::to_title_case()
#   )
# test$outcome

errorsar_figure <- sar_framed_outputs %>% 
  mutate(across(c(estimate:conf_high), ~ as.numeric(.x))) %>% 
  filter(str_detect(metric, 'easements|wages', negate = TRUE)) %>%
  mutate(
    across(c(estimate, starts_with('ci|conf')), as.numeric),
    outcome = ifelse(is.na(outcome), 'no change', outcome) %>% 
      snakecase::to_title_case()
  ) %>% 
  ggplot(aes(x = estimate, y = reorder(metric, estimate))) +
  geom_point(aes(color = outcome)) +
  geom_errorbarh(aes(xmin = conf_low, xmax = conf_high, color = outcome)) +
  geom_vline(xintercept = 0) +
  scale_color_manual(values = c('darkgreen', 'grey', 'purple')) +
  theme_minimal() +
  labs(
    x = 'Average Relative Percent Change per Year',
    y = 'Metric',
    color = 'Outcome'
  ) + 
  scale_x_continuous(n.breaks = 8)
ggsave(
  'outputs/fig_errorsarlm.png',
  plot = errorsar_figure,
  width = 8,
  height = 5,
  units = 'in',
  bg = 'white'
)
errorsar_figure

4 Spatial Panels

4.1 Test Run

Trying splm package for spatially weighted panel models. This is not going well though - too much missing data means it is very difficult to get a balanced panel.

Code
get_str(datasets$dp_metrics_county)

dat <- datasets$dp_metrics_county %>% 
  pivot_wider(
    id_cols = fips:year,
    names_from = variable_name,
    values_from = value
  )
get_str(dat)

# Get rid of geometry, make our df a plm object
df <- dat %>% 
  select(fips, year, gini) %>%
  # select(fips, year, otyAnnualAvgEstabsCountPctChgNAICS11) %>%
  filter(str_detect(fips, '^09', negate = TRUE)) %>%
  na.omit() %>%
  plm::pdata.frame()
get_str(df)

df %>% 
  group_by(year, fips) %>% 
  summarize(n()) %>% 
  arrange(year, fips) %>% 
  print(n = 500)

# Get balanced fips
all_years <- sort(unique(df$year))
balanced_fips <- df %>%
  group_by(fips) %>%
  filter(setequal(sort(year), all_years)) %>%
  pull(fips) %>%
  unique()
df <- df %>%
  filter(fips %in% balanced_fips)

# Get our listw based on counties
lw <- neast_counties_2024 %>% 
  filter(fips %in% df$fips) %>% 
  poly2nb() %>% 
  nb2listw(zero.policy = TRUE)

# Pooling model
pool <- spml(
  gini ~ as.numeric(year),
  data = df, 
  listw = lw,
  model = 'pooling'
)
summary(pool)

# Try FE error model
fe_err <- spml(
  gini ~ as.numeric(year),
  data = df, 
  listw = lw,
  model = 'within'
)
summary(fe_err)

# RE error model
re_err <- spml(
  gini ~ as.numeric(year), 
  data = df, 
  listw = lw,
  model = 'random'
)
summary(re_err)

# Try lagsarlm
lagsar <- lagsarlm(
  # otyAnnualAvgEstabsCountPctChgNAICS11 ~ as.numeric(year),
  gini ~ as.numeric(year),
  data = df,
  listw = lw
)
summary(lagsar)

# # Check to use RE or FE, see whether x_it related to a_i (idiosyncratic error)
# test <- sphtest(fe_err, re_err)
# test
# # Not sig: use RE, more efficient
# 
# # Check RE output
# summary(re_err)

4.2 All Models

Code
get_str(datasets$dp_county_wide)
dat <- datasets$dp_county_wide %>% 
  select(fips, year, any_of(datasets$series_vars))
get_str(dat)

# Get vector of vars
vars <- names(dat)[!names(dat) %in% c('fips', 'year')]

# Map over each var and do steps 1 through 5
out <- map(vars, \(var) {
  
  cat('\n\nStarting', var)
  
  tryCatch({
    # Reduce to relevant variable, remove NAs
    df <- dat %>% 
      select(fips, year, !!sym(var)) %>% 
      filter(str_detect(fips, '^09', negate = TRUE)) %>% 
      na.omit()
      # complete(fips, year)
      # mutate(!!sym(var) := ifelse(is.na(!!sym(var)), 0, !!sym(var)))
    
    # 1. Only use fips that produce balanced panel
    balanced_fips <- df %>%
      group_by(fips) %>%
      filter(n() == length(unique(df$year))) %>%
      pull(fips) %>%
      unique()

    df <- df %>%
      filter(fips %in% balanced_fips)

    # 2. Create lw object
    lw <- neast_counties_2024 %>%
      filter(fips %in% balanced_fips) %>%
      # filter(fips %in% df$fips) %>%
      poly2nb() %>%
      nb2listw(zero.policy = TRUE)
    
    cat('\nCreated lw')
    
    # Make df a plm object
    df <- plm::pdata.frame(df)
    
    # 3. Run pooled panel model
    formula <- as.formula(paste(var, "~ as.numeric(year)"))
    model <- spml(
      formula,
      data = df,
      listw = lw,
      index = c('fips', 'year'),
      model = 'pooling',
      spatial.error = 'b'
      # lag = TRUE
    )
   
    cat('\nRan model')
   
    return(list('model' = model, 'fips' = balanced_fips))
    },
    error = function(e) {
      return(paste0('Error with ', var, ': ', e))
    }
  )
  
}, .progress = FALSE) %>% 
 setNames(c(vars))
out
get_str(out)
get_str(out[[1]])
summary(out[[1]]$model)
map(out, ~ .x['fips'])
map(out, ~ summary(.x[[1]]))

# Save for posterity
saveRDS(out, 'data/objects/spatial_regression_outs.rds')

4.3 All Models Weighted

Code
# TODO: fix this. we broke it here somewhere.
get_str(datasets$dp_county_wide)
dat <- datasets$dp_county_wide %>% 
  select(fips, year, any_of(datasets$series_vars))
get_str(dat)

# Get vector of vars
vars <- names(dat)[!names(dat) %in% c('fips', 'year')]


## Normalization
get_str(dat)
dat <- dat %>% 
  mutate(across(all_of(vars), ~ min_max(.x)))
get_str(dat)  
map(dat, ~ range(.x, na.rm = TRUE))

 
## Regressions
out <- map(vars, \(var) {
# out <- map('treeDiversity', \(var) {
  cat('\n\nStarting', var)
  
  tryCatch({
    # Reduce to relevant variable, remove NAs
    df <- dat %>% 
      select(fips, year, !!sym(var)) %>% 
      filter(str_detect(fips, '^09', negate = TRUE)) %>% 
      na.omit()
    
    ## Add weights
    weight_var <- weights_crosswalk$weight_variable[weights_crosswalk$variable_name == var]
    weight_df <- datasets$dp_weights
    if (!is.na(weight_var) && length(weight_var) > 0) {
      weight_df <- weight_df %>%
        filter(variable_name == weight_var)
    }
    weight_df <- weight_df %>%
      mutate(
        value = as.numeric(value),
        year = as.numeric(year)
      ) %>%
      rename(
        weight_value = value,
        weight_name = variable_name,
        weight_year = year
      )
    # Do join depending on whether weight var has a year
    if (all(is.na(weight_df$weight_year))) {
      df <- df %>%
        inner_join(weight_df, by = join_by(fips))
    } else {
      df <- df %>%
        inner_join(weight_df, by = join_by(fips, year == weight_year))
    }
    # Weight the variable appropriately
    var_weighted <- paste0(var, '_weighted')
    df <- df %>%
      mutate(!!sym(var_weighted) := !!sym(var) * sqrt(weight_value)) %>% 
      drop_na(!!sym(var_weighted))
    
    ##  Only use fips that produce balanced panel
    balanced_fips <- df %>%
      group_by(fips) %>%
      filter(n() == length(unique(df$year))) %>%
      pull(fips) %>%
      unique()
    df <- df %>%
      filter(fips %in% balanced_fips)
    
    ## Create lw object
    lw <- neast_counties_2024 %>%
      filter(fips %in% balanced_fips) %>%
      filter(fips %in% df$fips) %>%
      poly2nb() %>%
      nb2listw(zero.policy = TRUE)
    cat('\nCreated lw')
    
    ## Run pooled panel model
    df <- plm::pdata.frame(df)
    # formula <- as.formula(paste(var_weighted, "~ as.numeric(year)"))
    formula <- as.formula(paste(var_weighted, "~ as.numeric(year)"))
    model <- spml(
      formula,
      data = df,
      listw = lw,
      index = c('fips', 'year'),
      model = 'pooling'
    )
   
    cat('\nRan model')
   
    return(list('model' = model, 'fips' = balanced_fips))
    },
    error = function(e) {
      return(paste0('Error with ', var, ': ', e))
    }
  )
}, .progress = FALSE) %>% 
 setNames(c(vars))
out
out[[2]]
out$cropDiversity # check for smoothed weighting var
get_str(out)
get_str(out[[1]])
summary(out$gini$model)
map(out, ~ .x['fips'])
map(out, ~ summary(.x[[1]]))
# Getting errors because there are NO balanced fips

# Save for posterity
saveRDS(out, 'data/objects/spatial_panel_outs.rds')

Check how they turned out:

Code
out <- readRDS('data/objects/spatial_panel_outs.rds')
get_str(out)

# How many were successful
(perc_success <- mean(map(out, class) == 'list') * 100)

# Check out errors only
errors <- out %>% 
  keep(is.character)
errors
(error_vars <- names(errors))

8 models failed: otyAnnualAvgEstabsCountPctChgNAICS11, otyAnnualAvgWklyWagePctChgNAICS11, vacancyRate, voterTurnout, plantRar, incomeAgTourismRecPropTotal, retailSalesPropWholesale, mobilityDisability.

4.4 Results

Check out the successful models:

4.4.1 Table

Code
model_outs <- out %>% 
  keep(is.list)

# Make a table
spatial_table <- imap(model_outs, ~ {
  tryCatch({
    
    # Pull model output results and coefficients
    sum <- summary(.x[[1]])
    coefs <- sum$CoefTable[2, ]
    
    # Add confidence intervals
    conf_ints <- confint(.x[[1]])[2, ] %>% 
      as.vector()
    # df$ci_low <- conf_ints[1]
    # df$ci_high <- conf_ints[2]
  
    df <- data.frame(
      n = length(.x$fips),
      # type = .x[[1]]$type,
      var = .y,
      # phi = sum$errcomp[1],
      rho = sum$ErrCompTable[[1]],
      est = coefs[1],
      se = coefs[2],
      t = coefs[3],
      ci_low = conf_ints[1],
      ci_high = conf_ints[2],
      p = coefs[4]
    ) 
    
    # Figure out good or bad outcome
    # If sig, then check reverse. else blank
    # If reverse, reverse it, else same
    # If up, good, else bad
    df <- df %>% 
      mutate(
        sig = ifelse(p < 0.05, TRUE, FALSE),
        posneg = case_when(
          est > 0 ~ 'pos',
          est < 0 ~ 'neg',
          .default = ''
        ),
        reverse = ifelse(.y %in% reverse, TRUE, FALSE),
        direction = case_when(
          sig == TRUE & reverse == TRUE & posneg == 'pos' ~ 'neg',
          sig == TRUE & reverse == TRUE & posneg == 'neg' ~ 'pos',
          sig == TRUE & reverse == FALSE ~ as.character(posneg),
          .default = ''
        ),
        outcome = case_when(
          sig == FALSE ~ '',
          sig == TRUE & direction == 'pos' ~ 'better',
          sig == TRUE & direction == 'neg' ~ 'worse',
          .default = '-'
        )
      ) %>% 
      select(-c(posneg, direction))
    return(df)
  },
    error = function(e) {
      return(paste('Error:', e))
    }
  )
}) %>% 
  bind_rows()
# get_str(spatial_table)

## Round off and make table for Quarto
spatial_table %>% 
  mutate(
    across(c(rho:p), ~ format(round(.x, 3), nsmall = 3))
  ) %>% 
  get_reactable(
    defaultColDef = colDef(minWidth = 50),
    columns = list(
      n = colDef(minWidth = 50),
      var = colDef(minWidth = 200),
      est = colDef(
        minWidth = 100,
        name = 'β'
      ),
      outcome = colDef(minWidth = 75),
      rho = colDef(name = 'ρ'),
      p = colDef(name = 'p-value'),
      'p-value' = colDef(minWidth = 100)
    )
  )

Issue here is that we have to drop many counties for some variables. The n column shows number of counties included. Some are well below half - not sure this is a great way to assess change if that is the case.

  • n: Number of counties included in regression
  • var: Metric
  • \(\rho\): Spatial autocorrelation in residual errors (omitted variables)
  • \(\beta\): Regression coefficient of year on the variable
  • se: standard error of \(\beta\)
  • t: test statistic
  • p-value: p-value
  • sig: whether p is < 0.05
  • reverse: whether the metric has been reversed so that lower numbers are better and higher numbers are worse (like food insecurity)
  • outcome: if the regression is significant, is it getting better or worse? Accounts for the ‘reverse’ variable

4.4.2 Figure

Code
# get_str(spatial_framed_outputs)
# get_str(spatial_table)
spatial_framed_outputs <- data_paper_meta %>% 
  select(dimension, index, indicator, metric, variable_name) %>% 
  right_join(spatial_table, by = join_by(variable_name == var))
# get_str(spatial_framed_outputs)

out <- spatial_framed_outputs %>% 
  mutate(
    across(c(est, starts_with('ci')), as.numeric),
    outcome = case_when(
      outcome == '' ~ 'no change',
      .default = outcome
    ) %>% 
      snakecase::to_title_case()
  ) %>% 
  ggplot(aes(x = est, y = reorder(metric, est))) +
  geom_point(aes(color = outcome)) +
  geom_errorbarh(aes(xmin = ci_low, xmax = ci_high, color = outcome)) +
  geom_vline(xintercept = 0) +
  scale_color_manual(values = c('darkgreen', 'grey', 'purple')) +
  theme_minimal() +
  labs(
    x = 'Average Relative Weighted Percent Change per Year',
    y = 'Metric',
    color = 'Outcome'
  )
ggsave(
  'outputs/fig_spml.png',
  plot = out,
  width = 8,
  height = 5,
  units = 'in',
  bg = 'white'
)
out

Back to top