Metric Trends

Caution

This is a mildly slapdash page working out spatial regressions for time series metrics from the refined framework variables. There are some cleaning issues to address with some variables, and the scripts are wildly disorganized.

Code
options(scipen = 999)

pacman::p_load(
  dplyr,
  htmltools,
  stringr,
  tidyr,
  reactable,
  purrr,
  broom,
  caret,
  knitr,
  tictoc,
  furrr,
  parallelly,
  splm,
  spdep,
  sf
)


# Functions
source('dev/data_pipeline_functions.R')
source('dev/get_reactable.R')
source('dev/filter_fips.R')

conflicts_prefer(
  dplyr::select(),
  dplyr::filter(),
  dplyr::pull(),
  stats::lag(),
  base::setdiff(),
  .quiet = TRUE
)
Code
# Load metrics
sm_data <- readRDS('data/sm_data.rds')
metrics <- sm_data$metrics
meta <- sm_data$metadata

# Load frame
frame <- readRDS('data/frameworks/new_frame.rds')

This page explores what data we have available at the county level specifically. It is branching off of the main “Analysis” pages because they are state-level analyses and I want to leave it there for posterity.

Note that there are some cleaning issues with this dataset, and the scripts are wildly disorganized. The important thing is that we figured out how to iterate through a few dozen variables and run spatial error models at the county level to see if they are getting better or worse over time.

1 Explore Metrics and Resolution

Check how many metrics we have:

Code
vars <- frame %>% 
  filter(variable_name != 'NONE') %>% 
  pull(variable_name)
count <- length(vars)

We have 130 metrics overall

Check which metrics are available at state vs county levels.

Code
get_str(metrics)

# Filter to vars, remove state fips codes
var_metrics <- metrics %>%
  filter(
    variable_name %in% vars
  )
get_str(var_metrics)
length(unique(var_metrics$variable_name))

# Check how many are left
out <- var_metrics %>% 
  filter(str_length(fips) == 5) %>% 
  pull(variable_name) %>% 
  unique() %>% 
  length()
out
perc_county <- out / length(unique(var_metrics$variable_name))
# 73 (58%) are at the county level. Better

79 (60.8%) are available at county level.

Make a table to explore what we have at each level:

Code
## What do we ONLY have at the state level?
county_metrics <- var_metrics %>% 
  filter(str_length(fips) == 5) %>% 
  pull(variable_name) %>%
  unique()
state_metrics <- var_metrics %>% 
  filter(str_length(fips) == 2) %>% 
  pull(variable_name) %>%
  unique()
(state_only <- setdiff(state_metrics, county_metrics))
length(state_only)

# Back to frame for context
state_only_frame <- frame %>% 
  filter(variable_name %in% state_only)
state_only_frame


## Get the ones that are available at county level
get_str(frame)
county_df <- frame %>% 
  filter(
    !variable_name %in% state_only_frame$variable_name,
    variable_name != 'NONE'
  )
get_str(county_df)

# Save just the county level variables here for analyses
county_vars <- county_df$variable_name

# We actually want a DF with a column that specifies the difference
df <- frame %>% 
  filter(variable_name != 'NONE') %>% 
  mutate(has_county = case_when(
    variable_name %in% county_df$variable_name ~ TRUE,
    .default = FALSE
  ))
get_str(df)

# Save this somewhere for some reason
write.csv(df, 'data/frontiers/metric_resolution.csv')
Code
get_reactable(df, defaultPageSize = 5)

3 Moran’s Tests

Caution

Turns out this whole section is moot. Moran test and spatialreg packages can only do cross-sections. So just skip down to the Spatial Regressions section where we do it with spml instead.

Moran’s test for spatial autocorrelation, determines whether we need to run spatial regressions. Turns out this is a bit hinky with spotty data. Starting with a single run on a single year to see how it works:

Code
get_str(series_metrics)
get_str(refined_vars)

# get single metric
dat <- series_metrics %>% 
  filter(
    str_length(fips) == 5,
    variable_name == 'gini'
  ) %>% 
  pivot_wider(
    id_cols = fips:year,
    names_from = variable_name,
    values_from = value
  ) %>% 
  mutate(across(c(year, gini), as.numeric))
get_str(dat)

## Check how many counties are in each year
dat %>% 
  group_by(year) %>% 
  summarize(count = n())
# they switch in 2022. Let's just take up through 2021 to make this work for now 
# OLS model

# Pull county polygons 2024
counties <- readRDS('data/sm_spatial.rds')[['ne_counties_2021']]
counties

# Combine data with polygons in one file
dat_sf <- dat %>% 
  filter(year < 2022) %>% 
  left_join(counties, by = 'fips')
get_str(dat_sf)

# Get neighbor list from counties
nb <- poly2nb(counties)
nb
class(nb)
summary(nb)


## Check Moran's stat for each year?
lw <- nb2listw(nb)

# Moran test for each year
years <- sort(unique(dat_sf$year))
results <- map(years, ~ {
  dat_sf %>% 
    filter(year == .x) %>% 
    pull(gini) %>% 
    moran.test(lw)
})
get_str(results[[1]])
any(map_dbl(results, ~ .x$p.value) < 0.05)
# No spatial autocorrelation here I guess?

Looks like it works, also looks like there is no spatial correlation within this variable in this year.

Try running Moran test for each variable in each year systematically:

Code
county_metrics <- series_metrics %>% 
  filter(
    str_length(fips) == 5,
    !variable_name %in% c('population')
  )
get_str(county_metrics)

# Get wide(r) df of all metrics
dat <- county_metrics %>% 
  pivot_wider(
    id_cols = fips:year,
    names_from = variable_name,
    values_from = value
  ) %>% 
  mutate(across(!fips, as.numeric))
get_str(dat)

# Check how many counties are in each year
dat %>% 
  group_by(year) %>% 
  summarize(count = n())
# 2022 has some old and some new CT counties! Sheesh

# Let's pull 2021 counties, filter to only those in the data
counties <- readRDS('data/sm_spatial.rds')[['ne_counties_2021']]
counties

# Combine county spatial files with data, which will also filter to old counties
dat <- left_join(dat, counties) %>% 
  select(-aland, -awater)
get_str(dat)


## For each variable, for each year: run moran test
(vars <- names(dat)[!names(dat) %in% c(
  'fips', 
  'year', 
  'geometry', 
  'womenEarnPercMenFPS',
  'expHiredLaborPercOpExp'
)])

get_time()
plan(multisession, workers = availableCores(omit = 2))
out <- future_map(vars, \(var) {
  
  # Get unique years and fips for that var
  pars <- dat %>% 
    select(fips, year, !!sym(var), geometry) %>% 
    na.omit() %>% 
    select(year, fips)
  
  # Single out unique years, we use fips later
  years <- sort(unique(pars[['year']]))
    
  # For each year, get counties, lw, and moran test
  year_results <- map(years, \(yr) {
    # Get fips in that year
    fips_subset <- pars %>% 
      filter(year == yr) %>% 
      pull(fips)
    
    # Use fips to get county subset for that year
    county_subset <- dat %>% 
      filter(
        year == yr,
        fips %in% fips_subset
      ) %>% 
      st_as_sf() %>% 
      st_make_valid() %>% 
      filter(!st_is_empty(geometry))
    
    # Now we can get lw from counties that year
    tryCatch({
      lw <- nb2listw(poly2nb(county_subset))
    }, error = function(e) {
      print(e)
      return(paste('Error in poly2nb:', e$message))
    })
    
    # FINALLY get Moran test for that var in that year
    # Catch errors because of missing values...
    tryCatch({
      test <- county_subset %>% 
        filter(year == yr) %>% 
        pull(!!sym(var)) %>% 
        moran.test(lw)
    }, error = function(e) {
      print(paste0('Error in ', var, ', year ', yr, e))
      return(paste('Error in Moran:', e$message))
    })
  }) %>% 
    setNames(c(paste0('y', years)))
}, .progress = TRUE) %>% 
  setNames(c(vars))
plan(sequential)

get_str(out)
# saveRDS(out, 'data/objects/moran_outs.rds')

Yeesh, that was hairy. Let’s explore those results now:

Code
out <- readRDS('data/objects/moran_outs.rds')
get_str(out)
get_str(out[[1]])

# Pull out stat and p value for each set of results
df_list <- map(out, \(var) {
  imap(var, \(year, yr_string) {
    if (class(year) == 'htest') {
      data.frame(
        'year' = yr_string,
        'moran' = year$statistic,
        'p' = year$p.value
      )
    } 
  }) %>% 
    bind_rows()
}) %>% 
  imap_dfr(~ mutate(.x, var = .y))
rownames(df_list) <- NULL
get_str(df_list)

# Make a nicer table, rounding off, remove the ys, remove rownames
tab <- df_list %>% 
  mutate(
    year = str_remove(year, 'y')
  )
tab  

# Summary stats for each var
sum <- tab %>% 
  group_by(var) %>% 
  summarize(prop_sig = mean(p < 0.05)) %>% 
  arrange(desc(prop_sig)) %>% 
  mutate(prop_sig = format(round(prop_sig, 3), nsmall = 3))
Code
get_reactable(sum)

prop_sig is the proportion of years within that variable that are significantly spatially correlated. Looks like many variables are spatially autocorrelated, but not all. Wonder whether it’s worth running spatial regressions on all of them, or only the variables (and years) that are significantly spatially correlated.

Problems with Moran’s test (and spatial regressions generally) right now:

  • County arrangements change between years. Newer variables might be on Connecticut’s governance regions, giving us 68 counties in New England rather than 67. The switch happened in 2022, but not every metric reflects this shift at the same time! Isn’t that fun
  • Still some issues with different object lengths between spatial list weights and datasets in the moran tests
  • Would probably be better off running Moran Monte Carlos instead of Moran tests, but that is going to up the run time substantially and I don’t think it matters all that much here.

4 Spatial Panel Regressions

Run spatial regression for all vars. Might be worth looking into whether there is a version of Moral test that can be run across years. Until then, just running each one with spatial error model.

4.1 Try it once

Try splm on just one variable first:

Code
get_str(series_metrics)

dat <- series_metrics %>% 
  filter(
    variable_name != 'population',
    str_length(fips) == 5
  ) %>% 
  pivot_wider(
    id_cols = fips:year,
    names_from = variable_name,
    values_from = value
  )

# Get rid of geometry, make our df a plm object
df <- dat %>% 
  select(fips, year, rentMedianPercHH) %>% 
  filter(str_detect(fips, '^09', negate = TRUE)) %>%
  na.omit() %>% 
  mutate(rentMedianPercHH = as.numeric(rentMedianPercHH)) %>% 
  plm::pdata.frame()
get_str(df)

# Get our listw based on counties kept (not CT)
lw <- counties %>% 
  filter(fips %in% df$fips) %>% 
  poly2nb() %>% 
  nb2listw(zero.policy = TRUE)

# Try FE error model with rentMedianPercHH
fe_err <- spml(
  rentMedianPercHH ~ as.numeric(year), 
  data = df, 
  listw = lw,
  model = 'within'
)
summary(fe_err)

# RE error model
re_err <- spml(
  rentMedianPercHH ~ as.numeric(year), 
  data = df, 
  listw = lw,
  model = 'random'
)
summary(re_err)

# Check to use RE or FE, see whether x_it related to a_i (idiosyncratic error)
test <- sphtest(fe_err, re_err)
test
# Not sig: use RE, more efficient

# Check RE output
summary(re_err)

4.2 Do it for all vars

So the workflow for each variable is:

  1. Make sure panel is balanced, i.e. see if Connecticut will be a problem. Adjust accordingly.
  2. Create list weight object based on the counties that are represented by in the variable. I think we have to do this for each variable individually… should double check though.
  3. Run FE and RE model with year as numeric.
  4. Hausman test with FE and RE model. If sig, report FE, if not, report RE

Try it:

Code
get_str(dat)

# Make variables numeric
dat <- dat %>% 
  mutate(across(3:last_col(), as.numeric))
get_str(dat)

# Get vector of vars
vars <- names(dat)[!names(dat) %in% c('fips', 'year')]

# Map over each var and do steps 1 through 5
plan(multisession, cores = availableCores(omit = 2))
out <- future_map(vars, \(var) {
  
  cat('\n\nStarting', var)
  
  tryCatch({
    # Reduce to relevant variable, remove NAs
    df <- dat %>% 
      select(fips, year, !!sym(var)) %>% 
      drop_na()
    
    # 1. Only use fips that produce balanced panel
    balanced_fips <- df %>% 
      group_by(fips) %>% 
      filter(n() == length(unique(df$year))) %>% 
      pull(fips) %>% 
      unique()
    
    df <- df %>% 
      filter(fips %in% balanced_fips)
    
    # 2. Create lw object
    lw <- counties %>% 
      filter(fips %in% balanced_fips) %>% 
      # filter(fips %in% df$fips) %>% 
      poly2nb() %>% 
      nb2listw(zero.policy = TRUE)
    cat('\nCreated lw')
    
    # Make df a plm object
    df <- plm::pdata.frame(df)
    
    # 3. Run FE and RE
    formula <- as.formula(paste(var, "~ as.numeric(year)"))
    fe <- spml(
      formula,
      data = df, 
      listw = lw,
      model = 'within'
    )
    re <- spml(
      formula,
      data = df, 
      listw = lw,
      model = 'random'
    )
    cat('\nRan models')
    
    # 4. Hausman test. If sig, use FE
    test <- sphtest(fe, re)
    if (test$p.value < 0.05) {
      model <- fe
    } else {
      model <- re
    }
    cat('\nRan Hausman')
    
    return(list('model' = model, 'fips' = balanced_fips))
      
    },
    error = function(e) {
      return(paste0('Error with ', var, ': ', e))
    }
  )
  
}, .progress = FALSE) %>% 
 setNames(c(vars))
plan(sequential)

# Save for posterity
saveRDS(out, 'data/objects/spatial_regression_outs.rds')
Code
out <- readRDS('data/objects/spatial_regression_outs.rds')

# How many were successful
perc_success <- mean(map(out, class) == 'list') * 100

get_str(out[[1]])
get_str(out)
out

# Check out errors only
errors <- out %>% 
  keep(is.character)
errors
error_vars <- names(errors)

92.1% of regressions were successful. The last few were some fucked up metrics for one reason or another. They include: infantMortality, disconnectedYouth, voterTurnout, expHiredLaborPercOpExp, salesCrop.

Check out the successful models:

Code
model_outs <- out %>% 
  keep(is.list)
# get_str(model_outs)
# get_str(model_outs[[1]])
# get_str(model_outs[[1]][[1]])

# Check model type
# model_outs[[1]][[1]]$type
# table(map_chr(model_outs, ~ .x[[1]]$type))
# They are all RE models - no FE


## Test out outcomes
# sum <- summary(model_outs[[1]][[1]])
# coefs <- sum$CoefTable[2, ]

# Make a table
spatial_table <- imap(model_outs, ~ {
  tryCatch({
    
    # Pull model output results and coefficients
    sum <- summary(.x[[1]])
    coefs <- sum$CoefTable[2, ]
    
    df <- data.frame(
      n = length(.x$fips),
      type = .x[[1]]$type,
      var = .y,
      phi = sum$errcomp[1],
      rho = sum$errcomp[2],
      est = coefs[1],
      se = coefs[2],
      t = coefs[3],
      p = coefs[4]
    ) 
    
    # Figure out good or bad outcome
    # If sig, then check reverse. else blank
    # If reverse, reverse it, else same
    # If up, good, else bad
    df <- df %>% 
      mutate(
        sig = ifelse(p < 0.05, TRUE, FALSE),
        posneg = case_when(
          est > 0 ~ 'pos',
          est < 0 ~ 'neg',
          .default = ''
        ),
        reverse = ifelse(.y %in% reverse, TRUE, FALSE),
        direction = case_when(
          sig == TRUE & reverse == TRUE & posneg == 'pos' ~ 'neg',
          sig == TRUE & reverse == TRUE & posneg == 'neg' ~ 'pos',
          sig == TRUE & reverse == FALSE ~ as.character(posneg),
          .default = ''
        ),
        outcome = case_when(
          sig == FALSE ~ '',
          sig == TRUE & direction == 'pos' ~ 'better',
          sig == TRUE & direction == 'neg' ~ 'worse',
          .default = '-'
        )
      ) %>% 
      select(-c(posneg, direction))
    return(df)
  },
    error = function(e) {
      return(paste('Error:', e))
    }
  )
}) %>% 
  bind_rows()
# get_str(spatial_table)

# Round off and make table
spatial_table %>% 
  mutate(
    across(c(phi:p), ~ format(round(.x, 3), nsmall = 3)),
    type = case_when(
      str_detect(type, 'random') ~ 'RE',
      .default = NA
    )
  ) %>% 
  get_reactable(
    defaultColDef = colDef(minWidth = 50),
    columns = list(
      n = colDef(minWidth = 25),
      var = colDef(minWidth = 200),
      est = colDef(
        minWidth = 100,
        name = 'β'
      ),
      outcome = colDef(minWidth = 75),
      rho = colDef(name = 'ρ'),
      phi = colDef(name = 'φ'),
      p = colDef(name = 'p-value'),
      'p-value' = colDef(minWidth = 100)
    )
  )
  • n: Number of counties included in regression
  • type: FE = fixed effects (every county has \(\beta\)), RE = random effects (counties have different \(\beta\)s)
  • var: Metric
  • \(\phi\): Spatial autocorrelation in idiosyncratic attributes (counties)
  • \(\rho\): Spatial autocorrelation in residual errors (omitted variables)
  • \(\beta\): Regression coefficient of year on the variable
  • se: standard error of \(\beta\)
  • t: test statistic
  • p-value: p-value
  • sig: whether p is < 0.05
  • reverse: whether the metric has been reversed so that lower numbers are better and higher numbers are worse (like food insecurity)
  • outcome: if the regression is significant, is it getting better or worse? Accounts for the ‘reverse’ variable
Back to top