Validation

1 Introduction

Here we will use the raw + min max + geometric aggregation scores and see how they hold up to validation by external variables and by PCA.

External variables:

Food Security Index (Feeding America, Map the Meal Gap)
Life expectancy, or premature age-adjusted mortality (UW County Health Rankings)
Food Environment Index (UW County Health Rankings)
Happiness Score (WalletHub - if anyone knows of a better metric for this, I’m all ears)

Code

# Load sm_data
sm_data <- readRDS('data/sm_data.rds')

# Load state fips key to join other datasets
state_key <- sm_data[['state_key']] %>% 
  select(state, state_code)

# Load cleaned aggregated data for all levels of regresion
raw_minmax_geo <- readRDS('data/raw_minmax_geo.rds')
get_str(raw_minmax_geo)

# Reduce to just dimension scores, and remove prefix
dimension_scores <- raw_minmax_geo %>% 
  select(state, starts_with('dimen')) %>% 
  setNames(c(str_remove(names(.), 'dimen_')))
get_str(dimension_scores)

# Pull validation variables out of sm_data, wrangle them to match metrics_df
# Also including covariates, gdp and population
validation_vars <- sm_data$metadata %>% 
  select(variable_name, metric, definition, source) %>% 
  filter(variable_name %in% c(
    'foodInsecurity',
    'communityEnvRank',
    'happinessScore',
    'wellbeingRank',
    'workEnvRank',
    'foodEnvironmentIndex',
    'lifeExpectancy',
    'population',
    'gdpCurrent'
  )) %>% 
  pull(variable_name)
validation_vars  
 
# Get subset of metrics for our validation variables, get latest year only
validation_metrics <- sm_data$metrics %>% 
  filter(
    variable_name %in% validation_vars, 
    !is.na(value), 
    str_length(fips) == 2
  ) %>% 
  get_latest_year()
get_str(validation_metrics)
# All are available in 2024

# Pivot wider, also get rid of trailing year
validation_metrics <- validation_metrics %>% 
  pivot_wider(
    id_cols = fips,
    names_from = variable_name,
    values_from = value
  ) %>% 
  setNames(c(str_remove(names(.), '_[0-9]{4}'))) %>% 
  mutate(across(!fips, as.numeric))
get_str(validation_metrics)
# 00 US is missing a lot obviously
# 11 DC is the other one with missing data
# We will just filter down to 50 states to match metrics_df

# Combine validation variables with our dimension scores using state key as the 
# bridge. Also remove DC (don't have validation metrics there)
key <- sm_data$state_key %>% 
  select(state, fips = state_code)
dat <- dimension_scores %>% 
  left_join(key) %>% 
  left_join(validation_metrics) %>% 
  as.data.frame() %>% 
  filter(state != 'DC') %>% 
  select(-fips)

# Make a GDP per capita variable from GDP real and population
# It was already in millions to begin with
dat <- dat %>% 
  mutate(gdp_per_cap = ((gdpCurrent / population) * 1e6) / 1000)
get_str(dat)

# Check it out
get_str(dat)
skimr::skim(dat)
# Looks good

# Save this for other pages
saveRDS(dat, 'data/metrics_df_with_vals_and_covars.rds')

2 Regression

2.1 Food Insecurity

Code

lm1 <- lm(
  foodInsecurity ~ economics + environment + health + production + social,
  data = dat
)


	Dependent variable:

	Food Insecurity Index
	Dimensions Only	With GDP per Capita	GDP and Pop. Weights
	(1)	(2)	(3)

Constant	0.157^*** (0.126, 0.188)	0.162^*** (0.123, 0.202)	0.163^*** (0.128, 0.199)
economics	-0.002 (-0.072, 0.069)	-0.005 (-0.077, 0.068)	-0.038 (-0.103, 0.027)
environment	0.062 (0.001, 0.122)	0.059 (-0.003, 0.121)	0.034 (-0.025, 0.094)
health	-0.185^*** (-0.257, -0.112)	-0.175^*** (-0.260, -0.089)	-0.187^*** (-0.270, -0.103)
production	-0.012 (-0.058, 0.033)	-0.011 (-0.057, 0.036)	0.014 (-0.013, 0.041)
social	0.012 (-0.058, 0.083)	0.011 (-0.061, 0.083)	-0.007 (-0.069, 0.055)
gdp_per_cap		-0.0001 (-0.001, 0.0003)	0.0002 (-0.0002, 0.001)

WLS	No	No	Yes
Robust	No	No	No
Observations	50	50	50
R²	0.546	0.548	0.629
Adjusted R²	0.494	0.485	0.578
Residual Std. Error	0.017 (df = 44)	0.017 (df = 43)	32.875 (df = 43)
F Statistic	10.574^*** (df = 5; 44)	8.679^*** (df = 6; 43)	12.166^*** (df = 6; 43)

Note:	^⋆p<0.05; ^⋆⋆p<0.01; ^⋆⋆⋆p<0.001

Code

check_model(lm1)

Code

lmtest::bptest(lm1)


    studentized Breusch-Pagan test

data:  lm1
BP = 12.068, df = 5, p-value = 0.03387

2.2 Life Expectancy

Code

lm2 <- lm(
  lifeExpectancy ~ economics + environment + health + production + social,
  data = dat
)


	Dependent variable:

	Life Expectancy
	Dimensions Only	With GDP per Capita	GDP and Pop. Weights
	(1)	(2)	(3)

Constant	71.837^*** (70.057, 73.618)	69.820^*** (67.760, 71.881)	67.969^*** (65.677, 70.260)
economics	2.385 (-1.688, 6.457)	3.416 (-0.360, 7.191)	2.618 (-1.533, 6.769)
environment	-7.892^*** (-11.387, -4.397)	-6.921^*** (-10.170, -3.671)	-6.209^** (-10.022, -2.396)
health	19.661^*** (15.473, 23.849)	15.990^*** (11.525, 20.455)	8.808^** (3.463, 14.152)
production	2.012 (-0.601, 4.626)	1.301 (-1.127, 3.729)	1.689 (-0.060, 3.439)
social	-2.153 (-6.240, 1.934)	-1.639 (-5.386, 2.107)	2.588 (-1.390, 6.566)
gdp_per_cap		0.037^** (0.014, 0.060)	0.074^*** (0.049, 0.100)

WLS	No	No	Yes
Robust	No	No	No
Observations	50	50	50
R²	0.805	0.841	0.837
Adjusted R²	0.783	0.819	0.814
Residual Std. Error	0.957 (df = 44)	0.874 (df = 43)	2,099.744 (df = 43)
F Statistic	36.380^*** (df = 5; 44)	37.979^*** (df = 6; 43)	36.684^*** (df = 6; 43)

Note:	^⋆p<0.05; ^⋆⋆p<0.01; ^⋆⋆⋆p<0.001

Code

check_model(lm2)

Code

lmtest::bptest(lm2)


    studentized Breusch-Pagan test

data:  lm2
BP = 4.9045, df = 5, p-value = 0.4276

Code

life_exp_vcov <- vcovHC(lm2, type = 'HC3')

% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com % Date and time: Wed, Mar 19, 2025 - 1:50:44 PM

2.3 Food Environment Index

Code

lm3 <- lm(
  foodEnvironmentIndex ~ economics + environment + health + production + social,
  data = dat
)


	Dependent variable:

	Food Environment Index
	Dimensions Only	With GDP per Capita	GDP and Pop. Weights
	(1)	(2)	(3)

Constant	3.180^*** (1.905, 4.455)	2.498^** (0.898, 4.098)	1.633 (0.004, 3.262)
economics	0.058 (-2.859, 2.974)	0.406 (-2.526, 3.339)	3.630^* (0.678, 6.581)
environment	-0.979 (-3.482, 1.524)	-0.651 (-3.175, 1.873)	-0.450 (-3.161, 2.261)
health	12.208^*** (9.208, 15.207)	10.967^*** (7.498, 14.435)	10.203^*** (6.403, 14.003)
production	-0.016 (-1.888, 1.855)	-0.257 (-2.143, 1.629)	-0.512 (-1.756, 0.732)
social	-1.060 (-3.987, 1.867)	-0.886 (-3.796, 2.024)	-0.870 (-3.698, 1.959)
gdp_per_cap		0.012 (-0.005, 0.030)	0.016 (-0.003, 0.034)

WLS	No	No	Yes
Robust	No	No	No
Observations	50	50	50
R²	0.769	0.779	0.810
Adjusted R²	0.743	0.748	0.784
Residual Std. Error	0.686 (df = 44)	0.679 (df = 43)	1,492.885 (df = 43)
F Statistic	29.359^*** (df = 5; 44)	25.247^*** (df = 6; 43)	30.644^*** (df = 6; 43)

Note:	^⋆p<0.05; ^⋆⋆p<0.01; ^⋆⋆⋆p<0.001

Code

bptest(lm3)


    studentized Breusch-Pagan test

data:  lm3
BP = 13.607, df = 5, p-value = 0.01831

Code

check_model(lm3)

The Food Environment Index Regression does not show heteroskedasticity, but may well have some non-linear relationships given the residual plots. Health and economics are significant predictors, with a pretty healthy \(R^2\).

Let’s try this one again with a random forest instead of linear model:

Code

# Get a version of dat without irrelevant variables
dat_ml <- dat %>% 
  select(
    economics, 
    environment, 
    health, 
    production, 
    social, 
    foodEnvironmentIndex,
    gdp_per_cap
  )

# Split data 60/40
set.seed(42)
indices <- createDataPartition(dat_ml$foodEnvironmentIndex, p = 0.60, list = FALSE)
training_data <- dat_ml[indices, ]
testing_data <- dat_ml[-indices,]

my_folds <- createFolds(training_data$foodEnvironmentIndex, k = 5, list = TRUE)

# Control
my_control <- trainControl(
  method = 'cv',
  number = 5,
  verboseIter = TRUE,
  index = my_folds
)

# Check for zero variance or near zero variance indicators
nearZeroVar(dat, names = TRUE, saveMetrics = TRUE)
# All clear

# Also let's start a list with other results for preso
# hyperparameters, etc
ml_out <- list()

2.3.1 GLMnet

Code

set.seed(42)
food_env_glmnet <- train(
  foodEnvironmentIndex ~ economics + environment + health + production + social + gdp_per_cap,
  data = training_data, 
  tuneGrid = expand.grid(
    alpha = seq(0.1, 1, length = 5),
    lambda = seq(0.0001, 0.1, length = 100)
  ),
  method = "glmnet",
  trControl = my_control,
  preProcess = c('zv', 'center', 'scale')
)
get_str(food_env_glmnet)

# Pull out best tune
ml_out$glmnet_best_tune <- food_env_glmnet$bestTune

Code

importance <- varImp(food_env_glmnet, scale = TRUE)

# Save for preso 
saveRDS(importance, 'preso/plots/val3_food_env_glmnet_importance.rds')

#
pred <- predict(food_env_glmnet, testing_data)
ml_out$glmnet_performance <- postResample(
  pred = pred, 
  obs = testing_data$foodEnvironmentIndex
) %>% 
  round(3)
# ml_out$glmnet_performance

ml_out$glmnet_imp_plot <- importance %>% 
  ggplot(aes(x = Overall, y = rownames(.))) +
  geom_col(
    color = 'royalblue',
    fill = 'lightblue'
  ) +
  theme_classic() 

plot(importance)

2.3.2 Random Forest

Code

set.seed(42)
food_env_rf <- train(
  foodEnvironmentIndex ~ production + social + health + economics + environment + gdp_per_cap,
  data = training_data, 
  tuneLength = 7,
  method = "ranger",
  trControl = my_control,
  importance = 'impurity'
)

get_str(food_env_rf)

# Pull out best tune
ml_out$rf_best_tune <- food_env_rf$bestTune

OOB prediction error (MSE): 0.6360507

Code

importance <- varImp(food_env_rf, scale = TRUE)

# Save for preso
saveRDS(importance, 'preso/plots/val3_rf_importance.rds')

# Get RMSEA and stuff
pred <- predict(food_env_rf, testing_data)
ml_out$rf_performance <- postResample(
  pred = pred, 
  obs = testing_data$foodEnvironmentIndex
) %>% 
  round(3)
# ml_out$rf_performance

imp <- importance$importance %>% 
  as.data.frame() %>% 
  tibble::rownames_to_column() %>% 
  setNames(c('Feature', 'Importance')) %>% 
  mutate(Importance = round(Importance, 2)) %>% 
  arrange(desc(Importance)) %>% 
  mutate(
    Feature = Feature %>% 
      str_to_title() %>% 
      str_replace('Gdp_per_cap', 'GDP per capita')
  )
  

ml_out$rf_imp_plot <- imp %>% 
  ggplot(aes(
    x = Importance, 
    y = reorder(Feature, Importance),
    text = paste0(
      '<b>Variable:</b> ', Feature, '\n',
      '<b>Importance:</b> ', Importance
    )
  )) +
  geom_col(
    color = 'royalblue',
    fill = 'lightblue',
  ) +
  theme_classic() +
  labs(
    x = 'Importance',
    y = 'Feature'
  )

# Save all results for preso
saveRDS(ml_out, 'preso/data/ml_out.rds')

# Show plot
ml_out$rf_imp_plot

2.4 Happiness Score

Code

lm4 <- lm(
  happinessScore ~ economics + environment + health + production + social,
  data = dat
)


	Dependent variable:

	Happiness Index
	Dimensions Only	With GDP per Capita	GDP and Pop. Weights
	(1)	(2)	(3)

Constant	32.347^*** (23.503, 41.191)	25.028^*** (14.265, 35.791)	25.015^*** (14.267, 35.763)
economics	23.038^* (2.804, 43.271)	26.779^* (7.056, 46.502)	24.625^* (5.152, 44.098)
environment	-34.924^*** (-52.287, -17.561)	-31.401^*** (-48.376, -14.425)	-26.400^** (-44.286, -8.513)
health	57.152^*** (36.346, 77.959)	43.831^*** (20.505, 67.156)	40.807^** (15.736, 65.878)
production	13.655^* (0.671, 26.638)	11.074 (-1.610, 23.759)	6.970 (-1.238, 15.178)
social	-2.149 (-22.454, 18.156)	-0.286 (-19.858, 19.287)	3.405 (-15.257, 22.067)
gdp_per_cap		0.134^* (0.013, 0.254)	0.130^* (0.010, 0.250)

WLS	No	No	Yes
Robust	No	No	No
Observations	50	50	50
R²	0.655	0.689	0.717
Adjusted R²	0.616	0.646	0.677
Residual Std. Error	4.757 (df = 44)	4.568 (df = 43)	9,849.609 (df = 43)
F Statistic	16.709^*** (df = 5; 44)	15.882^*** (df = 6; 43)	18.125^*** (df = 6; 43)

Note:	^⋆p<0.05; ^⋆⋆p<0.01; ^⋆⋆⋆p<0.001

Code

check_model(lm4)

3 PCA

Let’s use PCA to see how our indicators are associated with a set of orthogonal components. Ideally, we might like to see that each indicator is associated strongly with a single component (simple structure) that corresponds to the dimension.

3.1 Component Extraction

First we determine how many components to extract. This is a bit subjective, so we will use a few methods.

Code

raw_minmax_geo <- readRDS('data/raw_minmax_geo.rds')
framework <- readRDS('data/filtered_frame.rds')

# Filter down to just indicators for PCA
pca_dat <- raw_minmax_geo %>% 
  select(starts_with('indic')) %>% 
  setNames(c(str_remove(names(.), 'indic_'))) %>% 
  as.data.frame()
# get_str(pca_dat)

# Explore how many factors to extract
VSS(pca_dat, n = 8, fm = 'pc', rotate = 'Promax')


Very Simple Structure
Call: vss(x = x, n = n, rotate = rotate, diagonal = diagonal, fm = fm, 
    n.obs = n.obs, plot = plot, title = title, use = use, cor = cor)
VSS complexity 1 achieves a maximimum of 0.53  with  2  factors
VSS complexity 2 achieves a maximimum of 0.75  with  3  factors

The Velicer MAP achieves a minimum of 0.04  with  6  factors


BIC achieves a minimum of  Inf  with    factors

Sample Size adjusted BIC achieves a minimum of  Inf  with    factors

Statistics by number of factors 
  vss1 vss2   map dof chisq prob sqresid  fit RMSEA BIC SABIC complex eChisq
1 0.42 0.00 0.069   0    NA   NA    85.4 0.42    NA  NA    NA      NA     NA
2 0.53 0.67 0.059   0    NA   NA    48.6 0.67    NA  NA    NA      NA     NA
3 0.53 0.75 0.055   0    NA   NA    30.7 0.79    NA  NA    NA      NA     NA
4 0.53 0.75 0.046   0    NA   NA    19.6 0.87    NA  NA    NA      NA     NA
5 0.47 0.73 0.039   0    NA   NA    13.2 0.91    NA  NA    NA      NA     NA
6 0.48 0.73 0.039   0    NA   NA    10.5 0.93    NA  NA    NA      NA     NA
7 0.46 0.72 0.041   0    NA   NA     8.5 0.94    NA  NA    NA      NA     NA
8 0.47 0.68 0.040   0    NA   NA     6.5 0.96    NA  NA    NA      NA     NA
  SRMR eCRMS eBIC
1   NA    NA   NA
2   NA    NA   NA
3   NA    NA   NA
4   NA    NA   NA
5   NA    NA   NA
6   NA    NA   NA
7   NA    NA   NA
8   NA    NA   NA

Code

set.seed(42)
fa.parallel(pca_dat, fm = 'ml')

Parallel analysis suggests that the number of factors =  5  and the number of components =  5

MAP suggests 6, VSS 2 or 3, PA suggests 5. Not half bad. I think we are justified to go with 5.

Code

# Oblique rotations: promax, oblimin, simplimax, cluster
rotations <- c(
  'Promax',
  'oblimin',
  'simplimax',
  'cluster'
)
pca_outs <- map(rotations, ~ {
  pca_dat %>% 
    # scale() %>% 
    pca(nfactors = 5, rotate = .x)
}) %>% 
  setNames(c(rotations))

# Save a version of promax for preso?
png(
  filename = 'preso/plots/scree.png',
  width = 800,
  height = 600,
  units = 'px',
  res = 150
)
plot(
  pca_outs$simplimax$values,
  xlab = 'Number of Components',
  ylab = 'Eigen Values'
)
abline(h = 1)
dev.off()

png 
  2

Code

# Now actually show it 
plot(
  pca_outs$simplimax$values,
  xlab = 'Number of Components',
  ylab = 'Eigen Values'
)
abline(h = 1)

The scree plot makes a reasonably convincing case for 6 components, as the slope falls off substantially after the fifth.

3.2 Run PCA

Code

pca_tables <- map(pca_outs, ~ {
  .x$loadings %>% 
    unclass() %>% 
    as.data.frame() %>% 
    select(order(colnames(.))) %>%
    mutate(
      across(everything(), ~ round(.x, 3)),
      across(everything(), ~ case_when(
        .x < 0.20 ~ '',
        .x >= 0.20 & .x < 0.32 ~ '.',
        .x >= 0.32 ~ as.character(.x),
        .default = NA
      ))
    ) %>% 
    rownames_to_column() %>% 
    rename(indicator = 1) %>% 
    mutate(
      dimension = framework$dimension[match(indicator, framework$indicator)]
    ) %>% 
    select(indicator, dimension, everything())
})
get_str(pca_tables)

# Save it for preso
saveRDS(pca_tables, 'preso/data/pca_tables.rds')

Code

get_reactable(
  pca_tables$Promax
)

Note that we are using the promax rotation here as it seemed most interpretable, but we have created and saved all the other available oblique rotations as well.

It looks like our economics indicators are quite scattered, but the environment indicators mostly stick to components 3 and 4. Health is quite well centered on component 3, although the “physical health tbd” indicator has meaningful loadings on three components. Production and social are rather scattered, and it is also noteworthy that participatory governance is associated with three components as well.