Roy Model: Sector Choice with a Latent Ability Factor

Overview

In a Roy model, individuals choose the sector that offers the highest utility, and only their sector-specific wage is observed. An unobserved ability factor affects both test scores and potential wages, and the choice is driven by the implied gain. factorana can jointly estimate:

A measurement system for ability (multiple test scores).
Potential outcomes in each sector (wages, partially observed).
A discrete choice of sector (probit).

All three components share the same latent ability factor, and the model is identified because test scores contain information about ability independent of sector choice and realised wages.

The methodology is described in Heckman, Humphries and Veramendi (2016) doi:10.1016/j.jeconom.2015.12.001 and (2018) doi:10.1086/698760.

Simulate data

library(factorana)

set.seed(108)
n <- 500

# Observed covariates and the unobserved ability factor
x1 <- rnorm(n)           # shifts wages
x2 <- rnorm(n)           # shifts wages and sector choice
f  <- rnorm(n, sd = 1)   # latent ability (unobserved in practice)

# Test scores measure ability with error (shared scale across tests)
T1 <- 2.0 + 1.0 * f + rnorm(n, 0, 0.5)
T2 <- 1.5 + 1.2 * f + rnorm(n, 0, 0.6)
T3 <- 1.0 + 0.8 * f + rnorm(n, 0, 0.4)

# Potential wages in each sector
wage0 <- 2.0 + 0.5 * x1 + 0.3 * x2 + 0.5 * f + rnorm(n, 0, 0.6)
wage1 <- 2.5 + 0.6 * x1 +             1.0 * f + rnorm(n, 0, 0.7)

# Sector choice: higher ability -> more likely to pick sector 1
z_sector <- 0.0 + 0.4 * x2 + 0.8 * f
sector <- as.numeric(runif(n) < pnorm(z_sector))

# Only the wage in the chosen sector is observed
wage <- ifelse(sector == 1, wage1, wage0)

dat <- data.frame(
  intercept = 1,
  x1 = x1, x2 = x2,
  T1 = T1, T2 = T2, T3 = T3,
  wage = wage,
  sector = sector,
  eval_tests = 1L,            # always observe all three tests
  eval_wage0 = 1L - sector,   # wage0 observed iff sector = 0
  eval_wage1 = sector,        # wage1 observed iff sector = 1
  eval_sector = 1L            # sector choice always observed
)

Model specification

Only one latent factor (ability). Every component loads on that factor, so loading_normalization is a scalar here.

fm <- define_factor_model(n_factors = 1, n_types = 1)

Measurement equations (tests)

Fix T1’s loading at 1.0 to pin the factor scale; leave T2 and T3 free.

mc_T1 <- define_model_component(
  name = "T1", data = dat, outcome = "T1", factor = fm,
  covariates = "intercept", model_type = "linear",
  loading_normalization = 1.0,
  evaluation_indicator = "eval_tests"
)
mc_T2 <- define_model_component(
  name = "T2", data = dat, outcome = "T2", factor = fm,
  covariates = "intercept", model_type = "linear",
  loading_normalization = NA_real_,
  evaluation_indicator = "eval_tests"
)
mc_T3 <- define_model_component(
  name = "T3", data = dat, outcome = "T3", factor = fm,
  covariates = "intercept", model_type = "linear",
  loading_normalization = NA_real_,
  evaluation_indicator = "eval_tests"
)

Potential outcomes (wages)

Each wage equation is a linear model evaluated only on the subsample that chose that sector. evaluation_indicator tells factorana which rows contribute to which component’s likelihood.

mc_wage0 <- define_model_component(
  name = "wage0", data = dat, outcome = "wage", factor = fm,
  covariates = c("intercept", "x1", "x2"), model_type = "linear",
  loading_normalization = NA_real_,
  evaluation_indicator = "eval_wage0"
)
mc_wage1 <- define_model_component(
  name = "wage1", data = dat, outcome = "wage", factor = fm,
  covariates = c("intercept", "x1"), model_type = "linear",
  loading_normalization = NA_real_,
  evaluation_indicator = "eval_wage1"
)

Sector-choice equation

A probit with a free factor loading (positive ability raises the probability of choosing sector 1 when the simulated value is positive).

mc_sector <- define_model_component(
  name = "sector", data = dat, outcome = "sector", factor = fm,
  covariates = c("intercept", "x2"), model_type = "probit",
  loading_normalization = NA_real_,
  evaluation_indicator = "eval_sector"
)

Assemble the system

ms <- define_model_system(
  components = list(mc_T1, mc_T2, mc_T3, mc_wage0, mc_wage1, mc_sector),
  factor     = fm
)

Estimation

ctrl <- define_estimation_control(n_quad_points = 8, num_cores = 1)

fit <- estimate_model_rcpp(
  model_system = ms,
  data         = dat,
  control      = ctrl,
  optimizer    = "nlminb",
  parallel     = FALSE,
  verbose      = FALSE
)

fit$convergence   # 0 == strict convergence
#> [1] 0
fit$loglik
#> [1] -2524.132

Results

components_table(fit, digits = 3)
#> 
#> Factor Model Results by Component
#> ======================================================================================================================== 
#> 
#> Parameter             factor           T1           T2           T3        wage0        wage1       sector
#> ---------------------------------------------------------------------------------------------------------- 
#> Intercept                        1.917***     1.370***     0.918***     1.976***     2.390***       -0.054
#>                                   (0.042)      (0.050)      (0.034)      (0.049)      (0.069)      (0.070)
#> beta_sector_x2                                                                                    0.415***
#>                                                                                                    (0.066)
#> beta_wage0_x1                                                           0.494***                          
#>                                                                          (0.040)                          
#> beta_wage0_x2                                                           0.323***                          
#>                                                                          (0.040)                          
#> beta_wage1_x1                                                                        0.619***             
#>                                                                                       (0.057)             
#> Sigma                            0.550***     0.601***     0.439***     0.612***     0.769***             
#>                                   (0.022)      (0.027)      (0.018)      (0.028)      (0.041)             
#> factor_var_1        0.688***                                                                              
#>                      (0.048)                                                                              
#> T2_loading_1                                  1.224***                                                    
#>                                                (0.047)                                                    
#> T3_loading_1                                               0.817***                                       
#>                                                             (0.033)                                       
#> wage0_loading_1                                                         0.495***                          
#>                                                                          (0.054)                          
#> wage1_loading_1                                                                      0.949***             
#>                                                                                       (0.072)             
#> sector_loading_1                                                                                   0.865***
#>                                                                                                    (0.095)
#> 
#> ---------------------------------------------------------------------------------------------------------- 
#> N                          0          500          500          500          258          242          500
#> Log-likelihood: -2524.13
#> 
#> Standard errors in parentheses
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1

Estimated loadings on the ability factor track the simulated values: the test equations recover (1.0, 1.2, 0.8); the wage equations recover roughly 0.5 in sector 0 and 1.0 in sector 1, capturing the positive selection on ability into the high-return sector.

Robust standard errors

This is a single-stage full-information fit, so sandwich (Huber-White) robust standard errors are exact (no first-stage uncertainty to account for). robust_se() returns them for the whole parameter vector:

se_robust <- robust_se(fit, dat, type = "robust")

free <- fit$free_idx
data.frame(
  parameter = names(fit$estimates)[free],
  se_model  = round(fit$std_errors[free], 3),
  se_robust = round(se_robust[free], 3),
  row.names = NULL
)
#>           parameter se_model se_robust
#> 1      factor_var_1    0.048     0.054
#> 2      T1_intercept    0.042     0.066
#> 3          T1_sigma    0.022     0.022
#> 4      T2_intercept    0.050     0.078
#> 5      T2_loading_1    0.047     0.047
#> 6          T2_sigma    0.027     0.030
#> 7      T3_intercept    0.034     0.053
#> 8      T3_loading_1    0.033     0.032
#> 9          T3_sigma    0.018     0.017
#> 10  wage0_intercept    0.049     0.056
#> 11         wage0_x1    0.040     0.035
#> 12         wage0_x2    0.040     0.040
#> 13  wage0_loading_1    0.054     0.050
#> 14      wage0_sigma    0.028     0.031
#> 15  wage1_intercept    0.069     0.084
#> 16         wage1_x1    0.057     0.053
#> 17  wage1_loading_1    0.072     0.069
#> 18      wage1_sigma    0.041     0.046
#> 19 sector_intercept    0.070     0.082
#> 20        sector_x2    0.066     0.071
#> 21 sector_loading_1    0.095     0.093

If individuals are grouped (for example by region or cohort), use vcov_factorana(fit, dat, type = "cluster", cluster = "region") for cluster-robust inference. See ?vcov_factorana for details.

Recover factor scores

Given the fitted model, we can back out each individual’s posterior mean ability.

fscores <- estimate_factorscores_rcpp(
  fit, dat, control = ctrl, parallel = FALSE, verbose = FALSE
)
head(fscores[, c("obs_id", "factor_1", "se_factor_1", "converged")])
#> Factor Score Estimates
#> ======================
#> 
#> Observations: 6
#> Factors: 1
#> Converged: 6 (100.0%)
#> 
#> Summary Statistics:
#>   Factor 1: mean=-0.137, sd=0.751, mean_se=0.287
#> 
#> First 6 rows:
#>   obs_id    factor_1 se_factor_1 converged
#> 1      1 -0.37442741   0.2880698      TRUE
#> 2      2  0.91719444   0.2804158      TRUE
#> 3      3 -0.07723296   0.2886384      TRUE
#> 4      4  0.50339290   0.2887276      TRUE
#> 5      5 -0.69199514   0.2891442      TRUE
#> 6      6 -1.09977613   0.2899790      TRUE

# Correlation of estimated scores with the true (simulated) ability
cor(fscores$factor_1, f)
#> [1] 0.9583385

Two-stage estimation (optional)

For larger applications it is often convenient to estimate the measurement system first, then freeze those parameters and estimate the structural part. See ?estimate_model_rcpp (argument previous_stage in define_model_system()) and ?set_adaptive_quadrature_cpp for the adaptive integration that makes the second stage cheap.