Introduction
The summclust
package allows to compute leverage statistics for clustered errors and fast CRV3(J)
variance-covariance matrices as described in MacKinnon, J.G., Nielsen, M.Ø., Webb, M.D., 2022. Leverage, influence, and the jackknife in clustered regression models: Reliable inference using summclust.
It is a post-estimation command and currently supports methods for objects of type lm
(from stats
) and fixest
(from the fixest
package).
The summclust
function
library(summclust)
library(lmtest)
library(haven)
nlswork <- read_dta("http://www.stata-press.com/data/r9/nlswork.dta")
# drop NAs at the moment
nlswork <- nlswork[, c("ln_wage", "grade", "age", "birth_yr", "union", "race", "msp", "ind_code")]
nlswork <- na.omit(nlswork)
lm_fit <- lm(
ln_wage ~ as.factor(grade) + as.factor(age) + as.factor(birth_yr) + union + race + msp,
data = nlswork)
summclust_res <- summclust(
obj = lm_fit,
cluster = ~ind_code,
params = c("msp", "union")
)
# CRV3-based inference - exactly matches output of summclust-stata
tidy(summclust_res)
#> coef tstat se p_val conf_int_l conf_int_u
#> union 0.2039597 2.440122 0.08358587 0.03281561 0.01998847 0.387930980
#> msp -0.0275151 -1.956404 0.01406412 0.07628064 -0.05847002 0.003439815
summary(summclust_res)
#> summclust.lm(obj = lm_fit, cluster = ~ind_code, params = c("msp",
#> "union"))
#>
#> Number of observations: 19130
#> Number of clusters: 7
#>
#> coef tstat se p_val conf_int_l conf_int_u
#> union 0.2039597 2.440122 0.08358587 0.03281561 0.01998847 0.387930980
#> msp -0.0275151 -1.956404 0.01406412 0.07628064 -0.05847002 0.003439815
#>
#> N_G leverage partial-leverage-msp partial-leverage-union
#> Min. 38.000000 0.09332052 0.0006662968 0.001622359
#> 1st Qu. 172.000000 0.70440923 0.0048899422 0.009133996
#> Median 995.500000 3.51549151 0.0379535242 0.056682344
#> Mean 1594.166667 5.41666667 0.0833333333 0.083333333
#> 3rd Qu. 2047.250000 6.41132962 0.1004277711 0.106083114
#> Max. 6335.000000 20.28918187 0.3597669210 0.312994532
#> coefvar 1.187237 1.15296458 1.3313784379 1.141326117
#> beta-msp beta-union
#> Min. -0.03320040 0.1624754
#> 1st Qu. -0.02893131 0.1994694
#> Median -0.02776470 0.2045197
#> Mean -0.02691999 0.2053997
#> 3rd Qu. -0.02610221 0.2056569
#> Max. -0.01583453 0.2754228
#> coefvar 0.16289813 0.1279443
To visually inspect the leverage statistics, use the plot
method
plot(summclust_res)
#> $residual_leverage
#>
#> $coef_leverage
#>
#> $coef_beta
Using summclust
with coefplot
and fixest
Note that you can also use CVR3 and CRV3J covariance matrices computed via summclust
or its vcov_CR3J
method with the lmtest()
and fixest
packages.
library(lmtest)
vcov3J <-
vcov_CR3J(
lm_fit,
cluster = ~ ind_code
)
all.equal(
vcov3J,
summclust_res$vcov
)
#> [1] "Attributes: < Names: 2 string mismatches >"
#> [2] "Attributes: < Length mismatch: comparison on first 2 components >"
#> [3] "Attributes: < Component 1: Modes: character, numeric >"
#> [4] "Attributes: < Component 1: Lengths: 1, 2 >"
#> [5] "Attributes: < Component 1: target is character, current is numeric >"
#> [6] "Attributes: < Component 2: Modes: numeric, list >"
#> [7] "Attributes: < Component 2: target is numeric, current is list >"
#> [8] "target is vcov_CR3J, current is matrix"
df <- length(summclust_res$cluster) - 1
# with lmtest
CRV1 <- lmtest::coeftest(lm_fit, sandwich::vcovCL(lm_fit, ~ind_code), df = df)
CRV3 <- lmtest::coeftest(lm_fit, vcov3J, df = df)
CRV1[c("union", "race", "msp"),]
#> Estimate Std. Error t value Pr(>|t|)
#> union 0.20395972 0.061167499 3.334446 0.0066585766
#> race -0.08619813 0.016150418 -5.337207 0.0002384275
#> msp -0.02751510 0.009293046 -2.960827 0.0129561148
CRV3[c("union", "race", "msp"),]
#> Estimate Std. Error t value Pr(>|t|)
#> union 0.20395972 0.08358587 2.440122 0.032815614
#> race -0.08619813 0.01904684 -4.525586 0.000864074
#> msp -0.02751510 0.01406412 -1.956404 0.076280639
stats::confint(CRV1)[c("union", "race", "msp"),]
#> 2.5 % 97.5 %
#> union 0.06933097 0.338588481
#> race -0.12174496 -0.050651302
#> msp -0.04796896 -0.007061245
stats::confint(CRV3)[c("union", "race", "msp"),]
#> 2.5 % 97.5 %
#> union 0.01998847 0.387930980
#> race -0.12811995 -0.044276312
#> msp -0.05847002 0.003439815
library(fixest)
feols_fit <- feols(
ln_wage ~ i(grade) + i(age) + i(birth_yr) + union + race + msp,
data = nlswork
)
# Store vcov into the fixest object
feols_fit <- summary(
feols_fit,
vcov = vcov_CR3J(feols_fit, cluster = ~ ind_code)
)
# Now it just works with fixest functions
fixest::coeftable(feols_fit, keep = c("msp", "union", "race"))
#> Estimate Std. Error t value Pr(>|t|)
#> union 0.20395972 0.08358587 2.440122 1.469134e-02
#> race -0.08619813 0.01904684 -4.525586 6.059226e-06
#> msp -0.02751510 0.01406412 -1.956404 5.043213e-02
The p-value and confidence intervals for fixest::coeftable()
differ from lmtest::coeftest()
and summclust::coeftable()
. This is due to the fact that fixest::coeftable()
uses a different degree of freedom for the t-distribution used in these calculation (I believe it uses t(N-1)).