# note that state = 1 for NJ and 0 for PA.# also note that post = 1 for 1993 and 0 for 1992# NJ is treated group, so state = 1 means treat = 1
Card and Krueger (1994) - Minimum wage and employment
Code
# looks like fulltime is a character! let's try to make it numericckdata$fulltime_num <-as.numeric(ckdata$fulltime)
Warning: NAs introduced by coercion
Code
ckdata$parttime_num <-as.numeric(ckdata$parttime)
Warning: NAs introduced by coercion
Card and Krueger (1994) - Minimum wage and employment
Code
# said there are NAs in the data, so let's see where they areckdata %>%filter(is.na(fulltime_num)) %>% dplyr::select(fulltime, fulltime_num)
# A tibble: 18 × 2
fulltime fulltime_num
<chr> <dbl>
1 . NA
2 . NA
3 . NA
4 . NA
5 . NA
6 . NA
7 . NA
8 . NA
9 . NA
10 . NA
11 . NA
12 . NA
13 . NA
14 . NA
15 . NA
16 . NA
17 . NA
18 . NA
Code
# ah, so they are .! those are missing values in Stata, so leave as missing.
Card and Krueger (1994) - Minimum wage and employment
Code
reg1 <-feols(fulltime_num ~ state + post + state:post, data = ckdata, vcov ="HC1")summary(reg1)
Card and Krueger (1994) - Minimum wage and employment
Code
reg1 <-feols(fulltime_num ~ state + post + state:post, data = ckdata, vcov ="HC1")reg2 <-feols(parttime_num ~ state + post + state:post, data = ckdata, vcov ="HC1")table <-etable(reg1, reg2,# standard errors, digits, fit statistics, put SE below coefficients (the norm)vcov ="HC1", digits =3, fitstat ="", se.below =TRUE, # change significance codes to the normsignif.code =c("***"=0.01, "**"=0.05, "*"=0.1),# rename the variablesdict =c("Constant"="Intercept", "state"="Treat", "post"="Post", "state:post"="Treat x Post"))table
# drop some rowstable <- table[-c(1:2, 11:nrow(table)),]# rename columnscolnames(table) <-c("", "Full-time", "Part-time")
Card and Krueger (1994) - Minimum wage and employment
Code
kable(table, align ="lcc", booktabs =TRUE, linesep ="", escape =FALSE, row.names =FALSE) %>%column_spec(1,width ="4cm") %>%column_spec(c(2:3),width ="3cm") %>%kable_styling() %>%footnote("* p < 0.1, ** p < 0.05, *** p < 0.01.", general_title ="",footnote_as_chunk =TRUE ) %>%footnote("Note: Robust standard errors in parentheses.", general_title ="",footnote_as_chunk =TRUE )
Full-time
Part-time
Constant
9.99***
19.7***
(1.22)
(1.09)
Treat
-2.27*
-1.04
(1.30)
(1.23)
Post
-2.27
0.103
(1.57)
(1.60)
Treat x Post
2.99*
-0.405
(1.68)
(1.80)
* p < 0.1, ** p < 0.05, *** p < 0.01.
Note: Robust standard errors in parentheses.
Card and Krueger (1994) - Poisson regression!
Code
reg1 <-feglm(fulltime_num ~ state + post + state:post, data = ckdata, vcov ="HC1", family ="poisson")reg2 <-feglm(parttime_num ~ state + post + state:post, data = ckdata, vcov ="HC1", family ="poisson")table <-etable(reg1, reg2,# standard errors, digits, fit statistics, put SE below coefficients (the norm)vcov ="HC1", digits =3, fitstat ="", se.below =TRUE, # change significance codes to the normsignif.code =c("***"=0.01, "**"=0.05, "*"=0.1),# rename the variablesdict =c("Constant"="Intercept", "state"="Treat", "post"="Post", "state:post"="Treat x Post"))table
# key variables: state, year, cdl ("treatment"), and homicide_c (outcome)# homicide_c to rate (per 100,000 people)df$homicide_c <- (df$homicide_c/df$population)*100000# and logdf$homicide_c <-log(df$homicide_c)
Implementing WCB in R
Code
# Note: this is not differences-in-differences. # Just an example of the wild cluster bootstrap with fwildclusterbootreg1 <-feols(homicide_c ~ cdl, data = df, cluster ="state")summary(reg1)
reg1 <-feols(homicide_c ~ cdl, data = df, cluster ="state")boot_reg <-boottest( reg1, clustid =c("state"), param ="cdl", B =10000,type ="rademacher"# default weighting, by the way )boot_reg
boottest.fixest(object = reg1, param = "cdl", B = 10000, clustid = c("state"),
type = "rademacher")
p value: 5e-04
confidence interval: 0.1541 0.5249
test statistic 3.6528
boottest.fixest(object = reg1, param = "cdl", B = 10000, clustid = c("state",
"year"), type = "webb")
p value: 0.1748
confidence interval: -0.3707 1.4389
test statistic 1.9382
Some thoughts on clustering
If you have more than ~30 clusters, you can probably just cluster at the group level
We see that the standard errors are very similar in the state examples above
Otherwise, consider using an alternative approach
Also important when clusters have wildly different sample sizes, or where the treated clusters are relatively few (Moulton, 1990)
Alternative approach with only one treated cluster: randomization inference
Randomization inference in Buchmueller, DiNardo, and Valletta (2011)
They are interested in the change in insurance coverage in Hawaii relative to other states: \[\begin{gather} Y_{ist}=X_{ist}\beta^t+Z_{st}\gamma^t+H_{it}\delta^t+\phi_{st}+\eta_{it} \end{gather}\]
They calculate the change as: \(\Delta=\delta^1-\delta^0\)
The idea: see where the Hawaii effect sits in the distribution of the same effect across all US states
“Placebo” effects
Note that this is not true “randomization” inference
I’ll show you an example with one of my papers in a minute
Randomization inference in Buchmueller, DiNardo, and Valletta (2011)
Randomization inference in Merfeld (2023)
I’m interested in the effects of pollution on agricultural productivity in India
I have villages, which are nested within districts
I cluster on villages
Alternative: randomly assign pollution to villages within the same district and compare my effects to the distribution of effects
Randomization inference in Merfeld (2023)
Placebo tests for the parallel trends assumption
Above, we looked at the parallel trends assumption graphically in Richardson and Troost (2009)
Another common way is to look at leads of treatment
In my paper, for example, pollution next year should not affect agricultural productivity this year
Leads of pollution in Merfeld (2023)
(1)
(2)
particulate matter (one-year lead)
-0.033
(0.067)
particulate matter (two-year lead)
-0.070
(0.052)
weather (expanded)
No
No
fixed effects:
village
Yes
Yes
year
Yes
Yes
F
592
783
observations
1,161,265
1,055,562
* p < 0.1, ** p < 0.05, *** p < 0.01.
Note: Standard errors are in parentheses and are clustered at the village level.
Convincing the reader is like writing a good story
When you’re writing a diff-in-diff paper, think about the possible threats to your identification strategy
Then, think about how you can convince the reader that your strategy is valid
Use placebos: is there somewhere we shouldn’t expect an effect?
In the case of my paper, the leads convinced some seminar participants!
You can likewise think of heterogeneity we would expect to see, and test for that!
Fixed and random effects
Before moving on to some of the new literature…
Let’s talk about fixed and random effects
Fixed effects will be important for the upcoming discussions
Some nice (but old) slides from Oscar Tores-Reyna here.
Panel data
Both fixed and random effects are used in panel data
Panel data: data with multiple observations for each unit
Examples: individuals, firms, countries, etc.
In our previous example of homicide and castle doctrine laws: unit is the state!
Panel data
Fixed effects
Fixed effects are a way to control for omitted variables
However there is a key assumption: the omitted variable is time-invariant
Fixed effects are also called “within” effects
Why? Because we are looking at the variation within each unit
The regression is of the form: \[\begin{gather} y_{it} = \alpha_i + \beta x_{it} + \varepsilon_{it}, \end{gather}\]
where \(\alpha_i\) is the fixed effect for unit \(i\). Note the subscript! No \(t\)!
Time-invariant assumption
The key assumption is that the omitted variable is time-invariant
For example, in the case of the homicide data, we might think that there is some time-invariant variable that affects homicide rates
However, this is a strong assumption
Moreover, do you think it is more likely to hold in short or long panels?
Just a quick note: wild cluster bootstrap still works!
Code
# need to use a numeric variable for the bootstrap. sid is in our data, thankfully.reg1 <-feols(homicide_c ~ cdl +log(population) +log(income) + unemployrt | sid, data = df, cluster =c("sid"))reg1
boot_reg <-boottest( reg1, clustid =c("sid"), # note that it requires a numeric variable!param ="cdl", B =10000 )boot_reg
boottest.fixest(object = reg1, param = "cdl", B = 10000, clustid = c("sid"))
p value: 0.2757
confidence interval: -0.0338 0.1599
test statistic 1.1917
Fixed effects are the norm with “differences-in-differences”
It’s not quite the same as the canonical differences-in-differences model
We redefine treatment for the same unit
Before treatment, the value is zero, and after it is one
Note that this is different from the canonical model, where the value is zero for the comparison group and one for the treated group at all points in time
In fact, the regression we just saw is like a differences-in-differences model of this form!
In practice, we often tend to add time fixed effects, too:
Note: Standard errors clustered at the state level are in parentheses.
Random effects
Before turning to recent issues discovered with the two-way fixed effects estimator, let’s talk about random effects
Random effects are a way to capture the heterogeneity across units
The key is that this heterogeneity is random and uncorrelated with the predictors in the model
This is really a way to capture the variance across units
In practice, this absorbs some of the variance, increasing precision (but at the cost of the assumption above)
Random effects in R
Code
library(lme4)reg1 <-feols(homicide_c ~ cdl +log(population) +log(income) + unemployrt, data = df)# note iid standard errors for simplicity for random effects and compare to reg1# (can use other packages to change the vcov calculation if we think assumptions aren't exactly true...)reg2 <-lmer(homicide_c ~ cdl +log(population) +log(income) + unemployrt + (1| state) + (1| year), data = df) reg3 <-feols(homicide_c ~ cdl +log(population) +log(income) + unemployrt | state + year, data = df, cluster ="state")
Random effects in R
None
RE
FE
Castle laws
0.157
0.050
0.076
(0.065)
(0.030)
(0.056)
Population (log)
0.305
0.273
-0.974
(0.021)
(0.066)
(0.484)
Income (log)
-1.062
0.018
0.283
(0.140)
(0.191)
(0.308)
Unemployment rate
-0.007
-0.027
-0.009
(0.011)
(0.006)
(0.012)
Random effects
Some things about random effects relative to vanilla OLS (not fixed effects):
The random effects estimator is asymptotically more efficient than OLS if there is unit-level heterogeneity: \(\mathbb{E}(V_{RE})<\mathbb{E}(V_{OLS})\)
In practice with finite samples, not necessarily
In expectation, coefficients are the same: \(\mathbb{E}(\beta_{RE}) = \mathbb{E}(\beta_{OLS})\)
Random effects is estimated using (feasible) generalized least squares, which essentially reweights the observations
In practice with finite samples, this means the coefficients will not be exactly the same
Fixed or random effects?
In expectation, coefficients are the same: \[\mathbb{E}(\beta_{RE}) = \mathbb{E}(\beta_{OLS})\]
So, how might we test for whether we should use random or fixed effects?
We can test whether coefficients change “significantly” when using fixed vs. random effects!
This is called a Hausman test
Note: In practice, economists rarely do this. They tend to assume fixed effects unless they have a good reason to assume otherwise. Not necessarily true in other disciplines.
If RE is consistent, FE is, too; but RE is more efficient, asymptotically
\(H_0\): Random effects is consistent (i.e. “okay” to use)
The test statistic is asymptotically \(\chi^2\) distributed with \(k\) degrees of freedom, where \(k\) is the number of coefficients in the model
Triple differences
We have discussed differences-in-differences
We can also have triple differences
For example, if we have three groups and two time periods
The Card and Krueger paper simply compared employment rates in fast food restaurants in New Jersey and Pennsylvania before and after the minimum wage increase in New Jersey
This is a double difference
Can you think of a group that theoretically would not be affected by the minimum wage change?
How about high-wage workers?
Triple differences
Differences-in-differences can be extend into higher “differences”
They study a bicycle program for girls in Bihar, India
Reducing the gender gap by providing girls in secondary school with a bicycle
Higher differences
Using higher differences is quite common
Common problem: large standard errors
Triple difference in this paper:
Gender by age by location
The program
Program in Bihar
Third largest (by population) state in India
Quite poor
Secondary schools can be located far from households
Along with a large gender gap in schooling, this motivates the program itself
“Chief Minister’s Bicycle Program”
“Conditional kind transfer”
Data
Data was not collected specifically for the program
Indian District Level Health Survey (DLHS)
DLHS is nationally representative
720,000 households across 601 districts
Identification
Compare girls eligible for the program based on age
The survey was 18 months after the start of the program
Compare girls who were just barely eligible for the program based on age
Could just compare boys to girls based on age
Problem: possible non-parallel trends
Pre-trends a problem?
Solution? Triple difference bringing in a second state, Jharkhand
Triple difference bringing in a second state, Jharkhand
Main results: enrollment/completion of grade 9
Expected heterogeneity
One way to convince people that your causal results are true is to make arguments about heterogeneity
For example, with this paper distance should be important
For those who live very close to a school, bicycles shouldn’t matter
For those who live very far, same
So what is the heterogeneity we EXPECT to see?
Treatment effect should be largest at “middle” distances
Expected heterogeneity by distance
Effects on grade 10 board exams
Bias in TWFE
Bias in TWFE
Recently, a number of papers have shown that the two-way fixed effects estimator can be… problematic
We have been discussing differences-in-differences with TWFE of the following form: \[\begin{gather} y_{it} = \alpha_i + \delta_{t} + \beta D_{it} + \gamma x_{it} + \varepsilon_{it}, \end{gather}\] where \(D_{it}\) is a dummy variable for treatment.
If there are only two time periods and one group receives treatment in only one period, this is not a problem!
The Card and Krueger setup is not an issue with TWFE
TWFE and differential treatment timing
The real issue is when treatment is staggered across time
For example, if treatment is introduced in different years for different states
It turns out this is the case with the castle doctrine law!
year
treated
2000
0.00
2001
0.00
2002
0.00
2003
0.00
2004
0.00
2005
0.00
2006
0.02
2007
0.28
2008
0.36
2009
0.40
2010
0.42
Goodman-Bacon (2021), Journal of Econometrics
Goodman-Bacon lays out the problem (as does Scott in CI)
Suppose we have three groups and three time periods
Group 1 is treated before period 2 (Goodman-Bacon calls this group k)
Group 2 is treated before period 3 (Goodman-Bacon calls this group l)
Group 3 is never treated (Goodman-Bacon calls this group U)
If we are willing to assume treatment effect homogeneity, then we have no problems!
Are you willing to assume this?
Goodman-Bacon (2021)
Goodman-Bacon shows that the overall treatment effect is a weighted average of treatment effects from every possible 2x2 comparison where treatment status doesn’t change:
Group 1 vs group 2
Group 1 vs group 3
Group 2 vs group 1
Group 2 vs group 3
These weights are a function of two things:
Group sizes
Variance in treatment
Goodman-Bacon (2021)
Goodman-Bacon (2021)
Goodman-Bacon (2021)
He shows there are three relevant comparisons (groups for which the treatment changes):
where \(k\) and \(l\) are treated groups, and \(U\) is the untreated group.
Note the first term includes two separate comparisons (both treated groups vs. untreated group)
Goodman-Bacon (2021)
The DD estimator is a weighted average of all these comparisons.
Generalizing to \(K\) time periods: \[\begin{gather} \hat{\beta}^{DD}=\sum_{k\neq U}s_{kU}\hat{\beta}_{kU}^{2x2}+\sum_{k\neq U}\sum_{l>k}\left[s_{kl}^k\hat{\beta}_{kl}^{2x2,k}+s_{kl}^l\hat{\beta}_{kl}^{2x2,l} \right], \end{gather}\] where \(s_{ij}\) is the weight for the comparison between groups \(i\) and \(j\).
Goodman-Bacon (2021)
The weights: \[\begin{align} s_{kU}&=\frac{(n_k+n_U)^2n_{kU}(1-n_{kU})\bar{D}_k(1-\bar{D}_k)}{\hat{V}^D} \\
s_{kl}^k&=\frac{\left((n_k+n_l)(1-\bar{D}_l)\right)^2n_{kl}(1-n_{kl})\frac{\bar{D}_k-\bar{D}_l}{1-\bar{D}_l}\frac{1-\bar{D}_k}{1-\bar{D}_k}}{\hat{V}^D} \\
s_{kl}^l&=\frac{\left((n_k+n_l)(\bar{D}_k)\right)^2n_{kl}(1-n_{kl})\frac{\bar{D}_l}{\bar{D}_k}\frac{\bar{D}_k-\bar{D}_l}{\bar{D}_k}}{\hat{V}^D} \end{align}\]
Note how the variance of treatment affects the weights! “Changing the number or spacing of time periods changes the weights” (Goodman-Bacon).
Even if the treatment effect is constant, changing the length of the panel can change the weighted average if different groups have different treatment effects.
de Chaisemartin and D’Haultfœuille (2020), American Economic Review
de Chaisemartin and D’Haultfœuille (2020) more explicitly show the problem with weights.
The problem is an extrapolation problem
Essentially, “the regression predicts a treatment probability larger than the one in that cell” (de Chaisemartin and D’Haultfœuille)
Note that if the treatment effect is constant, then the weighted average is always the same, no matter the weights
But if the treatment effect is not constant, then the weighted average can be different from the true average
Back to Bacon-Goodman
Let’s return to Bacon-Goodman’s formulation
What happens if we have a “control” group in a later period that is treated in an earlier period? \[\begin{align} \begin{split} \hat{\delta}_{lk}^{2x2} = &ATT_{l,POST(l)} \\
+ &\Delta Y_l^0(Post(l), MID) - \Delta Y_k^0(Post(l), MID) \\
- &(ATT_k(Post) - ATT_k(Mid)) \end{split} \end{align}\]
The first line is what we want
The second line is parallel-trends bias
The third line is bias due to heterogeneity in time!
Even with parallel trends, this third line can cause deviations from the true ATT
Back to Bacon-Goodman
In other words, what causes the problem?
The fact that already-treated groups can be used as control groups for later-treated groups
This means that changing “effects” of the intervention can bias this later comparison
Goodman-Bacon (2021)
Enough math, what to do?
That’s enough math. Let’s talk about what we can actually do!
Cheng and Hoekstra (2013), Journal of Human Resources
The “castle doctrine” laws essentially make lethal force “more” legal
Recall the changes we made: turn homicide into a rate (per 100,000 people) and take the log:
Code
library(haven) # to load .dta filesdf <-read_dta("week5files/castle.dta")head(df)# key variables: state, year, cdl ("treatment"), and homicide_c (outcome)# homicide_c to rate (per 100,000 people)df$homicide_c <- (df$homicide_c/df$population)*100000# and logdf$homicide_c <-log(df$homicide_c)
Cheng and Hoekstra (2013), Journal of Human Resources
I made some changes to the data, as well
I’ve turned the “treatment” variable (cdl) into a dummy variable
We have the issue we ran into above:
Treatment is staggered across time
This means that some already-treated units will serve as controls for later-treated units!
Treatment timing
Treatment timing by individual state
Two-way fixed effects with fixest, as simple as possible
Code
# state fe, note the weights!reg1 <-feols(homicide_c ~ cdl +log(population) + unemployrt | state, data = df, cluster =c("state"), weights = df$population)# state and year fereg2 <-feols(homicide_c ~ cdl +log(population) + unemployrt | state + year, data = df, cluster =c("state"), weights = df$population)# with state linear trends, note the syntaxreg3 <-feols(homicide_c ~ cdl +log(population) + unemployrt | state + year + state[year], data = df, cluster =c("state"), weights = df$population)
Two-way fixed effects with fixest
reg1
reg2
reg3
Treat
0.057
0.089**
0.097*
(0.034)
(0.033)
(0.036)
Pop. (log)
-0.544**
-0.768*
-0.543
(0.168)
(0.370)
(0.960)
Unemp. rate
-0.024***
-0.009
-0.003
(0.004)
(0.010)
(0.011)
Fixed-Effects:
state
Yes
Yes
Yes
year
No
Yes
Yes
Varying Slopes:
year (state)
No
No
Yes
Observations
550
550
550
* p < 0.1, ** p < 0.05, *** p < 0.01.
Note: Standard errors clustered at the state level are in parentheses.
Event studies
Event studies are a way to look at the effect of treatment over time
We can see if the effect is immediate, or if it takes time to “kick in”
We can also see whether there are any pre-trends
Effectively, what we want to do is redefine the time period to be relative to treatment
For example, if treatment is introduced in 2005, we want to redefine 2005 as year 0, 2004 as year -1, etc.
Let’s do this now
Event studies
Code
# first, find the year of treatment by statedf <- df %>%# group by state ("panel" identifier)group_by(state) %>%mutate(year_treat =ifelse(cdl==1, year, NA),# first year with treatmentyear_treat =min(year_treat, na.rm =TRUE),# create new time variable called event_yearevent_year = year - year_treat) %>%# ungroupungroup()# note that states NEVER treated are -Inftable(df$event_year)
# now find the average homicide rate by event_yeardf_event <- df %>%# drop the missingsfilter(event_year>-11) %>%group_by(event_year) %>%summarize(homicide_c =weighted.mean(homicide_c, weights = population, na.rm =TRUE))
ggplot(df_event) +geom_line(aes(x = event_year, y = homicide_c)) +theme_bw() +labs(x ="Years relative to treatment", y ="Homicide rate (log)")
What do we really want to see?
We don’t really want the means, though
What do we want instead?
We want the effect of treatment over time
We want to essentially plot coefficients
Calculating year-specific coefficients
Code
# note that the year BEFORE treatment, -1, IS THE OMITTED CATEGORY# i() is a fixest-specific way to create factors/dummiesreg1 <-feols(homicide_c ~i(event_year, ref =-1) +log(population) + unemployrt | state + year, data = df, cluster =c("state"), weights = df$population)reg1$coefficients
# It's a vector. We can extract the coefficients we want by subsetting with []# get coefficientscoef <-c(reg1$coefficients[1:9], 0, reg1$coefficients[10:14])# confidence intervalslower <-c(confint(reg1)[1:9,1], 0, confint(reg1)[10:14,1])upper <-c(confint(reg1)[1:9,2], 0, confint(reg1)[10:14,2])# create minimum/maximum years <-c(-10:4)
Plot these coefficients using geom_point
Code
ggplot() +geom_point(aes(x = years, y = coef)) +geom_vline(xintercept =-0.5, linetype ="dashed", color ="red") +labs(x ="Years to treatment", y ="Coefficient (relative to T = -1)") +# change x axis to be every yearscale_x_continuous(breaks = years) +theme_bw()
Plot these coefficients using geom_point
Let’s remove the first three years because of small sample sizes
Add the confidence intervals using geom_errorbar
Code
ggplot() +geom_point(aes(x = years, y = coef)) +geom_errorbar(aes(x = years, ymin = lower, ymax = upper), alpha =0.2, width =0.1) +geom_vline(xintercept =-0.5, linetype ="dashed", color ="red") +labs(x ="Years to treatment", y ="Coefficient (relative to T = -1)") +# change x axis to be every yearscale_x_continuous(breaks = years) +theme_bw()
Add the confidence intervals using geom_errorbar
Add the confidence intervals using geom_line, if you prefer
Add the confidence intervals using geom_ribbon, if you prefer
Let’s use the “Bacon decomposition” to look at weighting
This won’t allow us to calculate standard errors
Think of this as “diagnostics”
We can see how different 2x2 cells have different weights, sometimes markedly so
We can also see that different cells have different treatment estimates
Let’s try the “Bacon decomposition” - Note the treatment variable must be binary
# compare to TWFE estimatefeols(homicide_c ~ cdl +log(population) + unemployrt | state + year, data = df, cluster =c("state")) # No weights since Bacon decomp doesn't allow them
We can plot the average effects for the different groups
Let’s estimate effects using the did2s function from Kyle Butts
bacondecomp is good for diagnostics, but we really want to estimate the ATT with standard errors
Code
library(did2s)
yname: the outcome variable
first_stage: formula for first stage, can include fixed effects and covariates, but do not include treatment variable(s)!
second_stage: This should be the treatment variable or in the case of event studies, treatment variables.
treatment: This has to be the 0/1 treatment variable that marks when treatment turns on for a unit. If you suspect anticipation, see note above for accounting for this.
cluster_var: Which variables to cluster on
How does this actually work?
The main problem with TWFE is the “residualization” of the treatment variable
did2s uses the implementation from Gardner (2021), which is similar to Borusyak et. al. (2021)
Estimate the fixed effects SEPARATELY to avoid residualization of the treatment indicator
This is a “two-step” estimator
Implemented with a generalized method of moments (GMM) estimator to correct standard errors
Let’s estimate effects using the did2s function from Kyle Butts
Code
library(did2s)# note that we can use fixest syntax, with FE and with i()!# can also add weightsdid2s <-did2s(data = df, yname ="homicide_c", first_stage ="log(population) + unemployrt | state + year", second_stage ="cdl", treatment ="cdl", cluster_var ="state", weights ="population")# let's compare it to the vanilla TWFEtwfe <-feols(homicide_c ~ cdl +log(population) + unemployrt | state + year, data = df, cluster =c("state"), weights = df$population)
We’ve learned how to estimate two-way fixed effects models with fixest
We’ve also learned how they can be biased if treatment is staggered
We saw how to use did2s to estimate the ATT
We also saw how to use bacondecomp to look at the weights
A lingering question: TWFE with continuous treatment variables
Callaway et al. (2021) and Chaisemartin and D’Haultfœuille (2023) have some ideas
IVs with TWFE?
Note that you could in theory do this by hand! IV estimates are just a ratio of reduced form and first stage. Could bootstrap the ratio.
Synthetic control
Increasing cigarette taxes and discouraging smoking in California (in 1988)
Increasing cigarette taxes and discouraging smoking in California (in 1988)
What if we wanted to figure out whether the law changed smoking in California?
A key problem, similar to Card and Krueger:
There is really only one treated unit
Many people live in California, obviously, but they are all equally affected by the law
Differences-in-differences is problematic here
A single treated cluster!
So what are we to do?
Synthetic control
This is where synthetic control comes in
Abadie and Gardeazabal (2003), American Economic Review
Abadie, Diamond, and Hainmueller (2010), Journal of the American Statistical Association
The basic idea:
We want to create a “synthetic” California that is a weighted average of the other states
We want to weight the other states so that they look like California before the law
We can then compare the synthetic California to the real California after the law
Synthetic control
Synthetic control doesn’t just match on pre-treatment outcomes
It also matches on pre-treatment trends and covariates
This is what makes it different from pure matching
Main requirement:
Many pre-treatment periods (Abadie, Diamond, and Hainmueller, 2010)
A little formalization
Let’s say we are interested in outcome \(Y_{jt}\) for unit \(j\) at time \(t\)
“Treated” group is \(j=1\)
In the post period, we estimate the effect of the intervention as \[\begin{gather} Y_{1t} - \sum_{j=1}^Jw_j^*Y_{jt}, \end{gather}\]
where \(w_j\) are time-invariant weights that we will estimate in the pre period.
A little formalization
In an ideal world, we would estimate the weights such that \[\begin{gather} \sum_{j=2}^Jw_j^*Y_{j1} = Y_{11},\;\sum_{j=2}^Jw_j^*Y_{j2} = Y_{12},\;\ldots,\;\sum_{j=2}^Jw_j^*Y_{jT_0} = Y_{1T_0}, \end{gather}\] where \(T_0\) is the number of pre-treatment periods.
In practice, however, this will never hold exactly.
Instead, we will have to choose weights such that the differences are as small as possible.
Our job is to estimate \(W\), the vector of weights for each of the candidate control units.
A little formalization
Consider a set of variables (which can include pre-treatment outcomes) \(X\), where \(X_1\) is the treated unit and \(X_0\) are the control units.
We will minimize \[\begin{gather} \lVert X_1-X_0W\rVert = \sqrt{(X_1-X_0W)'V(X_1-X_0W)}, \end{gather}\] where \(V\) is a diagonal matrix of weights for different variables.
Essentially there are two types of weights:
Weights for different variables
Weights for different units
A little formalization
Estimating \(V\) is important
The most common approach is to minimize mean squared prediction error (Cunnigham, 2022): \[\begin{gather} \sum_{t=1}^{T_0}\left(Y_{1t}-\sum_{j=1}^Jw_j^*Y_{jt}\right)^2, \end{gather}\] where, again, \(T_0\) is the number of pre-treatment periods.
Thankfully, the canned R packages do all of this for us!
It’s nonetheless good to have an understanding of what they’re doing
Synthetic control in R with the tidysynth package
Code
library(tidysynth)# We are going to use the "smoking" data from Abadie, Diamond, and Hainmueller (2010)# One issue: note how many missing values there are! We'll talk more on the next slide.summary(df)
state year cigsale lnincome beer
Min. : 1 Min. :1970 Min. : 40.7 Min. : 9.397 Min. : 2.50
1st Qu.:10 1st Qu.:1977 1st Qu.:100.9 1st Qu.: 9.739 1st Qu.:20.90
Median :20 Median :1985 Median :116.3 Median : 9.861 Median :23.30
Mean :20 Mean :1985 Mean :118.9 Mean : 9.862 Mean :23.43
3rd Qu.:30 3rd Qu.:1993 3rd Qu.:130.5 3rd Qu.: 9.973 3rd Qu.:25.10
Max. :39 Max. :2000 Max. :296.2 Max. :10.487 Max. :40.40
NA's :195 NA's :663
age15to24 retprice state_str california
Min. :0.1294 Min. : 27.3 Length:1209 Min. :0.00000
1st Qu.:0.1658 1st Qu.: 50.0 Class :character 1st Qu.:0.00000
Median :0.1781 Median : 95.5 Mode :character Median :0.00000
Mean :0.1755 Mean :108.3 Mean :0.02564
3rd Qu.:0.1867 3rd Qu.:158.4 3rd Qu.:0.00000
Max. :0.2037 Max. :351.2 Max. :1.00000
NA's :390
A note: using this package is not straightforward. The website has code you can simply copy-paste.
We have to create a “synthetic control object”… kind of a pain, to be honest
Code
smoking_out <- df %>%# initial the synthetic control objectsynthetic_control(outcome = cigsale, # outcomeunit = state, # unit index in the panel datatime = year, # time index in the panel datai_unit =mean(df$state[df$california==1]), # unit where the intervention occurredi_time =1988, # time period when the intervention occurredgenerate_placebos = T) %>%# generate placebo synthetic controls (for inference)# Generate the aggregate predictors used to fit the weights# average log income, retail price of cigarettes, and proportion of the# population between 15 and 24 years of age from 1980 - 1988generate_predictor(time_window =1980:1988,ln_income =mean(lnincome, na.rm = T),ret_price =mean(retprice, na.rm = T),youth =mean(age15to24, na.rm = T)) %>%# average beer consumption in the donor pool from 1984 - 1988generate_predictor(time_window =1984:1988,beer_sales =mean(beer, na.rm = T)) %>%# Lagged cigarette sales generate_predictor(time_window =1975,cigsale_1975 = cigsale) %>%generate_predictor(time_window =1980,cigsale_1980 = cigsale) %>%generate_predictor(time_window =1988,cigsale_1988 = cigsale) %>%# Generate the fitted weights for the synthetic controlgenerate_weights(optimization_window =1970:1988, # time to use in the optimization taskmargin_ipop = .02, sigf_ipop =7, bound_ipop =6) %>%# optimizer options# Generate the synthetic controlgenerate_control()
Let’s first look at the weights
Code
# If you get the first step above, the rest is easysmoking_out %>%plot_weights()
Code
# but what are the states!?
Let’s first look at the weights
Code
# Grab the weightsweights <- smoking_out %>%grab_unit_weights()# merge in state namesweights <- weights %>%mutate(unit =as.numeric(unit)) %>%left_join(df %>%select(unit = state, state_str) %>%distinct(), by ="unit")# arrange by weight, in descending orderweights %>%arrange(-weight)# note they sum to oneprint(paste0("Sum of weights: ", sum(weights$weight)))