D = 0
, or after treatment and D = 1
. If we control for time, we’re effectively controlling for treatment#Create our data
diddata <- tibble(year = sample(2002:2010,10000,replace=T)) %>%
mutate(D = year >= 2007) %>% mutate(Y = 2*D + .5*year + rnorm(10000))
#Now, control for year
diddata <- diddata %>% group_by(year) %>% mutate(D.r = D - mean(D), Y.r = Y - mean(Y))
#What's the difference with and without treatment?
diddata %>% group_by(D) %>% summarize(Y=mean(Y))
## # A tibble: 2 x 2
## D Y
## <lgl> <dbl>
## 1 FALSE 1002.
## 2 TRUE 1006.
#And controlling for time?
diddata %>% group_by(D.r) %>% summarize(Y=mean(Y.r))
## # A tibble: 1 x 2
## D.r Y
## <dbl> <dbl>
## 1 0 1.84e-15
#Create our data
diddata <- tibble(year = sample(2002:2010,10000,replace=T),
group = sample(c('TreatedGroup','UntreatedGroup'),10000,replace=T)) %>%
mutate(after = (year >= 2007)) %>%
#Only let the treatment be applied to the treated group
mutate(D = after*(group=='TreatedGroup')) %>%
mutate(Y = 2*D + .5*year + rnorm(10000))
#Now, get before-after differences for both groups
means <- diddata %>% group_by(group,after) %>% summarize(Y=mean(Y))
#Before-after difference for untreated, has time effect only
bef.aft.untreated <- filter(means,group=='UntreatedGroup',after==1)$Y - filter(means,group=='UntreatedGroup',after==0)$Y
#Before-after for treated, has time and treatment effect
bef.aft.treated <- filter(means,group=='TreatedGroup',after==1)$Y - filter(means,group=='TreatedGroup',after==0)$Y
#Difference-in-Difference! Take the Time + Treatment effect, and remove the Time effect
DID <- bef.aft.treated - bef.aft.untreated
DID
## [1] 1.976004
#Create our data
diddata <- tibble(year = sample(2002:2010,10000,replace=T),
group = sample(c('TreatedGroup','UntreatedGroup'),10000,replace=T)) %>%
mutate(after = (year >= 2007)) %>%
#Only let the treatment be applied to the treated group
mutate(D = after*(group=='TreatedGroup')) %>%
mutate(Y = 2*D + .5*year + (group == 'TreatedGroup') + rnorm(10000))
#Now, get before-after differences for both groups
means <- diddata %>% group_by(group,after) %>% summarize(Y=mean(Y))
#Before-after difference for untreated, has time effect only
bef.aft.untreated <- filter(means,group=='UntreatedGroup',after==1)$Y - filter(means,group=='UntreatedGroup',after==0)$Y
#Before-after for treated, has time and treatment effect
bef.aft.treated <- filter(means,group=='TreatedGroup',after==1)$Y - filter(means,group=='TreatedGroup',after==0)$Y
#Difference-in-Difference! Take the Time + Treatment effect, and remove the Time effect
DID <- bef.aft.treated - bef.aft.untreated
DID
## [1] 1.953753
load('mariel.RData')
#Take the log of wage and create our "after treatment" and "treated group" variables
df <- mutate(df,lwage = log(hourwage),
after = year >= 81,
miami = smsarank == 26)
#Then we can do our difference in difference!
means <- df %>% group_by(after,miami) %>% summarize(lwage = mean(lwage),unemp=mean(unemp))
means
## # A tibble: 4 x 4
## # Groups: after [2]
## after miami lwage unemp
## <lgl> <lgl> <dbl> <dbl>
## 1 FALSE FALSE 1.88 0.0619
## 2 FALSE TRUE 1.74 0.0547
## 3 TRUE FALSE 1.84 0.0794
## 4 TRUE TRUE 1.72 0.0854
means$lwage[4] - means$lwage[2]
= -0.019. Uh oh!means$lwage[3] - means$lwage[1]
= -0.046means$unemp[4] - means$unemp[2]
= 0.031means$unemp[3] - means$unemp[1]
= 0.018*This is called the “parallel trends” assumption.
read.csv('http://nickchk.com/eitc.csv')
after
for years 1994+, and treated
if they have any children
work
within year
and treated
. plot(,type='l',ylim=c(.4,.6))
average work
separately against year
for treated (blue), then points
to add untreated (red). Any concerns they’re already trending together/apart in 1991-1993?work
df <- read.csv('http://nickchk.com/eitc.csv') %>%
mutate(after = year >= 1994,
treated = children > 0)
plotdata <- df %>% group_by(treated,year) %>%
summarize(work = mean(work))
plot(filter(plotdata,treated==1)$year,
filter(plotdata,treated==1)$work,col='blue',type='l',ylim=c(.4,.6))
points(filter(plotdata,treated==0)$year,
filter(plotdata,treated==0)$work,col='red',type='l')
abline(v=1994)
# They don't appear to be trending away or towards each other before 1994. Good!
#Now DID:
did <- df %>% group_by(treated,after) %>% summarize(work = mean(work))
untreat.diff <- did$work[2]-did$work[1]
treat.diff <- did$work[4]-did$work[3]
did.estimate <- treat.diff - untreat.diff
## [1] 0.04687313