Y
to be for any given value of X
X
is a logical or a factor, regression and our method are exactly the same! This will make doing most of our methods easy!library(Ecdat)
data(Airline)
# Our method
AirlineOurMethod <- Airline %>% group_by(airline) %>%
mutate(output.r = output - mean(output),
cost.r = cost - mean(cost))
AirlineReg <- Airline %>%
mutate(output.reg = residuals(lm(output~factor(airline))),
cost.reg = residuals(lm(cost~factor(airline))))
c(cor(AirlineOurMethod$output.r,AirlineOurMethod$cost.r),
cor(AirlineReg$output.reg,AirlineReg$cost.reg))
## [1] 0.9272297 0.9272297
load('mariel.RData')
#(some data-cleaning omitted here, see the code for the slides)
#Then we can do our difference in difference with our method
means <- df %>% group_by(after,miami) %>% summarize(lwage = mean(lwage),unemp=mean(unemp))
(means$lwage[4] - means$lwage[2]) - (means$lwage[3]-means$lwage[1])
#or by regression, using an "interaction term"
lm(lwage~after*miami,data=df)
## [1] 0.02740653
##
## Call:
## lm(formula = lwage ~ after * miami, data = df)
##
## Coefficients:
## (Intercept) afterTRUE miamiTRUE
## 1.88186 -0.04606 -0.14674
## afterTRUE:miamiTRUE
## 0.02741
Z
. Just now, we explain it using regression!library(AER)
data(CigarettesSW)
#data-cleaning code to perform our version of IV omitted here
cor(CigarettesSW$priceexp,CigarettesSW$packsexp)
#And now with regression
data(CigarettesSW)
x.explained.with.z <- predict(lm(packs~cigtax,data=CigarettesSW))
lm(price~x.explained.with.z,data=CigarettesSW)
## [1] -0.9711096
##
## Call:
## lm(formula = price ~ x.explained.with.z, data = CigarettesSW)
##
## Coefficients:
## (Intercept) x.explained.with.z
## 3.155e-16 -1.461e+00
Y
with X
in a way very similar to our method - it breaks up X
into bins and takes the average of Y
within each bin. It doesn’t fit a line.library(AER)
data(GoldSilver)
GoldSilver <- as.data.frame(GoldSilver)
library(randomForest)
rf <- randomForest(gold~silver,data=GoldSilver)
GoldSilver <- GoldSilver %>%
mutate(rf.predict = predict(rf),
reg.predict = predict(lm(gold~silver,data=GoldSilver)))
dat
with:a
all even numbers from 2 to 100 twice, i.e. 2, 2, 4, 4… (hint: rep()
)b
randomly selected from ‘Hi’, ‘Hello’, and ‘Goodbye’. Make it a factor.arrange()
the data by a
a
%in%
mutate
to create c
as a logical equal to 1 if b
is ‘Goodbye’ OR if a > 90
OR if a <= 10
c
explained by b
dat <- data.frame(a = rep(1:50*2,2),
b = sample(c('Hi','Hello','Goodbye'),100,replace=T)) %>%
arrange(a) %>%
mutate(c = (b == 'Goodbye') | (a > 90) | (a <= 10))
sum(dat$a)
sum(dat$b %in% c('Hi','Hello'))
dat <- dat %>% group_by(b) %>% mutate(c.res = c - mean(c))
1 - var(dat$c.res)/var(dat$c)
x <- rexp(3000)
x
, with proper labelsmean(x)
or median(x)
to be higher?abline
s in different colors to your plot to checkstargazer
table to describe as.data.frame(x)
quantile
to get the 10th, 20th, … 100th percentile of x
x
?x <- rexp(3000)
plot(density(x),xlab='X',ylab='Density',main='Distribution of X')
#Because we have a small number of huge values, the mean should be larger
abline(v=mean(x),col='blue')
abline(v=median(x),col='red')
library(stargazer)
stargazer(as.data.frame(x),type='text')
quantile(x,c(1:10/10))
#Something unequally distributed, with a few big winners, like income or wealth, might be distributed like x
dat
from the first practice, and add d = rnorm(100) + .3*a
(may need to ungroup())table
and a prop.table
for b
barplot
for the count of b
, and one for the proportiond
when c == FALSE
and use lines()
to overlay different-colored density when c == TRUE
d
between c == FALSE
and c == TRUE
a
on the x-axis and d
on the y-axiscut(,breaks=8)
to get prop. of var. of d
explained by a
dat <- dat %>% ungroup() %>% mutate(d = rnorm(100) + .3*a)
table(dat$b)
prop.table(table(dat$b))
barplot(table(dat$b))
barplot(prop.table(table(dat$b)))
plot(density((filter(dat,c==FALSE))$d),col='red')
lines(density((filter(dat,c==TRUE))$d),col='blue')
meandiff <- dat %>% group_by(c) %>% summarize(d = mean(d))
meandiff$d[2] - meandiff$d[1]
plot(dat$a,dat$d)
dat <- dat %>% group_by(cut(a,breaks=8)) %>% mutate(d.res = d - mean(d))
1 - var(dat$d.res)/var(dat$d)