Y to be for any given value of XX is a logical or a factor, regression and our method are exactly the same! This will make doing most of our methods easy!library(Ecdat)
data(Airline)
# Our method
AirlineOurMethod <- Airline %>% group_by(airline) %>%
mutate(output.r = output - mean(output),
cost.r = cost - mean(cost))
AirlineReg <- Airline %>%
mutate(output.reg = residuals(lm(output~factor(airline))),
cost.reg = residuals(lm(cost~factor(airline))))
c(cor(AirlineOurMethod$output.r,AirlineOurMethod$cost.r),
cor(AirlineReg$output.reg,AirlineReg$cost.reg))## [1] 0.9272297 0.9272297
load('mariel.RData')
#(some data-cleaning omitted here, see the code for the slides)
#Then we can do our difference in difference with our method
means <- df %>% group_by(after,miami) %>% summarize(lwage = mean(lwage),unemp=mean(unemp))
(means$lwage[4] - means$lwage[2]) - (means$lwage[3]-means$lwage[1])
#or by regression, using an "interaction term"
lm(lwage~after*miami,data=df)## [1] 0.02740653
##
## Call:
## lm(formula = lwage ~ after * miami, data = df)
##
## Coefficients:
## (Intercept) afterTRUE miamiTRUE
## 1.88186 -0.04606 -0.14674
## afterTRUE:miamiTRUE
## 0.02741
Z. Just now, we explain it using regression!library(AER)
data(CigarettesSW)
#data-cleaning code to perform our version of IV omitted here
cor(CigarettesSW$priceexp,CigarettesSW$packsexp)
#And now with regression
data(CigarettesSW)
x.explained.with.z <- predict(lm(packs~cigtax,data=CigarettesSW))
lm(price~x.explained.with.z,data=CigarettesSW)## [1] -0.9711096
##
## Call:
## lm(formula = price ~ x.explained.with.z, data = CigarettesSW)
##
## Coefficients:
## (Intercept) x.explained.with.z
## 3.155e-16 -1.461e+00
Y with X in a way very similar to our method - it breaks up X into bins and takes the average of Y within each bin. It doesn’t fit a line.library(AER)
data(GoldSilver)
GoldSilver <- as.data.frame(GoldSilver)
library(randomForest)
rf <- randomForest(gold~silver,data=GoldSilver)
GoldSilver <- GoldSilver %>%
mutate(rf.predict = predict(rf),
reg.predict = predict(lm(gold~silver,data=GoldSilver)))dat with:a all even numbers from 2 to 100 twice, i.e. 2, 2, 4, 4… (hint: rep())b randomly selected from ‘Hi’, ‘Hello’, and ‘Goodbye’. Make it a factor.arrange() the data by aa%in%mutate to create c as a logical equal to 1 if b is ‘Goodbye’ OR if a > 90 OR if a <= 10c explained by bdat <- data.frame(a = rep(1:50*2,2),
b = sample(c('Hi','Hello','Goodbye'),100,replace=T)) %>%
arrange(a) %>%
mutate(c = (b == 'Goodbye') | (a > 90) | (a <= 10))
sum(dat$a)
sum(dat$b %in% c('Hi','Hello'))
dat <- dat %>% group_by(b) %>% mutate(c.res = c - mean(c))
1 - var(dat$c.res)/var(dat$c)x <- rexp(3000)x, with proper labelsmean(x) or median(x) to be higher?ablines in different colors to your plot to checkstargazer table to describe as.data.frame(x)quantile to get the 10th, 20th, … 100th percentile of xx?x <- rexp(3000)
plot(density(x),xlab='X',ylab='Density',main='Distribution of X')
#Because we have a small number of huge values, the mean should be larger
abline(v=mean(x),col='blue')
abline(v=median(x),col='red')
library(stargazer)
stargazer(as.data.frame(x),type='text')
quantile(x,c(1:10/10))
#Something unequally distributed, with a few big winners, like income or wealth, might be distributed like xdat from the first practice, and add d = rnorm(100) + .3*a (may need to ungroup())table and a prop.table for bbarplot for the count of b, and one for the proportiond when c == FALSE and use lines() to overlay different-colored density when c == TRUEd between c == FALSE and c == TRUEa on the x-axis and d on the y-axiscut(,breaks=8) to get prop. of var. of d explained by adat <- dat %>% ungroup() %>% mutate(d = rnorm(100) + .3*a)
table(dat$b)
prop.table(table(dat$b))
barplot(table(dat$b))
barplot(prop.table(table(dat$b)))
plot(density((filter(dat,c==FALSE))$d),col='red')
lines(density((filter(dat,c==TRUE))$d),col='blue')
meandiff <- dat %>% group_by(c) %>% summarize(d = mean(d))
meandiff$d[2] - meandiff$d[1]
plot(dat$a,dat$d)
dat <- dat %>% group_by(cut(a,breaks=8)) %>% mutate(d.res = d - mean(d))
1 - var(dat$d.res)/var(dat$d)