coins <- sample(c("Heads","Tails"),501,replace=T)
mean(coins=='Heads') =
0.493014barplot(prop.table(table(coins)))
#A blank vector to hold our results
propHeads <- c()
#Let's run this simulation 2000 times
for (i in 1:2000) {
#Re-create data using the true model
coinsdraw <- sample(c("Heads","Tails"),501,replace=T)
#Re-perform our analysis
result <- mean(coinsdraw=="Heads")
#And store the result
propHeads[i] <- result
}
#Let's see what we get on average
stargazer(as.data.frame(propHeads),type='text')
##
## ============================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## ------------------------------------------------------------
## propHeads 2,000 0.500 0.022 0.427 0.485 0.515 0.589
## ------------------------------------------------------------
#And let's look at the distribution of our findings
plot(density(propHeads),xlab="Proportion Heads",main="Mean of 501 Coin Flips over 2000 Samples")
abline(v=mean(propHeads),col='red')
sample()
as before to pick random data from different categories like Heads and Tails, or integers (say, 1:10
)help(Distributions)
.runif(thismanyobs,min,max)
will draw thismanyobs
observations from the range of min
to max
.runif(thismanyobs)
will assume min=0
and max=1
uniformdata <- runif(5)
uniformdata
## [1] 0.1267818 0.8207126 0.2619525 0.4291249 0.1961915
uniformdata <- runif(2000)
hist(uniformdata,xlab="Random Value",main="Distribution of Random Data from Uniform Distribution",probability=TRUE)
rnorm(thismanyobs,mean,sd)
will draw thismanyobs
observations from a normal distribution with mean mean
and standard deviation sd
rnorm(thismanyobs)
will assume mean=0
and sd=1
normaldata <- rnorm(5)
normaldata
## [1] -2.2990547 0.1119890 -0.3365734 2.2590853 -0.7877652
normaldata <- rnorm(2000)
hist(normaldata,xlab="Random Value",main="Distribution of Random Data from Normal Distribution",probability=TRUE)
# 400 people equally likely to be M or F
simdata <- tibble(gender = sample(c("Male","Female"),400,replace=T)) %>%
# Height is normally distributed with mean 5.5 and sd 1
#and men are .9 of a foot taller
mutate(heightft = .9*(gender == "Male")+rnorm(400,5.5))
simdata %>% group_by(gender) %>% summarize(height = mean(heightft))
## # A tibble: 2 x 2
## gender height
## <chr> <dbl>
## 1 Female 5.45
## 2 Male 6.42
heightdiff <- c()
for (i in 1:2500) {
simdata <- tibble(gender = sample(c("Male","Female"),400,replace=T)) %>%
mutate(heightft = .9*(gender == "Male")+rnorm(400,5.5))
heightdiff[i] <- mean(filter(simdata,gender=="Male")$heightft)-
mean(filter(simdata,gender=="Female")$heightft)
}
stargazer(as.data.frame(heightdiff),type='text')
##
## =============================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## -------------------------------------------------------------
## heightdiff 2,500 0.903 0.101 0.564 0.836 0.970 1.245
## -------------------------------------------------------------
# Is your company in tech? Let's say 30% of firms are
df <- tibble(tech = sample(c(0,1),500,replace=T,prob=c(.7,.3))) %>%
#Tech firms on average spend $3mil more defending IP lawsuits
mutate(IP.spend = 3*tech+runif(500,min=0,max=4)) %>%
#Tech firms also have higher profits. But IP lawsuits lower profits
mutate(log.profit = 2*tech - .3*IP.spend + rnorm(500,mean=2))
# Now let's check for how profit and IP.spend are correlated!
cor(df$log.profit,df$IP.spend)
## [1] 0.2088494
IPcorr <- c()
for (i in 1:1000) {
df <- tibble(tech = sample(c(0,1),500,replace=T,prob=c(.7,.3))) %>%
mutate(IP.spend = 3*tech+runif(500,min=0,max=4)) %>%
mutate(log.profit = 2*tech - .3*IP.spend + rnorm(500,mean=2))
IPcorr[i] <- cor(df$log.profit,df$IP.spend)
}
stargazer(as.data.frame(IPcorr),type='text')
##
## ============================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## ------------------------------------------------------------
## IPcorr 1,000 0.141 0.040 0.020 0.113 0.169 0.299
## ------------------------------------------------------------
prob
option in sample
to generate 300 coin flips weighted to be 55% heads. Calculate heads prop.dat
: 1500 obs of married
(0 or 1), educ
(unif 0 to 16, plus 2*married
), log.wage
(normal mean 5 plus .1*educ
plus 2*married
)cor
between educ
and log.wage
each time.plot
and then points
to plot the married==0
and married==1
data in different colors.co <- sample(c("Heads","Tails"),300,replace=T,prob=c(.55,.45))
mean(co=="Heads")
headsprop <- c()
for (i in 1:2000) {
co <- sample(c("Heads","Tails"),300,replace=T,prob=c(.55,.45))
headsprop[i] <- mean(co=="Heads")
}
mean(headsprop > .5)
educCor <- c()
for (j in 1:1000) {
dat <- tibble(married=sample(0:1,1500,replace=T)) %>%
mutate(educ = 2*married + runif(1500,0,16)) %>%
mutate(log.wage = .1*educ + 2*married + rnorm(1500,5))
educCor[j] <- cor(dat$educ,dat$log.wage)
}
mean(educCor)
#This is still wrong - we're ignoring what married does, and so OVERSTATING the correlation
plot(filter(dat,married==0)$educ,filter(dat,married==0)$log.wage,col='red',
ylim=range(c(min(dat$log.wage),max(dat$log.wage))),
xlim=range(c(min(dat$educ),max(dat$educ))),
xlab="Education",
ylab="Log Wages")
points(filter(dat,married==1)$educ,filter(dat,married==1)$log.wage,col='blue')
#Note the xlim and ylim options so we aren't cut off.