prop.table
), differences in means (group_by() %>% summarize()
), correlation (cor
), and graphically with scatterplots (plot(xvar,yvar)
) and overlaid densities (plot(density())
followed by lines(density())
)X
explain Y
” as “what would I expect Y
to look like, given a certain value of X
?”addata <- read.csv('http://www.nickchk.com/ad_spend_and_gdp.csv')
plot(addata$AdSpending,addata$GDP,
xlab='US Ad Spend/Year (Mil.)',ylab='US GDP (Bil.)')
X
, see what Y
looks like.Y
for different values of X
.summarize()
data(midwest)
and get the average poverty rate by county in each statelibrary(tidyverse)
data(midwest)
midwest %>% group_by(state) %>% summarize(percbelowpoverty = mean(percbelowpoverty))
## # A tibble: 5 x 2
## state percbelowpoverty
## <chr> <dbl>
## 1 IL 13.1
## 2 IN 10.3
## 3 MI 14.2
## 4 OH 13.0
## 5 WI 11.9
group_by()
leading into mutate()
instead of summarize()
midwest <- midwest %>% group_by(state) %>%
mutate(avebystate = mean(percbelowpoverty))
head(select(midwest,state,county,percbelowpoverty,avebystate))
## # A tibble: 6 x 4
## # Groups: state [1]
## state county percbelowpoverty avebystate
## <chr> <chr> <dbl> <dbl>
## 1 IL ADAMS 13.2 13.1
## 2 IL ALEXANDER 32.2 13.1
## 3 IL BOND 12.1 13.1
## 4 IL BOONE 7.21 13.1
## 5 IL BROWN 13.5 13.1
## 6 IL BUREAU 10.4 13.1
avebystate
now represents the part of poverty that is explained by statemidwest <- mutate(midwest,residual = percbelowpoverty - avebystate)
head(select(midwest,state,county,percbelowpoverty,avebystate,residual))
## # A tibble: 6 x 5
## # Groups: state [1]
## state county percbelowpoverty avebystate residual
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 IL ADAMS 13.2 13.1 0.0750
## 2 IL ALEXANDER 32.2 13.1 19.2
## 3 IL BOND 12.1 13.1 -1.01
## 4 IL BOONE 7.21 13.1 -5.87
## 5 IL BROWN 13.5 13.1 0.444
## 6 IL BUREAU 10.4 13.1 -2.68
c(var(midwest$percbelowpoverty),var(midwest$residual))
## [1] 26.52410 24.75809
var(midwest$residual)/var(midwest$percbelowpoverty)
## [1] 0.9334186
summarize(X = mean(X))
works great!midwest <- midwest %>% group_by(as.factor(percollege)) %>%
mutate(avebycoll = mean(percbelowpoverty)) %>%
mutate(collresidual = percbelowpoverty - avebycoll) %>% ungroup()
head(select(midwest,state,county,percbelowpoverty,avebycoll,collresidual))
## # A tibble: 6 x 5
## state county percbelowpoverty avebycoll collresidual
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 IL ADAMS 13.2 13.2 0
## 2 IL ALEXANDER 32.2 32.2 0
## 3 IL BOND 12.1 12.1 0
## 4 IL BOONE 7.21 7.21 0
## 5 IL BROWN 13.5 13.5 0
## 6 IL BUREAU 10.4 10.4 0
1-var(midwest$collresidual)/var(midwest$percbelowpoverty)
## [1] 1
X
up into bins, and taking the mean of Y
within those bins using mutate()
.cut()
function with the breaks
option, which splits the continuous variable into breaks
bins of equal lengthmidwest <- midwest %>% mutate(collbins = cut(percollege,breaks=10))
head(midwest %>% select(county,percbelowpoverty,percollege,collbins))
## # A tibble: 6 x 4
## county percbelowpoverty percollege collbins
## <chr> <dbl> <dbl> <fct>
## 1 CLARK 13.4 17.8 (15.5,19.6]
## 2 ST CLAIR 10.9 17.6 (15.5,19.6]
## 3 FOREST 21.8 13.6 (11.4,15.5]
## 4 DAVIESS 15.5 13.2 (11.4,15.5]
## 5 CHAMPAIGN 8.83 13.9 (11.4,15.5]
## 6 SHEBOYGAN 6.49 20.8 (19.6,23.6]
mutate
to take means within these binsmidwest <- midwest %>% mutate(collbins = cut(percollege,10)) %>%
group_by(collbins) %>% mutate(avebycoll = mean(percbelowpoverty)) %>% ungroup()
head(midwest %>% select(county,percbelowpoverty,percollege,avebycoll))
## # A tibble: 6 x 4
## county percbelowpoverty percollege avebycoll
## <chr> <dbl> <dbl> <dbl>
## 1 CLARK 13.4 17.8 11.5
## 2 ST CLAIR 10.9 17.6 11.5
## 3 FOREST 21.8 13.6 13.9
## 4 DAVIESS 15.5 13.2 13.9
## 5 CHAMPAIGN 8.83 13.9 13.9
## 6 SHEBOYGAN 6.49 20.8 10.5
plot(midwest$percollege,midwest$percbelowpoverty,xlab="Percent College",ylab="Percent below Poverty")
points(midwest$percollege,midwest$avebycoll,col='red')
var(midwest$percbelowpoverty-midwest$avebycoll)
## [1] 22.04112
var(midwest$percbelowpoverty)
## [1] 26.5241
1-var(midwest$percbelowpoverty-midwest$avebycoll)/var(midwest$percbelowpoverty)
## [1] 0.1690152
for (brks in c(2,10,20,50)) {
print(1-var(midwest$percbelowpoverty-
(midwest %>% group_by(cut(percollege,breaks=brks)) %>%
mutate(avebycoll = mean(percbelowpoverty)))$avebycoll)/
var(midwest$percbelowpoverty))
}
## [1] 0.005783466
## [1] 0.1690152
## [1] 0.2114173
## [1] 0.3168701
BudgetFood
(rename:BF
) data from Ecdat
and examine itwfood
with town
and then with totexp
(breaks=10
). For each:plot
the raw data and add red points
for the explained valuestotexp
, also do a plot
of residuals with a red horizontal line at 0library(Ecdat)
data(BudgetFood)
help(BudgetFood)
str(BudgetFood)
BF <- BudgetFood
BF <- BF %>%
group_by(town) %>%
mutate(avebytown = mean(wfood)) %>%
mutate(townresid = wfood - avebytown)
1-var(BF$townresid)/var(BF$wfood)
plot(BF$town,BF$wfood,xlab='Town',ylab='Food as Pct. of Expenditure')
points(BF$town,BF$avebytown,col='red')
BF <- BF %>%
mutate(expbins = cut(totexp,breaks=10)) %>%
group_by(expbins) %>%
mutate(avebyexp = mean(wfood)) %>%
mutate(expresid = wfood - avebyexp)
1-var(BF$expresid)/var(BF$wfood)
plot(BF$totexp,BF$wfood,xlab='Total Expenditure',ylab='Food as Pct. of Expenditure')
points(BF$totexp,BF$avebyexp,col='red')
plot(BF$totexp,BF$expresid,xlab='Total Expenditure',ylab='Residuals')
abline(0,0,col='red')