Cost x Earnings
pdata <- ed_df %>%
rename(
cost = latest.cost.attendance.academic_year,
earnings =latest.earnings.10_yrs_after_entry.median) %>%
select(cost,earnings) %>%
filter(!is.na(cost) & !is.na(earnings)) %>%
mutate(bin = ntile(cost,10)) %>%
arrange(bin) %>%
group_by(bin) %>%
mutate(min_cost = min(cost),
max_cost = max(cost)) %>%
mutate(bin_name = paste0(dollar(min_cost,scale = .001,suffix = "k",decimal.mark = ",")
,"-"
,dollar(max_cost,scale = .001,suffix = "k",decimal.mark = ",")))
# fix factor order
pdata <- pdata %>%
mutate(bin_name = factor(bin_name,levels = unique(pdata$bin_name)))
hcboxplot(x = pdata$earnings, var = pdata$bin_name,
outliers = FALSE) %>%
hc_chart(type = "column") # to put box vertical
#model simple linear regression
lm1 <- summary(lm(earnings ~ cost,data = pdata))
knitr::kable(data.frame(lm1$r.squared),align="l")
knitr::kable(
data.frame(lm1$coefficients) %>%
tibble::rownames_to_column("Variable") %>%
mutate(Pr...t.. = format.pval(Pr...t..,eps=.001)) %>%
rename("P Value" = Pr...t..))
| (Intercept) |
2.485317e+04 |
344.8485807 |
72.06981 |
< 0.001 |
| cost |
4.766705e-01 |
0.0115599 |
41.23479 |
< 0.001 |
Size x Earnings
pdata <- ed_df %>%
rename(
size = latest.student.size,
earnings =latest.earnings.10_yrs_after_entry.median) %>%
select(size,earnings) %>%
filter(!is.na(size) & !is.na(earnings)) %>%
mutate(bin = ntile(size,10)) %>%
arrange(bin) %>%
group_by(bin) %>%
mutate(min_size = min(size),
max_size = max(size)) %>%
mutate(bin_name = ifelse(max_size<1000,
paste0(min_size,"-",max_size),
paste0(number(min_size,scale = .001,suffix = "k")
,"-"
,number(max_size,scale = .001,suffix = "k"))))
# fix factor order
pdata <- pdata %>%
mutate(bin_name = factor(bin_name,levels = unique(pdata$bin_name)))
hcboxplot(x = pdata$earnings, var = pdata$bin_name,
outliers = FALSE,color="black") %>%
hc_chart(type = "column") # to put box vertical
#model simple linear regression
lm2 <- summary(lm(earnings ~ size,data = pdata))
knitr::kable(data.frame(lm2$r.squared),align="l")
knitr::kable(
data.frame(lm2$coefficients) %>%
tibble::rownames_to_column("Variable") %>%
mutate(Pr...t.. = format.pval(Pr...t..,eps=.001)) %>%
rename("P Value" = Pr...t..))
| (Intercept) |
3.241548e+04 |
198.1432135 |
163.59620 |
< 0.001 |
| size |
5.679296e-01 |
0.0293352 |
19.36003 |
< 0.001 |
# wanted to plot points, but too many for highcharter to handle in browser
# pdata %>%
# hchart("point",hcaes(x="size",y="earnings")) %>%
# hc_add_theme(thm)
UG Enrollment Size
This data represents undergraduate enrollment size by institution.
pal <- colorNumeric(
palette = c("orange","red"),
domain = ed_df$latest.student.size)
# discretify continuous variables
# qpal <- colorQuantile("Blues", ed_df$latest.student.enrollment.all, n = 7)
pdata <- ed_df %>%
filter(!is.na(latest.student.size))
leaflet() %>%
addTiles() %>%
addCircleMarkers(
data = pdata,
lat = ~location.lat, lng = ~location.lon,
color = ~pal(latest.student.size),
weight = 1,
fillOpacity = .6,
radius = ~(latest.student.size / 2500),
stroke = FALSE,
popup = ~paste0("<strong>",school.name,"</strong><br>",
"UG Enrollment: ",latest.student.size),
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"))
# create side-by-side bar charts for each institution type
lapply(unique(ed_df$pred_award),function(pred_award_level){
# set each chart with different color bars
color_select = color_temp[
(ed_df %>% filter(pred_award==pred_award_level) %>% distinct(school.degrees_awarded.predominant))$school.degrees_awarded.predominant+1
]
ed_df %>%
group_by(pred_award,school.name) %>%
# shorthand to rename(), filter(), select(), then arrange...
summarise(Enrollment = median(latest.student.size,na.rm=TRUE)) %>%
arrange(desc(Enrollment)) %>%
slice(1:10) %>%
filter(pred_award == pred_award_level) %>%
hchart("bar",hcaes(x="school.name",y="Enrollment"),color=color_select) %>%
hc_add_theme(thm) %>%
hc_title(text = pred_award_level) %>%
hc_xAxis(title = NA)
}) %>%
hw_grid(rowheight = 500,
ncol = ceiling(length(unique(ed_df$pred_award))/2))
Median Earnings 10 Year Post-Entry
This data should be interpreted with extreme caustion as is. Important differences exist between institutions such as the highest degree awarded (e.g., associates vs PHD), the types of programs offered (e.g., Medicine vs Liberal Arts), and cost of attendance to name a few. This is to say that this map is not a tool to choose where one should go to college to earn top dollar.
pal <- colorNumeric(
palette = c("orange","red"),
domain = ed_df$latest.earnings.10_yrs_after_entry.median)
# discretify continuous variables
# qpal <- colorQuantile("Blues", ed_df$latest.student.enrollment.all, n = 7)
pdata <- ed_df %>%
filter(!is.na(latest.earnings.10_yrs_after_entry.median))
leaflet() %>%
addTiles() %>%
addCircleMarkers(
data = pdata,
lat = ~location.lat, lng = ~location.lon,
color = ~pal(latest.earnings.10_yrs_after_entry.median),
weight = 1,
fillOpacity = .6,
radius = ~(latest.earnings.10_yrs_after_entry.median / 4500),
stroke = FALSE,
popup = ~paste0("<strong>",school.name,"</strong><br>",
"10 Year Post-Enroll Income ($): ",latest.earnings.10_yrs_after_entry.median),
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"))
# create side-by-side bar charts for each institution type
lapply(unique(ed_df$pred_award),function(pred_award_level){
# set each chart with different color bars
color_select = color_temp[
(ed_df %>% filter(pred_award==pred_award_level) %>% distinct(school.degrees_awarded.predominant))$school.degrees_awarded.predominant+1
]
ed_df %>%
group_by(pred_award,school.name) %>%
# shorthand to select, rename, and remove NAs
# more naturally rename(), filter(), select(), then arrange...
summarise(Median = median(latest.earnings.10_yrs_after_entry.median,na.rm=TRUE)) %>%
arrange(desc(Median)) %>%
slice(1:10) %>%
filter(pred_award == pred_award_level) %>%
hchart("bar",hcaes(x="school.name",y="Median"),color=color_select) %>%
hc_add_theme(thm) %>%
hc_title(text = pred_award_level) %>%
hc_xAxis(title = NA)
}) %>%
hw_grid(rowheight = 500,
ncol = ceiling(length(unique(ed_df$pred_award))/2))