Exploring Higher Education Data:

Cost x Earnings

pdata <- ed_df %>%
  rename(
    cost = latest.cost.attendance.academic_year,
    earnings =latest.earnings.10_yrs_after_entry.median) %>%
  select(cost,earnings) %>%
  filter(!is.na(cost) & !is.na(earnings)) %>%
  mutate(bin = ntile(cost,10)) %>%
  arrange(bin) %>%
  group_by(bin) %>%
  mutate(min_cost = min(cost),
         max_cost = max(cost)) %>%
  mutate(bin_name = paste0(dollar(min_cost,scale = .001,suffix = "k",decimal.mark = ",")
                           ,"-"
                           ,dollar(max_cost,scale = .001,suffix = "k",decimal.mark = ",")))
# fix factor order
pdata <- pdata %>%
  mutate(bin_name = factor(bin_name,levels = unique(pdata$bin_name)))
  
hcboxplot(x = pdata$earnings, var = pdata$bin_name, 
          outliers = FALSE) %>% 
  hc_chart(type = "column") # to put box vertical
#model simple linear regression
lm1 <- summary(lm(earnings ~ cost,data = pdata))

knitr::kable(data.frame(lm1$r.squared),align="l")
lm1.r.squared
0.3368742
knitr::kable(
  data.frame(lm1$coefficients) %>%
    tibble::rownames_to_column("Variable") %>%
    mutate(Pr...t.. = format.pval(Pr...t..,eps=.001)) %>%
    rename("P Value" = Pr...t..))
Variable Estimate Std..Error t.value P Value
(Intercept) 2.485317e+04 344.8485807 72.06981 < 0.001
cost 4.766705e-01 0.0115599 41.23479 < 0.001

Size x Earnings

pdata <- ed_df %>%
  rename(
    size = latest.student.size,
    earnings =latest.earnings.10_yrs_after_entry.median) %>%
  select(size,earnings) %>%
  filter(!is.na(size) & !is.na(earnings)) %>%
  mutate(bin = ntile(size,10)) %>%
  arrange(bin) %>%
  group_by(bin) %>%
  mutate(min_size = min(size),
         max_size = max(size)) %>%
  mutate(bin_name = ifelse(max_size<1000,
                           paste0(min_size,"-",max_size),
                           paste0(number(min_size,scale = .001,suffix = "k")
                              ,"-"
                              ,number(max_size,scale = .001,suffix = "k"))))
# fix factor order
pdata <- pdata %>%
  mutate(bin_name = factor(bin_name,levels = unique(pdata$bin_name)))

hcboxplot(x = pdata$earnings, var = pdata$bin_name,
          outliers = FALSE,color="black") %>% 
  hc_chart(type = "column") # to put box vertical
#model simple linear regression
lm2 <- summary(lm(earnings ~ size,data = pdata))

knitr::kable(data.frame(lm2$r.squared),align="l")
lm2.r.squared
0.0700737
knitr::kable(
  data.frame(lm2$coefficients) %>%
    tibble::rownames_to_column("Variable") %>%
    mutate(Pr...t.. = format.pval(Pr...t..,eps=.001)) %>%
    rename("P Value" = Pr...t..))
Variable Estimate Std..Error t.value P Value
(Intercept) 3.241548e+04 198.1432135 163.59620 < 0.001
size 5.679296e-01 0.0293352 19.36003 < 0.001
# wanted to plot points, but too many for highcharter to handle in browser
# pdata %>% 
#   hchart("point",hcaes(x="size",y="earnings")) %>% 
#   hc_add_theme(thm)

UG Enrollment Size

This data represents undergraduate enrollment size by institution.

pal <- colorNumeric(
  palette = c("orange","red"),
  domain = ed_df$latest.student.size)

# discretify continuous variables
# qpal <- colorQuantile("Blues", ed_df$latest.student.enrollment.all, n = 7)

pdata <- ed_df %>%
  filter(!is.na(latest.student.size))
  
leaflet() %>% 
  addTiles() %>%
  addCircleMarkers(
    data = pdata,
    lat = ~location.lat, lng = ~location.lon,
    color = ~pal(latest.student.size),
    weight = 1,
    fillOpacity = .6,
    radius = ~(latest.student.size / 2500),
    stroke = FALSE,
    popup = ~paste0("<strong>",school.name,"</strong><br>",
                   "UG Enrollment: ",latest.student.size),
    labelOptions = labelOptions(
    style = list("font-weight" = "normal", padding = "3px 8px"),
    textsize = "15px",
    direction = "auto"))
# create side-by-side bar charts for each institution type
lapply(unique(ed_df$pred_award),function(pred_award_level){
  
  # set each  chart with different color bars
  color_select = color_temp[
    (ed_df %>% filter(pred_award==pred_award_level) %>% distinct(school.degrees_awarded.predominant))$school.degrees_awarded.predominant+1
  ]
  
  ed_df %>%
    group_by(pred_award,school.name) %>%
    # shorthand to rename(), filter(), select(), then arrange...
    summarise(Enrollment = median(latest.student.size,na.rm=TRUE)) %>%
    arrange(desc(Enrollment)) %>%
    slice(1:10) %>%
    filter(pred_award == pred_award_level) %>%
    hchart("bar",hcaes(x="school.name",y="Enrollment"),color=color_select) %>%
    hc_add_theme(thm) %>%
    hc_title(text = pred_award_level) %>%
    hc_xAxis(title = NA)
}) %>%
  hw_grid(rowheight = 500,
          ncol = ceiling(length(unique(ed_df$pred_award))/2))

Median Earnings 10 Year Post-Entry

This data should be interpreted with extreme caustion as is. Important differences exist between institutions such as the highest degree awarded (e.g., associates vs PHD), the types of programs offered (e.g., Medicine vs Liberal Arts), and cost of attendance to name a few. This is to say that this map is not a tool to choose where one should go to college to earn top dollar.

pal <- colorNumeric(
  palette = c("orange","red"),
  domain = ed_df$latest.earnings.10_yrs_after_entry.median)

# discretify continuous variables
# qpal <- colorQuantile("Blues", ed_df$latest.student.enrollment.all, n = 7)

pdata <- ed_df %>%
  filter(!is.na(latest.earnings.10_yrs_after_entry.median))
  
leaflet() %>% 
  addTiles() %>%
  addCircleMarkers(
    data = pdata,
    lat = ~location.lat, lng = ~location.lon,
    color = ~pal(latest.earnings.10_yrs_after_entry.median),
    weight = 1,
    fillOpacity = .6,
    radius = ~(latest.earnings.10_yrs_after_entry.median / 4500),
    stroke = FALSE,
    popup = ~paste0("<strong>",school.name,"</strong><br>",
                   "10 Year Post-Enroll Income ($): ",latest.earnings.10_yrs_after_entry.median),
    labelOptions = labelOptions(
    style = list("font-weight" = "normal", padding = "3px 8px"),
    textsize = "15px",
    direction = "auto"))
# create side-by-side bar charts for each institution type
lapply(unique(ed_df$pred_award),function(pred_award_level){
  
  # set each  chart with different color bars
  color_select = color_temp[
    (ed_df %>% filter(pred_award==pred_award_level) %>% distinct(school.degrees_awarded.predominant))$school.degrees_awarded.predominant+1
  ]
  
  ed_df %>%
    group_by(pred_award,school.name) %>%
    # shorthand to select, rename, and remove NAs
    # more naturally rename(), filter(), select(), then arrange...
    summarise(Median = median(latest.earnings.10_yrs_after_entry.median,na.rm=TRUE)) %>%
    arrange(desc(Median)) %>%
    slice(1:10) %>%
    filter(pred_award == pred_award_level) %>%
    hchart("bar",hcaes(x="school.name",y="Median"),color=color_select) %>%
    hc_add_theme(thm) %>%
    hc_title(text = pred_award_level) %>%
    hc_xAxis(title = NA)
}) %>%
  hw_grid(rowheight = 500,
          ncol = ceiling(length(unique(ed_df$pred_award))/2))