# This R script creates a visualization of progression through mathematics courses for a cohort of students in Texas # There are two required data files; each contains aggregate characteristics for all year-to-year mathematics paths for a single cohort of students: # 1. The primary data file includes year-to-year transfers overall ("mathematics_paths.csv") # 2. The secondary data file includes year-to-year transfers separately for EDS and non-EDS students ("mathematics_paths_by_eds.csv") # This script cleans the raw data, applies formatting, and provides the code for plotting # This script includes code to create the following versions of the course pathways visualization: # Main Figure - as presented # Supplementary Figure 1 - path structure only, without shading by economic disadvantage # Supplementary Figure 2 - alternate representation of economic disadvantage (more detail) # Supplemenatry Figure 3 - including representation of "exit" pathways # Supplementary Figure 4 - panel plot by EDS ############################ # Section 0: Preliminaries # ############################ library(readr) library(dplyr) library(ggplot2) library(tidyr) ## Themes/Fonts windowsFonts(`Garamond` = windowsFont("Garamond")) theme_set(theme_bw()) ## NOTE: this script depends on the source data files (`mathematics_paths.csv` and `mathematics_paths_by_eds.csv`) being placed in the working directory ################################# # Section 1: Load Analysis File # ################################# # A) Load Raw Transitions File ############################## ## NOTES: These data are partially prepared- they include only paths with a sufficient number of cases for reporting, # but they include inconsistent labeling and some infrequent categories that will not be included in the # parsimonious visualization paths_df <- read_csv("mathematics_paths.csv") %>% mutate(dest_year = orig_year + 1) %>% select(n, orig_year, dest_year, orig_math_state, dest_math_state, mn_female, mn_eds_g8) # Here are the raw mathematics course options str(paths_df$orig_math_state) table(paths_df$orig_math_state) str(paths_df$dest_math_state) table(paths_df$dest_math_state) ## Investigate frequencies of missing values (entrances to or exits from the data) ### For origin states: paths_df %>% group_by(orig_year, orig_math_state) %>% summarize(N = sum(n)) %>% spread(orig_year, N) ## NOTE: entrances are somewhat prevalent in middle school but by construction do not exist in high school. ## This is because cohort is defined by 8th grade status ## A small number of students transition from No Mathematics or Independent Study to a course in 12th grade... paths_df %>% filter(orig_year == 2014) %>% filter(orig_math_state %in% c("Ind. Study", "No Mathematics")) %>% group_by(orig_math_state, dest_math_state) %>% summarize(N = sum(n)) %>% spread(dest_math_state, N) ## NOTE: so it is only Ind. Study -> Pre-Calc, and few students ### For destination states: paths_df %>% group_by(orig_year, dest_math_state) %>% summarize(N = sum(n)) %>% spread(orig_year, N) ## NOTES: "Ind. Study" is a common 12th grade destination, as is "Other Mathematics" ## These are similar as destinations from Alg II and Pre-Calc. paths_df %>% filter(orig_year %in% c(2013, 2014)) %>% filter(dest_math_state %in% c("Ind. Study", "Other Mathematics")) %>% group_by(orig_math_state, dest_math_state) %>% summarize(N = sum(n)) %>% spread(dest_math_state, N) # B) Data Cleaning Decisions ############################ # ISSUE: Several small categories that will want to treat the same below # DECISION: Move Ind. Study to "Other Mathematics" paths_df$orig_math_state <- ifelse(paths_df$orig_math_state == "Ind. Study", "Other Mathematics", paths_df$orig_math_state) paths_df$dest_math_state <- ifelse(paths_df$dest_math_state == "Ind. Study", "Other Mathematics", paths_df$dest_math_state) # ISSUE: Data includes 'entrance' rows with missing origin information paths_df %>% filter(is.na(orig_math_state)) %>% group_by(orig_year, dest_math_state) %>% summarize(N = sum(n)) %>% spread(orig_year, N) # DECISION: drop all entrances paths_df <- paths_df %>% filter(orig_math_state != "") # ISSUE: grade 5 to 6 transition is not in data because all students are in grade level # DECISION: for vizualization purposes, create implied grade 5 to grade 6 path g6_df <- paths_df %>% filter(orig_year == 2009) %>% summarise(n = sum(n), mn_female = mean(mn_female, wgt = n), mn_eds_g8 = mean(mn_eds_g8, wgt = n) ) %>% mutate(orig_year = 2008, dest_year = 2009, orig_math_state = "Grade", dest_math_state = "Grade") paths_df <- bind_rows(g6_df, paths_df) # ISSUE: the grade/basic-8th -> grade/basic-9th is missing, presumably due to differences in records between # grades and 8 and 9 that lead these students to appear as new to the data # DECISION: create grade/basic-8th to grade/basic-9th based on grade/basic-9th to alg i-10th new_row_df <- paths_df %>% filter(orig_year == 2012, dest_year == 2013, orig_math_state == "Grade", dest_math_state == "Alg I") %>% mutate(orig_year = 2011, dest_year = 2012, orig_math_state = "Grade", dest_math_state = "Grade") new_row_df paths_df <- bind_rows(paths_df, new_row_df) # ISSUE: small number of no math to no math transitions exist, which will not be represented in figure # DECISION: drop the no math to no math transitions paths_df <- paths_df %>% filter(!(orig_math_state %in% c("Drop-Out","No Mathematics") & dest_math_state %in% c("Drop-Out","No Mathematics"))) # C) Create Math States Values and Labels ######################################### unique(paths_df$orig_math_state) unique(paths_df$dest_math_state) length(unique(c(paths_df$orig_math_state, paths_df$dest_math_state))) # Map mathematics status options (`state`, which includings no mathematics) to numerical values ## Existing state labels state_list <- c("Drop-Out", "No Mathematics", "Grade", "Alg I", "Math Models", "Geom", "Alg II", "Pre-Calc", "AP Stats", "AP Calc AB", "AP Calc BC") ## Desired state labels state_labels <- c("Drop Out", "No Mathematics", "Grade Level/Basic", "Algebra I", "Models/Application", "Geometry", "Algebra II", "Pre-calculus", "AP Statistics", "Calculus AB", "Calculus BC") ## Desired numerical levels state_levels <- c(-2, -1, 0,1,1.5,2,3,4,4.5,5,6) ## Now apply desired values to origin and destination variables for (i in 1:length(state_levels)){ paths_df$orig_math_state <- ifelse(paths_df$orig_math_state == state_list[i], state_levels[i], paths_df$orig_math_state) paths_df$dest_math_state <- ifelse(paths_df$dest_math_state == state_list[i], state_levels[i], paths_df$dest_math_state) } paths_df$orig_math_state <- as.numeric(paths_df$orig_math_state) paths_df$dest_math_state <- as.numeric(paths_df$dest_math_state) ## Check results table(paths_df$orig_math_state) table(paths_df$dest_math_state) ###################### # Section 2: Plot(s) # ###################### ## Parameters to set output file size h <- 6 w <- 8 ## FIGURE (main): Plot of paths with SES information ## NOTE: colors for gradient based on colorbrewer2.org values ## for single color, use `palette` option: "Greens" ## for two color, use `palette` option: "RdBu" ## to use full range of colors, use `palette` option: "Spectral" (preferred except not color-blind safe)) fig_gg <- paths_df %>% filter(dest_math_state >= 0) %>% arrange(orig_year, n) %>% ggplot(aes(x=orig_year, y=orig_math_state)) + geom_segment(aes(xend = dest_year, yend = dest_math_state, size = n/1000, color = mn_eds_g8), alpha = .8, lineend = "round") + scale_size_area("Students\n(1000s)", max_size = 10) + scale_y_continuous("", minor_breaks = NULL, breaks = state_levels, labels = state_labels, position = "right") + scale_x_continuous("Grade", minor_breaks = NULL, breaks = 2009:2015, labels = 6:12) + scale_color_distiller("Economically\nDisadvantaged", labels = scales::percent, palette = "RdBu", direction = -1) + theme(legend.justification = c(0,1), legend.position = c(.03,.97), legend.box = "horizontal", legend.background = element_rect(linetype = "solid", size=.5,colour = "black"), text = element_text(size=16, family = "serif"), axis.text.y = element_text(color = c("black","black","gray50","black","black","black","gray50","black","black")), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank() ) + coord_fixed(xlim = c(2008.75,2015)) ggsave("fig_main.png", fig_gg, width = w, height = h) # SUPPLEMENTAL FIGURE 1: Basic plot of overall paths, omitting mathematics exits sf1_gg <- paths_df %>% filter(dest_math_state >= 0) %>% arrange(orig_year, n) %>% ggplot(aes(x=orig_year, y=orig_math_state)) + geom_segment(aes(xend = dest_year, yend = dest_math_state, size = n/1000), alpha = .8, lineend = "round") + scale_size_area("Students\n(1000s)", max_size = 10) + scale_y_continuous("", minor_breaks = NULL, breaks = state_levels, labels = state_labels, position = "right") + scale_x_continuous("Grade", minor_breaks = NULL, breaks = 2009:2015, labels = 6:12) + theme(legend.justification = c(0,1), legend.position = c(.03,.97), legend.box = "horizontal", legend.background = element_rect(linetype = "solid", size=.5,colour = "black"), text = element_text(size=16, family = "serif"), axis.text.y = element_text(color = c("black","black","gray50","black","black","black","gray50","black","black")), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank() ) + coord_fixed(xlim = c(2008.75,2015)) ggsave("sf1_baseline.png", sf1_gg, width = w, height = h) # SUPPLEMENTAL FIGURE 2: Main plot with greater EDS detail sf2_gg <- paths_df %>% filter(dest_math_state >= 0) %>% arrange(orig_year, n) %>% ggplot(aes(x=orig_year, y=orig_math_state)) + geom_segment(aes(xend = dest_year, yend = dest_math_state, size = n/1000, color = mn_eds_g8), alpha = .8, lineend = "round") + scale_size_area("Students\n(1000s)", max_size = 10) + scale_y_continuous("", minor_breaks = NULL, breaks = state_levels, labels = state_labels, position = "right") + scale_x_continuous("Grade", minor_breaks = NULL, breaks = 2009:2015, labels = 6:12) + scale_color_distiller("Economically\nDisadvantaged", labels = scales::percent, palette = "Spectral", direction = -1) + theme(legend.justification = c(0,1), legend.position = c(.03,.97), legend.box = "horizontal", legend.background = element_rect(linetype = "solid", size=.5,colour = "black"), text = element_text(size=16, family = "serif"), axis.text.y = element_text(color = c("black","black","gray50","black","black","black","gray50","black","black")), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank() ) + coord_fixed(xlim = c(2008.75,2015)) ggsave("sf2_spectral.png", sf2_gg, width = w, height = h) # SUPPLEMENTAL FIGURE 3: Including representation of exiting mathematics (no math enrollment or drop-out) ## function to modify dest_ values for exits (provide length in units and angle in radians) dest_exits <- function(df=paths_df, exit_length = .25, exit_angle = -75 * pi / 180) { nonexits <- filter(df, dest_math_state >= 0) exits <- df %>% filter(dest_math_state < 0) %>% mutate(dest_year = orig_year + exit_length * cos(exit_angle), dest_math_state = orig_math_state + exit_length * sin(exit_angle)) bind_rows(nonexits, exits) } # TESTING: # filter(paths_df, dest_math_state == -10) %>% dest_exits() ## FIGURE VERSION 3 (with exits): baseline figure with exits included sf3_gg <- paths_df %>% dest_exits() %>% arrange(orig_year, n) %>% ggplot(aes(x=orig_year, y=orig_math_state)) + geom_segment(aes(xend = dest_year, yend = dest_math_state, size = n/1000, color = mn_eds_g8), alpha = .8, lineend = "round") + scale_color_distiller("Economically\nDisadvantaged", labels = scales::percent, palette = "RdBu", direction = -1) + geom_point(data = filter(paths_df, dest_math_state < 0) %>% dest_exits(), aes(x = dest_year, y = dest_math_state), shape = 25, fill = "white", color = "black", size = 2) + scale_size_area("Students\n(1000s)", max_size = 10) + scale_y_continuous("", minor_breaks = NULL, breaks = state_levels, labels = state_labels, position = "right") + scale_x_continuous("Grade", minor_breaks = NULL, breaks = 2009:2015, labels = 6:12) + theme(legend.justification = c(0,1), legend.position = c(.03,.97), legend.box = "horizontal", legend.background = element_rect(linetype = "solid", size=.5,colour = "black"), text = element_text(size=16, family = "serif"), axis.text.y = element_text(color = c("black","black","gray50","black","black","black","gray50","black","black")), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank() ) + coord_fixed(xlim = c(2008.75,2015)) ggsave("sf3_exits.png", sf3_gg, width = w, height = h) ### SUPPLEMENT FIGURE 4: Seaparate EDS panels (kludgy: would not have set things up this way if planning to make panels in the first place) ## Load Data File (keeping only paths with at least 500, half of overall) by_eds_df <- read_csv("mathematics_paths_by_eds.csv") %>% mutate(dest_year = orig_year + 1) %>% select(eds_g8, n, orig_year, dest_year, orig_math_state, dest_math_state, mn_female) %>% filter(n > 200) ## Apply Same Cleaning Decisions as above #### DECISION: Move Ind. Study to "Other Mathematics" by_eds_df$orig_math_state <- ifelse(by_eds_df$orig_math_state == "Ind. Study", "Other Mathematics", by_eds_df$orig_math_state) by_eds_df$dest_math_state <- ifelse(by_eds_df$dest_math_state == "Ind. Study", "Other Mathematics", by_eds_df$dest_math_state) ### DECISION: drop all entrances by_eds_df <- by_eds_df %>% filter(orig_math_state != "") ### DECISION: for vizualization purposes, create implied grade 5 to grade 6 path by_eds_g6_df <- by_eds_df %>% filter(orig_year == 2009) %>% group_by(eds_g8) %>% summarise(n = sum(n), mn_female = mean(mn_female, wgt = n) ) %>% mutate(orig_year = 2008, dest_year = 2009, orig_math_state = "Grade", dest_math_state = "Grade") by_eds_df <- bind_rows(by_eds_g6_df, by_eds_df) ### DECISION: create grade/basic-8th to grade/basic-9th based on grade/basic-9th to alg i-10th by_eds_new_row_df <- by_eds_df %>% filter(orig_year == 2012, dest_year == 2013, orig_math_state == "Grade", dest_math_state == "Alg I") %>% mutate(orig_year = 2011, dest_year = 2012, orig_math_state = "Grade", dest_math_state = "Grade") by_eds_df <- bind_rows(by_eds_df, by_eds_new_row_df) ### DECISION: drop the no math to no math transitions by_eds_df <- by_eds_df %>% filter(!(orig_math_state %in% c("Drop-Out","No Mathematics") | dest_math_state %in% c("Drop-Out","No Mathematics"))) ### Create Math States Values and Labels, as above for (i in 1:length(state_levels)){ by_eds_df$orig_math_state <- ifelse(by_eds_df$orig_math_state == state_list[i], state_levels[i], by_eds_df$orig_math_state) by_eds_df$dest_math_state <- ifelse(by_eds_df$dest_math_state == state_list[i], state_levels[i], by_eds_df$dest_math_state) } by_eds_df$orig_math_state <- as.numeric(by_eds_df$orig_math_state) by_eds_df$dest_math_state <- as.numeric(by_eds_df$dest_math_state) ### Create factor of eds_g8 by_eds_df <- by_eds_df %>% mutate(eds_g8 = factor(eds_g8, levels = c(0,1), labels = c("Not Economically Disadvantaged","Economically Disadvantaged"))) ## Finally, now ready to plot sf4_gg <- by_eds_df %>% filter(dest_math_state >= 0) %>% arrange(orig_year, n) %>% ggplot(aes(x=orig_year, y=orig_math_state)) + geom_segment(aes(xend = dest_year, yend = dest_math_state, size = n/1000), alpha = .8, lineend = "round") + scale_color_gradient2(midpoint = .5, mid = "gray80") + scale_size_area("Students\n(1000s)", max_size = 10) + scale_y_continuous("", minor_breaks = NULL, breaks = state_levels, labels = state_labels, position = "right") + scale_x_continuous("Grade", minor_breaks = NULL, breaks = 2009:2015, labels = 6:12) + theme(legend.justification = c(0,1), legend.position = c(.03,.97), legend.box = "horizontal", legend.background = element_rect(linetype = "solid", size=.5,colour = "black"), text = element_text(size=16, family = "serif"), axis.text.y = element_text(color = c("black","black","gray50","black","black","black","gray50","black","black")), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank() ) + facet_grid(~eds_g8) + coord_fixed(xlim = c(2008.75,2015)) ggsave("sf4_panels.png", sf4_gg, width = 10, height = 5) ## BONUS FIGURE: Plot of paths by GENDER sf5_gg <- paths_df %>% filter(dest_math_state >= 0) %>% arrange(orig_year, n) %>% ggplot(aes(x=orig_year, y=orig_math_state)) + geom_segment(aes(xend = dest_year, yend = dest_math_state, size = n/1000), alpha = .8, lineend = "round") + scale_size_area("Students\n(1000s)", max_size = 10) + scale_y_continuous("", minor_breaks = NULL, breaks = state_levels, labels = state_labels, position = "right") + scale_x_continuous("Grade", minor_breaks = NULL, breaks = 2009:2015, labels = 6:12) + scale_color_distiller("Female", labels = scales::percent, palette = "RdBu", limits = c(.25,.65)) + theme(legend.justification = c(0,1), legend.position = c(.03,.97), legend.box = "horizontal", legend.background = element_rect(linetype = "solid", size=.5,colour = "black"), text = element_text(size=16, family = "serif"), axis.text.y = element_text(color = c("black","black","gray50","black","black","black","gray50","black","black")), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank() ) + coord_fixed(xlim = c(2008.75,2015)) ggsave("sf5_gender.png", sf5_gg, width = 7, height = 6) ## END