##READING FILES
chem1_f18 =     read.csv("~/Teaching/Grades_and_SRT/Fall2018/chem1331_f18.csv",header = TRUE)
chem1_f19 =     read.csv("~/Teaching/Grades_and_SRT/Fall2019/chem1331_f19_grades_canvas.csv",header = TRUE)
chem1_f20 =     read.csv("~/Teaching/Grades_and_SRT/Fall2020/chem1331_f20_grades_canvas.csv",header = TRUE)
chem1_f21 =     read.csv("~/Teaching/Grades_and_SRT/Fall2021/chem1331_f21_grades_canvas.csv",header = TRUE)
chem1_f22 =     read.csv("~/Teaching/Grades_and_SRT/Fall2022/chem1331_f22_grades_canvas.csv",header = TRUE)
chem1_f23 =     read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem1331_f23_grades_canvas.csv",header = TRUE)
chem1_f24 =     read.csv("~/Teaching/Grades_and_SRT/Fall2024/chem1331_f24_grades_canvas.csv",header = TRUE)

chem2_s19 =     read.csv("~/Teaching/Grades_and_SRT/Spring2019/chem1333_s19.csv", header = TRUE)
chem2_s20 =     read.csv("~/Teaching/Grades_and_SRT/Spring2020/chem1333_s20_grades_canvas.csv", header = TRUE)
chem2_s21 =     read.csv("~/Teaching/Grades_and_SRT/Spring2021/chem1333_s21_grades_canvas.csv", header = TRUE)
chem2_s22 =     read.csv("~/Teaching/Grades_and_SRT/Spring2022/chem1333_s22_grades_canvas.csv", header = TRUE)
chem2_s23 =     read.csv("~/Teaching/Grades_and_SRT/Spring2023/chem1333_s23_grades_canvas.csv", header = TRUE)
chem2_s24 =     read.csv("~/Teaching/Grades_and_SRT/Spring2024/chem1333_s24_grades_canvas.csv", header = TRUE)

chem3_f19 =     read.csv("~/Teaching/Grades_and_SRT/Fall2019/chem2231_f19_grades_canvas.csv",header = TRUE)
chem3_f20 =     read.csv("~/Teaching/Grades_and_SRT/Fall2020/chem2131_f20_grades_canvas.csv",header = TRUE)
chem3_f21 =     read.csv("~/Teaching/Grades_and_SRT/Fall2021/chem2131_f21_grades_canvas.csv",header = TRUE)
chem3_f22 =     read.csv("~/Teaching/Grades_and_SRT/Fall2022/chem2131_f22_grades_canvas.csv",header = TRUE)
chem3_f23 =     read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem2131_f23_grades_canvas.csv",header = TRUE)
chem3_f24 =     read.csv("~/Teaching/Grades_and_SRT/Fall2024/chem2131_f24_grades_canvas.csv",header = TRUE)

chem4_s20 =     read.csv("~/Teaching/Grades_and_SRT/Spring2020/chem2333_s20_grades_canvas.csv", header = TRUE)
chem4_s21 =     read.csv("~/Teaching/Grades_and_SRT/Spring2021/chem2335_s21_grades_canvas.csv", header = TRUE)
chem4_s22 =     read.csv("~/Teaching/Grades_and_SRT/Spring2022/chem2335_s22_grades_canvas.csv", header = TRUE)
chem4_s23 =     read.csv("~/Teaching/Grades_and_SRT/Spring2023/chem2335_s23_grades_canvas.csv", header = TRUE)
chem4_s24 =     read.csv("~/Teaching/Grades_and_SRT/Spring2024/chem2335_s24_grades_canvas.csv", header = TRUE)
chem4_s25 =     read.csv("~/Teaching/Grades_and_SRT/Spring2025/chem2335_s25_grades_tempo_canvas.csv", header = TRUE)

#we need to merge the summer courses, ngm-f23 and chem1330 into the corresponding cohorts. Just add the id columns and the last 4 columns for grades
chem0_f23 =     read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem1330_f23_grades_canvas.csv",header = TRUE)
chem0_f24 =     read.csv("~/Teaching/Grades_and_SRT/Fall2024/chem1330_f24_grades_canvas.csv",header = TRUE)
chem1_ngm_f23 =  read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem1331_f23_ngm_grades_canvas.csv",header = TRUE)

  
dfs = list(
  chem1_f18 = chem1_f18,
  chem2_s19 = chem2_s19,
  chem3_f19 = chem3_f19,
  chem4_s20 = chem4_s20,
  
  chem1_f19 = chem1_f19,
  chem2_s20 = chem2_s20,
  chem3_f20 = chem3_f20,
  chem4_s21 = chem4_s21,
  
  chem1_f20 = chem1_f20,
  chem2_s21 = chem2_s21,
  chem3_f21 = chem3_f21,
  chem4_s22 = chem4_s22,
  
  chem1_f21 = chem1_f21,
  chem2_s22 = chem2_s22,
  chem3_f22 = chem3_f22,
  chem4_s23 = chem4_s23,
  
  chem1_f22 = chem1_f22,
  chem2_s23 = chem2_s23,
  chem3_f23 = chem3_f23,
  chem4_s24 = chem4_s24,
  
  chem1_f23 = chem1_f23,
  chem2_s24 = chem2_s24,
  chem3_f24 = chem3_f24,
  chem4_s25 = chem4_s25
               
)
getFinalScore = function(dfs){
  #For some reason, some Final.Score is lower than Current.Score. Using the highest
  for (i in seq_along(dfs)) {
    df <- dfs[[i]]
    didItChange=FALSE
    times=0
    for (j in 1:nrow(df)) {
      if (df[j, "Current.Score"] > df[j, "Final.Score"]) {
        df[j, "Final.Score"] <- df[j, "Current.Score"] 
        didItChange=TRUE
        times=times+1
      }
    }
    #if (didItChange){ print(paste("The dataframe",names(dfs)[i]," Current and Final score are not the same in",times,"students"))}
    dfs[[i]] <- df
  }
  return(dfs)
}
dfs = getFinalScore(dfs)

1 All students

We started teaching the new curriculum in Fall 2018. The first cohort (f18) finished in Spring 2020

Let’s consider how students flew through the curriculum and how they did. Some students postpone taking Chem3 or Chem4, we are taking their grade into account even if they didn’t take the course during their sophomore year.

###PREPARING FILES

combined_df <- dfs %>%
  map(~ select(.x, SIS.Login.ID, Final.Score)) %>%
  reduce(full_join, by = "SIS.Login.ID")
whos_that_student <- dfs %>%
  map(~ select(.x,Student, SIS.Login.ID, Final.Score)) %>%
  reduce(full_join, by = "SIS.Login.ID")


# Rename the Final.Score columns with the names of the original dataframes
names(combined_df)[-1] <- names(dfs)
names(whos_that_student)[-1] <- names(dfs)

# Rename the SIS.Login.ID column to student
names(combined_df)[1] <- "student"

f18cohort <- combined_df %>%
  filter(!is.na(chem1_f18))
f19cohort <- combined_df %>%
  filter(!is.na(chem1_f19))
f20cohort <- combined_df %>%
  filter(!is.na(chem1_f20))
f21cohort <- combined_df %>%
  filter(!is.na(chem1_f21))
f22cohort <- combined_df %>%
  filter(!is.na(chem1_f22))
f23cohort <- combined_df %>%
  filter(!is.na(chem1_f23))

merge_columns <- function(df, prefix) {
  columns <- grep(paste0("^", prefix), names(df), value = TRUE)
  merged_column <- ifelse(rowSums(!is.na(df[columns])) == 0, NA, 
                          do.call(pmax, c(df[columns], na.rm = TRUE)))
  return(merged_column)
}

returnMergedColumns = function(df){
  # Apply the function to merge chem1, chem2, chem3, and chem4 columns
  df <- df %>%
    mutate(
      chem1 = merge_columns(., "chem1"),
      chem2 = merge_columns(., "chem2"),
      chem3 = merge_columns(., "chem3"),
      chem4 = merge_columns(., "chem4")
    ) %>%
    select(student, chem1, chem2, chem3, chem4)
  return(df)
}
prepareAlluvial = function(df){
  new_df <- df %>%
  mutate_at(vars(chem1:chem4), ~case_when(
    is.na(.) ~ "OUT",
    . > 90 ~ "A",
    . >= 80 & . <= 90 ~ "B",
    . >= 70 & . < 80 ~ "C",
    . >= 60 & . < 70 ~ "D",
    TRUE ~ "F"
  ))
long_df <- pivot_longer(new_df, cols = starts_with("chem"), names_to = "course", values_to = "letterGrade")

# Rename columns
colnames(long_df) <- c("student", "course", "letterGrade")
return(long_df)
  
}

f18cohort = returnMergedColumns(f18cohort)
f18letter = prepareAlluvial(f18cohort)

f19cohort = returnMergedColumns(f19cohort)
f19letter = prepareAlluvial(f19cohort)

f20cohort = returnMergedColumns(f20cohort)
f20letter = prepareAlluvial(f20cohort)

f21cohort = returnMergedColumns(f21cohort)
f21letter = prepareAlluvial(f21cohort)

f22cohort = returnMergedColumns(f22cohort)
f22letter = prepareAlluvial(f22cohort)

f23cohort = returnMergedColumns(f23cohort)
f23letter = prepareAlluvial(f23cohort)

#f18cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f18cohort.csv", row.names = FALSE)
#f19cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f19cohort.csv", row.names = FALSE)
#f20cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f20cohort.csv", row.names = FALSE)
#f21cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f21cohort.csv", row.names = FALSE)
#write.csv(f22cohort[order(f22cohort$chem2_s23),],"f22cohort.csv")
## REPRESENT
makeAlluvial_Cohort = function(df,thisTitle){
  ggplot(df,
           aes(x = course, stratum = letterGrade, alluvium = student, fill = letterGrade)) +
      scale_x_discrete(expand = c(.1, .1)) +
      geom_flow() +
      geom_stratum(alpha = .5) +
      theme_minimal() +
      geom_text(stat = "stratum",
                aes(label = percent(after_stat(prop), accuracy = .1)))+
      labs(title = thisTitle, x = "", y = "Number of students")
}
makeAlluvial_Cohort2 = function(df,thisTitle){
  ggplot(df,
           aes(x = course, stratum = letterGrade, alluvium = student, fill = letterGrade)) +
      scale_x_discrete(expand = c(.1, .1)) +
      geom_flow() +
      geom_stratum(alpha = .5) +
      theme_minimal() +
      geom_text(stat = "stratum",
                aes(label = after_stat(count) ))+
      labs(title = thisTitle, x = "", y = "Number of students")
}
create_summary_table <- function(df) {
    summary_table <- df %>%
    group_by(course, letterGrade, .drop = TRUE) %>%
    summarize(count = n(), .groups = 'drop') %>%
    ungroup() %>%
    complete(course, letterGrade, fill = list(count = 0))

  # Pivot the data to have courses as columns
  summary_table <- summary_table %>%
    pivot_wider(names_from = course, values_from = count)
  
  df_sum <- summary_table %>%
  bind_rows(
    data.frame(
      letterGrade = "Total",
      chem1 = sum(.$chem1),
      chem2 = sum(.$chem2),
      chem3 = sum(.$chem3),
      chem4 = sum(.$chem4)
    )
  )

# Calculate percentages and add "%" character
df_percent <- df_sum %>%
  mutate(across(starts_with("chem"), ~ scales::percent(./sum(.)*2 , accuracy = 0.1), .names = "{col}%"))

  return(df_percent)
}

allCohorts = rbind(f18letter,f19letter)
allCohorts = rbind(allCohorts,f20letter)
allCohorts = rbind(allCohorts,f21letter)
allCohorts = rbind(allCohorts,f22letter)
allCohorts = rbind(allCohorts,f23letter)
allCohorts = allCohorts[!duplicated(allCohorts, fromLast = TRUE),]
makeAlluvial_Cohort(allCohorts,"All cohorts")

summary_table <- create_summary_table(allCohorts)
kable(summary_table,caption = "All cohort numbers")
All cohort numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 387 309 158 139 34.7% 27.7% 14.2% 12.5%
B 465 326 184 200 41.7% 29.2% 16.5% 17.9%
C 187 171 121 70 16.8% 15.3% 10.8% 6.3%
D 46 27 30 12 4.1% 2.4% 2.7% 1.1%
F 31 6 6 5 2.8% 0.5% 0.5% 0.4%
OUT 0 277 617 690 0.0% 24.8% 55.3% 61.8%
Total 1116 1116 1116 1116 100.0% 100.0% 100.0% 100.0%

1.1 Retention across the curriculum and across the years

calculate_percentage <- function(df) {
  df %>%
    mutate(not_out = ifelse(letterGrade != "OUT", 1, 0)) %>%
    group_by(course) %>%
    summarise(
      raw_count  = sum(not_out),
      percentage = mean(not_out) * 100
    )
}

# Apply the function to each dataframe
f18_percentage <- calculate_percentage(f18letter)
f19_percentage <- calculate_percentage(f19letter)
f20_percentage <- calculate_percentage(f20letter)
f21_percentage <- calculate_percentage(f21letter)
f22_percentage <- calculate_percentage(f22letter)
f23_percentage <- calculate_percentage(f23letter)
allPercentage <- bind_rows(f18_percentage,
                         f19_percentage,
                         f20_percentage,
                         f21_percentage,
                         f22_percentage,
                         f23_percentage,
                          .id = "df_id"
                         )%>%
  mutate(df_id = case_when(
    df_id == "1" ~ "Fall18",
    df_id == "2" ~ "Fall19",
    df_id == "3" ~ "Fall20",
    df_id == "4" ~ "Fall21",
    df_id == "5" ~ "Fall22",
    df_id == "6" ~ "Fall23",
    TRUE ~ df_id  # Keep other values unchanged
  ))

ggplot(allPercentage, aes(x = course, y = percentage, group = df_id, color = df_id)) +
  geom_line() +
  geom_point() +
  labs(x = "Course", y = "% students",
       title = "Retention across the Chemistry Curriculum") +
  scale_color_discrete(name = "Cohort first semester") +
  theme_minimal()

allPercentage_wide <- allPercentage %>%
  mutate(raw_pct = paste0(raw_count, " (", round(percentage, 1), "%)")) %>%
  select(course, df_id, raw_pct) %>%
  pivot_wider(
    id_cols = course,
    names_from = df_id,
    values_from = raw_pct
  )


# Print the resulting table
kable(allPercentage_wide, format = "markdown",digits = 1, caption = "Raw and percent numbers of students throughout the curriculum since 2018")
Raw and percent numbers of students throughout the curriculum since 2018
course Fall18 Fall19 Fall20 Fall21 Fall22 Fall23
chem1 181 (100%) 182 (100%) 223 (100%) 195 (100%) 208 (100%) 151 (100%)
chem2 129 (71.3%) 134 (73.6%) 162 (72.6%) 146 (74.9%) 155 (74.5%) 123 (81.5%)
chem3 78 (43.1%) 91 (50%) 93 (41.7%) 78 (40%) 95 (45.7%) 70 (46.4%)
chem4 66 (36.5%) 80 (44%) 73 (32.7%) 64 (32.8%) 81 (38.9%) 64 (42.4%)

1.2 Probability of success and retention based on previous grades

all_letters <- bind_rows(
  f18letter, f19letter, f20letter,
  f21letter, f22letter, f23letter,
  .id = "semester"
)
# Step 2: Reshape wide so each course is its own column
wide_all_letters <- all_letters %>%
  pivot_wider(names_from = course, values_from = letterGrade)

make_transition_table <- function(data, from_course, to_course) {
  
  transition <- data %>%
    count(.data[[from_course]], .data[[to_course]]) %>%
    group_by(.data[[from_course]]) %>%
    mutate(percentage = 100 * n / sum(n)) %>%
    ungroup() %>%
    select(-n) %>%
    pivot_wider(
      names_from = !!sym(to_course), 
      values_from = percentage, 
      values_fill = 0
    ) %>%
    rename(!!from_course := 1) %>%
    mutate(across(-1, ~ paste0(sprintf("%.1f", .x), "%")))
  
  return(transition)
}

kable(make_transition_table(wide_all_letters, "chem1", "chem2"),format = "markdown",caption = "Percent chance to transition from CHEM1 to CHEM2 - All cohorts")
Percent chance to transition from CHEM1 to CHEM2 - All cohorts
chem1 A B C D OUT F
A 59.2% 24.1% 3.1% 0.3% 13.3% 0.0%
B 15.8% 41.8% 19.4% 2.3% 20.5% 0.2%
C 2.1% 18.8% 32.3% 6.8% 37.5% 2.6%
D 0.0% 5.9% 15.7% 3.9% 72.5% 2.0%
F 0.0% 0.0% 0.0% 0.0% 100.0% 0.0%

A way to read the above table is that 59.2% of students who got an A in chem1 will get an A in chem2. Notice that the lower the grade in CHEM1 the higher the chance to be OUT.

kable(make_transition_table(wide_all_letters, "chem2", "chem3"),format = "markdown",caption = "Percent chance to transition from CHEM2 to CHEM3 - All cohorts")
Percent chance to transition from CHEM2 to CHEM3 - All cohorts
chem2 A B C D OUT F
A 45.2% 24.8% 1.6% 0.3% 28.1% 0.0%
B 5.4% 30.2% 23.6% 3.6% 36.6% 0.6%
C 0.6% 4.6% 21.8% 10.3% 60.9% 1.7%
D 0.0% 3.7% 7.4% 0.0% 85.2% 3.7%
F 0.0% 0.0% 0.0% 0.0% 100.0% 0.0%
OUT 0.0% 0.0% 0.0% 0.0% 100.0% 0.0%
kable(make_transition_table(wide_all_letters, "chem3", "chem4"),format = "markdown",caption = "Percent chance to transition from CHEM3 to CHEM4 - All cohorts")
Percent chance to transition from CHEM3 to CHEM4 - All cohorts
chem3 A B C OUT D F
A 60.4% 25.8% 1.3% 12.6% 0.0% 0.0%
B 19.4% 57.0% 11.8% 10.8% 0.5% 0.5%
C 1.6% 39.0% 29.3% 24.4% 4.1% 1.6%
D 3.2% 3.2% 22.6% 51.6% 12.9% 6.5%
F 0.0% 0.0% 0.0% 100.0% 0.0% 0.0%
OUT 0.6% 0.9% 0.5% 97.6% 0.3% 0.0%

1.3 Cohort 1: Starting in Fall2018

Took: * CHEM1 in F18 * CHEM2 in S19 * CHEM3 in F19 or later * CHEM4 in S20 or later

makeAlluvial_Cohort(f18letter,"F18 all students cohort")

summary_table <- create_summary_table(f18letter)
kable(summary_table,caption = "F18 numbers")
F18 numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 90 38 18 16 49.7% 21.0% 9.9% 8.8%
B 71 50 36 34 39.2% 27.6% 19.9% 18.8%
C 13 34 19 14 7.2% 18.8% 10.5% 7.7%
D 2 5 4 2 1.1% 2.8% 2.2% 1.1%
F 5 2 1 0 2.8% 1.1% 0.6% 0.0%
OUT 0 52 103 115 0.0% 28.7% 56.9% 63.5%
Total 181 181 181 181 100.0% 100.0% 100.0% 100.0%

1.4 Cohort 2: Starting in Fall2019

Took: * CHEM1 in F19 * CHEM2 in S20 * CHEM3 in F20 or later * CHEM4 in S21 or later

makeAlluvial_Cohort(f19letter,"F19 all students cohort")

summary_table <- create_summary_table(f19letter)
kable(summary_table,caption = "F19 numbers")
F19 numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 52 29 23 13 28.6% 15.9% 12.6% 7.1%
B 95 59 31 46 52.2% 32.4% 17.0% 25.3%
C 25 39 25 19 13.7% 21.4% 13.7% 10.4%
D 7 6 10 2 3.8% 3.3% 5.5% 1.1%
F 3 1 2 0 1.6% 0.5% 1.1% 0.0%
OUT 0 48 91 102 0.0% 26.4% 50.0% 56.0%
Total 182 182 182 182 100.0% 100.0% 100.0% 100.0%

1.5 Cohort 3: Started in Fall2020

Took: * CHEM1 in F20 * CHEM2 in S21 * CHEM3 in F21 or later * CHEM4 in S22 or later

makeAlluvial_Cohort(f20letter,"F20 all students cohort")

summary_table <- create_summary_table(f20letter)
kable(summary_table,caption = "F20 numbers")
F20 numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 48 59 30 27 21.5% 26.5% 13.5% 12.1%
B 80 62 34 30 35.9% 27.8% 15.2% 13.5%
C 65 35 23 12 29.1% 15.7% 10.3% 5.4%
D 22 5 6 3 9.9% 2.2% 2.7% 1.3%
F 8 1 0 1 3.6% 0.4% 0.0% 0.4%
OUT 0 61 130 150 0.0% 27.4% 58.3% 67.3%
Total 223 223 223 223 100.0% 100.0% 100.0% 100.0%

1.6 Cohort 4: Started in Fall2021

Took:

  • CHEM1 in F21
  • CHEM2 in S22
  • CHEM3 in F22 or later
  • CHEM4 in S23
makeAlluvial_Cohort(f21letter,"F21 all students cohort")

summary_table <- create_summary_table(f21letter)
kable(summary_table,caption = "F21 numbers")
F21 numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 71 58 31 31 36.4% 29.7% 15.9% 15.9%
B 76 52 22 24 39.0% 26.7% 11.3% 12.3%
C 28 29 21 5 14.4% 14.9% 10.8% 2.6%
D 14 5 2 2 7.2% 2.6% 1.0% 1.0%
F 6 2 2 2 3.1% 1.0% 1.0% 1.0%
OUT 0 49 117 131 0.0% 25.1% 60.0% 67.2%
Total 195 195 195 195 100.0% 100.0% 100.0% 100.0%

1.7 Cohort 5. Started in Fall 2022

Took:

  • CHEM1 in F22
  • CHEM2 in S23
  • CHEM3 in F23
  • CHEM4 in S24
makeAlluvial_Cohort(f22letter,"F22 all students cohort")

summary_table <- create_summary_table(f22letter)
kable(summary_table,caption = "F22 numbers")
F22 numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 60 54 24 28 28.8% 26.0% 11.5% 13.5%
B 102 69 39 40 49.0% 33.2% 18.8% 19.2%
C 37 26 24 9 17.8% 12.5% 11.5% 4.3%
D 4 5 8 2 1.9% 2.4% 3.8% 1.0%
F 5 1 0 2 2.4% 0.5% 0.0% 1.0%
OUT 0 53 113 127 0.0% 25.5% 54.3% 61.1%
Total 208 208 208 208 100.0% 100.0% 100.0% 100.0%

1.8 Cohort 6. Started in Fall 2023

Took:

  • CHEM1 in F23
  • CHEM2 in S24
  • CHEM3 in F24
  • CHEM4 in S25

At the moment of this analysis some students dropped or have not taken ochem2 or chem4, so they may take it in the next year and the results may change

makeAlluvial_Cohort(f22letter,"F23 all students cohort")

summary_table <- create_summary_table(f23letter)
kable(summary_table,caption = "F23 numbers")
F23 numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 69 72 33 24 45.7% 47.7% 21.9% 15.9%
B 50 39 24 28 33.1% 25.8% 15.9% 18.5%
C 24 11 11 11 15.9% 7.3% 7.3% 7.3%
D 2 1 1 1 1.3% 0.7% 0.7% 0.7%
F 6 0 1 0 4.0% 0.0% 0.7% 0.0%
OUT 0 28 81 87 0.0% 18.5% 53.6% 57.6%
Total 151 151 151 151 100.0% 100.0% 100.0% 100.0%

1.9 The fate of C- students

If a student gets less than 74%, what advise should we give them?

Cstud_s23 = chem2_s23[which(chem2_s23$Unposted.Final.Score < 74),]
Cstud_s23 = Cstud_s23
summary_table <- f18cohort %>%
  filter(chem2 < 74) %>%
  summarize(
    students_below_74 = n(),
    avg_chem3_score = mean(chem3, na.rm = TRUE),
    avg_chem4_score = mean(chem4, na.rm = TRUE)
  )

# Print summary table
print(summary_table)
##   students_below_74 avg_chem3_score avg_chem4_score
## 1                18         75.0475            74.6

2 Only students who took the 4 semesters

Now let’s just consider only the students who took the four semesters.

There are some students who transferred in GenChem2, so there are some premed-like students that continued to Biochemistry even if they look to be “OUT” of the curriculum.

TODO: Check Biochem lists.

## PREPARE FILES
select_and_add_name <- function(df, name) {
  if (any(is.na(df$Final.Score))) {
    print(paste("Data frame", name, "contains NA values in the 'grade' column"))
  }
  selected_df <- df %>% 
    #select(ID, Final.Score) %>%
    select(SIS.Login.ID, Final.Score) %>%
    mutate(Final.Score = as.numeric(Final.Score))  # Convert "grade" to numeric type
  selected_df$name <- name
  return(selected_df)
}

# Apply the function to each dataframe in the list
dfs_with_name <- Map(select_and_add_name, dfs, names(dfs))

# Merge all data frames into one
merged_df <- bind_rows(dfs_with_name)

names(merged_df)[names(merged_df)=="SIS.Login.ID"] = "student"
allCohort <- separate(merged_df, name, into = c("course", "semester"), sep = "_")
allCohort$letterGrade = ifelse(allCohort$Final.Score > 90, "A",
                         ifelse(allCohort$Final.Score >= 80, "B",
                                ifelse(allCohort$Final.Score >= 70, "C",
                                       ifelse(allCohort$Final.Score >= 60, "D", "F"))))

# filter just in case they took it twice, just take the highest score
allCohort <- allCohort %>%
  group_by(student, course) %>%
  filter(Final.Score == max(Final.Score))

# What students took the whole sequence
fullFour <- allCohort %>%
  group_by(student) %>%
  filter(all(c("chem1", "chem2", "chem3", "chem4") %in% course))

# filter just in case they took it twice, just take the highest score
fullFour <- fullFour %>%
  group_by(student, course) %>%
  filter(Final.Score == max(Final.Score))
#Writing just in case

## REPRESENTING
selectSemester = function(df,thisSem){
  selected_students <- df %>%
    filter(course == "chem1" & semester == thisSem) %>%
    pull(student) %>%
    unique()

  # Filter the dataframe to include all rows for selected students
  thisSem_df <- df %>%
    filter(student %in% selected_students)
  return(thisSem_df) 
}
plotAlluvialCohort = function(df,thisSem,thisTitle){
  
  thisSem_df = selectSemester(df,thisSem)
  ggplot(thisSem_df,
           aes(x = course, stratum = letterGrade, alluvium = student, fill = letterGrade)) +
      scale_x_discrete(expand = c(.1, .1)) +
      geom_flow() +
      geom_stratum(alpha = .5) +
      theme_minimal() +
      geom_text(stat = "stratum",
                aes(label = percent(after_stat(prop), accuracy = .1)))+
      labs(title = thisTitle, x = "", y = "Number of students")
}
makeAlluvial_Cohort(fullFour,"All cohorts. Students who took the four semesters")

summary_table <- create_summary_table(fullFour)
kable(summary_table,caption = "All cohort numbers. Students who took the four semesters")
All cohort numbers. Students who took the four semesters
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 207 196 139 135 50.4% 47.7% 33.8% 32.8%
B 177 170 165 194 43.1% 41.4% 40.1% 47.2%
C 25 42 92 67 6.1% 10.2% 22.4% 16.3%
D 2 3 15 10 0.5% 0.7% 3.6% 2.4%
F 0 0 0 5 0.0% 0.0% 0.0% 1.2%
Total 411 411 411 411 100.0% 100.0% 100.0% 100.0%

2.1 Cohort 1: Started in Fall2018

plotAlluvialCohort(fullFour,"f18","AllFour: Cohort taking CHEM1 in Fall 2018")

summary_table <- create_summary_table(selectSemester(fullFour,"f18"))
kable(summary_table,caption = "All4 F18 cohort numbers")
All4 F18 cohort numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 46 23 16 15 71.9% 35.9% 25.0% 23.4%
B 18 32 32 33 28.1% 50.0% 50.0% 51.6%
C 0 8 14 14 0.0% 12.5% 21.9% 21.9%
D 0 1 2 2 0.0% 1.6% 3.1% 3.1%
Total 64 64 64 64 100.0% 100.0% 100.0% 100.0%

2.2 Cohort 2: Started in Fall2019

plotAlluvialCohort(fullFour,"f19","AllFour: Cohort taking CHEM1 in Fall 2019")

summary_table <- create_summary_table(selectSemester(fullFour,"f19"))
kable(summary_table,caption = "All4 F18 cohort numbers")
All4 F18 cohort numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 30 21 23 12 38.5% 26.9% 29.5% 15.4%
B 45 40 29 46 57.7% 51.3% 37.2% 59.0%
C 3 16 20 18 3.8% 20.5% 25.6% 23.1%
D 0 1 6 2 0.0% 1.3% 7.7% 2.6%
Total 78 78 78 78 100.0% 100.0% 100.0% 100.0%

2.3 Cohort 3: Started in Fall2020

plotAlluvialCohort(fullFour,"f20","AllFour: Cohort taking CHEM1 in Fall 2020")

summary_table <- create_summary_table(selectSemester(fullFour,"f20"))
kable(summary_table,caption = "All4 F20 cohort numbers")
All4 F20 cohort numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 27 40 24 27 37.5% 55.6% 33.3% 37.5%
B 34 25 28 29 47.2% 34.7% 38.9% 40.3%
C 10 7 17 12 13.9% 9.7% 23.6% 16.7%
D 1 0 3 3 1.4% 0.0% 4.2% 4.2%
F 0 0 0 1 0.0% 0.0% 0.0% 1.4%
Total 72 72 72 72 100.0% 100.0% 100.0% 100.0%

2.4 Cohort 4: Started in Fall2021

plotAlluvialCohort(fullFour,"f21","AllFour: Cohort taking CHEM1 in Fall 2021")

summary_table <- create_summary_table(selectSemester(fullFour,"f21"))
kable(summary_table,caption = "All4 F21 cohort numbers")
All4 F21 cohort numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 33 29 24 29 57.9% 50.9% 42.1% 50.9%
B 20 23 18 20 35.1% 40.4% 31.6% 35.1%
C 3 5 13 5 5.3% 8.8% 22.8% 8.8%
D 1 0 2 1 1.8% 0.0% 3.5% 1.8%
F 0 0 0 2 0.0% 0.0% 0.0% 3.5%
Total 57 57 57 57 100.0% 100.0% 100.0% 100.0%

2.5 Cohort 5: Started in Fall2022

plotAlluvialCohort(fullFour,"f22","AllFour: Cohort taking CHEM1 in Fall 2022")

summary_table <- create_summary_table(selectSemester(fullFour,"f22"))
kable(summary_table,caption = "All4 F22 cohort numbers")
All4 F22 cohort numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 32 38 21 28 40.5% 48.1% 26.6% 35.4%
B 41 34 37 40 51.9% 43.0% 46.8% 50.6%
C 6 6 19 8 7.6% 7.6% 24.1% 10.1%
D 0 1 2 1 0.0% 1.3% 2.5% 1.3%
F 0 0 0 2 0.0% 0.0% 0.0% 2.5%
Total 79 79 79 79 100.0% 100.0% 100.0% 100.0%

2.6 Cohort 6: Started in Fall2023

plotAlluvialCohort(fullFour,"f23","AllFour: Cohort taking CHEM1 in Fall 2023")

summary_table <- create_summary_table(selectSemester(fullFour,"f23"))
kable(summary_table,caption = "All4 F23 cohort numbers")
All4 F23 cohort numbers
letterGrade chem1 chem2 chem3 chem4 chem1% chem2% chem3% chem4%
A 39 45 31 24 63.9% 73.8% 50.8% 39.3%
B 19 16 21 26 31.1% 26.2% 34.4% 42.6%
C 3 0 9 10 4.9% 0.0% 14.8% 16.4%
D 0 0 0 1 0.0% 0.0% 0.0% 1.6%
Total 61 61 61 61 100.0% 100.0% 100.0% 100.0%

3 Demographics and pre-UMR

#discover demographics
discover =  read.csv("~/Teaching/Grades_and_SRT/discover_chem1331_f18-f23_.csv",header = TRUE)

discf18 = discover[which(discover$Semester == "Fall 2018"),]
discf19 = discover[which(discover$Semester == "Fall 2019"),]
discf20 = discover[which(discover$Semester == "Fall 2020"),]
discf21 = discover[which(discover$Semester == "Fall 2021"),]
discf22 = discover[which(discover$Semester == "Fall 2022"),]
discf23 = discover[which(discover$Semester == "Fall 2023"),]

3.1 Sex

library(dplyr)

# Filter out blank or NA values in DEM_Sex
cleaned <- discover %>%
  filter(!is.na(DEM_Sex), DEM_Sex %in% c("F", "M"))
ggplot(cleaned, aes(x = Semester, fill = DEM_Sex)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Semester", y = "Proportion", fill = "Sex") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create contingency table
semester_table <- table(cleaned$Semester, cleaned$DEM_Sex)

# Run chi-square test
chisq.test(semester_table)
## 
##  Pearson's Chi-squared test
## 
## data:  semester_table
## X-squared = 3.996, df = 5, p-value = 0.55

3.2 Student of color

cleaned <- discover %>%
  filter(!is.na(DEM_Student.of.Color), DEM_Student.of.Color %in% c("Y", "N"))
ggplot(cleaned, aes(x = Semester, fill = DEM_Student.of.Color)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Semester", y = "Proportion", fill = "Student.of.Color") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create contingency table
semester_table <- table(cleaned$Semester, cleaned$DEM_Student.of.Color)

# Run chi-square test
chisq.test(semester_table)
## 
##  Pearson's Chi-squared test
## 
## data:  semester_table
## X-squared = 7.8073, df = 5, p-value = 0.1672

3.3 First generation

# Replace empty or NA values with "Missing"
plot_data <- discover %>%
  filter(DEM_Sex %in% c("F", "M")) %>%
  mutate(
    DEM_First.Generation = ifelse(
      is.na(DEM_First.Generation) | DEM_First.Generation == "", 
      "Missing", 
      DEM_First.Generation
    )
  )

# Plot
ggplot(plot_data, aes(x = Semester, fill = DEM_First.Generation)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Semester", y = "Proportion", fill = "First Gen") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

chi_data <- discover %>%
  filter(DEM_First.Generation %in% c("Y", "N"))

# Contingency table
firstgen_table <- table(chi_data$Semester, chi_data$DEM_First.Generation)

# Chi-square test
chi_square_result = chisq.test(firstgen_table)

chi_square_result
## 
##  Pearson's Chi-squared test
## 
## data:  firstgen_table
## X-squared = 21.616, df = 5, p-value = 0.0006192
library(corrplot)
residuals <- chi_square_result$residuals
# Normalize the residuals to make them suitable for plotting
  normalized_residuals <- residuals / sqrt(chi_square_result$expected)
  corrplot(normalized_residuals, is.corr = FALSE, method = "circle",  tl.col = "black", tl.srt = 45,mar = c(3, 3, 3, 3)) 

3.4 HS GPA value

# Filter for non-missing GPA values
gpa_data <- discover %>%
  filter(!is.na(DEM_HS_GPA_scale_unknown))

# Boxplot
ggplot(gpa_data, aes(x = Semester, y = DEM_HS_GPA_scale_unknown)) +
  geom_boxplot() +
  labs(x = "Semester", y = "High School GPA") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Run ANOVA
anova_result <- aov(DEM_HS_GPA_scale_unknown ~ Semester, data = gpa_data)
summary(anova_result)
##              Df Sum Sq Mean Sq F value Pr(>F)  
## Semester      5   1.49  0.2980    2.53  0.028 *
## Residuals   602  70.92  0.1178                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gpa_summary <- discover %>%
  group_by(Semester) %>%
  summarise(
    total = n(),
    non_missing = sum(!is.na(DEM_HS_GPA_scale_unknown)),
    percent_available = round(100 * non_missing / total, 1),
    mean_gpa = mean(DEM_HS_GPA_scale_unknown, na.rm = TRUE),
    median_gpa = median(DEM_HS_GPA_scale_unknown, na.rm = TRUE),
    sd_gpa = sd(DEM_HS_GPA_scale_unknown, na.rm = TRUE)
  )
library(kableExtra)

gpa_summary %>%
  arrange(Semester) %>%
  kable(digits = 2, caption = "GPA Summary by Semester") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
GPA Summary by Semester
Semester total non_missing percent_available mean_gpa median_gpa sd_gpa
Fall 2018 181 10 5.5 3.40 3.50 0.25
Fall 2019 182 153 84.1 3.65 3.74 0.34
Fall 2020 229 199 86.9 3.63 3.72 0.34
Fall 2021 195 162 83.1 3.58 3.69 0.38
Fall 2022 208 11 5.3 3.53 3.59 0.30
Fall 2023 161 73 45.3 3.70 3.70 0.29

3.4.1 HSGPA and First generation

gpa_fg <- discover %>%
  filter(DEM_First.Generation %in% c("Y", "N"),
         !is.na(DEM_HS_GPA_scale_unknown))

ttest_result <- t.test(DEM_HS_GPA_scale_unknown ~ DEM_First.Generation, data = gpa_fg)
p_value <- ttest_result$p.value

ggplot(gpa_fg, aes(x = DEM_First.Generation, y = DEM_HS_GPA_scale_unknown, fill = DEM_First.Generation)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.8) +
  labs(x = "First Generation (Y/N)", y = "High School GPA") +
  theme_minimal() +
  theme(legend.position = "none")

gpa_fg %>%
  group_by(DEM_First.Generation) %>%
  summarise(
    count = n(),
    mean_gpa = mean(DEM_HS_GPA_scale_unknown),
    sd_gpa = sd(DEM_HS_GPA_scale_unknown),
    median_gpa = median(DEM_HS_GPA_scale_unknown)
  ) %>%
  mutate(`t-test p-value` = ifelse(row_number() == 1, round(p_value, 4), "")) %>%
  kable(digits = 2, caption = "GPA by First Generation Status with t-test") %>%
  kable_styling(full_width = FALSE)
GPA by First Generation Status with t-test
DEM_First.Generation count mean_gpa sd_gpa median_gpa t-test p-value
N 179 3.61 0.37 3.70 0.476
Y 242 3.58 0.35 3.65

For each semester

library(dplyr)
library(tidyr)
library(broom)
library(kableExtra)

# Filter usable data
gpa_by_semester <- discover %>%
  filter(DEM_First.Generation %in% c("Y", "N"),
         !is.na(DEM_HS_GPA_scale_unknown))

# Run only on semesters with >=2 values in both Y and N groups
gpa_comparison <- gpa_by_semester %>%
  group_by(Semester) %>%
  filter(sum(DEM_First.Generation == "Y") >= 2,
         sum(DEM_First.Generation == "N") >= 2) %>%
  group_modify(~ {
    data = .
    test = t.test(DEM_HS_GPA_scale_unknown ~ DEM_First.Generation, data = data)
    tidy_test = broom::tidy(test)
    
    summary_stats <- data %>%
      group_by(DEM_First.Generation) %>%
      summarise(
        count = n(),
        mean = mean(DEM_HS_GPA_scale_unknown),
        sd = sd(DEM_HS_GPA_scale_unknown),
        .groups = "drop"
      ) %>%
      pivot_wider(
        names_from = DEM_First.Generation,
        values_from = c(count, mean, sd),
        names_glue = "{.value}_{DEM_First.Generation}"
      )
    
    bind_cols(summary_stats, tibble(
      p_value = tidy_test$p.value
    ))
  }) %>%
  ungroup()

# Print table
gpa_comparison %>%
  mutate(p_value = round(p_value, 4)) %>%
  kable(
    digits = 2,
    caption = "GPA by First Generation Status for Each Semester (≥2 students in each group)"
  ) %>%
  kable_styling(full_width = FALSE)
GPA by First Generation Status for Each Semester (≥2 students in each group)
Semester count_N count_Y mean_N mean_Y sd_N sd_Y p_value
Fall 2019 28 86 3.58 3.66 0.38 0.35 0.38
Fall 2020 54 54 3.57 3.54 0.37 0.32 0.64
Fall 2021 64 76 3.58 3.55 0.38 0.39 0.69
Fall 2022 3 5 3.62 3.52 0.07 0.42 0.64
Fall 2023 25 20 3.84 3.54 0.28 0.26 0.00

3.4.2 HSGPA and SoC

gpa_fg <- discover %>%
  filter(DEM_Student.of.Color %in% c("Y", "N"),
         !is.na(DEM_HS_GPA_scale_unknown))

ttest_result <- t.test(DEM_HS_GPA_scale_unknown ~ DEM_Student.of.Color, data = gpa_fg)
p_value <- ttest_result$p.value

ggplot(gpa_fg, aes(x = DEM_Student.of.Color, y = DEM_HS_GPA_scale_unknown, fill = DEM_Student.of.Color)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.8) +
  labs(x = "Student of Color (Y/N)", y = "High School GPA") +
  theme_minimal() +
  theme(legend.position = "none")

gpa_fg %>%
  group_by(DEM_Student.of.Color) %>%
  summarise(
    count = n(),
    mean_gpa = mean(DEM_HS_GPA_scale_unknown),
    sd_gpa = sd(DEM_HS_GPA_scale_unknown),
    median_gpa = median(DEM_HS_GPA_scale_unknown)
  ) %>%
  mutate(`t-test p-value` = ifelse(row_number() == 1, round(p_value, 4), "")) %>%
  kable(digits = 2, caption = "GPA by Student of Color Status with t-test") %>%
  kable_styling(full_width = FALSE)
GPA by Student of Color Status with t-test
DEM_Student.of.Color count mean_gpa sd_gpa median_gpa t-test p-value
N 376 3.67 0.32 3.74 1e-04
Y 232 3.56 0.37 3.59
# Filter usable data
gpa_by_semester <- discover %>%
  filter(DEM_Student.of.Color %in% c("Y", "N"),
         !is.na(DEM_HS_GPA_scale_unknown))

# Run only on semesters with >=2 values in both Y and N groups
gpa_comparison <- gpa_by_semester %>%
  group_by(Semester) %>%
  filter(sum(DEM_Student.of.Color == "Y") >= 2,
         sum(DEM_Student.of.Color == "N") >= 2) %>%
  group_modify(~ {
    data = .
    test = t.test(DEM_HS_GPA_scale_unknown ~ DEM_Student.of.Color, data = data)
    tidy_test = broom::tidy(test)
    
    summary_stats <- data %>%
      group_by(DEM_Student.of.Color) %>%
      summarise(
        count = n(),
        mean = mean(DEM_HS_GPA_scale_unknown),
        sd = sd(DEM_HS_GPA_scale_unknown),
        .groups = "drop"
      ) %>%
      pivot_wider(
        names_from = DEM_Student.of.Color,
        values_from = c(count, mean, sd),
        names_glue = "{.value}_{DEM_Student.of.Color}"
      )
    
    bind_cols(summary_stats, tibble(
      p_value = tidy_test$p.value
    ))
  }) %>%
  ungroup()

# Print table
gpa_comparison %>%
  mutate(p_value = round(p_value, 4)) %>%
  kable(
    digits = 2,
    caption = "GPA by Student of Color Status for Each Semester (≥2 students in each group)"
  ) %>%
  kable_styling(full_width = FALSE)
GPA by Student of Color Status for Each Semester (≥2 students in each group)
Semester count_N count_Y mean_N mean_Y sd_N sd_Y p_value
Fall 2018 3 7 3.40 3.40 0.19 0.29 0.98
Fall 2019 103 50 3.70 3.55 0.30 0.39 0.02
Fall 2020 124 75 3.67 3.57 0.33 0.35 0.04
Fall 2021 95 67 3.61 3.54 0.36 0.40 0.23
Fall 2022 5 6 3.48 3.57 0.31 0.31 0.62
Fall 2023 46 27 3.75 3.62 0.25 0.34 0.11

3.4.3 HSGPA and Sex

gpa_fg <- discover %>%
  filter(DEM_Sex %in% c("M", "F"),
         !is.na(DEM_HS_GPA_scale_unknown))

ttest_result <- t.test(DEM_HS_GPA_scale_unknown ~ DEM_Sex, data = gpa_fg)
p_value <- ttest_result$p.value

ggplot(gpa_fg, aes(x = DEM_Sex, y = DEM_HS_GPA_scale_unknown, fill = DEM_Sex)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.8) +
  labs(x = "Student of Color (Y/N)", y = "High School GPA") +
  theme_minimal() +
  theme(legend.position = "none")

gpa_fg %>%
  group_by(DEM_Sex) %>%
  summarise(
    count = n(),
    mean_gpa = mean(DEM_HS_GPA_scale_unknown),
    sd_gpa = sd(DEM_HS_GPA_scale_unknown),
    median_gpa = median(DEM_HS_GPA_scale_unknown)
  ) %>%
  mutate(`t-test p-value` = ifelse(row_number() == 1, round(p_value, 4), "")) %>%
  kable(digits = 2, caption = "GPA by Sex with t-test") %>%
  kable_styling(full_width = FALSE)
GPA by Sex with t-test
DEM_Sex count mean_gpa sd_gpa median_gpa t-test p-value
F 489 3.65 0.34 3.72 2e-04
M 119 3.51 0.37 3.58
# Filter usable data
gpa_by_semester <- discover %>%
  filter(DEM_Sex %in% c("M", "F"),
         !is.na(DEM_HS_GPA_scale_unknown))

# Run only on semesters with >=2 values in both Y and N groups
gpa_comparison <- gpa_by_semester %>%
  group_by(Semester) %>%
  filter(sum(DEM_Sex == "M") >= 2,
         sum(DEM_Sex == "F") >= 2) %>%
  group_modify(~ {
    data = .
    test = t.test(DEM_HS_GPA_scale_unknown ~ DEM_Sex, data = data)
    tidy_test = broom::tidy(test)
    
    summary_stats <- data %>%
      group_by(DEM_Sex) %>%
      summarise(
        count = n(),
        mean = mean(DEM_HS_GPA_scale_unknown),
        sd = sd(DEM_HS_GPA_scale_unknown),
        .groups = "drop"
      ) %>%
      pivot_wider(
        names_from = DEM_Sex,
        values_from = c(count, mean, sd),
        names_glue = "{.value}_{DEM_Sex}"
      )
    
    bind_cols(summary_stats, tibble(
      p_value = tidy_test$p.value
    ))
  }) %>%
  ungroup()

# Print table
gpa_comparison %>%
  mutate(p_value = round(p_value, 4)) %>%
  kable(
    digits = 2,
    caption = "GPA by Sex for Each Semester (≥2 students in each group)"
  ) %>%
  kable_styling(full_width = FALSE)
GPA by Sex for Each Semester (≥2 students in each group)
Semester count_F count_M mean_F mean_M sd_F sd_M p_value
Fall 2018 5 5 3.48 3.32 0.19 0.31 0.37
Fall 2019 116 37 3.69 3.54 0.31 0.38 0.03
Fall 2020 167 32 3.65 3.57 0.33 0.40 0.30
Fall 2021 136 26 3.61 3.41 0.37 0.38 0.02
Fall 2022 8 3 3.57 3.41 0.33 0.20 0.37
Fall 2023 57 16 3.73 3.60 0.31 0.21 0.06

3.5 ACT value

# ACT Math
discover %>%
  filter(!is.na(DEM_ACT.MATH)) %>%
  ggplot(aes(x = Semester, y = DEM_ACT.MATH)) +
  geom_boxplot(fill = "#2c7fb8", alpha = 0.7) +
  labs(title = "ACT Math Score by Semester", y = "ACT Math", x = "Semester") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# ACT Composite
discover %>%
  filter(!is.na(DEM_ACT.Composite)) %>%
  ggplot(aes(x = Semester, y = DEM_ACT.Composite)) +
  geom_boxplot(fill = "#7fcdbb", alpha = 0.7) +
  labs(title = "ACT Composite Score by Semester", y = "ACT Composite", x = "Semester") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Function to summarize and test
summarize_act <- function(data, variable) {
  total_n <- nrow(data)
  data_valid <- data %>%
    filter(!is.na(.data[[variable]]))

  # Summary by semester
  summary_tbl <- data_valid %>%
    group_by(Semester) %>%
    summarise(
      count = n(),
      mean = mean(.data[[variable]], na.rm = TRUE),
      sd = sd(.data[[variable]], na.rm = TRUE),
      .groups = "drop"
    )

  # ANOVA
  aov_result <- aov(as.formula(paste(variable, "~ Semester")), data = data_valid)
  p_val <- tidy(aov_result)$p.value[1]

  # Add cohort % and p-value
  summary_tbl %>%
    mutate(
      percent_present = round(100 * count / total_n, 1),
      p_value = round(p_val, 4)
    )
}

# Run for both ACT variables
act_math_tbl <- summarize_act(discover, "DEM_ACT.MATH")
act_comp_tbl <- summarize_act(discover, "DEM_ACT.Composite")

# ACT Math Table
act_math_tbl %>%
  kable(digits = 2, caption = "ACT Math by Semester with ANOVA and Coverage %") %>%
  kable_styling(full_width = FALSE)
ACT Math by Semester with ANOVA and Coverage %
Semester count mean sd percent_present p_value
Fall 2018 165 24.31 3.47 14.3 0.02
Fall 2019 167 23.96 3.64 14.4 0.02
Fall 2020 211 23.44 4.05 18.3 0.02
Fall 2021 107 23.19 3.98 9.3 0.02
Fall 2022 95 23.04 4.20 8.2 0.02
Fall 2023 64 22.66 4.73 5.5 0.02
# ACT Composite Table
act_comp_tbl %>%
  kable(digits = 2, caption = "ACT Composite by Semester with ANOVA and Coverage %") %>%
  kable_styling(full_width = FALSE)
ACT Composite by Semester with ANOVA and Coverage %
Semester count mean sd percent_present p_value
Fall 2018 167 24.11 3.22 14.4 0.51
Fall 2019 167 23.98 3.58 14.4 0.51
Fall 2020 211 23.56 4.02 18.3 0.51
Fall 2021 107 23.50 4.11 9.3 0.51
Fall 2022 95 23.41 4.47 8.2 0.51
Fall 2023 65 23.38 4.23 5.6 0.51

4 With Biochemistry

chem5_f21 = read.csv("~/Teaching/Grades_and_SRT/BIOC3321\ Grade\ Spreadsheet\ F21\ F22\ F23/Grades-BIOC_3321_(001)_Fall_2021.csv")
chem5_f22 = read.csv("~/Teaching/Grades_and_SRT/BIOC3321\ Grade\ Spreadsheet\ F21\ F22\ F23/Grades-BIOC_3321_(001)_Fall_2022.csv")
chem5_f23 = read.csv("~/Teaching/Grades_and_SRT/BIOC3321\ Grade\ Spreadsheet\ F21\ F22\ F23/Grades-BIOC_3321_(001)_Fall_2023.csv")
chem5_f21 = head(tail(chem5_f21, -1), -1)
chem5_f22 = head(tail(chem5_f22, -1), -1)
chem5_f23 = tail(chem5_f23, -1)
dfs_bio = list(
  chem5_f21 = chem5_f21,
  chem5_f22 = chem5_f22,
  chem5_f23 = chem5_f23
)
dfs_bio = getFinalScore(dfs_bio)

#merged_df = merge(f19cohort,chem5_f21, by.x = "student", by.y = "SIS.Login.ID", all.x = TRUE)
#f19cohort$chem5 = merged_df$Final.Score
addBiochemStudents = function(df1,df2,df3,df4){
  #df1 is the original cohort, df2,3, and 4 are the biochem 
  df1 <- df1 %>%
    left_join(select(df2, SIS.Login.ID, Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f21 = Final.Score)
  df1 <- df1 %>%
    left_join(select(df3, SIS.Login.ID, Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f22 = Final.Score)
  df1 <- df1 %>%
    left_join(select(df4, SIS.Login.ID, Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f23 = Final.Score)

  df1 <- df1 %>%
  mutate(chem5 = coalesce(chem5_f21, chem5_f22, chem5_f23)) %>%
  select(-chem5_f21, -chem5_f22, -chem5_f23)  # Remove intermediate columns
  df1$chem5 = as.numeric(df1$chem5) 
  return(df1)

}
addFinalExamBiochemStudents = function(df1,df2,df3,df4){
  #df1 is the original cohort, df2,3, and 4 are the biochem 
  df1 <- df1 %>%
    left_join(select(df2, SIS.Login.ID, Final.Exam.Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f21_final = Final.Exam.Final.Score)
  df1 <- df1 %>%
    left_join(select(df3, SIS.Login.ID, Final.Exam.Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f22_final = Final.Exam.Final.Score)
  df1 <- df1 %>%
    left_join(select(df4, SIS.Login.ID, Final.Exam.Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f23_final = Final.Exam.Final.Score)

  df1 <- df1 %>%
  mutate(chem5_final = coalesce(chem5_f21_final, chem5_f22_final, chem5_f23_final)) %>%
  select(-chem5_f21_final, -chem5_f22_final, -chem5_f23_final)  # Remove intermediate columns
  
  df1$chem5_final = as.numeric(df1$chem5_final) 
  
  return(df1)

}

f19cohort = addBiochemStudents(f19cohort,chem5_f21,chem5_f22,chem5_f23)
f20cohort = addBiochemStudents(f20cohort,chem5_f21,chem5_f22,chem5_f23)
f21cohort = addBiochemStudents(f21cohort,chem5_f21,chem5_f22,chem5_f23)

f19cohort = addFinalExamBiochemStudents(f19cohort,chem5_f21,chem5_f22,chem5_f23)
f20cohort = addFinalExamBiochemStudents(f20cohort,chem5_f21,chem5_f22,chem5_f23)
f21cohort = addFinalExamBiochemStudents(f21cohort,chem5_f21,chem5_f22,chem5_f23)

f19complete = f19cohort[complete.cases(f19cohort),]
f20complete = f20cohort[complete.cases(f20cohort),]
f21complete = f21cohort[complete.cases(f21cohort),]

f19complete$student = sub("@umn.edu","",f19complete$student)
f20complete$student = sub("@umn.edu","",f20complete$student)
f21complete$student = sub("@umn.edu","",f21complete$student)

4.1 Scatter plot matrix

#install.packages("GGally")

p <- ggpairs(f19complete[, c(2:5,7)], title = "Fall19 cohort vs Biochem final exam")
print(p)

p <- ggpairs(f20complete[, c(2:5,7)], title = "Fall20 cohort vs Biochem final exam")
print(p)

p <- ggpairs(f21complete[, c(2:5,7)], title = "Fall21 cohort vs Biochem final exam")
print(p)

4.2 Correlation Heat map

#install.packages("ggcorrplot")
library(ggcorrplot)

# Calculate correlation matrix
corr_matrix <- cor(f19complete[, c(2:5,7)])

# Plot heatmap
ggcorrplot(corr_matrix, lab = TRUE, title = "Fall 19 cohort - Biochem final")