##READING FILES
chem1_f18 =     read.csv("~/Teaching/Grades_and_SRT/Fall2018/chem1331_f18.csv",header = TRUE)
chem1_f19 =     read.csv("~/Teaching/Grades_and_SRT/Fall2019/chem1331_f19_grades_canvas.csv",header = TRUE)
chem1_f20 =     read.csv("~/Teaching/Grades_and_SRT/Fall2020/chem1331_f20_grades_canvas.csv",header = TRUE)
chem1_f21 =     read.csv("~/Teaching/Grades_and_SRT/Fall2021/chem1331_f21_grades_canvas.csv",header = TRUE)
chem1_f22 =     read.csv("~/Teaching/Grades_and_SRT/Fall2022/chem1331_f22_grades_canvas.csv",header = TRUE)
chem1_f23 =     read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem1331_f23_grades_canvas.csv",header = TRUE)
chem1_f24 =     read.csv("~/Teaching/Grades_and_SRT/Fall2024/chem1331_f24_grades_canvas.csv",header = TRUE)

chem2_s19 =     read.csv("~/Teaching/Grades_and_SRT/Spring2019/chem1333_s19.csv", header = TRUE)
chem2_s20 =     read.csv("~/Teaching/Grades_and_SRT/Spring2020/chem1333_s20_grades_canvas.csv", header = TRUE)
chem2_s21 =     read.csv("~/Teaching/Grades_and_SRT/Spring2021/chem1333_s21_grades_canvas.csv", header = TRUE)
chem2_s22 =     read.csv("~/Teaching/Grades_and_SRT/Spring2022/chem1333_s22_grades_canvas.csv", header = TRUE)
chem2_s23 =     read.csv("~/Teaching/Grades_and_SRT/Spring2023/chem1333_s23_grades_canvas.csv", header = TRUE)
chem2_s24 =     read.csv("~/Teaching/Grades_and_SRT/Spring2024/chem1333_s24_grades_canvas.csv", header = TRUE)

chem3_f19 =     read.csv("~/Teaching/Grades_and_SRT/Fall2019/chem2231_f19_grades_canvas.csv",header = TRUE)
chem3_f20 =     read.csv("~/Teaching/Grades_and_SRT/Fall2020/chem2131_f20_grades_canvas.csv",header = TRUE)
chem3_f21 =     read.csv("~/Teaching/Grades_and_SRT/Fall2021/chem2131_f21_grades_canvas.csv",header = TRUE)
chem3_f22 =     read.csv("~/Teaching/Grades_and_SRT/Fall2022/chem2131_f22_grades_canvas.csv",header = TRUE)
chem3_f23 =     read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem2131_f23_grades_canvas.csv",header = TRUE)
chem3_f24 =     read.csv("~/Teaching/Grades_and_SRT/Fall2024/chem2131_f24_grades_canvas.csv",header = TRUE)

chem4_s20 =     read.csv("~/Teaching/Grades_and_SRT/Spring2020/chem2333_s20_grades_canvas.csv", header = TRUE)
chem4_s21 =     read.csv("~/Teaching/Grades_and_SRT/Spring2021/chem2335_s21_grades_canvas.csv", header = TRUE)
chem4_s22 =     read.csv("~/Teaching/Grades_and_SRT/Spring2022/chem2335_s22_grades_canvas.csv", header = TRUE)
chem4_s23 =     read.csv("~/Teaching/Grades_and_SRT/Spring2023/chem2335_s23_grades_canvas.csv", header = TRUE)
chem4_s24 =     read.csv("~/Teaching/Grades_and_SRT/Spring2024/chem2335_s24_grades_canvas.csv", header = TRUE)
chem4_s25 =     read.csv("~/Teaching/Grades_and_SRT/Spring2025/chem2335_s25_grades_tempo_canvas.csv", header = TRUE)
  
dfs = list(
  chem1_f18 = chem1_f18,
  chem2_s19 = chem2_s19,
  chem3_f19 = chem3_f19,
  chem4_s20 = chem4_s20,
  
  chem1_f19 = chem1_f19,
  chem2_s20 = chem2_s20,
  chem3_f20 = chem3_f20,
  chem4_s21 = chem4_s21,
  
  chem1_f20 = chem1_f20,
  chem2_s21 = chem2_s21,
  chem3_f21 = chem3_f21,
  chem4_s22 = chem4_s22,
  
  chem1_f21 = chem1_f21,
  chem2_s22 = chem2_s22,
  chem3_f22 = chem3_f22,
  chem4_s23 = chem4_s23,
  
  chem1_f22 = chem1_f22,
  chem2_s23 = chem2_s23,
  chem3_f23 = chem3_f23,
  chem4_s24 = chem4_s24,
  
  chem1_f23 = chem1_f23,
  chem2_s24 = chem2_s24,
  chem3_f24 = chem3_f24,
  chem4_s25 = chem4_s25
               
)
getFinalScore = function(dfs){
  #For some reason, some Final.Score is lower than Current.Score. Using the highest
  for (i in seq_along(dfs)) {
    df <- dfs[[i]]
    didItChange=FALSE
    times=0
    for (j in 1:nrow(df)) {
      if (df[j, "Current.Score"] > df[j, "Final.Score"]) {
        df[j, "Final.Score"] <- df[j, "Current.Score"] 
        didItChange=TRUE
        times=times+1
      }
    }
    #if (didItChange){ print(paste("The dataframe",names(dfs)[i]," Current and Final score are not the same in",times,"students"))}
    dfs[[i]] <- df
  }
  return(dfs)
}
dfs = getFinalScore(dfs)

1 All students

We started teaching the new curriculum in Fall 2018. The first cohort (f18) finished in Spring 2020

Let’s consider how students flew through the curriculum and how they did. Some students postpone taking Chem3 or Chem4, we are taking their grade into account even if they didn’t take the course during their sophomore year.

###PREPARING FILES

combined_df <- dfs %>%
  map(~ select(.x, SIS.Login.ID, Final.Score)) %>%
  reduce(full_join, by = "SIS.Login.ID")
whos_that_student <- dfs %>%
  map(~ select(.x,Student, SIS.Login.ID, Final.Score)) %>%
  reduce(full_join, by = "SIS.Login.ID")


# Rename the Final.Score columns with the names of the original dataframes
names(combined_df)[-1] <- names(dfs)
names(whos_that_student)[-1] <- names(dfs)

# Rename the SIS.Login.ID column to student
names(combined_df)[1] <- "student"

f18cohort <- combined_df %>%
  filter(!is.na(chem1_f18))
f19cohort <- combined_df %>%
  filter(!is.na(chem1_f19))
f20cohort <- combined_df %>%
  filter(!is.na(chem1_f20))
f21cohort <- combined_df %>%
  filter(!is.na(chem1_f21))
f22cohort <- combined_df %>%
  filter(!is.na(chem1_f22))
f23cohort <- combined_df %>%
  filter(!is.na(chem1_f23))

merge_columns <- function(df, prefix) {
  columns <- grep(paste0("^", prefix), names(df), value = TRUE)
  merged_column <- ifelse(rowSums(!is.na(df[columns])) == 0, NA, 
                          do.call(pmax, c(df[columns], na.rm = TRUE)))
  return(merged_column)
}

returnMergedColumns = function(df){
  # Apply the function to merge chem1, chem2, chem3, and chem4 columns
  df <- df %>%
    mutate(
      chem1 = merge_columns(., "chem1"),
      chem2 = merge_columns(., "chem2"),
      chem3 = merge_columns(., "chem3"),
      chem4 = merge_columns(., "chem4")
    ) %>%
    select(student, chem1, chem2, chem3, chem4)
  return(df)
}
prepareAlluvial = function(df){
  new_df <- df %>%
  mutate_at(vars(chem1:chem4), ~case_when(
    is.na(.) ~ "OUT",
    . > 90 ~ "A",
    . >= 80 & . <= 90 ~ "B",
    . >= 70 & . < 80 ~ "C",
    . >= 60 & . < 70 ~ "D",
    TRUE ~ "F"
  ))
long_df <- pivot_longer(new_df, cols = starts_with("chem"), names_to = "course", values_to = "letterGrade")

# Rename columns
colnames(long_df) <- c("student", "course", "letterGrade")
return(long_df)
  
}

f18cohort = returnMergedColumns(f18cohort)
f18letter = prepareAlluvial(f18cohort)

f19cohort = returnMergedColumns(f19cohort)
f19letter = prepareAlluvial(f19cohort)

f20cohort = returnMergedColumns(f20cohort)
f20letter = prepareAlluvial(f20cohort)

f21cohort = returnMergedColumns(f21cohort)
f21letter = prepareAlluvial(f21cohort)

f22cohort = returnMergedColumns(f22cohort)
f22letter = prepareAlluvial(f22cohort)

f23cohort = returnMergedColumns(f23cohort)
f23letter = prepareAlluvial(f23cohort)

#f18cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f18cohort.csv", row.names = FALSE)
#f19cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f19cohort.csv", row.names = FALSE)
#f20cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f20cohort.csv", row.names = FALSE)
#f21cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f21cohort.csv", row.names = FALSE)
#write.csv(f22cohort[order(f22cohort$chem2_s23),],"f22cohort.csv")

## REPRESENT
makeAlluvial_Cohort = function(df,thisTitle){
  ggplot(df,
           aes(x = course, stratum = letterGrade, alluvium = student, fill = letterGrade)) +
      scale_x_discrete(expand = c(.1, .1)) +
      geom_flow() +
      geom_stratum(alpha = .5) +
      theme_minimal() +
      geom_text(stat = "stratum",
                aes(label = percent(after_stat(prop), accuracy = .1)))+
      labs(title = thisTitle, x = "", y = "Number of students")
}
makeAlluvial_Cohort2 = function(df,thisTitle){
  ggplot(df,
           aes(x = course, stratum = letterGrade, alluvium = student, fill = letterGrade)) +
      scale_x_discrete(expand = c(.1, .1)) +
      geom_flow() +
      geom_stratum(alpha = .5) +
      theme_minimal() +
      geom_text(stat = "stratum",
                aes(label = after_stat(count) ))+
      labs(title = thisTitle, x = "", y = "Number of students")
}
create_summary_table <- function(df) {
    summary_table <- df %>%
    group_by(course, letterGrade, .drop = TRUE) %>%
    summarize(count = n(), .groups = 'drop') %>%
    ungroup() %>%
    complete(course, letterGrade, fill = list(count = 0))

  # Pivot the data to have courses as columns
  summary_table <- summary_table %>%
    pivot_wider(names_from = course, values_from = count)
  
  df_sum <- summary_table %>%
  bind_rows(
    data.frame(
      letterGrade = "Total",
      chem1 = sum(.$chem1),
      chem2 = sum(.$chem2),
      chem3 = sum(.$chem3),
      chem4 = sum(.$chem4)
    )
  )

# Calculate percentages and add "%" character
df_percent <- df_sum %>%
  mutate(across(starts_with("chem"), ~ scales::percent(./sum(.)*2 , accuracy = 0.1), .names = "{col}%"))

  return(df_percent)
}

allCohorts = rbind(f18letter,f19letter)
allCohorts = rbind(allCohorts,f20letter)
allCohorts = rbind(allCohorts,f21letter)
allCohorts = rbind(allCohorts,f22letter)
allCohorts = rbind(allCohorts,f23letter)
allCohorts = allCohorts[!duplicated(allCohorts, fromLast = TRUE),]
makeAlluvial_Cohort(allCohorts,"All cohorts")

summary_table <- create_summary_table(allCohorts)
kable(summary_table,caption = "All cohort numbers")

All cohort numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	387	309	158	139	34.7%	27.7%	14.2%	12.5%
B	465	326	184	200	41.7%	29.2%	16.5%	17.9%
C	187	171	121	70	16.8%	15.3%	10.8%	6.3%
D	46	27	30	12	4.1%	2.4%	2.7%	1.1%
F	31	6	6	5	2.8%	0.5%	0.5%	0.4%
OUT	0	277	617	690	0.0%	24.8%	55.3%	61.8%
Total	1116	1116	1116	1116	100.0%	100.0%	100.0%	100.0%

1.1 Retention across the curriculum and across the years

calculate_percentage <- function(df) {
  df %>%
    mutate(not_out = ifelse(letterGrade != "OUT", 1, 0)) %>%
    group_by(course) %>%
    summarise(percentage = mean(not_out) * 100)
}

# Apply the function to each dataframe
f18_percentage <- calculate_percentage(f18letter)
f19_percentage <- calculate_percentage(f19letter)
f20_percentage <- calculate_percentage(f20letter)
f21_percentage <- calculate_percentage(f21letter)
f22_percentage <- calculate_percentage(f22letter)
f23_percentage <- calculate_percentage(f23letter)
allPercentage <- bind_rows(f18_percentage,
                         f19_percentage,
                         f20_percentage,
                         f21_percentage,
                         f22_percentage,
                         f23_percentage,
                          .id = "df_id"
                         )%>%
  mutate(df_id = case_when(
    df_id == "1" ~ "Fall18",
    df_id == "2" ~ "Fall19",
    df_id == "3" ~ "Fall20",
    df_id == "4" ~ "Fall21",
    df_id == "5" ~ "Fall22",
    df_id == "6" ~ "Fall23",
    TRUE ~ df_id  # Keep other values unchanged
  ))
ggplot(allPercentage, aes(x = course, y = percentage, group = df_id, color = df_id)) +
  geom_line() +
  geom_point() +
  labs(x = "Course", y = "% students",
       title = "Retention across the Chemistry Curriculum") +
  scale_color_discrete(name = "Cohort first semester") +
  theme_minimal()

spread_df <- spread(allPercentage, df_id, percentage)

# Print the resulting table
kable(spread_df, format = "markdown",digits = 1, caption = "Percentage of the initial cohort taking the course ")

Percentage of the initial cohort taking the course
course	Fall18	Fall19	Fall20	Fall21	Fall22	Fall23
chem1	100.0	100.0	100.0	100.0	100.0	100.0
chem2	71.3	73.6	72.6	74.9	74.5	81.5
chem3	43.1	50.0	41.7	40.0	45.7	46.4
chem4	36.5	44.0	32.7	32.8	38.9	42.4

1.2 Cohort 1: Starting in Fall2018

Took: * CHEM1 in F18 * CHEM2 in S19 * CHEM3 in F19 or later * CHEM4 in S20 or later

makeAlluvial_Cohort(f18letter,"F18 all students cohort")

summary_table <- create_summary_table(f18letter)
kable(summary_table,caption = "F18 numbers")

F18 numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	90	38	18	16	49.7%	21.0%	9.9%	8.8%
B	71	50	36	34	39.2%	27.6%	19.9%	18.8%
C	13	34	19	14	7.2%	18.8%	10.5%	7.7%
D	2	5	4	2	1.1%	2.8%	2.2%	1.1%
F	5	2	1	0	2.8%	1.1%	0.6%	0.0%
OUT	0	52	103	115	0.0%	28.7%	56.9%	63.5%
Total	181	181	181	181	100.0%	100.0%	100.0%	100.0%

1.3 Cohort 2: Starting in Fall2019

Took: * CHEM1 in F19 * CHEM2 in S20 * CHEM3 in F20 or later * CHEM4 in S21 or later

makeAlluvial_Cohort(f19letter,"F19 all students cohort")

summary_table <- create_summary_table(f19letter)
kable(summary_table,caption = "F19 numbers")

F19 numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	52	29	23	13	28.6%	15.9%	12.6%	7.1%
B	95	59	31	46	52.2%	32.4%	17.0%	25.3%
C	25	39	25	19	13.7%	21.4%	13.7%	10.4%
D	7	6	10	2	3.8%	3.3%	5.5%	1.1%
F	3	1	2	0	1.6%	0.5%	1.1%	0.0%
OUT	0	48	91	102	0.0%	26.4%	50.0%	56.0%
Total	182	182	182	182	100.0%	100.0%	100.0%	100.0%

1.4 Cohort 3: Started in Fall2020

Took: * CHEM1 in F20 * CHEM2 in S21 * CHEM3 in F21 or later * CHEM4 in S22 or later

makeAlluvial_Cohort(f20letter,"F20 all students cohort")

summary_table <- create_summary_table(f20letter)
kable(summary_table,caption = "F20 numbers")

F20 numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	48	59	30	27	21.5%	26.5%	13.5%	12.1%
B	80	62	34	30	35.9%	27.8%	15.2%	13.5%
C	65	35	23	12	29.1%	15.7%	10.3%	5.4%
D	22	5	6	3	9.9%	2.2%	2.7%	1.3%
F	8	1	0	1	3.6%	0.4%	0.0%	0.4%
OUT	0	61	130	150	0.0%	27.4%	58.3%	67.3%
Total	223	223	223	223	100.0%	100.0%	100.0%	100.0%

1.5 Cohort 4: Started in Fall2021

Took:

CHEM1 in F21
CHEM2 in S22
CHEM3 in F22 or later
CHEM4 in S23

makeAlluvial_Cohort(f21letter,"F21 all students cohort")

summary_table <- create_summary_table(f21letter)
kable(summary_table,caption = "F21 numbers")

F21 numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	71	58	31	31	36.4%	29.7%	15.9%	15.9%
B	76	52	22	24	39.0%	26.7%	11.3%	12.3%
C	28	29	21	5	14.4%	14.9%	10.8%	2.6%
D	14	5	2	2	7.2%	2.6%	1.0%	1.0%
F	6	2	2	2	3.1%	1.0%	1.0%	1.0%
OUT	0	49	117	131	0.0%	25.1%	60.0%	67.2%
Total	195	195	195	195	100.0%	100.0%	100.0%	100.0%

1.6 Cohort 5. Started in Fall 2022

Took:

CHEM1 in F22
CHEM2 in S23
CHEM3 in F23
CHEM4 in S24

At the moment of this analysis some students dropped or have not taken ochem2 or chem4, so they may take it in the next year and the results may change

makeAlluvial_Cohort(f22letter,"F22 all students cohort")

summary_table <- create_summary_table(f22letter)
kable(summary_table,caption = "F22 numbers")

F22 numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	60	54	24	28	28.8%	26.0%	11.5%	13.5%
B	102	69	39	40	49.0%	33.2%	18.8%	19.2%
C	37	26	24	9	17.8%	12.5%	11.5%	4.3%
D	4	5	8	2	1.9%	2.4%	3.8%	1.0%
F	5	1	0	2	2.4%	0.5%	0.0%	1.0%
OUT	0	53	113	127	0.0%	25.5%	54.3%	61.1%
Total	208	208	208	208	100.0%	100.0%	100.0%	100.0%

1.7 The fate of C- students

If a student gets less than 74%, what advise should we give them?

Cstud_s23 = chem2_s23[which(chem2_s23$Unposted.Final.Score < 74),]
Cstud_s23 = Cstud_s23
summary_table <- f18cohort %>%
  filter(chem2 < 74) %>%
  summarize(
    students_below_74 = n(),
    avg_chem3_score = mean(chem3, na.rm = TRUE),
    avg_chem4_score = mean(chem4, na.rm = TRUE)
  )

# Print summary table
print(summary_table)

##   students_below_74 avg_chem3_score avg_chem4_score
## 1                18         75.0475            74.6

2 Only students who took the 4 semesters

Now let’s just consider only the students who took the four semesters.

There are some students who transferred in GenChem2, so there are some premed-like students that continued to Biochemistry even if they look to be “OUT” of the curriculum.

TODO: Check Biochem lists.

## PREPARE FILES
select_and_add_name <- function(df, name) {
  if (any(is.na(df$Final.Score))) {
    print(paste("Data frame", name, "contains NA values in the 'grade' column"))
  }
  selected_df <- df %>% 
    #select(ID, Final.Score) %>%
    select(SIS.Login.ID, Final.Score) %>%
    mutate(Final.Score = as.numeric(Final.Score))  # Convert "grade" to numeric type
  selected_df$name <- name
  return(selected_df)
}

# Apply the function to each dataframe in the list
dfs_with_name <- Map(select_and_add_name, dfs, names(dfs))

# Merge all data frames into one
merged_df <- bind_rows(dfs_with_name)

names(merged_df)[names(merged_df)=="SIS.Login.ID"] = "student"
allCohort <- separate(merged_df, name, into = c("course", "semester"), sep = "_")
allCohort$letterGrade = ifelse(allCohort$Final.Score > 90, "A",
                         ifelse(allCohort$Final.Score >= 80, "B",
                                ifelse(allCohort$Final.Score >= 70, "C",
                                       ifelse(allCohort$Final.Score >= 60, "D", "F"))))

# filter just in case they took it twice, just take the highest score
allCohort <- allCohort %>%
  group_by(student, course) %>%
  filter(Final.Score == max(Final.Score))

# What students took the whole sequence
fullFour <- allCohort %>%
  group_by(student) %>%
  filter(all(c("chem1", "chem2", "chem3", "chem4") %in% course))

# filter just in case they took it twice, just take the highest score
fullFour <- fullFour %>%
  group_by(student, course) %>%
  filter(Final.Score == max(Final.Score))

#Writing just in case

## REPRESENTING
selectSemester = function(df,thisSem){
  selected_students <- df %>%
    filter(course == "chem1" & semester == thisSem) %>%
    pull(student) %>%
    unique()

  # Filter the dataframe to include all rows for selected students
  thisSem_df <- df %>%
    filter(student %in% selected_students)
  return(thisSem_df) 
}
plotAlluvialCohort = function(df,thisSem,thisTitle){
  
  thisSem_df = selectSemester(df,thisSem)
  ggplot(thisSem_df,
           aes(x = course, stratum = letterGrade, alluvium = student, fill = letterGrade)) +
      scale_x_discrete(expand = c(.1, .1)) +
      geom_flow() +
      geom_stratum(alpha = .5) +
      theme_minimal() +
      geom_text(stat = "stratum",
                aes(label = percent(after_stat(prop), accuracy = .1)))+
      labs(title = thisTitle, x = "", y = "Number of students")
}
makeAlluvial_Cohort(fullFour,"All cohorts. Students who took the four semesters")

summary_table <- create_summary_table(fullFour)
kable(summary_table,caption = "All cohort numbers. Students who took the four semesters")

All cohort numbers. Students who took the four semesters
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	207	196	139	135	50.4%	47.7%	33.8%	32.8%
B	177	170	165	194	43.1%	41.4%	40.1%	47.2%
C	25	42	92	67	6.1%	10.2%	22.4%	16.3%
D	2	3	15	10	0.5%	0.7%	3.6%	2.4%
F	0	0	0	5	0.0%	0.0%	0.0%	1.2%
Total	411	411	411	411	100.0%	100.0%	100.0%	100.0%

2.1 Cohort 1: Started in Fall2018

plotAlluvialCohort(fullFour,"f18","AllFour: Cohort taking CHEM1 in Fall 2018")

summary_table <- create_summary_table(selectSemester(fullFour,"f18"))
kable(summary_table,caption = "All4 F18 cohort numbers")

All4 F18 cohort numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	46	23	16	15	71.9%	35.9%	25.0%	23.4%
B	18	32	32	33	28.1%	50.0%	50.0%	51.6%
C	0	8	14	14	0.0%	12.5%	21.9%	21.9%
D	0	1	2	2	0.0%	1.6%	3.1%	3.1%
Total	64	64	64	64	100.0%	100.0%	100.0%	100.0%

2.2 Cohort 2: Started in Fall2019

plotAlluvialCohort(fullFour,"f19","AllFour: Cohort taking CHEM1 in Fall 2019")

summary_table <- create_summary_table(selectSemester(fullFour,"f19"))
kable(summary_table,caption = "All4 F18 cohort numbers")

All4 F18 cohort numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	30	21	23	12	38.5%	26.9%	29.5%	15.4%
B	45	40	29	46	57.7%	51.3%	37.2%	59.0%
C	3	16	20	18	3.8%	20.5%	25.6%	23.1%
D	0	1	6	2	0.0%	1.3%	7.7%	2.6%
Total	78	78	78	78	100.0%	100.0%	100.0%	100.0%

2.3 Cohort 3: Started in Fall2020

plotAlluvialCohort(fullFour,"f20","AllFour: Cohort taking CHEM1 in Fall 2020")

summary_table <- create_summary_table(selectSemester(fullFour,"f20"))
kable(summary_table,caption = "All4 F20 cohort numbers")

All4 F20 cohort numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	27	40	24	27	37.5%	55.6%	33.3%	37.5%
B	34	25	28	29	47.2%	34.7%	38.9%	40.3%
C	10	7	17	12	13.9%	9.7%	23.6%	16.7%
D	1	0	3	3	1.4%	0.0%	4.2%	4.2%
F	0	0	0	1	0.0%	0.0%	0.0%	1.4%
Total	72	72	72	72	100.0%	100.0%	100.0%	100.0%

2.4 Cohort 4: Started in Fall2021

plotAlluvialCohort(fullFour,"f21","AllFour: Cohort taking CHEM1 in Fall 2021")

summary_table <- create_summary_table(selectSemester(fullFour,"f21"))
kable(summary_table,caption = "All4 F21 cohort numbers")

All4 F21 cohort numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	33	29	24	29	57.9%	50.9%	42.1%	50.9%
B	20	23	18	20	35.1%	40.4%	31.6%	35.1%
C	3	5	13	5	5.3%	8.8%	22.8%	8.8%
D	1	0	2	1	1.8%	0.0%	3.5%	1.8%
F	0	0	0	2	0.0%	0.0%	0.0%	3.5%
Total	57	57	57	57	100.0%	100.0%	100.0%	100.0%

2.5 Cohort 5: Started in Fall2022

plotAlluvialCohort(fullFour,"f22","AllFour: Cohort taking CHEM1 in Fall 2022")

summary_table <- create_summary_table(selectSemester(fullFour,"f22"))
kable(summary_table,caption = "All4 F22 cohort numbers")

All4 F22 cohort numbers
letterGrade	chem1	chem2	chem3	chem4	chem1%	chem2%	chem3%	chem4%
A	32	38	21	28	40.5%	48.1%	26.6%	35.4%
B	41	34	37	40	51.9%	43.0%	46.8%	50.6%
C	6	6	19	8	7.6%	7.6%	24.1%	10.1%
D	0	1	2	1	0.0%	1.3%	2.5%	1.3%
F	0	0	0	2	0.0%	0.0%	0.0%	2.5%
Total	79	79	79	79	100.0%	100.0%	100.0%	100.0%

3 With Biochemistry

chem5_f21 = read.csv("~/Teaching/Grades_and_SRT/BIOC3321\ Grade\ Spreadsheet\ F21\ F22\ F23/Grades-BIOC_3321_(001)_Fall_2021.csv")
chem5_f22 = read.csv("~/Teaching/Grades_and_SRT/BIOC3321\ Grade\ Spreadsheet\ F21\ F22\ F23/Grades-BIOC_3321_(001)_Fall_2022.csv")
chem5_f23 = read.csv("~/Teaching/Grades_and_SRT/BIOC3321\ Grade\ Spreadsheet\ F21\ F22\ F23/Grades-BIOC_3321_(001)_Fall_2023.csv")
chem5_f21 = head(tail(chem5_f21, -1), -1)
chem5_f22 = head(tail(chem5_f22, -1), -1)
chem5_f23 = tail(chem5_f23, -1)
dfs_bio = list(
  chem5_f21 = chem5_f21,
  chem5_f22 = chem5_f22,
  chem5_f23 = chem5_f23
)
dfs_bio = getFinalScore(dfs_bio)

#merged_df = merge(f19cohort,chem5_f21, by.x = "student", by.y = "SIS.Login.ID", all.x = TRUE)
#f19cohort$chem5 = merged_df$Final.Score
addBiochemStudents = function(df1,df2,df3,df4){
  #df1 is the original cohort, df2,3, and 4 are the biochem 
  df1 <- df1 %>%
    left_join(select(df2, SIS.Login.ID, Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f21 = Final.Score)
  df1 <- df1 %>%
    left_join(select(df3, SIS.Login.ID, Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f22 = Final.Score)
  df1 <- df1 %>%
    left_join(select(df4, SIS.Login.ID, Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f23 = Final.Score)

  df1 <- df1 %>%
  mutate(chem5 = coalesce(chem5_f21, chem5_f22, chem5_f23)) %>%
  select(-chem5_f21, -chem5_f22, -chem5_f23)  # Remove intermediate columns
  df1$chem5 = as.numeric(df1$chem5) 
  return(df1)

}
addFinalExamBiochemStudents = function(df1,df2,df3,df4){
  #df1 is the original cohort, df2,3, and 4 are the biochem 
  df1 <- df1 %>%
    left_join(select(df2, SIS.Login.ID, Final.Exam.Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f21_final = Final.Exam.Final.Score)
  df1 <- df1 %>%
    left_join(select(df3, SIS.Login.ID, Final.Exam.Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f22_final = Final.Exam.Final.Score)
  df1 <- df1 %>%
    left_join(select(df4, SIS.Login.ID, Final.Exam.Final.Score), by = c("student" = "SIS.Login.ID")) %>%
    rename(chem5_f23_final = Final.Exam.Final.Score)

  df1 <- df1 %>%
  mutate(chem5_final = coalesce(chem5_f21_final, chem5_f22_final, chem5_f23_final)) %>%
  select(-chem5_f21_final, -chem5_f22_final, -chem5_f23_final)  # Remove intermediate columns
  
  df1$chem5_final = as.numeric(df1$chem5_final) 
  
  return(df1)

}

f19cohort = addBiochemStudents(f19cohort,chem5_f21,chem5_f22,chem5_f23)
f20cohort = addBiochemStudents(f20cohort,chem5_f21,chem5_f22,chem5_f23)
f21cohort = addBiochemStudents(f21cohort,chem5_f21,chem5_f22,chem5_f23)

f19cohort = addFinalExamBiochemStudents(f19cohort,chem5_f21,chem5_f22,chem5_f23)
f20cohort = addFinalExamBiochemStudents(f20cohort,chem5_f21,chem5_f22,chem5_f23)
f21cohort = addFinalExamBiochemStudents(f21cohort,chem5_f21,chem5_f22,chem5_f23)

f19complete = f19cohort[complete.cases(f19cohort),]
f20complete = f20cohort[complete.cases(f20cohort),]
f21complete = f21cohort[complete.cases(f21cohort),]

f19complete$student = sub("@umn.edu","",f19complete$student)
f20complete$student = sub("@umn.edu","",f20complete$student)
f21complete$student = sub("@umn.edu","",f21complete$student)

3.1 Scatter plot matrix

#install.packages("GGally")

p <- ggpairs(f19complete[, c(2:5,7)], title = "Fall19 cohort vs Biochem final exam")
print(p)

p <- ggpairs(f20complete[, c(2:5,7)], title = "Fall20 cohort vs Biochem final exam")
print(p)

p <- ggpairs(f21complete[, c(2:5,7)], title = "Fall21 cohort vs Biochem final exam")
print(p)

3.2 Multiple linear regressions

library(plotly)

getMultipleLinear = function(f19complete,mytitle){
  # Fit the model
  model <- lm(chem5_final ~ chem1 + chem2 + chem3 + chem4, data = f19complete)
   
  # Extract coefficients and R2
  coeffs <- coef(model)
  r2 <- summary(model)$r.squared
   
  # Create the equation as a string
  equation <- paste("Biochem final = ", round(coeffs[1], 2), " + ", 
                    round(coeffs[2], 2), "*Chem1 + ",
                    round(coeffs[3], 2), "*Chem2 + ",
                    round(coeffs[4], 2), "*Chem3 + ",
                    round(coeffs[5], 2), "*Chem4")
   
  # Predicted vs Actual plot
  f19complete$predicted <- predict(model)
  # remove temporarily the @umn.edu
  p = ggplot(f19complete, aes(x = predicted, y = chem5_final, text=student)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1, color = 'red') +
    labs(x = 'Predicted Biochem final score', y = 'Actual Biochem final score', title = mytitle) +
    theme_minimal() +
    theme(plot.margin = unit(c(1, 1, 1, 4), "lines")) +
    annotate("text", x = min(f19complete$predicted)+15, y = max(f19complete$chem5_final),
             label = equation, hjust = 0, vjust = 1, size = 2, angle = 0) +
    annotate("text", x = min(f19complete$predicted)+5, y = max(f19complete$chem5_final) - 5, 
             label = paste("R² = ", round(r2, 3)), hjust = 0, vjust = 1,size = 3)
   
  # Convert the ggplot object to a plotly object for interactivity
  p_interactive <- ggplotly(p, tooltip = "text")
   
  # Display the interactive plot
  p_interactive
  #print(p)
    
}

getMultipleLinear(f19complete,"Fall19 cohort. Multiple regression")

getMultipleLinear(f20complete,"Fall20 cohort. Multiple regression")

getMultipleLinear(f21complete,"Fall21 cohort. Multiple regression")

3.3 Correlation Heat map

#install.packages("ggcorrplot")
library(ggcorrplot)

# Calculate correlation matrix
corr_matrix <- cor(f19complete[, c(2:5,7)])

# Plot heatmap
ggcorrplot(corr_matrix, lab = TRUE, title = "Fall 19 cohort - Biochem final")

3.4 Parallel coordinates

ggparcoord(f19complete, columns = c(2:5,7), groupColumn = 1, scale = "globalminmax") +
  theme_minimal()+
  ggtitle("Fall 19 cohort")+
  theme(legend.position = "none")

# Create the parallel coordinates plot
p <- ggparcoord(f19complete, columns = c(2:5,7), groupColumn = 1, scale = "globalminmax") +
  ggtitle("Interactive Fall 19 cohort") +
  theme_minimal() +
  theme(legend.position = "none") # Remove the legend

#install.packages("plotly")
library(plotly)
# Convert the ggplot object to a plotly object for interactivity
p_interactive <- ggplotly(p)

# Display the interactive plot
p_interactive

3.5 3D scatter plots

plot_ly(f19complete, x = ~chem2, y = ~chem3, z = ~chem4, color = ~chem5_final, type = 'scatter3d', mode = 'markers') %>%
        layout(title = 'Fall 19 cohort')

3.6 Heatmap of coefficients

#install.packages("coefplot")
library(coefplot)

# Fit the model
model <- lm(chem5_final ~ chem1 + chem2 + chem3 + chem4, data = f19complete)

# Plot coefficients
coefplot(model)

3.7 Line plot with facets

library(reshape2)

# Melt the dataframe
df_melt <- melt(f19complete[,c(1,2:5,7)], id.vars = 'student', variable.name = 'chem', value.name = 'Score')

# Line plot with facets
ggplot(df_melt, aes(x = chem, y = Score, group = student)) +
  geom_line() +
  facet_wrap(~ student, scales = 'free_y') +
  theme_minimal()

Comparing Cohorts in Curriculum

Xavier Prat-Resina

2025-04-24