##READING FILES
chem1_f18 = read.csv("~/Teaching/Grades_and_SRT/Fall2018/chem1331_f18.csv",header = TRUE)
chem1_f19 = read.csv("~/Teaching/Grades_and_SRT/Fall2019/chem1331_f19_grades_canvas.csv",header = TRUE)
chem1_f20 = read.csv("~/Teaching/Grades_and_SRT/Fall2020/chem1331_f20_grades_canvas.csv",header = TRUE)
chem1_f21 = read.csv("~/Teaching/Grades_and_SRT/Fall2021/chem1331_f21_grades_canvas.csv",header = TRUE)
chem1_f22 = read.csv("~/Teaching/Grades_and_SRT/Fall2022/chem1331_f22_grades_canvas.csv",header = TRUE)
chem1_f23 = read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem1331_f23_grades_canvas.csv",header = TRUE)
chem1_f24 = read.csv("~/Teaching/Grades_and_SRT/Fall2024/chem1331_f24_grades_canvas.csv",header = TRUE)
chem2_s19 = read.csv("~/Teaching/Grades_and_SRT/Spring2019/chem1333_s19.csv", header = TRUE)
chem2_s20 = read.csv("~/Teaching/Grades_and_SRT/Spring2020/chem1333_s20_grades_canvas.csv", header = TRUE)
chem2_s21 = read.csv("~/Teaching/Grades_and_SRT/Spring2021/chem1333_s21_grades_canvas.csv", header = TRUE)
chem2_s22 = read.csv("~/Teaching/Grades_and_SRT/Spring2022/chem1333_s22_grades_canvas.csv", header = TRUE)
chem2_s23 = read.csv("~/Teaching/Grades_and_SRT/Spring2023/chem1333_s23_grades_canvas.csv", header = TRUE)
chem2_s24 = read.csv("~/Teaching/Grades_and_SRT/Spring2024/chem1333_s24_grades_canvas.csv", header = TRUE)
chem3_f19 = read.csv("~/Teaching/Grades_and_SRT/Fall2019/chem2231_f19_grades_canvas.csv",header = TRUE)
chem3_f20 = read.csv("~/Teaching/Grades_and_SRT/Fall2020/chem2131_f20_grades_canvas.csv",header = TRUE)
chem3_f21 = read.csv("~/Teaching/Grades_and_SRT/Fall2021/chem2131_f21_grades_canvas.csv",header = TRUE)
chem3_f22 = read.csv("~/Teaching/Grades_and_SRT/Fall2022/chem2131_f22_grades_canvas.csv",header = TRUE)
chem3_f23 = read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem2131_f23_grades_canvas.csv",header = TRUE)
chem3_f24 = read.csv("~/Teaching/Grades_and_SRT/Fall2024/chem2131_f24_grades_canvas.csv",header = TRUE)
chem4_s20 = read.csv("~/Teaching/Grades_and_SRT/Spring2020/chem2333_s20_grades_canvas.csv", header = TRUE)
chem4_s21 = read.csv("~/Teaching/Grades_and_SRT/Spring2021/chem2335_s21_grades_canvas.csv", header = TRUE)
chem4_s22 = read.csv("~/Teaching/Grades_and_SRT/Spring2022/chem2335_s22_grades_canvas.csv", header = TRUE)
chem4_s23 = read.csv("~/Teaching/Grades_and_SRT/Spring2023/chem2335_s23_grades_canvas.csv", header = TRUE)
chem4_s24 = read.csv("~/Teaching/Grades_and_SRT/Spring2024/chem2335_s24_grades_canvas.csv", header = TRUE)
chem4_s25 = read.csv("~/Teaching/Grades_and_SRT/Spring2025/chem2335_s25_grades_tempo_canvas.csv", header = TRUE)
#we need to merge the summer courses, ngm-f23 and chem1330 into the corresponding cohorts. Just add the id columns and the last 4 columns for grades
chem0_f23 = read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem1330_f23_grades_canvas.csv",header = TRUE)
chem0_f24 = read.csv("~/Teaching/Grades_and_SRT/Fall2024/chem1330_f24_grades_canvas.csv",header = TRUE)
chem1_ngm_f23 = read.csv("~/Teaching/Grades_and_SRT/Fall2023/chem1331_f23_ngm_grades_canvas.csv",header = TRUE)
dfs = list(
chem1_f18 = chem1_f18,
chem2_s19 = chem2_s19,
chem3_f19 = chem3_f19,
chem4_s20 = chem4_s20,
chem1_f19 = chem1_f19,
chem2_s20 = chem2_s20,
chem3_f20 = chem3_f20,
chem4_s21 = chem4_s21,
chem1_f20 = chem1_f20,
chem2_s21 = chem2_s21,
chem3_f21 = chem3_f21,
chem4_s22 = chem4_s22,
chem1_f21 = chem1_f21,
chem2_s22 = chem2_s22,
chem3_f22 = chem3_f22,
chem4_s23 = chem4_s23,
chem1_f22 = chem1_f22,
chem2_s23 = chem2_s23,
chem3_f23 = chem3_f23,
chem4_s24 = chem4_s24,
chem1_f23 = chem1_f23,
chem2_s24 = chem2_s24,
chem3_f24 = chem3_f24,
chem4_s25 = chem4_s25
)
getFinalScore = function(dfs){
#For some reason, some Final.Score is lower than Current.Score. Using the highest
for (i in seq_along(dfs)) {
df <- dfs[[i]]
didItChange=FALSE
times=0
for (j in 1:nrow(df)) {
if (df[j, "Current.Score"] > df[j, "Final.Score"]) {
df[j, "Final.Score"] <- df[j, "Current.Score"]
didItChange=TRUE
times=times+1
}
}
#if (didItChange){ print(paste("The dataframe",names(dfs)[i]," Current and Final score are not the same in",times,"students"))}
dfs[[i]] <- df
}
return(dfs)
}
dfs = getFinalScore(dfs)
We started teaching the new curriculum in Fall 2018. The first cohort (f18) finished in Spring 2020
Let’s consider how students flew through the curriculum and how they did. Some students postpone taking Chem3 or Chem4, we are taking their grade into account even if they didn’t take the course during their sophomore year.
###PREPARING FILES
combined_df <- dfs %>%
map(~ select(.x, SIS.Login.ID, Final.Score)) %>%
reduce(full_join, by = "SIS.Login.ID")
whos_that_student <- dfs %>%
map(~ select(.x,Student, SIS.Login.ID, Final.Score)) %>%
reduce(full_join, by = "SIS.Login.ID")
# Rename the Final.Score columns with the names of the original dataframes
names(combined_df)[-1] <- names(dfs)
names(whos_that_student)[-1] <- names(dfs)
# Rename the SIS.Login.ID column to student
names(combined_df)[1] <- "student"
f18cohort <- combined_df %>%
filter(!is.na(chem1_f18))
f19cohort <- combined_df %>%
filter(!is.na(chem1_f19))
f20cohort <- combined_df %>%
filter(!is.na(chem1_f20))
f21cohort <- combined_df %>%
filter(!is.na(chem1_f21))
f22cohort <- combined_df %>%
filter(!is.na(chem1_f22))
f23cohort <- combined_df %>%
filter(!is.na(chem1_f23))
merge_columns <- function(df, prefix) {
columns <- grep(paste0("^", prefix), names(df), value = TRUE)
merged_column <- ifelse(rowSums(!is.na(df[columns])) == 0, NA,
do.call(pmax, c(df[columns], na.rm = TRUE)))
return(merged_column)
}
returnMergedColumns = function(df){
# Apply the function to merge chem1, chem2, chem3, and chem4 columns
df <- df %>%
mutate(
chem1 = merge_columns(., "chem1"),
chem2 = merge_columns(., "chem2"),
chem3 = merge_columns(., "chem3"),
chem4 = merge_columns(., "chem4")
) %>%
select(student, chem1, chem2, chem3, chem4)
return(df)
}
prepareAlluvial = function(df){
new_df <- df %>%
mutate_at(vars(chem1:chem4), ~case_when(
is.na(.) ~ "OUT",
. > 90 ~ "A",
. >= 80 & . <= 90 ~ "B",
. >= 70 & . < 80 ~ "C",
. >= 60 & . < 70 ~ "D",
TRUE ~ "F"
))
long_df <- pivot_longer(new_df, cols = starts_with("chem"), names_to = "course", values_to = "letterGrade")
# Rename columns
colnames(long_df) <- c("student", "course", "letterGrade")
return(long_df)
}
f18cohort = returnMergedColumns(f18cohort)
f18letter = prepareAlluvial(f18cohort)
f19cohort = returnMergedColumns(f19cohort)
f19letter = prepareAlluvial(f19cohort)
f20cohort = returnMergedColumns(f20cohort)
f20letter = prepareAlluvial(f20cohort)
f21cohort = returnMergedColumns(f21cohort)
f21letter = prepareAlluvial(f21cohort)
f22cohort = returnMergedColumns(f22cohort)
f22letter = prepareAlluvial(f22cohort)
f23cohort = returnMergedColumns(f23cohort)
f23letter = prepareAlluvial(f23cohort)
#f18cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f18cohort.csv", row.names = FALSE)
#f19cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f19cohort.csv", row.names = FALSE)
#f20cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f20cohort.csv", row.names = FALSE)
#f21cohort %>% select(-1) %>% arrange(chem2) %>% write.csv("f21cohort.csv", row.names = FALSE)
#write.csv(f22cohort[order(f22cohort$chem2_s23),],"f22cohort.csv")
## REPRESENT
makeAlluvial_Cohort = function(df,thisTitle){
ggplot(df,
aes(x = course, stratum = letterGrade, alluvium = student, fill = letterGrade)) +
scale_x_discrete(expand = c(.1, .1)) +
geom_flow() +
geom_stratum(alpha = .5) +
theme_minimal() +
geom_text(stat = "stratum",
aes(label = percent(after_stat(prop), accuracy = .1)))+
labs(title = thisTitle, x = "", y = "Number of students")
}
makeAlluvial_Cohort2 = function(df,thisTitle){
ggplot(df,
aes(x = course, stratum = letterGrade, alluvium = student, fill = letterGrade)) +
scale_x_discrete(expand = c(.1, .1)) +
geom_flow() +
geom_stratum(alpha = .5) +
theme_minimal() +
geom_text(stat = "stratum",
aes(label = after_stat(count) ))+
labs(title = thisTitle, x = "", y = "Number of students")
}
create_summary_table <- function(df) {
summary_table <- df %>%
group_by(course, letterGrade, .drop = TRUE) %>%
summarize(count = n(), .groups = 'drop') %>%
ungroup() %>%
complete(course, letterGrade, fill = list(count = 0))
# Pivot the data to have courses as columns
summary_table <- summary_table %>%
pivot_wider(names_from = course, values_from = count)
df_sum <- summary_table %>%
bind_rows(
data.frame(
letterGrade = "Total",
chem1 = sum(.$chem1),
chem2 = sum(.$chem2),
chem3 = sum(.$chem3),
chem4 = sum(.$chem4)
)
)
# Calculate percentages and add "%" character
df_percent <- df_sum %>%
mutate(across(starts_with("chem"), ~ scales::percent(./sum(.)*2 , accuracy = 0.1), .names = "{col}%"))
return(df_percent)
}
allCohorts = rbind(f18letter,f19letter)
allCohorts = rbind(allCohorts,f20letter)
allCohorts = rbind(allCohorts,f21letter)
allCohorts = rbind(allCohorts,f22letter)
allCohorts = rbind(allCohorts,f23letter)
allCohorts = allCohorts[!duplicated(allCohorts, fromLast = TRUE),]
makeAlluvial_Cohort(allCohorts,"All cohorts")
summary_table <- create_summary_table(allCohorts)
kable(summary_table,caption = "All cohort numbers")
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 387 | 309 | 158 | 139 | 34.7% | 27.7% | 14.2% | 12.5% |
B | 465 | 326 | 184 | 200 | 41.7% | 29.2% | 16.5% | 17.9% |
C | 187 | 171 | 121 | 70 | 16.8% | 15.3% | 10.8% | 6.3% |
D | 46 | 27 | 30 | 12 | 4.1% | 2.4% | 2.7% | 1.1% |
F | 31 | 6 | 6 | 5 | 2.8% | 0.5% | 0.5% | 0.4% |
OUT | 0 | 277 | 617 | 690 | 0.0% | 24.8% | 55.3% | 61.8% |
Total | 1116 | 1116 | 1116 | 1116 | 100.0% | 100.0% | 100.0% | 100.0% |
calculate_percentage <- function(df) {
df %>%
mutate(not_out = ifelse(letterGrade != "OUT", 1, 0)) %>%
group_by(course) %>%
summarise(
raw_count = sum(not_out),
percentage = mean(not_out) * 100
)
}
# Apply the function to each dataframe
f18_percentage <- calculate_percentage(f18letter)
f19_percentage <- calculate_percentage(f19letter)
f20_percentage <- calculate_percentage(f20letter)
f21_percentage <- calculate_percentage(f21letter)
f22_percentage <- calculate_percentage(f22letter)
f23_percentage <- calculate_percentage(f23letter)
allPercentage <- bind_rows(f18_percentage,
f19_percentage,
f20_percentage,
f21_percentage,
f22_percentage,
f23_percentage,
.id = "df_id"
)%>%
mutate(df_id = case_when(
df_id == "1" ~ "Fall18",
df_id == "2" ~ "Fall19",
df_id == "3" ~ "Fall20",
df_id == "4" ~ "Fall21",
df_id == "5" ~ "Fall22",
df_id == "6" ~ "Fall23",
TRUE ~ df_id # Keep other values unchanged
))
ggplot(allPercentage, aes(x = course, y = percentage, group = df_id, color = df_id)) +
geom_line() +
geom_point() +
labs(x = "Course", y = "% students",
title = "Retention across the Chemistry Curriculum") +
scale_color_discrete(name = "Cohort first semester") +
theme_minimal()
allPercentage_wide <- allPercentage %>%
mutate(raw_pct = paste0(raw_count, " (", round(percentage, 1), "%)")) %>%
select(course, df_id, raw_pct) %>%
pivot_wider(
id_cols = course,
names_from = df_id,
values_from = raw_pct
)
# Print the resulting table
kable(allPercentage_wide, format = "markdown",digits = 1, caption = "Raw and percent numbers of students throughout the curriculum since 2018")
course | Fall18 | Fall19 | Fall20 | Fall21 | Fall22 | Fall23 |
---|---|---|---|---|---|---|
chem1 | 181 (100%) | 182 (100%) | 223 (100%) | 195 (100%) | 208 (100%) | 151 (100%) |
chem2 | 129 (71.3%) | 134 (73.6%) | 162 (72.6%) | 146 (74.9%) | 155 (74.5%) | 123 (81.5%) |
chem3 | 78 (43.1%) | 91 (50%) | 93 (41.7%) | 78 (40%) | 95 (45.7%) | 70 (46.4%) |
chem4 | 66 (36.5%) | 80 (44%) | 73 (32.7%) | 64 (32.8%) | 81 (38.9%) | 64 (42.4%) |
all_letters <- bind_rows(
f18letter, f19letter, f20letter,
f21letter, f22letter, f23letter,
.id = "semester"
)
# Step 2: Reshape wide so each course is its own column
wide_all_letters <- all_letters %>%
pivot_wider(names_from = course, values_from = letterGrade)
make_transition_table <- function(data, from_course, to_course) {
transition <- data %>%
count(.data[[from_course]], .data[[to_course]]) %>%
group_by(.data[[from_course]]) %>%
mutate(percentage = 100 * n / sum(n)) %>%
ungroup() %>%
select(-n) %>%
pivot_wider(
names_from = !!sym(to_course),
values_from = percentage,
values_fill = 0
) %>%
rename(!!from_course := 1) %>%
mutate(across(-1, ~ paste0(sprintf("%.1f", .x), "%")))
return(transition)
}
kable(make_transition_table(wide_all_letters, "chem1", "chem2"),format = "markdown",caption = "Percent chance to transition from CHEM1 to CHEM2 - All cohorts")
chem1 | A | B | C | D | OUT | F |
---|---|---|---|---|---|---|
A | 59.2% | 24.1% | 3.1% | 0.3% | 13.3% | 0.0% |
B | 15.8% | 41.8% | 19.4% | 2.3% | 20.5% | 0.2% |
C | 2.1% | 18.8% | 32.3% | 6.8% | 37.5% | 2.6% |
D | 0.0% | 5.9% | 15.7% | 3.9% | 72.5% | 2.0% |
F | 0.0% | 0.0% | 0.0% | 0.0% | 100.0% | 0.0% |
A way to read the above table is that 59.2% of students who got an A in chem1 will get an A in chem2. Notice that the lower the grade in CHEM1 the higher the chance to be OUT.
kable(make_transition_table(wide_all_letters, "chem2", "chem3"),format = "markdown",caption = "Percent chance to transition from CHEM2 to CHEM3 - All cohorts")
chem2 | A | B | C | D | OUT | F |
---|---|---|---|---|---|---|
A | 45.2% | 24.8% | 1.6% | 0.3% | 28.1% | 0.0% |
B | 5.4% | 30.2% | 23.6% | 3.6% | 36.6% | 0.6% |
C | 0.6% | 4.6% | 21.8% | 10.3% | 60.9% | 1.7% |
D | 0.0% | 3.7% | 7.4% | 0.0% | 85.2% | 3.7% |
F | 0.0% | 0.0% | 0.0% | 0.0% | 100.0% | 0.0% |
OUT | 0.0% | 0.0% | 0.0% | 0.0% | 100.0% | 0.0% |
kable(make_transition_table(wide_all_letters, "chem3", "chem4"),format = "markdown",caption = "Percent chance to transition from CHEM3 to CHEM4 - All cohorts")
chem3 | A | B | C | OUT | D | F |
---|---|---|---|---|---|---|
A | 60.4% | 25.8% | 1.3% | 12.6% | 0.0% | 0.0% |
B | 19.4% | 57.0% | 11.8% | 10.8% | 0.5% | 0.5% |
C | 1.6% | 39.0% | 29.3% | 24.4% | 4.1% | 1.6% |
D | 3.2% | 3.2% | 22.6% | 51.6% | 12.9% | 6.5% |
F | 0.0% | 0.0% | 0.0% | 100.0% | 0.0% | 0.0% |
OUT | 0.6% | 0.9% | 0.5% | 97.6% | 0.3% | 0.0% |
Took: * CHEM1 in F18 * CHEM2 in S19 * CHEM3 in F19 or later * CHEM4 in S20 or later
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 90 | 38 | 18 | 16 | 49.7% | 21.0% | 9.9% | 8.8% |
B | 71 | 50 | 36 | 34 | 39.2% | 27.6% | 19.9% | 18.8% |
C | 13 | 34 | 19 | 14 | 7.2% | 18.8% | 10.5% | 7.7% |
D | 2 | 5 | 4 | 2 | 1.1% | 2.8% | 2.2% | 1.1% |
F | 5 | 2 | 1 | 0 | 2.8% | 1.1% | 0.6% | 0.0% |
OUT | 0 | 52 | 103 | 115 | 0.0% | 28.7% | 56.9% | 63.5% |
Total | 181 | 181 | 181 | 181 | 100.0% | 100.0% | 100.0% | 100.0% |
Took: * CHEM1 in F19 * CHEM2 in S20 * CHEM3 in F20 or later * CHEM4 in S21 or later
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 52 | 29 | 23 | 13 | 28.6% | 15.9% | 12.6% | 7.1% |
B | 95 | 59 | 31 | 46 | 52.2% | 32.4% | 17.0% | 25.3% |
C | 25 | 39 | 25 | 19 | 13.7% | 21.4% | 13.7% | 10.4% |
D | 7 | 6 | 10 | 2 | 3.8% | 3.3% | 5.5% | 1.1% |
F | 3 | 1 | 2 | 0 | 1.6% | 0.5% | 1.1% | 0.0% |
OUT | 0 | 48 | 91 | 102 | 0.0% | 26.4% | 50.0% | 56.0% |
Total | 182 | 182 | 182 | 182 | 100.0% | 100.0% | 100.0% | 100.0% |
Took: * CHEM1 in F20 * CHEM2 in S21 * CHEM3 in F21 or later * CHEM4 in S22 or later
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 48 | 59 | 30 | 27 | 21.5% | 26.5% | 13.5% | 12.1% |
B | 80 | 62 | 34 | 30 | 35.9% | 27.8% | 15.2% | 13.5% |
C | 65 | 35 | 23 | 12 | 29.1% | 15.7% | 10.3% | 5.4% |
D | 22 | 5 | 6 | 3 | 9.9% | 2.2% | 2.7% | 1.3% |
F | 8 | 1 | 0 | 1 | 3.6% | 0.4% | 0.0% | 0.4% |
OUT | 0 | 61 | 130 | 150 | 0.0% | 27.4% | 58.3% | 67.3% |
Total | 223 | 223 | 223 | 223 | 100.0% | 100.0% | 100.0% | 100.0% |
Took:
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 71 | 58 | 31 | 31 | 36.4% | 29.7% | 15.9% | 15.9% |
B | 76 | 52 | 22 | 24 | 39.0% | 26.7% | 11.3% | 12.3% |
C | 28 | 29 | 21 | 5 | 14.4% | 14.9% | 10.8% | 2.6% |
D | 14 | 5 | 2 | 2 | 7.2% | 2.6% | 1.0% | 1.0% |
F | 6 | 2 | 2 | 2 | 3.1% | 1.0% | 1.0% | 1.0% |
OUT | 0 | 49 | 117 | 131 | 0.0% | 25.1% | 60.0% | 67.2% |
Total | 195 | 195 | 195 | 195 | 100.0% | 100.0% | 100.0% | 100.0% |
Took:
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 60 | 54 | 24 | 28 | 28.8% | 26.0% | 11.5% | 13.5% |
B | 102 | 69 | 39 | 40 | 49.0% | 33.2% | 18.8% | 19.2% |
C | 37 | 26 | 24 | 9 | 17.8% | 12.5% | 11.5% | 4.3% |
D | 4 | 5 | 8 | 2 | 1.9% | 2.4% | 3.8% | 1.0% |
F | 5 | 1 | 0 | 2 | 2.4% | 0.5% | 0.0% | 1.0% |
OUT | 0 | 53 | 113 | 127 | 0.0% | 25.5% | 54.3% | 61.1% |
Total | 208 | 208 | 208 | 208 | 100.0% | 100.0% | 100.0% | 100.0% |
Took:
At the moment of this analysis some students dropped or have not taken ochem2 or chem4, so they may take it in the next year and the results may change
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 69 | 72 | 33 | 24 | 45.7% | 47.7% | 21.9% | 15.9% |
B | 50 | 39 | 24 | 28 | 33.1% | 25.8% | 15.9% | 18.5% |
C | 24 | 11 | 11 | 11 | 15.9% | 7.3% | 7.3% | 7.3% |
D | 2 | 1 | 1 | 1 | 1.3% | 0.7% | 0.7% | 0.7% |
F | 6 | 0 | 1 | 0 | 4.0% | 0.0% | 0.7% | 0.0% |
OUT | 0 | 28 | 81 | 87 | 0.0% | 18.5% | 53.6% | 57.6% |
Total | 151 | 151 | 151 | 151 | 100.0% | 100.0% | 100.0% | 100.0% |
If a student gets less than 74%, what advise should we give them?
Cstud_s23 = chem2_s23[which(chem2_s23$Unposted.Final.Score < 74),]
Cstud_s23 = Cstud_s23
summary_table <- f18cohort %>%
filter(chem2 < 74) %>%
summarize(
students_below_74 = n(),
avg_chem3_score = mean(chem3, na.rm = TRUE),
avg_chem4_score = mean(chem4, na.rm = TRUE)
)
# Print summary table
print(summary_table)
## students_below_74 avg_chem3_score avg_chem4_score
## 1 18 75.0475 74.6
Now let’s just consider only the students who took the four semesters.
There are some students who transferred in GenChem2, so there are some premed-like students that continued to Biochemistry even if they look to be “OUT” of the curriculum.
TODO: Check Biochem lists.
## PREPARE FILES
select_and_add_name <- function(df, name) {
if (any(is.na(df$Final.Score))) {
print(paste("Data frame", name, "contains NA values in the 'grade' column"))
}
selected_df <- df %>%
#select(ID, Final.Score) %>%
select(SIS.Login.ID, Final.Score) %>%
mutate(Final.Score = as.numeric(Final.Score)) # Convert "grade" to numeric type
selected_df$name <- name
return(selected_df)
}
# Apply the function to each dataframe in the list
dfs_with_name <- Map(select_and_add_name, dfs, names(dfs))
# Merge all data frames into one
merged_df <- bind_rows(dfs_with_name)
names(merged_df)[names(merged_df)=="SIS.Login.ID"] = "student"
allCohort <- separate(merged_df, name, into = c("course", "semester"), sep = "_")
allCohort$letterGrade = ifelse(allCohort$Final.Score > 90, "A",
ifelse(allCohort$Final.Score >= 80, "B",
ifelse(allCohort$Final.Score >= 70, "C",
ifelse(allCohort$Final.Score >= 60, "D", "F"))))
# filter just in case they took it twice, just take the highest score
allCohort <- allCohort %>%
group_by(student, course) %>%
filter(Final.Score == max(Final.Score))
# What students took the whole sequence
fullFour <- allCohort %>%
group_by(student) %>%
filter(all(c("chem1", "chem2", "chem3", "chem4") %in% course))
# filter just in case they took it twice, just take the highest score
fullFour <- fullFour %>%
group_by(student, course) %>%
filter(Final.Score == max(Final.Score))
#Writing just in case
## REPRESENTING
selectSemester = function(df,thisSem){
selected_students <- df %>%
filter(course == "chem1" & semester == thisSem) %>%
pull(student) %>%
unique()
# Filter the dataframe to include all rows for selected students
thisSem_df <- df %>%
filter(student %in% selected_students)
return(thisSem_df)
}
plotAlluvialCohort = function(df,thisSem,thisTitle){
thisSem_df = selectSemester(df,thisSem)
ggplot(thisSem_df,
aes(x = course, stratum = letterGrade, alluvium = student, fill = letterGrade)) +
scale_x_discrete(expand = c(.1, .1)) +
geom_flow() +
geom_stratum(alpha = .5) +
theme_minimal() +
geom_text(stat = "stratum",
aes(label = percent(after_stat(prop), accuracy = .1)))+
labs(title = thisTitle, x = "", y = "Number of students")
}
makeAlluvial_Cohort(fullFour,"All cohorts. Students who took the four semesters")
summary_table <- create_summary_table(fullFour)
kable(summary_table,caption = "All cohort numbers. Students who took the four semesters")
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 207 | 196 | 139 | 135 | 50.4% | 47.7% | 33.8% | 32.8% |
B | 177 | 170 | 165 | 194 | 43.1% | 41.4% | 40.1% | 47.2% |
C | 25 | 42 | 92 | 67 | 6.1% | 10.2% | 22.4% | 16.3% |
D | 2 | 3 | 15 | 10 | 0.5% | 0.7% | 3.6% | 2.4% |
F | 0 | 0 | 0 | 5 | 0.0% | 0.0% | 0.0% | 1.2% |
Total | 411 | 411 | 411 | 411 | 100.0% | 100.0% | 100.0% | 100.0% |
summary_table <- create_summary_table(selectSemester(fullFour,"f18"))
kable(summary_table,caption = "All4 F18 cohort numbers")
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 46 | 23 | 16 | 15 | 71.9% | 35.9% | 25.0% | 23.4% |
B | 18 | 32 | 32 | 33 | 28.1% | 50.0% | 50.0% | 51.6% |
C | 0 | 8 | 14 | 14 | 0.0% | 12.5% | 21.9% | 21.9% |
D | 0 | 1 | 2 | 2 | 0.0% | 1.6% | 3.1% | 3.1% |
Total | 64 | 64 | 64 | 64 | 100.0% | 100.0% | 100.0% | 100.0% |
summary_table <- create_summary_table(selectSemester(fullFour,"f19"))
kable(summary_table,caption = "All4 F18 cohort numbers")
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 30 | 21 | 23 | 12 | 38.5% | 26.9% | 29.5% | 15.4% |
B | 45 | 40 | 29 | 46 | 57.7% | 51.3% | 37.2% | 59.0% |
C | 3 | 16 | 20 | 18 | 3.8% | 20.5% | 25.6% | 23.1% |
D | 0 | 1 | 6 | 2 | 0.0% | 1.3% | 7.7% | 2.6% |
Total | 78 | 78 | 78 | 78 | 100.0% | 100.0% | 100.0% | 100.0% |
summary_table <- create_summary_table(selectSemester(fullFour,"f20"))
kable(summary_table,caption = "All4 F20 cohort numbers")
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 27 | 40 | 24 | 27 | 37.5% | 55.6% | 33.3% | 37.5% |
B | 34 | 25 | 28 | 29 | 47.2% | 34.7% | 38.9% | 40.3% |
C | 10 | 7 | 17 | 12 | 13.9% | 9.7% | 23.6% | 16.7% |
D | 1 | 0 | 3 | 3 | 1.4% | 0.0% | 4.2% | 4.2% |
F | 0 | 0 | 0 | 1 | 0.0% | 0.0% | 0.0% | 1.4% |
Total | 72 | 72 | 72 | 72 | 100.0% | 100.0% | 100.0% | 100.0% |
summary_table <- create_summary_table(selectSemester(fullFour,"f21"))
kable(summary_table,caption = "All4 F21 cohort numbers")
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 33 | 29 | 24 | 29 | 57.9% | 50.9% | 42.1% | 50.9% |
B | 20 | 23 | 18 | 20 | 35.1% | 40.4% | 31.6% | 35.1% |
C | 3 | 5 | 13 | 5 | 5.3% | 8.8% | 22.8% | 8.8% |
D | 1 | 0 | 2 | 1 | 1.8% | 0.0% | 3.5% | 1.8% |
F | 0 | 0 | 0 | 2 | 0.0% | 0.0% | 0.0% | 3.5% |
Total | 57 | 57 | 57 | 57 | 100.0% | 100.0% | 100.0% | 100.0% |
summary_table <- create_summary_table(selectSemester(fullFour,"f22"))
kable(summary_table,caption = "All4 F22 cohort numbers")
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 32 | 38 | 21 | 28 | 40.5% | 48.1% | 26.6% | 35.4% |
B | 41 | 34 | 37 | 40 | 51.9% | 43.0% | 46.8% | 50.6% |
C | 6 | 6 | 19 | 8 | 7.6% | 7.6% | 24.1% | 10.1% |
D | 0 | 1 | 2 | 1 | 0.0% | 1.3% | 2.5% | 1.3% |
F | 0 | 0 | 0 | 2 | 0.0% | 0.0% | 0.0% | 2.5% |
Total | 79 | 79 | 79 | 79 | 100.0% | 100.0% | 100.0% | 100.0% |
summary_table <- create_summary_table(selectSemester(fullFour,"f23"))
kable(summary_table,caption = "All4 F23 cohort numbers")
letterGrade | chem1 | chem2 | chem3 | chem4 | chem1% | chem2% | chem3% | chem4% |
---|---|---|---|---|---|---|---|---|
A | 39 | 45 | 31 | 24 | 63.9% | 73.8% | 50.8% | 39.3% |
B | 19 | 16 | 21 | 26 | 31.1% | 26.2% | 34.4% | 42.6% |
C | 3 | 0 | 9 | 10 | 4.9% | 0.0% | 14.8% | 16.4% |
D | 0 | 0 | 0 | 1 | 0.0% | 0.0% | 0.0% | 1.6% |
Total | 61 | 61 | 61 | 61 | 100.0% | 100.0% | 100.0% | 100.0% |
#discover demographics
discover = read.csv("~/Teaching/Grades_and_SRT/discover_chem1331_f18-f23_.csv",header = TRUE)
discf18 = discover[which(discover$Semester == "Fall 2018"),]
discf19 = discover[which(discover$Semester == "Fall 2019"),]
discf20 = discover[which(discover$Semester == "Fall 2020"),]
discf21 = discover[which(discover$Semester == "Fall 2021"),]
discf22 = discover[which(discover$Semester == "Fall 2022"),]
discf23 = discover[which(discover$Semester == "Fall 2023"),]
library(dplyr)
# Filter out blank or NA values in DEM_Sex
cleaned <- discover %>%
filter(!is.na(DEM_Sex), DEM_Sex %in% c("F", "M"))
ggplot(cleaned, aes(x = Semester, fill = DEM_Sex)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Semester", y = "Proportion", fill = "Sex") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create contingency table
semester_table <- table(cleaned$Semester, cleaned$DEM_Sex)
# Run chi-square test
chisq.test(semester_table)
##
## Pearson's Chi-squared test
##
## data: semester_table
## X-squared = 3.996, df = 5, p-value = 0.55
cleaned <- discover %>%
filter(!is.na(DEM_Student.of.Color), DEM_Student.of.Color %in% c("Y", "N"))
ggplot(cleaned, aes(x = Semester, fill = DEM_Student.of.Color)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Semester", y = "Proportion", fill = "Student.of.Color") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create contingency table
semester_table <- table(cleaned$Semester, cleaned$DEM_Student.of.Color)
# Run chi-square test
chisq.test(semester_table)
##
## Pearson's Chi-squared test
##
## data: semester_table
## X-squared = 7.8073, df = 5, p-value = 0.1672
# Replace empty or NA values with "Missing"
plot_data <- discover %>%
filter(DEM_Sex %in% c("F", "M")) %>%
mutate(
DEM_First.Generation = ifelse(
is.na(DEM_First.Generation) | DEM_First.Generation == "",
"Missing",
DEM_First.Generation
)
)
# Plot
ggplot(plot_data, aes(x = Semester, fill = DEM_First.Generation)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Semester", y = "Proportion", fill = "First Gen") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
chi_data <- discover %>%
filter(DEM_First.Generation %in% c("Y", "N"))
# Contingency table
firstgen_table <- table(chi_data$Semester, chi_data$DEM_First.Generation)
# Chi-square test
chi_square_result = chisq.test(firstgen_table)
chi_square_result
##
## Pearson's Chi-squared test
##
## data: firstgen_table
## X-squared = 21.616, df = 5, p-value = 0.0006192
library(corrplot)
residuals <- chi_square_result$residuals
# Normalize the residuals to make them suitable for plotting
normalized_residuals <- residuals / sqrt(chi_square_result$expected)
corrplot(normalized_residuals, is.corr = FALSE, method = "circle", tl.col = "black", tl.srt = 45,mar = c(3, 3, 3, 3))
# Filter for non-missing GPA values
gpa_data <- discover %>%
filter(!is.na(DEM_HS_GPA_scale_unknown))
# Boxplot
ggplot(gpa_data, aes(x = Semester, y = DEM_HS_GPA_scale_unknown)) +
geom_boxplot() +
labs(x = "Semester", y = "High School GPA") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Run ANOVA
anova_result <- aov(DEM_HS_GPA_scale_unknown ~ Semester, data = gpa_data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Semester 5 1.49 0.2980 2.53 0.028 *
## Residuals 602 70.92 0.1178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gpa_summary <- discover %>%
group_by(Semester) %>%
summarise(
total = n(),
non_missing = sum(!is.na(DEM_HS_GPA_scale_unknown)),
percent_available = round(100 * non_missing / total, 1),
mean_gpa = mean(DEM_HS_GPA_scale_unknown, na.rm = TRUE),
median_gpa = median(DEM_HS_GPA_scale_unknown, na.rm = TRUE),
sd_gpa = sd(DEM_HS_GPA_scale_unknown, na.rm = TRUE)
)
library(kableExtra)
gpa_summary %>%
arrange(Semester) %>%
kable(digits = 2, caption = "GPA Summary by Semester") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
Semester | total | non_missing | percent_available | mean_gpa | median_gpa | sd_gpa |
---|---|---|---|---|---|---|
Fall 2018 | 181 | 10 | 5.5 | 3.40 | 3.50 | 0.25 |
Fall 2019 | 182 | 153 | 84.1 | 3.65 | 3.74 | 0.34 |
Fall 2020 | 229 | 199 | 86.9 | 3.63 | 3.72 | 0.34 |
Fall 2021 | 195 | 162 | 83.1 | 3.58 | 3.69 | 0.38 |
Fall 2022 | 208 | 11 | 5.3 | 3.53 | 3.59 | 0.30 |
Fall 2023 | 161 | 73 | 45.3 | 3.70 | 3.70 | 0.29 |
gpa_fg <- discover %>%
filter(DEM_First.Generation %in% c("Y", "N"),
!is.na(DEM_HS_GPA_scale_unknown))
ttest_result <- t.test(DEM_HS_GPA_scale_unknown ~ DEM_First.Generation, data = gpa_fg)
p_value <- ttest_result$p.value
ggplot(gpa_fg, aes(x = DEM_First.Generation, y = DEM_HS_GPA_scale_unknown, fill = DEM_First.Generation)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.8) +
labs(x = "First Generation (Y/N)", y = "High School GPA") +
theme_minimal() +
theme(legend.position = "none")
gpa_fg %>%
group_by(DEM_First.Generation) %>%
summarise(
count = n(),
mean_gpa = mean(DEM_HS_GPA_scale_unknown),
sd_gpa = sd(DEM_HS_GPA_scale_unknown),
median_gpa = median(DEM_HS_GPA_scale_unknown)
) %>%
mutate(`t-test p-value` = ifelse(row_number() == 1, round(p_value, 4), "")) %>%
kable(digits = 2, caption = "GPA by First Generation Status with t-test") %>%
kable_styling(full_width = FALSE)
DEM_First.Generation | count | mean_gpa | sd_gpa | median_gpa | t-test p-value |
---|---|---|---|---|---|
N | 179 | 3.61 | 0.37 | 3.70 | 0.476 |
Y | 242 | 3.58 | 0.35 | 3.65 |
For each semester
library(dplyr)
library(tidyr)
library(broom)
library(kableExtra)
# Filter usable data
gpa_by_semester <- discover %>%
filter(DEM_First.Generation %in% c("Y", "N"),
!is.na(DEM_HS_GPA_scale_unknown))
# Run only on semesters with >=2 values in both Y and N groups
gpa_comparison <- gpa_by_semester %>%
group_by(Semester) %>%
filter(sum(DEM_First.Generation == "Y") >= 2,
sum(DEM_First.Generation == "N") >= 2) %>%
group_modify(~ {
data = .
test = t.test(DEM_HS_GPA_scale_unknown ~ DEM_First.Generation, data = data)
tidy_test = broom::tidy(test)
summary_stats <- data %>%
group_by(DEM_First.Generation) %>%
summarise(
count = n(),
mean = mean(DEM_HS_GPA_scale_unknown),
sd = sd(DEM_HS_GPA_scale_unknown),
.groups = "drop"
) %>%
pivot_wider(
names_from = DEM_First.Generation,
values_from = c(count, mean, sd),
names_glue = "{.value}_{DEM_First.Generation}"
)
bind_cols(summary_stats, tibble(
p_value = tidy_test$p.value
))
}) %>%
ungroup()
# Print table
gpa_comparison %>%
mutate(p_value = round(p_value, 4)) %>%
kable(
digits = 2,
caption = "GPA by First Generation Status for Each Semester (≥2 students in each group)"
) %>%
kable_styling(full_width = FALSE)
Semester | count_N | count_Y | mean_N | mean_Y | sd_N | sd_Y | p_value |
---|---|---|---|---|---|---|---|
Fall 2019 | 28 | 86 | 3.58 | 3.66 | 0.38 | 0.35 | 0.38 |
Fall 2020 | 54 | 54 | 3.57 | 3.54 | 0.37 | 0.32 | 0.64 |
Fall 2021 | 64 | 76 | 3.58 | 3.55 | 0.38 | 0.39 | 0.69 |
Fall 2022 | 3 | 5 | 3.62 | 3.52 | 0.07 | 0.42 | 0.64 |
Fall 2023 | 25 | 20 | 3.84 | 3.54 | 0.28 | 0.26 | 0.00 |
gpa_fg <- discover %>%
filter(DEM_Student.of.Color %in% c("Y", "N"),
!is.na(DEM_HS_GPA_scale_unknown))
ttest_result <- t.test(DEM_HS_GPA_scale_unknown ~ DEM_Student.of.Color, data = gpa_fg)
p_value <- ttest_result$p.value
ggplot(gpa_fg, aes(x = DEM_Student.of.Color, y = DEM_HS_GPA_scale_unknown, fill = DEM_Student.of.Color)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.8) +
labs(x = "Student of Color (Y/N)", y = "High School GPA") +
theme_minimal() +
theme(legend.position = "none")
gpa_fg %>%
group_by(DEM_Student.of.Color) %>%
summarise(
count = n(),
mean_gpa = mean(DEM_HS_GPA_scale_unknown),
sd_gpa = sd(DEM_HS_GPA_scale_unknown),
median_gpa = median(DEM_HS_GPA_scale_unknown)
) %>%
mutate(`t-test p-value` = ifelse(row_number() == 1, round(p_value, 4), "")) %>%
kable(digits = 2, caption = "GPA by Student of Color Status with t-test") %>%
kable_styling(full_width = FALSE)
DEM_Student.of.Color | count | mean_gpa | sd_gpa | median_gpa | t-test p-value |
---|---|---|---|---|---|
N | 376 | 3.67 | 0.32 | 3.74 | 1e-04 |
Y | 232 | 3.56 | 0.37 | 3.59 |
# Filter usable data
gpa_by_semester <- discover %>%
filter(DEM_Student.of.Color %in% c("Y", "N"),
!is.na(DEM_HS_GPA_scale_unknown))
# Run only on semesters with >=2 values in both Y and N groups
gpa_comparison <- gpa_by_semester %>%
group_by(Semester) %>%
filter(sum(DEM_Student.of.Color == "Y") >= 2,
sum(DEM_Student.of.Color == "N") >= 2) %>%
group_modify(~ {
data = .
test = t.test(DEM_HS_GPA_scale_unknown ~ DEM_Student.of.Color, data = data)
tidy_test = broom::tidy(test)
summary_stats <- data %>%
group_by(DEM_Student.of.Color) %>%
summarise(
count = n(),
mean = mean(DEM_HS_GPA_scale_unknown),
sd = sd(DEM_HS_GPA_scale_unknown),
.groups = "drop"
) %>%
pivot_wider(
names_from = DEM_Student.of.Color,
values_from = c(count, mean, sd),
names_glue = "{.value}_{DEM_Student.of.Color}"
)
bind_cols(summary_stats, tibble(
p_value = tidy_test$p.value
))
}) %>%
ungroup()
# Print table
gpa_comparison %>%
mutate(p_value = round(p_value, 4)) %>%
kable(
digits = 2,
caption = "GPA by Student of Color Status for Each Semester (≥2 students in each group)"
) %>%
kable_styling(full_width = FALSE)
Semester | count_N | count_Y | mean_N | mean_Y | sd_N | sd_Y | p_value |
---|---|---|---|---|---|---|---|
Fall 2018 | 3 | 7 | 3.40 | 3.40 | 0.19 | 0.29 | 0.98 |
Fall 2019 | 103 | 50 | 3.70 | 3.55 | 0.30 | 0.39 | 0.02 |
Fall 2020 | 124 | 75 | 3.67 | 3.57 | 0.33 | 0.35 | 0.04 |
Fall 2021 | 95 | 67 | 3.61 | 3.54 | 0.36 | 0.40 | 0.23 |
Fall 2022 | 5 | 6 | 3.48 | 3.57 | 0.31 | 0.31 | 0.62 |
Fall 2023 | 46 | 27 | 3.75 | 3.62 | 0.25 | 0.34 | 0.11 |
gpa_fg <- discover %>%
filter(DEM_Sex %in% c("M", "F"),
!is.na(DEM_HS_GPA_scale_unknown))
ttest_result <- t.test(DEM_HS_GPA_scale_unknown ~ DEM_Sex, data = gpa_fg)
p_value <- ttest_result$p.value
ggplot(gpa_fg, aes(x = DEM_Sex, y = DEM_HS_GPA_scale_unknown, fill = DEM_Sex)) +
geom_violin(trim = FALSE, alpha = 0.5) +
geom_boxplot(width = 0.1, outlier.shape = NA, alpha = 0.8) +
labs(x = "Student of Color (Y/N)", y = "High School GPA") +
theme_minimal() +
theme(legend.position = "none")
gpa_fg %>%
group_by(DEM_Sex) %>%
summarise(
count = n(),
mean_gpa = mean(DEM_HS_GPA_scale_unknown),
sd_gpa = sd(DEM_HS_GPA_scale_unknown),
median_gpa = median(DEM_HS_GPA_scale_unknown)
) %>%
mutate(`t-test p-value` = ifelse(row_number() == 1, round(p_value, 4), "")) %>%
kable(digits = 2, caption = "GPA by Sex with t-test") %>%
kable_styling(full_width = FALSE)
DEM_Sex | count | mean_gpa | sd_gpa | median_gpa | t-test p-value |
---|---|---|---|---|---|
F | 489 | 3.65 | 0.34 | 3.72 | 2e-04 |
M | 119 | 3.51 | 0.37 | 3.58 |
# Filter usable data
gpa_by_semester <- discover %>%
filter(DEM_Sex %in% c("M", "F"),
!is.na(DEM_HS_GPA_scale_unknown))
# Run only on semesters with >=2 values in both Y and N groups
gpa_comparison <- gpa_by_semester %>%
group_by(Semester) %>%
filter(sum(DEM_Sex == "M") >= 2,
sum(DEM_Sex == "F") >= 2) %>%
group_modify(~ {
data = .
test = t.test(DEM_HS_GPA_scale_unknown ~ DEM_Sex, data = data)
tidy_test = broom::tidy(test)
summary_stats <- data %>%
group_by(DEM_Sex) %>%
summarise(
count = n(),
mean = mean(DEM_HS_GPA_scale_unknown),
sd = sd(DEM_HS_GPA_scale_unknown),
.groups = "drop"
) %>%
pivot_wider(
names_from = DEM_Sex,
values_from = c(count, mean, sd),
names_glue = "{.value}_{DEM_Sex}"
)
bind_cols(summary_stats, tibble(
p_value = tidy_test$p.value
))
}) %>%
ungroup()
# Print table
gpa_comparison %>%
mutate(p_value = round(p_value, 4)) %>%
kable(
digits = 2,
caption = "GPA by Sex for Each Semester (≥2 students in each group)"
) %>%
kable_styling(full_width = FALSE)
Semester | count_F | count_M | mean_F | mean_M | sd_F | sd_M | p_value |
---|---|---|---|---|---|---|---|
Fall 2018 | 5 | 5 | 3.48 | 3.32 | 0.19 | 0.31 | 0.37 |
Fall 2019 | 116 | 37 | 3.69 | 3.54 | 0.31 | 0.38 | 0.03 |
Fall 2020 | 167 | 32 | 3.65 | 3.57 | 0.33 | 0.40 | 0.30 |
Fall 2021 | 136 | 26 | 3.61 | 3.41 | 0.37 | 0.38 | 0.02 |
Fall 2022 | 8 | 3 | 3.57 | 3.41 | 0.33 | 0.20 | 0.37 |
Fall 2023 | 57 | 16 | 3.73 | 3.60 | 0.31 | 0.21 | 0.06 |
# ACT Math
discover %>%
filter(!is.na(DEM_ACT.MATH)) %>%
ggplot(aes(x = Semester, y = DEM_ACT.MATH)) +
geom_boxplot(fill = "#2c7fb8", alpha = 0.7) +
labs(title = "ACT Math Score by Semester", y = "ACT Math", x = "Semester") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# ACT Composite
discover %>%
filter(!is.na(DEM_ACT.Composite)) %>%
ggplot(aes(x = Semester, y = DEM_ACT.Composite)) +
geom_boxplot(fill = "#7fcdbb", alpha = 0.7) +
labs(title = "ACT Composite Score by Semester", y = "ACT Composite", x = "Semester") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Function to summarize and test
summarize_act <- function(data, variable) {
total_n <- nrow(data)
data_valid <- data %>%
filter(!is.na(.data[[variable]]))
# Summary by semester
summary_tbl <- data_valid %>%
group_by(Semester) %>%
summarise(
count = n(),
mean = mean(.data[[variable]], na.rm = TRUE),
sd = sd(.data[[variable]], na.rm = TRUE),
.groups = "drop"
)
# ANOVA
aov_result <- aov(as.formula(paste(variable, "~ Semester")), data = data_valid)
p_val <- tidy(aov_result)$p.value[1]
# Add cohort % and p-value
summary_tbl %>%
mutate(
percent_present = round(100 * count / total_n, 1),
p_value = round(p_val, 4)
)
}
# Run for both ACT variables
act_math_tbl <- summarize_act(discover, "DEM_ACT.MATH")
act_comp_tbl <- summarize_act(discover, "DEM_ACT.Composite")
# ACT Math Table
act_math_tbl %>%
kable(digits = 2, caption = "ACT Math by Semester with ANOVA and Coverage %") %>%
kable_styling(full_width = FALSE)
Semester | count | mean | sd | percent_present | p_value |
---|---|---|---|---|---|
Fall 2018 | 165 | 24.31 | 3.47 | 14.3 | 0.02 |
Fall 2019 | 167 | 23.96 | 3.64 | 14.4 | 0.02 |
Fall 2020 | 211 | 23.44 | 4.05 | 18.3 | 0.02 |
Fall 2021 | 107 | 23.19 | 3.98 | 9.3 | 0.02 |
Fall 2022 | 95 | 23.04 | 4.20 | 8.2 | 0.02 |
Fall 2023 | 64 | 22.66 | 4.73 | 5.5 | 0.02 |
# ACT Composite Table
act_comp_tbl %>%
kable(digits = 2, caption = "ACT Composite by Semester with ANOVA and Coverage %") %>%
kable_styling(full_width = FALSE)
Semester | count | mean | sd | percent_present | p_value |
---|---|---|---|---|---|
Fall 2018 | 167 | 24.11 | 3.22 | 14.4 | 0.51 |
Fall 2019 | 167 | 23.98 | 3.58 | 14.4 | 0.51 |
Fall 2020 | 211 | 23.56 | 4.02 | 18.3 | 0.51 |
Fall 2021 | 107 | 23.50 | 4.11 | 9.3 | 0.51 |
Fall 2022 | 95 | 23.41 | 4.47 | 8.2 | 0.51 |
Fall 2023 | 65 | 23.38 | 4.23 | 5.6 | 0.51 |
chem5_f21 = read.csv("~/Teaching/Grades_and_SRT/BIOC3321\ Grade\ Spreadsheet\ F21\ F22\ F23/Grades-BIOC_3321_(001)_Fall_2021.csv")
chem5_f22 = read.csv("~/Teaching/Grades_and_SRT/BIOC3321\ Grade\ Spreadsheet\ F21\ F22\ F23/Grades-BIOC_3321_(001)_Fall_2022.csv")
chem5_f23 = read.csv("~/Teaching/Grades_and_SRT/BIOC3321\ Grade\ Spreadsheet\ F21\ F22\ F23/Grades-BIOC_3321_(001)_Fall_2023.csv")
chem5_f21 = head(tail(chem5_f21, -1), -1)
chem5_f22 = head(tail(chem5_f22, -1), -1)
chem5_f23 = tail(chem5_f23, -1)
dfs_bio = list(
chem5_f21 = chem5_f21,
chem5_f22 = chem5_f22,
chem5_f23 = chem5_f23
)
dfs_bio = getFinalScore(dfs_bio)
#merged_df = merge(f19cohort,chem5_f21, by.x = "student", by.y = "SIS.Login.ID", all.x = TRUE)
#f19cohort$chem5 = merged_df$Final.Score
addBiochemStudents = function(df1,df2,df3,df4){
#df1 is the original cohort, df2,3, and 4 are the biochem
df1 <- df1 %>%
left_join(select(df2, SIS.Login.ID, Final.Score), by = c("student" = "SIS.Login.ID")) %>%
rename(chem5_f21 = Final.Score)
df1 <- df1 %>%
left_join(select(df3, SIS.Login.ID, Final.Score), by = c("student" = "SIS.Login.ID")) %>%
rename(chem5_f22 = Final.Score)
df1 <- df1 %>%
left_join(select(df4, SIS.Login.ID, Final.Score), by = c("student" = "SIS.Login.ID")) %>%
rename(chem5_f23 = Final.Score)
df1 <- df1 %>%
mutate(chem5 = coalesce(chem5_f21, chem5_f22, chem5_f23)) %>%
select(-chem5_f21, -chem5_f22, -chem5_f23) # Remove intermediate columns
df1$chem5 = as.numeric(df1$chem5)
return(df1)
}
addFinalExamBiochemStudents = function(df1,df2,df3,df4){
#df1 is the original cohort, df2,3, and 4 are the biochem
df1 <- df1 %>%
left_join(select(df2, SIS.Login.ID, Final.Exam.Final.Score), by = c("student" = "SIS.Login.ID")) %>%
rename(chem5_f21_final = Final.Exam.Final.Score)
df1 <- df1 %>%
left_join(select(df3, SIS.Login.ID, Final.Exam.Final.Score), by = c("student" = "SIS.Login.ID")) %>%
rename(chem5_f22_final = Final.Exam.Final.Score)
df1 <- df1 %>%
left_join(select(df4, SIS.Login.ID, Final.Exam.Final.Score), by = c("student" = "SIS.Login.ID")) %>%
rename(chem5_f23_final = Final.Exam.Final.Score)
df1 <- df1 %>%
mutate(chem5_final = coalesce(chem5_f21_final, chem5_f22_final, chem5_f23_final)) %>%
select(-chem5_f21_final, -chem5_f22_final, -chem5_f23_final) # Remove intermediate columns
df1$chem5_final = as.numeric(df1$chem5_final)
return(df1)
}
f19cohort = addBiochemStudents(f19cohort,chem5_f21,chem5_f22,chem5_f23)
f20cohort = addBiochemStudents(f20cohort,chem5_f21,chem5_f22,chem5_f23)
f21cohort = addBiochemStudents(f21cohort,chem5_f21,chem5_f22,chem5_f23)
f19cohort = addFinalExamBiochemStudents(f19cohort,chem5_f21,chem5_f22,chem5_f23)
f20cohort = addFinalExamBiochemStudents(f20cohort,chem5_f21,chem5_f22,chem5_f23)
f21cohort = addFinalExamBiochemStudents(f21cohort,chem5_f21,chem5_f22,chem5_f23)
f19complete = f19cohort[complete.cases(f19cohort),]
f20complete = f20cohort[complete.cases(f20cohort),]
f21complete = f21cohort[complete.cases(f21cohort),]
f19complete$student = sub("@umn.edu","",f19complete$student)
f20complete$student = sub("@umn.edu","",f20complete$student)
f21complete$student = sub("@umn.edu","",f21complete$student)