setwd("~/Research/02b Neural Network Research UMR/Data + Analysis/Clustering_Xavier")
umr = read.csv("UMR_all_for_R_with_courses.csv",header = TRUE)

umr[which(umr$Term_collected == "Fall2022"),]$Course_collected = gsub('Biochem 1', 'BiocF22', umr[which(umr$Term_collected == "Fall2022"),]$Course_collected)
umr[which(umr$Term_collected == "Fall2021"),]$Course_collected = gsub('Biochem 1', 'BiocF21', umr[which(umr$Term_collected == "Fall2021"),]$Course_collected)

umrs1 = umr[which(umr$Survey=="ES_Chemical_Reaction"),]
umrs2 = umr[which(umr$Survey=="ES_Glucosidase"),]
umrs3 = umr[which(umr$Survey=="Nucleic_Acids"),]
umrs4 = umr[which(umr$Survey=="Oxygen_Binding"),]
umrs5 = umr[which(umr$Survey=="Protein_Structure"),]

expert = read.csv("Experts_all_for_R.csv",header = TRUE)
exs1 = expert[which(expert$Survey=="ES_Chemical_Equation"),]
exs2 = expert[which(expert$Survey=="ES_Glucosidase"),]
exs3 = expert[which(expert$Survey=="Nucleic_Acids"),]
exs4 = expert[which(expert$Survey=="Oxygen_Binding"),]
exs5 = expert[which(expert$Survey=="Protein_Strcuture"),]

library(psych)

analyzeUMRCourses = function(umrs1){
 #allBiochem = data.frame(matrix(ncol = 8,nrow=0))
 #myCols = c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","PLC","NS")
 #colnames(allBiochem) = myCols

 #allBiochem = rbind(allBiochem,otherss1[myCols])

 ##remove UT/BIO206 and all Dennison and non-Bioc3321 at UMR
 #allBiochem = allBiochem[! grepl("Dennison",allBiochem$Institution ),]
 #allBiochem = allBiochem[! grepl("BIO206",allBiochem$Course_collected),]
 #tempo = umrs1[grep("BIOC3321",umrs1$Course_collected),]
 #allBiochem = rbind(allBiochem,tempo[myCols])
  
  allBiochem = umrs1[,c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","NS","actual_year","PLC")]
  allBiochem$Coherency = as.numeric(allBiochem$Coherency)
  allBiochem$NS = as.numeric(allBiochem$NS)
  allBiochem$PLC = as.numeric(allBiochem$PLC)
  allBiochem$race_binary <- ifelse(allBiochem$Race_ethnicity == "White/Caucasian" , 'White', "Non-white")
   
  #Cluster. Setting one seed, whatever
  set.seed(42)
  df <- matrix(data=c(allBiochem$PLC,allBiochem$NS),ncol=2)
  allBiochem$cluster = kmeans(scale(df[,1:2]),3)$cluster
  
  #this is clumsy but I have to programmatically find the cluster number corresponding to HP, LP, and IP
  #Using the PLC to make sure its working
  meanPLCbyCluster = describeBy(allBiochem$PLC,allBiochem$cluster,mat=TRUE)
  maxPLC = max(meanPLCbyCluster$mean)
  HPgroup = as.numeric(meanPLCbyCluster[which(meanPLCbyCluster$mean==maxPLC),]$group1)
  minPLC = min(meanPLCbyCluster$mean)
  LPgroup = as.numeric(meanPLCbyCluster[which(meanPLCbyCluster$mean==minPLC),]$group1)
  if (HPgroup + LPgroup == 3 ){IPgroup = 3}
  if (HPgroup + LPgroup == 4 ){IPgroup = 2}
  if (HPgroup + LPgroup == 5 ){IPgroup = 1}
  allBiochem$clusterLetter = ifelse(allBiochem$cluster == HPgroup, "HP",
                                    ifelse(allBiochem$cluster == LPgroup,"LP",
                                           ifelse(allBiochem$cluster == IPgroup,"IP","Oops")))  
  allBiochem$Course_collected = factor(allBiochem$Course_collected,levels = c(
    "Gen + Organic 1","O Chem 1","O Chem 2","Gen Chem 2","BiocF21","BiocF22","Biochem 2"))
  return(allBiochem)
}

buildTables = function(allBiochem){
  mata<-describeBy(allBiochem$PLC,allBiochem$clusterLetter,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by cluster group"))
  mata<-describeBy(allBiochem$PLC,allBiochem$Institution,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by institution"))
  mata<-describeBy(allBiochem$PLC,allBiochem$actual_year,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by Actual Year"))
  mata<-describeBy(allBiochem$PLC,allBiochem$Course_collected,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by course"))
  mata<-describeBy(allBiochem$PLC,allBiochem$Sex_birth,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by Sex"))
  mata<-describeBy(allBiochem$PLC,allBiochem$race_binary,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by Race"))
}
calcStats = function(allBiochem,mycategory){
  #using the term course as a generic category
   for (course in unique(allBiochem$Course_collected)){
     if ( course == "Expert") next
     header = paste("<b>Results for category: ",course,"</b></br></br>")
     cat(header)
     umrTot= sum(allBiochem$Course_collected == course )
     umrHP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "HP")
     umrIP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "IP")
     umrLP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "LP")
     
     umrMale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male")
     umrHPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "HP")
     umrIPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "IP")
     umrLPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "LP")
      
     umrFemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female")
     umrHPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "HP")
     umrIPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "IP")
     umrLPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "LP")
      
     umrWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White")
     umrHPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "HP")
     umrIPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "IP")
     umrLPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "LP")
      
     umrNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white")
     umrHPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "HP")
     umrIPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "IP")
     umrLPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "LP")
     
     output = paste("<table >
<thead>
<tr>
  <th colspan='2'></th>
  <th colspan='2'>High Performers</th>
  <th colspan='2'>Intermediate Performers</th>
  <th colspan='2'>Low Performers</th>
  
</tr>
</thead>
<tbody>
  <tr>
    <td rowspan='5'>",course," </td>
    <td>Total N=", umrTot,"</td>
    <td colspan='2'>", signif(umrHP/umrTot*100,digits=2),"% </td>
    <td colspan='2'>", signif(umrIP/umrTot*100,digits=2),"%</td>
    <td colspan='2'>", signif(umrLP/umrTot*100,digits=2),"% </td>
  </tr>
  <tr>
    <td rowspan='2'>Sex: males N=",umrMale,"; females N=",umrFemale,"</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
  </tr>
  <tr>
    <td>", signif(umrHPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrHPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrIPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrIPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrLPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrLPfemale/umrFemale*100,digits=2),"%</td>
  </tr>
  <tr>
    <td rowspan='2'>Race: White N=",umrWhite,"; Non-white N=",umrNonwhite,"</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
  </tr>
  <tr>
    <td>", signif(umrHPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrHPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrIPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrIPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrLPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrLPNonwhite/umrNonwhite*100,digits=2),"%</td>
  </tr>
</tbody>
</table> ")
     cat(output)
   }
  
}


calcStats2 = function(allBiochem,mycategory){
  #using the term course as a generic   category
   for (course in unique(allBiochem$actual_year)){
     if ( course == "Expert") next
     header = paste("<b>Results for category: ",course,"</b></br></br>")
     cat(header)
     umrTot= sum(allBiochem$actual_year == course )
     umrHP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "HP")
     umrIP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "IP")
     umrLP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "LP")
     
     umrMale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male")
     umrHPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "HP")
     umrIPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "IP")
     umrLPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "LP")
      
     umrFemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female")
     umrHPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "HP")
     umrIPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "IP")
     umrLPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "LP")
      
     umrWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White")
     umrHPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "HP")
     umrIPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "IP")
     umrLPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "LP")
      
     umrNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white")
     umrHPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "HP")
     umrIPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "IP")
     umrLPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "LP")
     
     output = paste("<table >
<thead>
<tr>
  <th colspan='2'></th>
  <th colspan='2'>High Performers</th>
  <th colspan='2'>Intermediate Performers</th>
  <th colspan='2'>Low Performers</th>
  
</tr>
</thead>
<tbody>
  <tr>
    <td rowspan='5'>",course," </td>
    <td>Total N=", umrTot,"</td>
    <td colspan='2'>", signif(umrHP/umrTot*100,digits=2),"% </td>
    <td colspan='2'>", signif(umrIP/umrTot*100,digits=2),"%</td>
    <td colspan='2'>", signif(umrLP/umrTot*100,digits=2),"% </td>
  </tr>
  <tr>
    <td rowspan='2'>Sex: males N=",umrMale,"; females N=",umrFemale,"</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
  </tr>
  <tr>
    <td>", signif(umrHPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrHPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrIPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrIPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrLPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrLPfemale/umrFemale*100,digits=2),"%</td>
  </tr>
  <tr>
    <td rowspan='2'>Race: White N=",umrWhite,"; Non-white N=",umrNonwhite,"</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
  </tr>
  <tr>
    <td>", signif(umrHPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrHPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrIPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrIPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrLPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrLPNonwhite/umrNonwhite*100,digits=2),"%</td>
  </tr>
</tbody>
</table> ")
     cat(output)
   }
  
}



library(ggplot2)
library(ggpubr)
library(psych)

plotGGbox = function(df,myx,myy,mytitle,myylab){
  df = df[complete.cases(df[[myy]]),]
  maxy = max(df[[myy]])
  ggboxplot(df, x = myx, y = myy,  
            title = mytitle,
            color = myx, add = "jitter", legend="none",ylab = myylab) + rotate_x_text(angle = 45) +  
    geom_hline( yintercept = mean(df[[myy]]), linetype = 2) + 
    stat_compare_means(method = "anova", label.y = maxy*1.10) +
    coord_cartesian(ylim = c(0, maxy*1.2)) + 
    stat_compare_means(label = "p.format", size=2.5, method = "t.test", ref.group = ".all.",label.y = maxy*1.05)
}
getAnova = function(df,myx,myy,mytitle,myylab){
  #get anova
  a<- TukeyHSD( aov(df[[myy]] ~ df[[myx]])) 
  b<-as.data.frame(a$`df[[myx]]`[,4])
  colnames(b) = c("Testing statistical significance: p-values")
  print(knitr::kable(b, caption = paste("Anova: ",mytitle)))
}
plotAndTable = function(df,myx,myy,mytitle,myylab){
  if (myx=="Sex_birth" | myx=="race_binary"){
    df = df[!grepl("(?i)Expert", df$Course_collected),]
    df = df[!grepl("(?)Prefer not to answer",df$Sex_birth),]
  }
  print(plotGGbox(df,myx,myy,mytitle,myylab))
  table = describeBy(df[[myy]],df[[myx]],mat=TRUE,digits = 2)
  print(knitr::kable(table[,c(2,4,5,6,7,10,11,12)],caption=paste("Statistics of ",myylab," based on the category",myx)))
  getAnova(df,myx,myy,mytitle,myylab)
}
addExperts = function(alldf, experts){
  alldf = allBiochem
  ex_new = as.data.frame( matrix( ncol=ncol(alldf),nrow = nrow(experts)) )
  colnames(ex_new) =  colnames(alldf)
  #colnames(ex_new) =  c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","NS","actual_year","PLC","cluster","race_binary","clusterLeter")
  ex_new[,1:12] = "Expert"
  ex_new$PLC = experts$PLC
  ex_new$NS = experts$NS
  ex_new$Coherency = experts$Coherency
  alldf=rbind(alldf,ex_new)
  return(alldf)
}

library(dplyr)
library(corrplot)
plotChi = function(a){
  #I need to use droplevels otherwise it was showing Expert with zeros as a ghost category?
  b=chisq.test(table(droplevels(a)))
  cat(paste("<p><b>The Chi-square analysis gives a p=",round(b$p.value,5),"</b></p>"))
  cat(paste("<p><b>Residuals analysis:</b></p>"))
  cat("A negative residual implies that the measured value is lower than expected and a positive value higher than expected</br>")
  corrplot(b$residuals, is.cor = FALSE)
  #normalize it
  #contrib <- 100*b$residuals^2/b$statistic
  #round(contrib, 3)
  #corrplot(contrib, is.cor = FALSE)
  #corrplot(contrib, is.cor = FALSE, col.lim = c(0.3,1) )


}
plotBarAndCorr = function(df,myx,myy,myxlabel,myylabel,mytitle){
  #myx is the course or demographic variable, the independent variable
  #myy is typically the clusterLetter, the dependent variable
  #remove experts, not useful for the chisquare analysis
  a = df[!grepl("Expert",df[,1]),]
  if (myx=="Sex_birth"){
    a = a[!grepl("(?)Prefer not to answer",a$Sex_birth),]
  }
  #select the two categorical variables
  a = a[,c(myy,myx)]
  print(plotBarCategories(a,myx,myy,myxlabel,myylabel,mytitle))
  plotChi(a)
}
plotBarCategories = function(a,myx,myy,myxlabel,myylabel,mytitle){
  #using aes_string instead of aes because colnames are variables
  #ggplot(a, aes_string(x=myx,fill=myy)) + geom_bar()
  
  
  #c=prop.table(table(a$clusterLetter))
  #scales::percent(as.double(z))
  #a %>% select(clusterLetter) %>% table() %>% prop.table() %>% as.double() %>% scales::percent()
  #this one
  #myx = enquo(myx)
  #myy = enquo(myy)
  a %>% 
    count(!!sym(myy),!!sym(myx))  %>% 
    group_by(!!sym(myx)) %>% 
    mutate(lab = paste0(round(prop.table(n) * 100, 2), '%')) %>%
    ggplot(aes(!!sym(myx),n, fill=!!sym(myy))) + 
    geom_col() + geom_text(aes(label=lab),position='stack',vjust=1.5) +
    labs(x=myxlabel,y=myylabel,title=mytitle)
}

1 Introduction

What was learned from “clustering_indeces_v2” analysis about indicators

PLC seem to clearly distinguish experts from students (with significant overlap though)
NS does not distinguish as much as PLC, but the highest score and median is still significantly different between experts and students
Coherency does not seem to distinguish well and we will discard it in the subsequent analysis

Meaning of PLC and NS * PLC: path length correlation. How connected are two nodes. The closer to one more connected. Compared to expert. * NS: Neighborhood similarity. Compared to the experts.

First we will compare how PLC score is distributed among demographics and courses. Then we will combine students’ PLC and NS indexes as a measurement of students performance. We then cluster the PLC,NS pairs into three groups: Low performers, intermediate, and high performers. We then analyze the composition of those three groups by its demographics, year and courses. The year is not by the number of credits, rather, if they are in CHEM1 or CHEM2 they are labeled as first_year, CHEM3 and CHEM4 will be second_year, and Bioc1 and Bioc2 will be third_year.

2 UMR courses: ES Chemical Equation

2.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs1)
allBiochem = addExperts(allBiochem,exs1)
#adding experts
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	109	0.23	0.17	0.26	-0.20	0.51	0.71
X12	O Chem 1	100	0.26	0.16	0.28	-0.16	0.55	0.71
X13	O Chem 2	76	0.29	0.15	0.30	-0.40	0.54	0.94
X14	Gen Chem 2	57	0.25	0.15	0.24	-0.05	0.53	0.58
X15	BiocF21	58	0.40	0.18	0.45	-0.18	0.67	0.85
X16	BiocF22	43	0.46	0.11	0.47	0.23	0.72	0.49
X17	Biochem 2	22	0.40	0.20	0.43	-0.06	0.67	0.73
X18	Expert	6	0.67	0.12	0.69	0.49	0.82	0.33

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	0.9652073
O Chem 2-Gen + Organic 1	0.3643457
Gen Chem 2-Gen + Organic 1	0.9997580
BiocF21-Gen + Organic 1	0.0000000
BiocF22-Gen + Organic 1	0.0000000
Biochem 2-Gen + Organic 1	0.0005200
Expert-Gen + Organic 1	0.0000000
O Chem 2-O Chem 1	0.9368395
Gen Chem 2-O Chem 1	0.9999042
BiocF21-O Chem 1	0.0000038
BiocF22-O Chem 1	0.0000000
Biochem 2-O Chem 1	0.0068022
Expert-O Chem 1	0.0000001
Gen Chem 2-O Chem 2	0.8486153
BiocF21-O Chem 2	0.0016387
BiocF22-O Chem 2	0.0000004
Biochem 2-O Chem 2	0.0934819
Expert-O Chem 2	0.0000009
BiocF21-Gen Chem 2	0.0000134
BiocF22-Gen Chem 2	0.0000000
Biochem 2-Gen Chem 2	0.0057281
Expert-Gen Chem 2	0.0000001
BiocF22-BiocF21	0.5102566
Biochem 2-BiocF21	1.0000000
Expert-BiocF21	0.0025478
Biochem 2-BiocF22	0.7580255
Expert-BiocF22	0.0645927
Expert-Biochem 2	0.0058951

plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	6	0.67	0.12	0.69	0.49	0.82	0.33
X12	first_year	209	0.24	0.17	0.27	-0.20	0.55	0.75
X13	second_year	133	0.27	0.15	0.28	-0.40	0.54	0.94
X14	third_year	123	0.42	0.16	0.46	-0.18	0.72	0.90

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0014380
second_year-first_year	0.5354066
third_year-first_year	0.0000000
third_year-second_year	0.0000000

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	189	0.26	0.18	0.26	-0.40	0.72	1.12
X12	White	272	0.32	0.17	0.35	-0.16	0.67	0.83

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.0001431

plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	369	0.30	0.18	0.31	-0.40	0.67	1.07
X12	Male	92	0.31	0.19	0.33	-0.18	0.72	0.90

Anova: PLC: Sex
Testing statistical significance: p-values
0.547014

2.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	109	0.22	0.08	0.22	0.04	0.41	0.37
X12	O Chem 1	100	0.22	0.09	0.21	0.04	0.47	0.43
X13	O Chem 2	76	0.25	0.08	0.24	0.08	0.41	0.33
X14	Gen Chem 2	57	0.21	0.06	0.22	0.08	0.37	0.29
X15	BiocF21	58	0.22	0.07	0.23	0.09	0.41	0.32
X16	BiocF22	43	0.24	0.07	0.25	0.13	0.42	0.29
X17	Biochem 2	22	0.23	0.09	0.21	0.07	0.42	0.35
X18	Expert	6	0.37	0.11	0.34	0.28	0.57	0.29

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	1.0000000
O Chem 2-Gen + Organic 1	0.1841246
Gen Chem 2-Gen + Organic 1	0.9985103
BiocF21-Gen + Organic 1	0.9999784
BiocF22-Gen + Organic 1	0.6710577
Biochem 2-Gen + Organic 1	0.9999959
Expert-Gen + Organic 1	0.0001402
O Chem 2-O Chem 1	0.1649445
Gen Chem 2-O Chem 1	0.9994987
BiocF21-O Chem 1	0.9999028
BiocF22-O Chem 1	0.6324514
Biochem 2-O Chem 1	0.9999833
Expert-O Chem 1	0.0001260
Gen Chem 2-O Chem 2	0.1120421
BiocF21-O Chem 2	0.5781042
BiocF22-O Chem 2	0.9999652
Biochem 2-O Chem 2	0.8941579
Expert-O Chem 2	0.0066311
BiocF21-Gen Chem 2	0.9908048
BiocF22-Gen Chem 2	0.4529292
Biochem 2-Gen Chem 2	0.9980973
Expert-Gen Chem 2	0.0000762
BiocF22-BiocF21	0.9076963
Biochem 2-BiocF21	1.0000000
Expert-BiocF21	0.0003856
Biochem 2-BiocF22	0.9805426
Expert-BiocF22	0.0053866
Expert-Biochem 2	0.0014475

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	6	0.37	0.11	0.34	0.28	0.57	0.29
X12	first_year	209	0.22	0.08	0.21	0.04	0.47	0.43
X13	second_year	133	0.23	0.08	0.23	0.08	0.41	0.33
X14	third_year	123	0.23	0.07	0.23	0.07	0.42	0.35

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000246
second_year-Expert	0.0001911
third_year-Expert	0.0001494
second_year-first_year	0.3752155
third_year-first_year	0.5451535
third_year-second_year	0.9963083

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	189	0.22	0.08	0.21	0.07	0.41	0.34
X12	White	272	0.24	0.08	0.23	0.04	0.47	0.43

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.0104875

plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	369	0.23	0.08	0.22	0.04	0.47	0.43
X12	Male	92	0.22	0.07	0.22	0.04	0.40	0.36

Anova: NS: Sex
Testing statistical significance: p-values
0.5026976

2.3 PLC/NS clustering

The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	6	0.67	0.12	0.69	0.49	0.82	0.33
X12	HP	183	0.41	0.10	0.41	0.23	0.72	0.49
X13	IP	133	0.36	0.12	0.37	0.07	0.63	0.56
X14	LP	149	0.10	0.12	0.14	-0.40	0.29	0.69

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	0.0000005
IP-Expert	0.0000000
LP-Expert	0.0000000
IP-HP	0.0007267
LP-HP	0.0000000
LP-IP	0.0000000

Are cluster groups unevenly distributed among these categories? A chi-square analysis will give us the probability that all three cluster groups (HP,IP,LP) contain statistically similar proportions of this category (course, year, sex, race…)

2.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES_Chemical_Reaction - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: Gen + Organic 1

		High Performers		Intermediate Performers		Low Performers
Gen + Organic 1	Total N= 109	27 %		29 %		44 %
	Sex: males N= 8 ; females N= 99	male	female	male	female	male	female
	Sex: males N= 8 ; females N= 99	50 %	25 %	0 %	31 %	50 %	43 %
	Race: White N= 62 ; Non-white N= 47	white	non-white	white	non-white	white	non-white
	Race: White N= 62 ; Non-white N= 47	31 %	21 %	32 %	26 %	37 %	53 %

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 76	33 %		39 %		28 %
	Sex: males N= 15 ; females N= 60	male	female	male	female	male	female
	Sex: males N= 15 ; females N= 60	60 %	27 %	27 %	43 %	13 %	30 %
	Race: White N= 48 ; Non-white N= 28	white	non-white	white	non-white	white	non-white
	Race: White N= 48 ; Non-white N= 28	38 %	25 %	44 %	32 %	19 %	43 %

Results for category: BiocF21

		High Performers		Intermediate Performers		Low Performers
BiocF21	Total N= 58	62 %		21 %		17 %
	Sex: males N= 18 ; females N= 40	male	female	male	female	male	female
	Sex: males N= 18 ; females N= 40	61 %	62 %	28 %	18 %	11 %	20 %
	Race: White N= 32 ; Non-white N= 26	white	non-white	white	non-white	white	non-white
	Race: White N= 32 ; Non-white N= 26	59 %	65 %	25 %	15 %	16 %	19 %

Results for category: Gen Chem 2

		High Performers		Intermediate Performers		Low Performers
Gen Chem 2	Total N= 57	37 %		21 %		42 %
	Sex: males N= 18 ; females N= 39	male	female	male	female	male	female
	Sex: males N= 18 ; females N= 39	44 %	33 %	11 %	26 %	44 %	41 %
	Race: White N= 33 ; Non-white N= 24	white	non-white	white	non-white	white	non-white
	Race: White N= 33 ; Non-white N= 24	30 %	46 %	27 %	12 %	42 %	42 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 100	31 %		30 %		39 %
	Sex: males N= 19 ; females N= 80	male	female	male	female	male	female
	Sex: males N= 19 ; females N= 80	32 %	31 %	26 %	31 %	42 %	38 %
	Race: White N= 56 ; Non-white N= 44	white	non-white	white	non-white	white	non-white
	Race: White N= 56 ; Non-white N= 44	36 %	25 %	32 %	27 %	32 %	48 %

Results for category: Biochem 2

		High Performers		Intermediate Performers		Low Performers
Biochem 2	Total N= 22	59 %		23 %		18 %
	Sex: males N= 9 ; females N= 13	male	female	male	female	male	female
	Sex: males N= 9 ; females N= 13	56 %	62 %	11 %	31 %	33 %	7.7 %
	Race: White N= 14 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 8	64 %	50 %	29 %	12 %	7.1 %	38 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 43	65 %		28 %		7 %
	Sex: males N= 5 ; females N= 38	male	female	male	female	male	female
	Sex: males N= 5 ; females N= 38	60 %	66 %	20 %	29 %	20 %	5.3 %
	Race: White N= 28 ; Non-white N= 15	white	non-white	white	non-white	white	non-white
	Race: White N= 28 ; Non-white N= 15	61 %	73 %	39 %	6.7 %	0 %	20 %

2.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES_Chemical_Reaction - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 209	29 %		30 %		42 %
	Sex: males N= 27 ; females N= 179	male	female	male	female	male	female
	Sex: males N= 27 ; females N= 179	37 %	28 %	19 %	31 %	44 %	41 %
	Race: White N= 118 ; Non-white N= 91	white	non-white	white	non-white	white	non-white
	Race: White N= 118 ; Non-white N= 91	33 %	23 %	32 %	26 %	35 %	51 %

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 133	35 %		32 %		34 %
	Sex: males N= 33 ; females N= 99	male	female	male	female	male	female
	Sex: males N= 33 ; females N= 99	52 %	29 %	18 %	36 %	30 %	34 %
	Race: White N= 81 ; Non-white N= 52	white	non-white	white	non-white	white	non-white
	Race: White N= 81 ; Non-white N= 52	35 %	35 %	37 %	23 %	28 %	42 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 123	63 %		24 %		14 %
	Sex: males N= 32 ; females N= 91	male	female	male	female	male	female
	Sex: males N= 32 ; females N= 91	59 %	64 %	22 %	24 %	19 %	12 %
	Race: White N= 74 ; Non-white N= 49	white	non-white	white	non-white	white	non-white
	Race: White N= 74 ; Non-white N= 49	61 %	65 %	31 %	12 %	8.1 %	22 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.36146

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.04505

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.64232

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.06549

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.15212

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.01157

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

3 UMR courses: ES Glucosidase

3.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs2)
allBiochem = addExperts(allBiochem,exs2)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	109	0.24	0.15	0.26	-0.15	0.50	0.65
X12	O Chem 1	100	0.28	0.17	0.27	-0.19	0.60	0.78
X13	O Chem 2	76	0.29	0.14	0.28	-0.25	0.56	0.81
X14	Gen Chem 2	57	0.24	0.18	0.25	-0.09	0.61	0.70
X15	BiocF21	58	0.44	0.17	0.46	-0.16	0.68	0.84
X16	BiocF22	43	0.47	0.10	0.48	0.24	0.65	0.41
X17	Biochem 2	22	0.41	0.19	0.45	0.03	0.72	0.69
X18	Expert	8	0.72	0.09	0.70	0.59	0.82	0.23

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	0.5497471
O Chem 2-Gen + Organic 1	0.3654800
Gen Chem 2-Gen + Organic 1	0.9999957
BiocF21-Gen + Organic 1	0.0000000
BiocF22-Gen + Organic 1	0.0000000
Biochem 2-Gen + Organic 1	0.0001120
Expert-Gen + Organic 1	0.0000000
O Chem 2-O Chem 1	0.9999098
Gen Chem 2-O Chem 1	0.8876521
BiocF21-O Chem 1	0.0000001
BiocF22-O Chem 1	0.0000000
Biochem 2-O Chem 1	0.0116580
Expert-O Chem 1	0.0000000
Gen Chem 2-O Chem 2	0.7414446
BiocF21-O Chem 2	0.0000027
BiocF22-O Chem 2	0.0000001
Biochem 2-O Chem 2	0.0364355
Expert-O Chem 2	0.0000000
BiocF21-Gen Chem 2	0.0000000
BiocF22-Gen Chem 2	0.0000000
Biochem 2-Gen Chem 2	0.0009537
Expert-Gen Chem 2	0.0000000
BiocF22-BiocF21	0.9665517
Biochem 2-BiocF21	0.9963039
Expert-BiocF21	0.0000540
Biochem 2-BiocF22	0.8097147
Expert-BiocF22	0.0009480
Expert-Biochem 2	0.0000488

plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	8	0.72	0.09	0.70	0.59	0.82	0.23
X12	first_year	209	0.26	0.16	0.27	-0.19	0.60	0.78
X13	second_year	133	0.27	0.16	0.27	-0.25	0.61	0.86
X14	third_year	123	0.44	0.15	0.47	-0.16	0.72	0.88

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000105
second_year-first_year	0.8952110
third_year-first_year	0.0000000
third_year-second_year	0.0000000

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	189	0.28	0.18	0.28	-0.25	0.65	0.90
X12	White	272	0.34	0.17	0.36	-0.15	0.72	0.87

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.0002627

plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	369	0.31	0.17	0.32	-0.25	0.72	0.97
X12	Male	92	0.31	0.19	0.34	-0.16	0.65	0.81

Anova: PLC: Sex
Testing statistical significance: p-values
0.9661744

3.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	109	0.31	0.24	0.27	-0.36	0.83	1.19
X12	O Chem 1	100	0.23	0.08	0.24	0.04	0.45	0.41
X13	O Chem 2	76	0.24	0.08	0.25	0.04	0.41	0.37
X14	Gen Chem 2	57	0.21	0.07	0.21	0.09	0.47	0.38
X15	BiocF21	58	0.25	0.09	0.26	0.04	0.45	0.41
X16	BiocF22	43	0.25	0.07	0.23	0.10	0.41	0.31
X17	Biochem 2	22	0.25	0.07	0.26	0.13	0.39	0.26
X18	Expert	8	0.40	0.06	0.42	0.29	0.47	0.17

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	0.0015567
O Chem 2-Gen + Organic 1	0.0187765
Gen Chem 2-Gen + Organic 1	0.0004946
BiocF21-Gen + Organic 1	0.1457790
BiocF22-Gen + Organic 1	0.2016682
Biochem 2-Gen + Organic 1	0.5667910
Expert-Gen + Organic 1	0.5832677
O Chem 2-O Chem 1	0.9999463
Gen Chem 2-O Chem 1	0.9875516
BiocF21-O Chem 1	0.9931104
BiocF22-O Chem 1	0.9987084
Biochem 2-O Chem 1	0.9995817
Expert-O Chem 1	0.0181461
Gen Chem 2-O Chem 2	0.9412761
BiocF21-O Chem 2	0.9998844
BiocF22-O Chem 2	0.9999929
Biochem 2-O Chem 2	0.9999957
Expert-O Chem 2	0.0335475
BiocF21-Gen Chem 2	0.8113878
BiocF22-Gen Chem 2	0.9067041
Biochem 2-Gen Chem 2	0.9632280
Expert-Gen Chem 2	0.0065152
BiocF22-BiocF21	1.0000000
Biochem 2-BiocF21	1.0000000
Expert-BiocF21	0.0677658
Biochem 2-BiocF22	1.0000000
Expert-BiocF22	0.0692078
Expert-Biochem 2	0.1226520

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	8	0.40	0.06	0.42	0.29	0.47	0.17
X12	first_year	209	0.27	0.19	0.25	-0.36	0.83	1.19
X13	second_year	133	0.23	0.07	0.24	0.04	0.47	0.43
X14	third_year	123	0.25	0.08	0.25	0.04	0.45	0.41

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0482549
second_year-Expert	0.0035754
third_year-Expert	0.0143046
second_year-first_year	0.0228005
third_year-first_year	0.4577446
third_year-second_year	0.6250317

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	189	0.23	0.14	0.22	-0.36	0.71	1.07
X12	White	272	0.27	0.13	0.25	-0.13	0.83	0.96

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.0004332

plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	369	0.26	0.14	0.24	-0.36	0.83	1.19
X12	Male	92	0.24	0.11	0.24	-0.21	0.64	0.85

Anova: NS: Sex
Testing statistical significance: p-values
0.2258991

3.3 PLC/NS clustering

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	8	0.72	0.09	0.70	0.59	0.82	0.23
X12	HP	259	0.43	0.10	0.42	0.23	0.72	0.49
X13	IP	38	0.29	0.11	0.29	0.07	0.46	0.40
X14	LP	168	0.13	0.12	0.16	-0.25	0.39	0.64

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	0
IP-Expert	0
LP-Expert	0
IP-HP	0
LP-HP	0
LP-IP	0

3.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES Glucosidase - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: Gen + Organic 1

		High Performers		Intermediate Performers		Low Performers
Gen + Organic 1	Total N= 109	28 %		32 %		40 %
	Sex: males N= 8 ; females N= 99	male	female	male	female	male	female
	Sex: males N= 8 ; females N= 99	0 %	30 %	38 %	30 %	62 %	39 %
	Race: White N= 62 ; Non-white N= 47	white	non-white	white	non-white	white	non-white
	Race: White N= 62 ; Non-white N= 47	29 %	26 %	35 %	28 %	35 %	47 %

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 76	61 %		1.3 %		38 %
	Sex: males N= 15 ; females N= 60	male	female	male	female	male	female
	Sex: males N= 15 ; females N= 60	67 %	58 %	0 %	1.7 %	33 %	40 %
	Race: White N= 48 ; Non-white N= 28	white	non-white	white	non-white	white	non-white
	Race: White N= 48 ; Non-white N= 28	71 %	43 %	2.1 %	0 %	27 %	57 %

Results for category: BiocF21

		High Performers		Intermediate Performers		Low Performers
BiocF21	Total N= 58	83 %		0 %		17 %
	Sex: males N= 18 ; females N= 40	male	female	male	female	male	female
	Sex: males N= 18 ; females N= 40	83 %	82 %	0 %	0 %	17 %	18 %
	Race: White N= 32 ; Non-white N= 26	white	non-white	white	non-white	white	non-white
	Race: White N= 32 ; Non-white N= 26	88 %	77 %	0 %	0 %	12 %	23 %

Results for category: Gen Chem 2

		High Performers		Intermediate Performers		Low Performers
Gen Chem 2	Total N= 57	46 %		1.8 %		53 %
	Sex: males N= 18 ; females N= 39	male	female	male	female	male	female
	Sex: males N= 18 ; females N= 39	33 %	51 %	0 %	2.6 %	67 %	46 %
	Race: White N= 33 ; Non-white N= 24	white	non-white	white	non-white	white	non-white
	Race: White N= 33 ; Non-white N= 24	45 %	46 %	3 %	0 %	52 %	54 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 100	51 %		1 %		48 %
	Sex: males N= 19 ; females N= 80	male	female	male	female	male	female
	Sex: males N= 19 ; females N= 80	42 %	54 %	0 %	1.2 %	58 %	45 %
	Race: White N= 56 ; Non-white N= 44	white	non-white	white	non-white	white	non-white
	Race: White N= 56 ; Non-white N= 44	55 %	45 %	0 %	2.3 %	45 %	52 %

Results for category: Biochem 2

		High Performers		Intermediate Performers		Low Performers
Biochem 2	Total N= 22	77 %		0 %		23 %
	Sex: males N= 9 ; females N= 13	male	female	male	female	male	female
	Sex: males N= 9 ; females N= 13	78 %	77 %	0 %	0 %	22 %	23 %
	Race: White N= 14 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 8	86 %	62 %	0 %	0 %	14 %	38 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 43	95 %		0 %		4.7 %
	Sex: males N= 5 ; females N= 38	male	female	male	female	male	female
	Sex: males N= 5 ; females N= 38	100 %	95 %	0 %	0 %	0 %	5.3 %
	Race: White N= 28 ; Non-white N= 15	white	non-white	white	non-white	white	non-white
	Race: White N= 28 ; Non-white N= 15	100 %	87 %	0 %	0 %	0 %	13 %

3.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES Glucosidase - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 209	39 %		17 %		44 %
	Sex: males N= 27 ; females N= 179	male	female	male	female	male	female
	Sex: males N= 27 ; females N= 179	30 %	41 %	11 %	17 %	59 %	42 %
	Race: White N= 118 ; Non-white N= 91	white	non-white	white	non-white	white	non-white
	Race: White N= 118 ; Non-white N= 91	42 %	35 %	19 %	15 %	40 %	49 %

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 133	54 %		1.5 %		44 %
	Sex: males N= 33 ; females N= 99	male	female	male	female	male	female
	Sex: males N= 33 ; females N= 99	48 %	56 %	0 %	2 %	52 %	42 %
	Race: White N= 81 ; Non-white N= 52	white	non-white	white	non-white	white	non-white
	Race: White N= 81 ; Non-white N= 52	60 %	44 %	2.5 %	0 %	37 %	56 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 123	86 %		0 %		14 %
	Sex: males N= 32 ; females N= 91	male	female	male	female	male	female
	Sex: males N= 32 ; females N= 91	84 %	87 %	0 %	0 %	16 %	13 %
	Race: White N= 74 ; Non-white N= 49	white	non-white	white	non-white	white	non-white
	Race: White N= 74 ; Non-white N= 49	92 %	78 %	0 %	0 %	8.1 %	22 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.2357

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.50805

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.96331

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.38034

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.06938

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.04667

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

4 UMR courses: Nucleic Acids

4.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs3)
allBiochem = addExperts(allBiochem,exs3)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	82	0.06	0.14	0.07	-0.79	0.43	1.23
X12	O Chem 1	107	0.11	0.12	0.11	-0.21	0.37	0.58
X13	O Chem 2	61	0.12	0.12	0.14	-0.33	0.40	0.73
X14	Gen Chem 2	58	0.10	0.14	0.09	-0.27	0.42	0.69
X15	BiocF21	53	0.14	0.15	0.14	-0.17	0.56	0.72
X16	BiocF22	36	0.16	0.12	0.15	-0.08	0.39	0.47
X17	Biochem 2	24	0.19	0.19	0.22	-0.18	0.56	0.74
X18	Expert	7	0.71	0.08	0.69	0.60	0.82	0.22

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	0.3117001
O Chem 2-Gen + Organic 1	0.1952197
Gen Chem 2-Gen + Organic 1	0.5777922
BiocF21-Gen + Organic 1	0.0128166
BiocF22-Gen + Organic 1	0.0107056
Biochem 2-Gen + Organic 1	0.0006625
Expert-Gen + Organic 1	0.0000000
O Chem 2-O Chem 1	0.9992456
Gen Chem 2-O Chem 1	1.0000000
BiocF21-O Chem 1	0.7089232
BiocF22-O Chem 1	0.5273523
Biochem 2-O Chem 1	0.0750461
Expert-O Chem 1	0.0000000
Gen Chem 2-O Chem 2	0.9991966
BiocF21-O Chem 2	0.9729913
BiocF22-O Chem 2	0.8814935
Biochem 2-O Chem 2	0.2727323
Expert-O Chem 2	0.0000000
BiocF21-Gen Chem 2	0.7815819
BiocF22-Gen Chem 2	0.6046273
Biochem 2-Gen Chem 2	0.1090752
Expert-Gen Chem 2	0.0000000
BiocF22-BiocF21	0.9998746
Biochem 2-BiocF21	0.7937217
Expert-BiocF21	0.0000000
Biochem 2-BiocF22	0.9624613
Expert-BiocF22	0.0000000
Expert-Biochem 2	0.0000000

plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.71	0.08	0.69	0.60	0.82	0.22
X12	first_year	189	0.09	0.13	0.09	-0.79	0.43	1.23
X13	second_year	119	0.11	0.13	0.12	-0.33	0.42	0.76
X14	third_year	113	0.16	0.15	0.16	-0.18	0.56	0.74

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	0.3984704
third_year-first_year	0.0000594
third_year-second_year	0.0414213

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	173	0.10	0.15	0.09	-0.79	0.56	1.35
X12	White	244	0.12	0.13	0.12	-0.27	0.52	0.79

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.1649182

plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	338	0.11	0.14	0.11	-0.79	0.56	1.35
X12	Male	79	0.14	0.13	0.14	-0.17	0.43	0.59

Anova: PLC: Sex
Testing statistical significance: p-values
0.0802224

4.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	82	0.13	0.06	0.13	0.04	0.29	0.25
X12	O Chem 1	107	0.16	0.07	0.15	0.04	0.40	0.36
X13	O Chem 2	61	0.16	0.06	0.15	0.04	0.27	0.23
X14	Gen Chem 2	58	0.16	0.07	0.15	0.06	0.32	0.25
X15	BiocF21	53	0.17	0.08	0.15	0.00	0.35	0.35
X16	BiocF22	36	0.17	0.07	0.18	0.03	0.30	0.27
X17	Biochem 2	24	0.18	0.09	0.17	0.04	0.41	0.37
X18	Expert	7	0.43	0.08	0.44	0.33	0.53	0.20

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	0.1222132
O Chem 2-Gen + Organic 1	0.4688702
Gen Chem 2-Gen + Organic 1	0.2024925
BiocF21-Gen + Organic 1	0.0570943
BiocF22-Gen + Organic 1	0.1233538
Biochem 2-Gen + Organic 1	0.0502438
Expert-Gen + Organic 1	0.0000000
O Chem 2-O Chem 1	0.9999693
Gen Chem 2-O Chem 1	0.9999997
BiocF21-O Chem 1	0.9941082
BiocF22-O Chem 1	0.9955670
Biochem 2-O Chem 1	0.8716951
Expert-O Chem 1	0.0000000
Gen Chem 2-O Chem 2	0.9997855
BiocF21-O Chem 2	0.9749227
BiocF22-O Chem 2	0.9809149
Biochem 2-O Chem 2	0.7995603
Expert-O Chem 2	0.0000000
BiocF21-Gen Chem 2	0.9994726
BiocF22-Gen Chem 2	0.9994520
Biochem 2-Gen Chem 2	0.9448838
Expert-Gen Chem 2	0.0000000
BiocF22-BiocF21	1.0000000
Biochem 2-BiocF21	0.9962882
Expert-BiocF21	0.0000000
Biochem 2-BiocF22	0.9984874
Expert-BiocF22	0.0000000
Expert-Biochem 2	0.0000000

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.43	0.08	0.44	0.33	0.53	0.20
X12	first_year	189	0.15	0.06	0.14	0.04	0.40	0.36
X13	second_year	119	0.16	0.06	0.15	0.04	0.32	0.28
X14	third_year	113	0.17	0.08	0.16	0.00	0.41	0.41

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	0.5347359
third_year-first_year	0.0210884
third_year-second_year	0.4892358

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	173	0.15	0.07	0.14	0.03	0.40	0.37
X12	White	244	0.16	0.07	0.15	0.00	0.41	0.41

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.3201503

plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	338	0.15	0.07	0.14	0.00	0.41	0.41
X12	Male	79	0.17	0.07	0.16	0.04	0.40	0.36

Anova: NS: Sex
Testing statistical significance: p-values
0.1498567

4.3 PLC/NS clustering

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.71	0.08	0.69	0.60	0.82	0.22
X12	HP	75	0.29	0.09	0.28	0.09	0.56	0.47
X13	IP	201	0.14	0.07	0.14	-0.08	0.38	0.46
X14	LP	145	-0.01	0.11	0.01	-0.79	0.14	0.94

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	0
IP-Expert	0
LP-Expert	0
IP-HP	0
LP-HP	0
LP-IP	0

4.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 1e-04

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Nucleic Acids - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: Gen + Organic 1

		High Performers		Intermediate Performers		Low Performers
Gen + Organic 1	Total N= 82	6.1 %		41 %		52 %
	Sex: males N= 3 ; females N= 77	male	female	male	female	male	female
	Sex: males N= 3 ; females N= 77	0 %	6.5 %	33 %	42 %	67 %	52 %
	Race: White N= 46 ; Non-white N= 36	white	non-white	white	non-white	white	non-white
	Race: White N= 46 ; Non-white N= 36	4.3 %	8.3 %	41 %	42 %	54 %	50 %

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 61	11 %		67 %		21 %
	Sex: males N= 11 ; females N= 49	male	female	male	female	male	female
	Sex: males N= 11 ; females N= 49	9.1 %	12 %	73 %	67 %	18 %	20 %
	Race: White N= 36 ; Non-white N= 25	white	non-white	white	non-white	white	non-white
	Race: White N= 36 ; Non-white N= 25	14 %	8 %	75 %	56 %	11 %	36 %

Results for category: BiocF21

		High Performers		Intermediate Performers		Low Performers
BiocF21	Total N= 53	25 %		47 %		28 %
	Sex: males N= 15 ; females N= 38	male	female	male	female	male	female
	Sex: males N= 15 ; females N= 38	27 %	24 %	53 %	45 %	20 %	32 %
	Race: White N= 29 ; Non-white N= 24	white	non-white	white	non-white	white	non-white
	Race: White N= 29 ; Non-white N= 24	28 %	21 %	55 %	38 %	17 %	42 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 107	17 %		48 %		36 %
	Sex: males N= 19 ; females N= 87	male	female	male	female	male	female
	Sex: males N= 19 ; females N= 87	21 %	16 %	47 %	47 %	32 %	37 %
	Race: White N= 61 ; Non-white N= 46	white	non-white	white	non-white	white	non-white
	Race: White N= 61 ; Non-white N= 46	18 %	15 %	52 %	41 %	30 %	43 %

Results for category: Gen Chem 2

		High Performers		Intermediate Performers		Low Performers
Gen Chem 2	Total N= 58	19 %		45 %		36 %
	Sex: males N= 18 ; females N= 40	male	female	male	female	male	female
	Sex: males N= 18 ; females N= 40	28 %	15 %	44 %	45 %	28 %	40 %
	Race: White N= 32 ; Non-white N= 26	white	non-white	white	non-white	white	non-white
	Race: White N= 32 ; Non-white N= 26	22 %	15 %	41 %	50 %	38 %	35 %

Results for category: Biochem 2

		High Performers		Intermediate Performers		Low Performers
Biochem 2	Total N= 24	38 %		33 %		29 %
	Sex: males N= 9 ; females N= 15	male	female	male	female	male	female
	Sex: males N= 9 ; females N= 15	22 %	47 %	56 %	20 %	22 %	33 %
	Race: White N= 17 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 17 ; Non-white N= 7	41 %	29 %	24 %	57 %	35 %	14 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 36	33 %		44 %		22 %
	Sex: males N= 4 ; females N= 32	male	female	male	female	male	female
	Sex: males N= 4 ; females N= 32	25 %	34 %	75 %	41 %	0 %	25 %
	Race: White N= 24 ; Non-white N= 12	white	non-white	white	non-white	white	non-white
	Race: White N= 24 ; Non-white N= 12	33 %	33 %	50 %	33 %	17 %	33 %

4.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.00012

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Nucleic Acids - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 189	12 %		45 %		43 %
	Sex: males N= 22 ; females N= 164	male	female	male	female	male	female
	Sex: males N= 22 ; females N= 164	18 %	12 %	45 %	45 %	36 %	44 %
	Race: White N= 107 ; Non-white N= 82	white	non-white	white	non-white	white	non-white
	Race: White N= 107 ; Non-white N= 82	12 %	12 %	48 %	41 %	40 %	46 %

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 119	15 %		56 %		29 %
	Sex: males N= 29 ; females N= 89	male	female	male	female	male	female
	Sex: males N= 29 ; females N= 89	21 %	13 %	55 %	57 %	24 %	29 %
	Race: White N= 68 ; Non-white N= 51	white	non-white	white	non-white	white	non-white
	Race: White N= 68 ; Non-white N= 51	18 %	12 %	59 %	53 %	24 %	35 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 113	30 %		43 %		27 %
	Sex: males N= 28 ; females N= 85	male	female	male	female	male	female
	Sex: males N= 28 ; females N= 85	25 %	32 %	57 %	39 %	18 %	29 %
	Race: White N= 70 ; Non-white N= 43	white	non-white	white	non-white	white	non-white
	Race: White N= 70 ; Non-white N= 43	33 %	26 %	46 %	40 %	21 %	35 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.62414

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.61766

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.22207

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.66791

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.32343

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.2837

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

5 UMR courses: Oxygen Binding

5.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs4)
allBiochem = addExperts(allBiochem,exs4)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	94	0.17	0.13	0.19	-0.17	0.51	0.68
X12	O Chem 1	126	0.18	0.12	0.18	-0.19	0.41	0.60
X13	O Chem 2	65	0.18	0.14	0.19	-0.20	0.48	0.68
X14	Gen Chem 2	67	0.21	0.13	0.24	-0.21	0.42	0.63
X15	BiocF21	53	0.22	0.26	0.18	-0.25	0.86	1.11
X16	BiocF22	37	0.20	0.13	0.20	-0.04	0.42	0.46
X17	Biochem 2	26	0.32	0.23	0.30	-0.09	0.70	0.80
X18	Expert	15	0.69	0.13	0.66	0.52	0.89	0.38

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	0.9995797
O Chem 2-Gen + Organic 1	0.9997234
Gen Chem 2-Gen + Organic 1	0.7022747
BiocF21-Gen + Organic 1	0.5671070
BiocF22-Gen + Organic 1	0.9934641
Biochem 2-Gen + Organic 1	0.0005910
Expert-Gen + Organic 1	0.0000000
O Chem 2-O Chem 1	1.0000000
Gen Chem 2-O Chem 1	0.8956474
BiocF21-O Chem 1	0.7851551
BiocF22-O Chem 1	0.9998398
Biochem 2-O Chem 1	0.0013717
Expert-O Chem 1	0.0000000
Gen Chem 2-O Chem 2	0.9585310
BiocF21-O Chem 2	0.8909605
BiocF22-O Chem 2	0.9999571
Biochem 2-O Chem 2	0.0048459
Expert-O Chem 2	0.0000000
BiocF21-Gen Chem 2	0.9999902
BiocF22-Gen Chem 2	0.9992966
Biochem 2-Gen Chem 2	0.0661446
Expert-Gen Chem 2	0.0000000
BiocF22-BiocF21	0.9937191
Biochem 2-BiocF21	0.1530198
Expert-BiocF21	0.0000000
Biochem 2-BiocF22	0.0417856
Expert-BiocF22	0.0000000
Expert-Biochem 2	0.0000000

plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	15	0.69	0.13	0.66	0.52	0.89	0.38
X12	first_year	220	0.18	0.12	0.18	-0.19	0.51	0.71
X13	second_year	132	0.20	0.13	0.22	-0.21	0.48	0.69
X14	third_year	116	0.23	0.22	0.20	-0.25	0.86	1.11

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	0.6208045
third_year-first_year	0.0082881
third_year-second_year	0.2616615

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	195	0.17	0.16	0.17	-0.25	0.86	1.11
X12	White	269	0.22	0.15	0.22	-0.21	0.70	0.92

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.001165

plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	372	0.19	0.16	0.2	-0.25	0.80	1.05
X12	Male	92	0.21	0.16	0.2	-0.09	0.86	0.95

Anova: PLC: Sex
Testing statistical significance: p-values
0.4297313

5.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	94	0.18	0.06	0.17	0.04	0.32	0.28
X12	O Chem 1	126	0.18	0.06	0.17	0.04	0.37	0.33
X13	O Chem 2	65	0.18	0.08	0.17	0.00	0.41	0.41
X14	Gen Chem 2	67	0.19	0.07	0.19	0.04	0.44	0.39
X15	BiocF21	53	0.21	0.09	0.21	0.06	0.44	0.38
X16	BiocF22	37	0.17	0.06	0.17	0.04	0.30	0.26
X17	Biochem 2	26	0.22	0.08	0.21	0.05	0.41	0.36
X18	Expert	15	0.35	0.09	0.35	0.25	0.53	0.28

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	0.9999998
O Chem 2-Gen + Organic 1	1.0000000
Gen Chem 2-Gen + Organic 1	0.9996865
BiocF21-Gen + Organic 1	0.2186001
BiocF22-Gen + Organic 1	0.9949659
Biochem 2-Gen + Organic 1	0.3220053
Expert-Gen + Organic 1	0.0000000
O Chem 2-O Chem 1	1.0000000
Gen Chem 2-O Chem 1	0.9975965
BiocF21-O Chem 1	0.1205696
BiocF22-O Chem 1	0.9978219
Biochem 2-O Chem 1	0.2314305
Expert-O Chem 1	0.0000000
Gen Chem 2-O Chem 2	0.9996240
BiocF21-O Chem 2	0.2807652
BiocF22-O Chem 2	0.9977051
Biochem 2-O Chem 2	0.3582391
Expert-O Chem 2	0.0000000
BiocF21-Gen Chem 2	0.5773117
BiocF22-Gen Chem 2	0.9555810
Biochem 2-Gen Chem 2	0.6071783
Expert-Gen Chem 2	0.0000000
BiocF22-BiocF21	0.1393354
Biochem 2-BiocF21	0.9999812
Expert-BiocF21	0.0000000
Biochem 2-BiocF22	0.1865267
Expert-BiocF22	0.0000000
Expert-Biochem 2	0.0000001

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	15	0.35	0.09	0.35	0.25	0.53	0.28
X12	first_year	220	0.18	0.06	0.17	0.04	0.37	0.33
X13	second_year	132	0.18	0.07	0.18	0.00	0.44	0.44
X14	third_year	116	0.20	0.08	0.20	0.04	0.44	0.40

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	0.9715375
third_year-first_year	0.0892992
third_year-second_year	0.3068437

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	195	0.18	0.07	0.17	0.04	0.44	0.40
X12	White	269	0.19	0.07	0.18	0.00	0.44	0.44

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.1920829

plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	372	0.18	0.07	0.18	0.00	0.44	0.44
X12	Male	92	0.19	0.08	0.18	0.05	0.44	0.39

Anova: NS: Sex
Testing statistical significance: p-values
0.4814695

5.3 PLC/NS clustering

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	15	0.69	0.13	0.66	0.52	0.89	0.38
X12	HP	78	0.40	0.15	0.36	0.13	0.86	0.72
X13	IP	243	0.22	0.09	0.22	-0.01	0.43	0.44
X14	LP	147	0.05	0.11	0.06	-0.25	0.32	0.57

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	0
IP-Expert	0
LP-Expert	0
IP-HP	0
LP-HP	0
LP-IP	0

5.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 1e-05

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Oxygen Binding - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: Gen + Organic 1

		High Performers		Intermediate Performers		Low Performers
Gen + Organic 1	Total N= 94	9.6 %		55 %		35 %
	Sex: males N= 7 ; females N= 85	male	female	male	female	male	female
	Sex: males N= 7 ; females N= 85	0 %	9.4 %	29 %	58 %	71 %	33 %
	Race: White N= 50 ; Non-white N= 44	white	non-white	white	non-white	white	non-white
	Race: White N= 50 ; Non-white N= 44	6 %	14 %	62 %	48 %	32 %	39 %

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 65	17 %		55 %		28 %
	Sex: males N= 12 ; females N= 52	male	female	male	female	male	female
	Sex: males N= 12 ; females N= 52	8.3 %	19 %	83 %	50 %	8.3 %	31 %
	Race: White N= 40 ; Non-white N= 25	white	non-white	white	non-white	white	non-white
	Race: White N= 40 ; Non-white N= 25	25 %	4 %	52 %	60 %	22 %	36 %

Results for category: BiocF21

		High Performers		Intermediate Performers		Low Performers
BiocF21	Total N= 53	34 %		25 %		42 %
	Sex: males N= 16 ; females N= 37	male	female	male	female	male	female
	Sex: males N= 16 ; females N= 37	44 %	30 %	19 %	27 %	38 %	43 %
	Race: White N= 29 ; Non-white N= 24	white	non-white	white	non-white	white	non-white
	Race: White N= 29 ; Non-white N= 24	34 %	33 %	31 %	17 %	34 %	50 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 126	10 %		58 %		32 %
	Sex: males N= 22 ; females N= 103	male	female	male	female	male	female
	Sex: males N= 22 ; females N= 103	9.1 %	11 %	59 %	57 %	32 %	32 %
	Race: White N= 71 ; Non-white N= 55	white	non-white	white	non-white	white	non-white
	Race: White N= 71 ; Non-white N= 55	11 %	9.1 %	59 %	56 %	30 %	35 %

Results for category: Biochem 2

		High Performers		Intermediate Performers		Low Performers
Biochem 2	Total N= 26	46 %		38 %		15 %
	Sex: males N= 11 ; females N= 15	male	female	male	female	male	female
	Sex: males N= 11 ; females N= 15	18 %	67 %	64 %	20 %	18 %	13 %
	Race: White N= 19 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 19 ; Non-white N= 7	42 %	57 %	42 %	29 %	16 %	14 %

Results for category: Gen Chem 2

		High Performers		Intermediate Performers		Low Performers
Gen Chem 2	Total N= 67	18 %		55 %		27 %
	Sex: males N= 20 ; females N= 47	male	female	male	female	male	female
	Sex: males N= 20 ; females N= 47	20 %	17 %	60 %	53 %	20 %	30 %
	Race: White N= 37 ; Non-white N= 30	white	non-white	white	non-white	white	non-white
	Race: White N= 37 ; Non-white N= 30	14 %	23 %	65 %	43 %	22 %	33 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 37	8.1 %		59 %		32 %
	Sex: males N= 4 ; females N= 33	male	female	male	female	male	female
	Sex: males N= 4 ; females N= 33	0 %	9.1 %	75 %	58 %	25 %	33 %
	Race: White N= 24 ; Non-white N= 13	white	non-white	white	non-white	white	non-white
	Race: White N= 24 ; Non-white N= 13	8.3 %	7.7 %	67 %	46 %	25 %	46 %

5.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.00023

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Oxygen Binding - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 220	10 %		57 %		33 %
	Sex: males N= 29 ; females N= 188	male	female	male	female	male	female
	Sex: males N= 29 ; females N= 188	6.9 %	10 %	52 %	57 %	41 %	32 %
	Race: White N= 121 ; Non-white N= 99	white	non-white	white	non-white	white	non-white
	Race: White N= 121 ; Non-white N= 99	9.1 %	11 %	60 %	53 %	31 %	36 %

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 132	17 %		55 %		27 %
	Sex: males N= 32 ; females N= 99	male	female	male	female	male	female
	Sex: males N= 32 ; females N= 99	16 %	18 %	69 %	52 %	16 %	30 %
	Race: White N= 77 ; Non-white N= 55	white	non-white	white	non-white	white	non-white
	Race: White N= 77 ; Non-white N= 55	19 %	15 %	58 %	51 %	22 %	35 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 116	28 %		39 %		33 %
	Sex: males N= 31 ; females N= 85	male	female	male	female	male	female
	Sex: males N= 31 ; females N= 85	29 %	28 %	42 %	38 %	29 %	34 %
	Race: White N= 72 ; Non-white N= 44	white	non-white	white	non-white	white	non-white
	Race: White N= 72 ; Non-white N= 44	28 %	30 %	46 %	27 %	26 %	43 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.60394

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.18927

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.86412

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.50782

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.27169

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.09043

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

6 UMR courses: Protein Structure

6.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs5)
allBiochem = addExperts(allBiochem,exs5)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	89	0.16	0.06	0.15	0.00	0.35	0.35
X12	O Chem 1	109	0.14	0.15	0.13	-0.18	0.46	0.64
X13	O Chem 2	59	0.18	0.17	0.18	-0.18	0.62	0.80
X14	Gen Chem 2	61	0.16	0.15	0.17	-0.13	0.47	0.60
X15	BiocF21	51	0.26	0.15	0.29	-0.13	0.54	0.67
X16	BiocF22	35	0.27	0.13	0.27	0.01	0.51	0.51
X17	Biochem 2	23	0.26	0.19	0.22	-0.18	0.59	0.77
X18	Expert	7	0.76	0.10	0.79	0.59	0.89	0.30

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	0.9933172
O Chem 2-Gen + Organic 1	0.9601136
Gen Chem 2-Gen + Organic 1	1.0000000
BiocF21-Gen + Organic 1	0.0011686
BiocF22-Gen + Organic 1	0.0023723
Biochem 2-Gen + Organic 1	0.0547524
Expert-Gen + Organic 1	0.0000000
O Chem 2-O Chem 1	0.6013307
Gen Chem 2-O Chem 1	0.9898798
BiocF21-O Chem 1	0.0000313
BiocF22-O Chem 1	0.0001285
Biochem 2-O Chem 1	0.0094669
Expert-O Chem 1	0.0000000
Gen Chem 2-O Chem 2	0.9883586
BiocF21-O Chem 2	0.0893150
BiocF22-O Chem 2	0.0891965
Biochem 2-O Chem 2	0.3981718
Expert-O Chem 2	0.0000000
BiocF21-Gen Chem 2	0.0060084
BiocF22-Gen Chem 2	0.0083529
Biochem 2-Gen Chem 2	0.1004083
Expert-Gen Chem 2	0.0000000
BiocF22-BiocF21	0.9999921
Biochem 2-BiocF21	1.0000000
Expert-BiocF21	0.0000000
Biochem 2-BiocF22	0.9999882
Expert-BiocF22	0.0000000
Expert-Biochem 2	0.0000000

plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.76	0.10	0.79	0.59	0.89	0.30
X12	first_year	198	0.15	0.12	0.14	-0.18	0.46	0.64
X13	second_year	120	0.17	0.16	0.18	-0.18	0.62	0.80
X14	third_year	109	0.26	0.15	0.27	-0.18	0.59	0.77

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	0.4895237
third_year-first_year	0.0000000
third_year-second_year	0.0000105

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	173	0.17	0.15	0.17	-0.18	0.46	0.64
X12	White	250	0.19	0.15	0.18	-0.17	0.62	0.79

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.057055

plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	343	0.18	0.14	0.17	-0.18	0.62	0.80
X12	Male	80	0.18	0.17	0.16	-0.18	0.48	0.66

Anova: PLC: Sex
Testing statistical significance: p-values
0.7431887

6.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	Gen + Organic 1	89	0.18	0.16	0.20	-0.22	0.50	0.72
X12	O Chem 1	109	0.16	0.07	0.16	0.03	0.35	0.32
X13	O Chem 2	59	0.17	0.06	0.17	0.04	0.35	0.31
X14	Gen Chem 2	61	0.17	0.07	0.17	0.04	0.41	0.37
X15	BiocF21	51	0.17	0.07	0.17	0.04	0.33	0.29
X16	BiocF22	35	0.18	0.06	0.17	0.08	0.42	0.34
X17	Biochem 2	23	0.19	0.10	0.19	0.04	0.39	0.35
X18	Expert	7	0.35	0.08	0.35	0.24	0.44	0.21

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 1-Gen + Organic 1	0.9097223
O Chem 2-Gen + Organic 1	0.9856908
Gen Chem 2-Gen + Organic 1	0.9989850
BiocF21-Gen + Organic 1	0.9995746
BiocF22-Gen + Organic 1	1.0000000
Biochem 2-Gen + Organic 1	0.9988883
Expert-Gen + Organic 1	0.0003529
O Chem 2-O Chem 1	0.9999996
Gen Chem 2-O Chem 1	0.9995544
BiocF21-O Chem 1	0.9994740
BiocF22-O Chem 1	0.9724458
Biochem 2-O Chem 1	0.8569256
Expert-O Chem 1	0.0000370
Gen Chem 2-O Chem 2	0.9999908
BiocF21-O Chem 2	0.9999839
BiocF22-O Chem 2	0.9939313
Biochem 2-O Chem 2	0.9355910
Expert-O Chem 2	0.0000924
BiocF21-Gen Chem 2	1.0000000
BiocF22-Gen Chem 2	0.9994642
Biochem 2-Gen Chem 2	0.9779878
Expert-Gen Chem 2	0.0001638
BiocF22-BiocF21	0.9997260
Biochem 2-BiocF21	0.9841815
Expert-BiocF21	0.0002177
Biochem 2-BiocF22	0.9997919
Expert-BiocF22	0.0010806
Expert-Biochem 2	0.0062939

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.35	0.08	0.35	0.24	0.44	0.21
X12	first_year	198	0.17	0.12	0.17	-0.22	0.50	0.72
X13	second_year	120	0.17	0.07	0.17	0.04	0.41	0.37
X14	third_year	109	0.18	0.07	0.17	0.04	0.42	0.38

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000153
second_year-Expert	0.0000145
third_year-Expert	0.0000606
second_year-first_year	0.9965851
third_year-first_year	0.8674005
third_year-second_year	0.8145313

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	173	0.16	0.09	0.17	-0.11	0.50	0.61
X12	White	250	0.18	0.09	0.17	-0.22	0.46	0.68

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.0386413

plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	343	0.17	0.10	0.17	-0.22	0.50	0.72
X12	Male	80	0.17	0.07	0.17	0.03	0.33	0.30

Anova: NS: Sex
Testing statistical significance: p-values
0.4421795

6.3 PLC/NS clustering

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.76	0.10	0.79	0.59	0.89	0.30
X12	HP	174	0.28	0.09	0.27	0.14	0.57	0.43
X13	IP	91	0.24	0.12	0.21	0.04	0.62	0.58
X14	LP	162	0.04	0.09	0.07	-0.18	0.21	0.40

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	0.0000000
IP-Expert	0.0000000
LP-Expert	0.0000000
IP-HP	0.0012838
LP-HP	0.0000000
LP-IP	0.0000000

6.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Protein Structure - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: Gen + Organic 1

		High Performers		Intermediate Performers		Low Performers
Gen + Organic 1	Total N= 89	18 %		44 %		38 %
	Sex: males N= 2 ; females N= 85	male	female	male	female	male	female
	Sex: males N= 2 ; females N= 85	0 %	19 %	50 %	44 %	50 %	38 %
	Race: White N= 51 ; Non-white N= 38	white	non-white	white	non-white	white	non-white
	Race: White N= 51 ; Non-white N= 38	16 %	21 %	55 %	29 %	29 %	50 %

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 59	47 %		10 %		42 %
	Sex: males N= 11 ; females N= 47	male	female	male	female	male	female
	Sex: males N= 11 ; females N= 47	45 %	47 %	9.1 %	11 %	45 %	43 %
	Race: White N= 35 ; Non-white N= 24	white	non-white	white	non-white	white	non-white
	Race: White N= 35 ; Non-white N= 24	54 %	38 %	8.6 %	12 %	37 %	50 %

Results for category: BiocF21

		High Performers		Intermediate Performers		Low Performers
BiocF21	Total N= 51	63 %		14 %		24 %
	Sex: males N= 15 ; females N= 36	male	female	male	female	male	female
	Sex: males N= 15 ; females N= 36	60 %	64 %	6.7 %	17 %	33 %	19 %
	Race: White N= 28 ; Non-white N= 23	white	non-white	white	non-white	white	non-white
	Race: White N= 28 ; Non-white N= 23	64 %	61 %	11 %	17 %	25 %	22 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 109	36 %		15 %		50 %
	Sex: males N= 19 ; females N= 89	male	female	male	female	male	female
	Sex: males N= 19 ; females N= 89	26 %	38 %	16 %	15 %	58 %	47 %
	Race: White N= 63 ; Non-white N= 46	white	non-white	white	non-white	white	non-white
	Race: White N= 63 ; Non-white N= 46	37 %	35 %	16 %	13 %	48 %	52 %

Results for category: Gen Chem 2

		High Performers		Intermediate Performers		Low Performers
Gen Chem 2	Total N= 61	38 %		20 %		43 %
	Sex: males N= 19 ; females N= 42	male	female	male	female	male	female
	Sex: males N= 19 ; females N= 42	53 %	31 %	26 %	17 %	21 %	52 %
	Race: White N= 35 ; Non-white N= 26	white	non-white	white	non-white	white	non-white
	Race: White N= 35 ; Non-white N= 26	49 %	23 %	8.6 %	35 %	43 %	42 %

Results for category: Biochem 2

		High Performers		Intermediate Performers		Low Performers
Biochem 2	Total N= 23	43 %		35 %		22 %
	Sex: males N= 10 ; females N= 13	male	female	male	female	male	female
	Sex: males N= 10 ; females N= 13	30 %	54 %	30 %	38 %	40 %	7.7 %
	Race: White N= 16 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 16 ; Non-white N= 7	44 %	43 %	44 %	14 %	12 %	43 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 35	74 %		8.6 %		17 %
	Sex: males N= 4 ; females N= 31	male	female	male	female	male	female
	Sex: males N= 4 ; females N= 31	75 %	74 %	0 %	9.7 %	25 %	16 %
	Race: White N= 23 ; Non-white N= 12	white	non-white	white	non-white	white	non-white
	Race: White N= 23 ; Non-white N= 12	70 %	83 %	13 %	0 %	17 %	17 %

6.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Protein Structure - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 198	28 %		28 %		44 %
	Sex: males N= 21 ; females N= 174	male	female	male	female	male	female
	Sex: males N= 21 ; females N= 174	24 %	29 %	19 %	29 %	57 %	43 %
	Race: White N= 114 ; Non-white N= 84	white	non-white	white	non-white	white	non-white
	Race: White N= 114 ; Non-white N= 84	27 %	29 %	33 %	20 %	39 %	51 %

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 120	42 %		15 %		42 %
	Sex: males N= 30 ; females N= 89	male	female	male	female	male	female
	Sex: males N= 30 ; females N= 89	50 %	39 %	20 %	13 %	30 %	47 %
	Race: White N= 70 ; Non-white N= 50	white	non-white	white	non-white	white	non-white
	Race: White N= 70 ; Non-white N= 50	51 %	30 %	8.6 %	24 %	40 %	46 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 109	62 %		17 %		21 %
	Sex: males N= 29 ; females N= 80	male	female	male	female	male	female
	Sex: males N= 29 ; females N= 80	52 %	66 %	14 %	18 %	34 %	16 %
	Race: White N= 67 ; Non-white N= 42	white	non-white	white	non-white	white	non-white
	Race: White N= 67 ; Non-white N= 42	61 %	64 %	19 %	12 %	19 %	24 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.42661

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.24838

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.11942

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.10473

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.01807

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.561

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

Clustering All UMR Courses 2021 - 2022

Xavier Prat-Resina

2023-02-20

1 Introduction

2 UMR courses: ES Chemical Equation

2.1 PLC only: Anova

2.2 NS only: Anova

2.3 PLC/NS clustering

2.3.1 Analysis by course

2.3.2 Analysis by year

3 UMR courses: ES Glucosidase

3.1 PLC only: Anova

3.2 NS only: Anova

3.3 PLC/NS clustering

3.3.1 Analysis by course

3.3.2 Analysis by year

4 UMR courses: Nucleic Acids

4.1 PLC only: Anova

4.2 NS only: Anova

4.3 PLC/NS clustering

4.3.1 Analysis by course

4.3.2 Analysis by year

5 UMR courses: Oxygen Binding

5.1 PLC only: Anova

5.2 NS only: Anova

5.3 PLC/NS clustering

5.3.1 Analysis by course

5.3.2 Analysis by year

6 UMR courses: Protein Structure

6.1 PLC only: Anova

6.2 NS only: Anova

6.3 PLC/NS clustering

6.3.1 Analysis by course

6.3.2 Analysis by year