setwd("~/Research/02b Neural Network Research UMR/Data + Analysis/Clustering_Xavier")
umr = read.csv("UMR_all_for_R_with_courses.csv",header = TRUE)

umr[which(umr$Term_collected == "Fall2022"),]$Course_collected = gsub('Biochem 1', 'BiocF22', umr[which(umr$Term_collected == "Fall2022"),]$Course_collected)
umr[which(umr$Term_collected == "Fall2021"),]$Course_collected = gsub('Biochem 1', 'BiocF21', umr[which(umr$Term_collected == "Fall2021"),]$Course_collected)

umrs1 = umr[which(umr$Survey=="ES_Chemical_Reaction"),]
umrs2 = umr[which(umr$Survey=="ES_Glucosidase"),]
umrs3 = umr[which(umr$Survey=="Nucleic_Acids"),]
umrs4 = umr[which(umr$Survey=="Oxygen_Binding"),]
umrs5 = umr[which(umr$Survey=="Protein_Structure"),]

#filter out the ones who were did not continue ochem1, 2 and biochem
deid = read.csv("deids_f22_f21_s21.csv",header = TRUE)
# some of the df below are not multiples of 3 which means that
#somehow the deid does not match with the main source of answers, UMR_all_for_R_with_courses.csv has eliminated some students?
# the easy solution would be to eliminate the students who are not 
# we will just do it manually removing UMRBIOC3321F22ES045 from deid
deid = deid[!deid$f22=="UMRBIOC3321F22ES045",]
umrs1 = umrs1[umrs1$Deidentifier %in% deid$s21 |  umrs1$Deidentifier %in% deid$f21 | umrs1$Deidentifier %in% deid$f22 ,]
umrs2 = umrs2[umrs2$Deidentifier %in% deid$s21 |  umrs2$Deidentifier %in% deid$f21 | umrs2$Deidentifier %in% deid$f22 ,]
umrs3 = umrs3[umrs3$Deidentifier %in% deid$s21 |  umrs3$Deidentifier %in% deid$f21 | umrs3$Deidentifier %in% deid$f22 ,]
umrs4 = umrs4[umrs4$Deidentifier %in% deid$s21 |  umrs4$Deidentifier %in% deid$f21 | umrs4$Deidentifier %in% deid$f22 ,]
umrs5 = umrs5[umrs5$Deidentifier %in% deid$s21 |  umrs5$Deidentifier %in% deid$f21 | umrs5$Deidentifier %in% deid$f22 ,]

#sum(umrs5$Term_collected=='Fall2021')
#sum(umrs5$Term_collected=='Fall2022')
#sum(umrs5$Term_collected=='Spring2021')

expert = read.csv("Experts_all_for_R.csv",header = TRUE)
exs1 = expert[which(expert$Survey=="ES_Chemical_Equation"),]
exs2 = expert[which(expert$Survey=="ES_Glucosidase"),]
exs3 = expert[which(expert$Survey=="Nucleic_Acids"),]
exs4 = expert[which(expert$Survey=="Oxygen_Binding"),]
exs5 = expert[which(expert$Survey=="Protein_Strcuture"),]

library(psych)

analyzeUMRCourses = function(umrs1){
 #allBiochem = data.frame(matrix(ncol = 8,nrow=0))
 #myCols = c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","PLC","NS")
 #colnames(allBiochem) = myCols

 #allBiochem = rbind(allBiochem,otherss1[myCols])

 ##remove UT/BIO206 and all Dennison and non-Bioc3321 at UMR
 #allBiochem = allBiochem[! grepl("Dennison",allBiochem$Institution ),]
 #allBiochem = allBiochem[! grepl("BIO206",allBiochem$Course_collected),]
 #tempo = umrs1[grep("BIOC3321",umrs1$Course_collected),]
 #allBiochem = rbind(allBiochem,tempo[myCols])
  
  allBiochem = umrs1[,c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","NS","actual_year","PLC")]
  allBiochem$Coherency = as.numeric(allBiochem$Coherency)
  allBiochem$NS = as.numeric(allBiochem$NS)
  allBiochem$PLC = as.numeric(allBiochem$PLC)
  allBiochem$race_binary <- ifelse(allBiochem$Race_ethnicity == "White/Caucasian" , 'White', "Non-white")
   
  #Cluster. Setting one seed, whatever
  set.seed(42)
  df <- matrix(data=c(allBiochem$PLC,allBiochem$NS),ncol=2)
  allBiochem$cluster = kmeans(scale(df[,1:2]),3)$cluster
  
  #this is clumsy but I have to programmatically find the cluster number corresponding to HP, LP, and IP
  #Using the PLC to make sure its working
  meanPLCbyCluster = describeBy(allBiochem$PLC,allBiochem$cluster,mat=TRUE)
  maxPLC = max(meanPLCbyCluster$mean)
  HPgroup = as.numeric(meanPLCbyCluster[which(meanPLCbyCluster$mean==maxPLC),]$group1)
  minPLC = min(meanPLCbyCluster$mean)
  LPgroup = as.numeric(meanPLCbyCluster[which(meanPLCbyCluster$mean==minPLC),]$group1)
  if (HPgroup + LPgroup == 3 ){IPgroup = 3}
  if (HPgroup + LPgroup == 4 ){IPgroup = 2}
  if (HPgroup + LPgroup == 5 ){IPgroup = 1}
  allBiochem$clusterLetter = ifelse(allBiochem$cluster == HPgroup, "HP",
                                    ifelse(allBiochem$cluster == LPgroup,"LP",
                                           ifelse(allBiochem$cluster == IPgroup,"IP","Oops")))  
  allBiochem$Course_collected = factor(allBiochem$Course_collected,levels = c(
    "Gen + Organic 1","O Chem 1","O Chem 2","Gen Chem 2","BiocF21","BiocF22","Biochem 2"))
  return(allBiochem)
}

buildTables = function(allBiochem){
  mata<-describeBy(allBiochem$PLC,allBiochem$clusterLetter,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by cluster group"))
  mata<-describeBy(allBiochem$PLC,allBiochem$Institution,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by institution"))
  mata<-describeBy(allBiochem$PLC,allBiochem$actual_year,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by Actual Year"))
  mata<-describeBy(allBiochem$PLC,allBiochem$Course_collected,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by course"))
  mata<-describeBy(allBiochem$PLC,allBiochem$Sex_birth,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by Sex"))
  mata<-describeBy(allBiochem$PLC,allBiochem$race_binary,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by Race"))
}
calcStats = function(allBiochem,mycategory){
  #using the term course as a generic category
   for (course in unique(allBiochem$Course_collected)){
     if ( course == "Expert") next
     header = paste("<b>Results for category: ",course,"</b></br></br>")
     cat(header)
     umrTot= sum(allBiochem$Course_collected == course )
     umrHP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "HP")
     umrIP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "IP")
     umrLP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "LP")
     
     umrMale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male")
     umrHPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "HP")
     umrIPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "IP")
     umrLPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "LP")
      
     umrFemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female")
     umrHPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "HP")
     umrIPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "IP")
     umrLPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "LP")
      
     umrWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White")
     umrHPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "HP")
     umrIPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "IP")
     umrLPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "LP")
      
     umrNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white")
     umrHPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "HP")
     umrIPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "IP")
     umrLPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "LP")
     
     output = paste("<table >
<thead>
<tr>
  <th colspan='2'></th>
  <th colspan='2'>High Performers</th>
  <th colspan='2'>Intermediate Performers</th>
  <th colspan='2'>Low Performers</th>
  
</tr>
</thead>
<tbody>
  <tr>
    <td rowspan='5'>",course," </td>
    <td>Total N=", umrTot,"</td>
    <td colspan='2'>", signif(umrHP/umrTot*100,digits=2),"% </td>
    <td colspan='2'>", signif(umrIP/umrTot*100,digits=2),"%</td>
    <td colspan='2'>", signif(umrLP/umrTot*100,digits=2),"% </td>
  </tr>
  <tr>
    <td rowspan='2'>Sex: males N=",umrMale,"; females N=",umrFemale,"</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
  </tr>
  <tr>
    <td>", signif(umrHPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrHPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrIPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrIPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrLPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrLPfemale/umrFemale*100,digits=2),"%</td>
  </tr>
  <tr>
    <td rowspan='2'>Race: White N=",umrWhite,"; Non-white N=",umrNonwhite,"</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
  </tr>
  <tr>
    <td>", signif(umrHPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrHPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrIPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrIPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrLPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrLPNonwhite/umrNonwhite*100,digits=2),"%</td>
  </tr>
</tbody>
</table> ")
     cat(output)
   }
  
}


calcStats2 = function(allBiochem,mycategory){
  #using the term course as a generic   category
   for (course in unique(allBiochem$actual_year)){
     if ( course == "Expert") next
     header = paste("<b>Results for category: ",course,"</b></br></br>")
     cat(header)
     umrTot= sum(allBiochem$actual_year == course )
     umrHP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "HP")
     umrIP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "IP")
     umrLP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "LP")
     
     umrMale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male")
     umrHPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "HP")
     umrIPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "IP")
     umrLPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "LP")
      
     umrFemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female")
     umrHPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "HP")
     umrIPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "IP")
     umrLPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "LP")
      
     umrWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White")
     umrHPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "HP")
     umrIPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "IP")
     umrLPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "LP")
      
     umrNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white")
     umrHPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "HP")
     umrIPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "IP")
     umrLPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "LP")
     
     output = paste("<table >
<thead>
<tr>
  <th colspan='2'></th>
  <th colspan='2'>High Performers</th>
  <th colspan='2'>Intermediate Performers</th>
  <th colspan='2'>Low Performers</th>
  
</tr>
</thead>
<tbody>
  <tr>
    <td rowspan='5'>",course," </td>
    <td>Total N=", umrTot,"</td>
    <td colspan='2'>", signif(umrHP/umrTot*100,digits=2),"% </td>
    <td colspan='2'>", signif(umrIP/umrTot*100,digits=2),"%</td>
    <td colspan='2'>", signif(umrLP/umrTot*100,digits=2),"% </td>
  </tr>
  <tr>
    <td rowspan='2'>Sex: males N=",umrMale,"; females N=",umrFemale,"</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
  </tr>
  <tr>
    <td>", signif(umrHPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrHPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrIPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrIPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrLPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrLPfemale/umrFemale*100,digits=2),"%</td>
  </tr>
  <tr>
    <td rowspan='2'>Race: White N=",umrWhite,"; Non-white N=",umrNonwhite,"</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
  </tr>
  <tr>
    <td>", signif(umrHPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrHPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrIPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrIPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrLPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrLPNonwhite/umrNonwhite*100,digits=2),"%</td>
  </tr>
</tbody>
</table> ")
     cat(output)
   }
  
}



library(ggplot2)
library(ggpubr)
library(psych)

plotGGbox = function(df,myx,myy,mytitle,myylab){
  df = df[complete.cases(df[[myy]]),]
  maxy = max(df[[myy]])
  ggboxplot(df, x = myx, y = myy,  
            title = mytitle,
            color = myx, add = "jitter", legend="none",ylab = myylab) + rotate_x_text(angle = 45) +  
    geom_hline( yintercept = mean(df[[myy]]), linetype = 2) + 
    stat_compare_means(method = "anova", label.y = maxy*1.10) +
    coord_cartesian(ylim = c(0, maxy*1.2)) + 
    stat_compare_means(label = "p.format", size=2.5, method = "t.test", ref.group = ".all.",label.y = maxy*1.05)
}
getAnova = function(df,myx,myy,mytitle,myylab){
  #get anova
  a<- TukeyHSD( aov(df[[myy]] ~ df[[myx]])) 
  b<-as.data.frame(a$`df[[myx]]`[,4])
  colnames(b) = c("Testing statistical significance: p-values")
  print(knitr::kable(b, caption = paste("Anova: ",mytitle)))
}
plotAndTable = function(df,myx,myy,mytitle,myylab){
  if (myx=="Sex_birth" | myx=="race_binary"){
    df = df[!grepl("(?i)Expert", df$Course_collected),]
    df = df[!grepl("(?)Prefer not to answer",df$Sex_birth),]
  }
  print(plotGGbox(df,myx,myy,mytitle,myylab))
  #adding droplevels when the variable is a factor otherwise describeBy will give you NaN for empty categories
  if ( myx == "Course_collected"){
    table = describeBy(df[[myy]],droplevels(df[[myx]]),mat=TRUE,digits = 2)
  }else{
    table = describeBy(df[[myy]],df[[myx]],mat=TRUE,digits = 2)
  }
  print(knitr::kable(table[,c(2,4,5,6,7,10,11,12)],caption=paste("Statistics of ",myylab," based on the category",myx)))
  getAnova(df,myx,myy,mytitle,myylab)
}
addExperts = function(alldf, experts){
  alldf = allBiochem
  ex_new = as.data.frame( matrix( ncol=ncol(alldf),nrow = nrow(experts)) )
  colnames(ex_new) =  colnames(alldf)
  #colnames(ex_new) =  c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","NS","actual_year","PLC","cluster","race_binary","clusterLeter")
  ex_new[,1:12] = "Expert"
  ex_new$PLC = experts$PLC
  ex_new$NS = experts$NS
  ex_new$Coherency = experts$Coherency
  alldf=rbind(alldf,ex_new)
  return(alldf)
}

library(dplyr)
library(corrplot)
plotChi = function(a){
  #I need to use droplevels otherwise it was showing Expert with zeros as a ghost category?
  b=chisq.test(table(droplevels(a)))
  cat(paste("<p><b>The Chi-square analysis gives a p=",round(b$p.value,5),"</b></p>"))
  cat(paste("<p><b>Residuals analysis:</b></p>"))
  cat("A negative residual implies that the measured value is lower than expected and a positive value higher than expected</br>")
  corrplot(b$residuals, is.cor = FALSE)
  #normalize it
  #contrib <- 100*b$residuals^2/b$statistic
  #round(contrib, 3)
  #corrplot(contrib, is.cor = FALSE)
  #corrplot(contrib, is.cor = FALSE, col.lim = c(0.3,1) )


}
plotBarAndCorr = function(df,myx,myy,myxlabel,myylabel,mytitle){
  #myx is the course or demographic variable, the independent variable
  #myy is typically the clusterLetter, the dependent variable
  #remove experts, not useful for the chisquare analysis
  a = df[!grepl("Expert",df[,1]),]
  if (myx=="Sex_birth"){
    a = a[!grepl("(?)Prefer not to answer",a$Sex_birth),]
  }
  #select the two categorical variables
  a = a[,c(myy,myx)]
  print(plotBarCategories(a,myx,myy,myxlabel,myylabel,mytitle))
  plotChi(a)
}
plotBarCategories = function(a,myx,myy,myxlabel,myylabel,mytitle){
  #using aes_string instead of aes because colnames are variables
  #ggplot(a, aes_string(x=myx,fill=myy)) + geom_bar()
  
  
  #c=prop.table(table(a$clusterLetter))
  #scales::percent(as.double(z))
  #a %>% select(clusterLetter) %>% table() %>% prop.table() %>% as.double() %>% scales::percent()
  #this one
  #myx = enquo(myx)
  #myy = enquo(myy)
  a %>% 
    count(!!sym(myy),!!sym(myx))  %>% 
    group_by(!!sym(myx)) %>% 
    mutate(lab = paste0(round(prop.table(n) * 100, 2), '%')) %>%
    ggplot(aes(!!sym(myx),n, fill=!!sym(myy))) + 
    geom_col() + geom_text(aes(label=lab),position='stack',vjust=1.5) +
    labs(x=myxlabel,y=myylabel,title=mytitle)
}

1 Introduction

What was learned from “clustering_indeces_v2” analysis about indicators

PLC seem to clearly distinguish experts from students (with significant overlap though)
NS does not distinguish as much as PLC, but the highest score and median is still significantly different between experts and students
Coherency does not seem to distinguish well and we will discard it in the subsequent analysis

Meaning of PLC and NS * PLC: path length correlation. How connected are two nodes. The closer to one more connected. Compared to expert. * NS: Neighborhood similarity. Compared to the experts.

In here we analyze only the subgroup of students who took the same survey during OChem1, OChem2, and Biochem. The sample is much smaller. Each survey is treated separately, so not because “student1” took the survey1 on those three different semesters means that student1 also took survey2. This is why the sample size is not exactly the same for each survey. For some surveys the analysis by sex is eliminated because there are no males. Because the sample size is so small, be careful of making conclusions on some demographics for some courses/surveys.

The analysis by year is kept even though it should be identical to the analysis by course because OChem1 is taken by first years, OChem2 by second, and Biochem by third years.

2 UMR courses: ES Chemical Equation

2.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs1)
allBiochem = addExperts(allBiochem,exs1)
#adding experts
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	24	0.27	0.17	0.30	-0.16	0.55	0.71
X12	O Chem 2	24	0.26	0.18	0.26	-0.40	0.52	0.92
X13	BiocF22	24	0.44	0.13	0.43	0.23	0.72	0.49
X14	Expert	6	0.67	0.12	0.69	0.49	0.82	0.33

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	0.9954125
BiocF22-O Chem 1	0.0021329
Expert-O Chem 1	0.0000029
BiocF22-O Chem 2	0.0009780
Expert-O Chem 2	0.0000016
Expert-BiocF22	0.0120205

#df=allBiochem
#myx="Course_collected"
#myy="PLC"
#mytitle="PLC:Whatever"
#myylab="PLC"
#plotAndTable = function(df,myx,myy,mytitle,myylab){
#  if (myx=="Sex_birth" | myx=="race_binary"){
#    df = df[!grepl("(?i)Expert", df$Course_collected),]
#    df = df[!grepl("(?)Prefer not to answer",df$Sex_birth),]
#  }
#  print(plotGGbox(df,myx,myy,mytitle,myylab))
#  table = describeBy(df[[myy]],df[[myx]],mat=TRUE,digits = 2)
#  table = describeBy(df[[myy]],df[[myx]],digits = 2)
#  print(knitr::kable(table2[,c(2,4,5,6,7,10,11,12)],caption=paste("Statistics of ",myylab," based on the category",myx)))
#  getAnova(df,myx,myy,mytitle,myylab)
#}
plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	6	0.67	0.12	0.69	0.49	0.82	0.33
X12	first_year	24	0.27	0.17	0.30	-0.16	0.55	0.71
X13	second_year	24	0.26	0.18	0.26	-0.40	0.52	0.92
X14	third_year	24	0.44	0.13	0.43	0.23	0.72	0.49

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000029
second_year-Expert	0.0000016
third_year-Expert	0.0120205
second_year-first_year	0.9954125
third_year-first_year	0.0021329
third_year-second_year	0.0009780

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	25	0.30	0.22	0.30	-0.40	0.72	1.12
X12	White	46	0.34	0.15	0.36	-0.16	0.62	0.78

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.387264

plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	64	0.33	0.17	0.36	-0.40	0.62	1.02
X12	Male	7	0.26	0.26	0.27	-0.16	0.72	0.88

Anova: PLC: Sex
Testing statistical significance: p-values
0.3200775

2.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	24	0.22	0.10	0.20	0.04	0.44	0.41
X12	O Chem 2	24	0.23	0.09	0.22	0.08	0.41	0.33
X13	BiocF22	24	0.24	0.06	0.24	0.15	0.35	0.20
X14	Expert	6	0.37	0.11	0.34	0.28	0.57	0.29

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	0.9562525
BiocF22-O Chem 1	0.9084114
Expert-O Chem 1	0.0009971
BiocF22-O Chem 2	0.9985778
Expert-O Chem 2	0.0028907
Expert-BiocF22	0.0039688

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	6	0.37	0.11	0.34	0.28	0.57	0.29
X12	first_year	24	0.22	0.10	0.20	0.04	0.44	0.41
X13	second_year	24	0.23	0.09	0.22	0.08	0.41	0.33
X14	third_year	24	0.24	0.06	0.24	0.15	0.35	0.20

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0009971
second_year-Expert	0.0028907
third_year-Expert	0.0039688
second_year-first_year	0.9562525
third_year-first_year	0.9084114
third_year-second_year	0.9985778

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	25	0.21	0.08	0.19	0.08	0.39	0.31
X12	White	46	0.24	0.09	0.22	0.04	0.44	0.41

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.2074017

plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	64	0.24	0.08	0.22	0.08	0.44	0.37
X12	Male	7	0.18	0.07	0.18	0.04	0.26	0.23

Anova: NS: Sex
Testing statistical significance: p-values
0.0713704

2.3 PLC/NS clustering

The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	6	0.67	0.12	0.69	0.49	0.82	0.33
X12	HP	23	0.45	0.11	0.44	0.19	0.62	0.43
X13	IP	28	0.38	0.09	0.36	0.27	0.72	0.45
X14	LP	21	0.12	0.15	0.19	-0.40	0.24	0.64

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	0.0006232
IP-Expert	0.0000041
LP-Expert	0.0000000
IP-HP	0.1913762
LP-HP	0.0000000
LP-IP	0.0000000

Are cluster groups unevenly distributed among these categories? A chi-square analysis will give us the probability that all three cluster groups (HP,IP,LP) contain statistically similar proportions of this category (course, year, sex, race…)

2.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.0879

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES_Chemical_Reaction - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 24	25 %		33 %		42 %
	Sex: males N= 2 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 2 ; females N= 21	0 %	29 %	50 %	33 %	50 %	38 %
	Race: White N= 16 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 16 ; Non-white N= 8	25 %	25 %	38 %	25 %	38 %	50 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 24	25 %		38 %		38 %
	Sex: males N= 2 ; females N= 22	male	female	male	female	male	female
	Sex: males N= 2 ; females N= 22	0 %	27 %	0 %	41 %	100 %	32 %
	Race: White N= 16 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 16 ; Non-white N= 8	31 %	12 %	38 %	38 %	31 %	50 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 24	46 %		46 %		8.3 %
	Sex: males N= 3 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 3 ; females N= 21	0 %	52 %	100 %	38 %	0 %	9.5 %
	Race: White N= 14 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 10	57 %	30 %	43 %	50 %	0 %	20 %

2.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.0879

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES_Chemical_Reaction - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 24	25 %		33 %		42 %
	Sex: males N= 2 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 2 ; females N= 21	0 %	29 %	50 %	33 %	50 %	38 %
	Race: White N= 16 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 16 ; Non-white N= 8	25 %	25 %	38 %	25 %	38 %	50 %

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 24	25 %		38 %		38 %
	Sex: males N= 2 ; females N= 22	male	female	male	female	male	female
	Sex: males N= 2 ; females N= 22	0 %	27 %	0 %	41 %	100 %	32 %
	Race: White N= 16 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 16 ; Non-white N= 8	31 %	12 %	38 %	38 %	31 %	50 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 24	46 %		46 %		8.3 %
	Sex: males N= 3 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 3 ; females N= 21	0 %	52 %	100 %	38 %	0 %	9.5 %
	Race: White N= 14 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 10	57 %	30 %	43 %	50 %	0 %	20 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.16232

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.67591

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.13187

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.53526

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.79852

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.14937

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

3 UMR courses: ES Glucosidase

3.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs2)
allBiochem = addExperts(allBiochem,exs2)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	24	0.31	0.14	0.30	0.06	0.52	0.46
X12	O Chem 2	24	0.29	0.17	0.33	-0.25	0.52	0.78
X13	BiocF22	24	0.45	0.11	0.46	0.24	0.65	0.41
X14	Expert	8	0.72	0.09	0.70	0.59	0.82	0.23

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	0.9571573
BiocF22-O Chem 1	0.0043942
Expert-O Chem 1	0.0000000
BiocF22-O Chem 2	0.0008321
Expert-O Chem 2	0.0000000
Expert-BiocF22	0.0000461

plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	8	0.72	0.09	0.70	0.59	0.82	0.23
X12	first_year	24	0.31	0.14	0.30	0.06	0.52	0.46
X13	second_year	24	0.29	0.17	0.33	-0.25	0.52	0.78
X14	third_year	24	0.45	0.11	0.46	0.24	0.65	0.41

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000461
second_year-first_year	0.9571573
third_year-first_year	0.0043942
third_year-second_year	0.0008321

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	25	0.32	0.20	0.32	-0.25	0.65	0.90
X12	White	46	0.36	0.14	0.38	0.04	0.57	0.53

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.3480736

plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	64	0.35	0.16	0.38	-0.25	0.60	0.85
X12	Male	7	0.30	0.18	0.30	0.09	0.65	0.56

Anova: PLC: Sex
Testing statistical significance: p-values
0.400078

3.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	24	0.25	0.08	0.26	0.04	0.39	0.35
X12	O Chem 2	24	0.24	0.08	0.24	0.11	0.39	0.28
X13	BiocF22	24	0.26	0.07	0.25	0.10	0.41	0.31
X14	Expert	8	0.40	0.06	0.42	0.29	0.47	0.17

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	0.9667165
BiocF22-O Chem 1	0.9745936
Expert-O Chem 1	0.0000346
BiocF22-O Chem 2	0.8123887
Expert-O Chem 2	0.0000097
Expert-BiocF22	0.0001065

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	8	0.40	0.06	0.42	0.29	0.47	0.17
X12	first_year	24	0.25	0.08	0.26	0.04	0.39	0.35
X13	second_year	24	0.24	0.08	0.24	0.11	0.39	0.28
X14	third_year	24	0.26	0.07	0.25	0.10	0.41	0.31

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000346
second_year-Expert	0.0000097
third_year-Expert	0.0001065
second_year-first_year	0.9667165
third_year-first_year	0.9745936
third_year-second_year	0.8123887

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	25	0.24	0.08	0.25	0.10	0.39	0.29
X12	White	46	0.25	0.08	0.25	0.04	0.41	0.37

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.6249095

plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	64	0.25	0.08	0.25	0.10	0.41	0.31
X12	Male	7	0.21	0.08	0.24	0.04	0.28	0.23

Anova: NS: Sex
Testing statistical significance: p-values
0.1949647

3.3 PLC/NS clustering

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	8	0.72	0.09	0.70	0.59	0.82	0.23
X12	HP	28	0.47	0.09	0.48	0.26	0.65	0.39
X13	IP	34	0.32	0.10	0.34	0.06	0.50	0.43
X14	LP	10	0.11	0.15	0.14	-0.25	0.27	0.52

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	5.0e-07
IP-Expert	0.0e+00
LP-Expert	0.0e+00
IP-HP	2.4e-06
LP-HP	0.0e+00
LP-IP	4.8e-06

3.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.28225

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES Glucosidase - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 24	29 %		50 %		21 %
	Sex: males N= 2 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 2 ; females N= 21	0 %	33 %	100 %	43 %	0 %	24 %
	Race: White N= 16 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 16 ; Non-white N= 8	31 %	25 %	56 %	38 %	12 %	38 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 24	33 %		50 %		17 %
	Sex: males N= 2 ; females N= 22	male	female	male	female	male	female
	Sex: males N= 2 ; females N= 22	0 %	36 %	50 %	50 %	50 %	14 %
	Race: White N= 16 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 16 ; Non-white N= 8	38 %	25 %	38 %	75 %	25 %	0 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 24	54 %		42 %		4.2 %
	Sex: males N= 3 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 3 ; females N= 21	33 %	57 %	67 %	38 %	0 %	4.8 %
	Race: White N= 14 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 10	64 %	40 %	36 %	50 %	0 %	10 %

3.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.28225

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES Glucosidase - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 24	29 %		50 %		21 %
	Sex: males N= 2 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 2 ; females N= 21	0 %	33 %	100 %	43 %	0 %	24 %
	Race: White N= 16 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 16 ; Non-white N= 8	31 %	25 %	56 %	38 %	12 %	38 %

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 24	33 %		50 %		17 %
	Sex: males N= 2 ; females N= 22	male	female	male	female	male	female
	Sex: males N= 2 ; females N= 22	0 %	36 %	50 %	50 %	50 %	14 %
	Race: White N= 16 ; Non-white N= 8	white	non-white	white	non-white	white	non-white
	Race: White N= 16 ; Non-white N= 8	38 %	25 %	38 %	75 %	25 %	0 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 24	54 %		42 %		4.2 %
	Sex: males N= 3 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 3 ; females N= 21	33 %	57 %	67 %	38 %	0 %	4.8 %
	Race: White N= 14 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 10	64 %	40 %	36 %	50 %	0 %	10 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.33591

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.30276

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.62755

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.15335

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.35944

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.31335

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

4 UMR courses: Nucleic Acids

4.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs3)
allBiochem = addExperts(allBiochem,exs3)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	21	0.12	0.14	0.12	-0.21	0.29	0.50
X12	O Chem 2	21	0.12	0.10	0.13	-0.09	0.28	0.36
X13	BiocF22	21	0.18	0.12	0.17	-0.07	0.36	0.43
X14	Expert	7	0.71	0.08	0.69	0.60	0.82	0.22

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	0.9964900
BiocF22-O Chem 1	0.3491960
Expert-O Chem 1	0.0000000
BiocF22-O Chem 2	0.4716465
Expert-O Chem 2	0.0000000
Expert-BiocF22	0.0000000

plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.71	0.08	0.69	0.60	0.82	0.22
X12	first_year	21	0.12	0.14	0.12	-0.21	0.29	0.50
X13	second_year	21	0.12	0.10	0.13	-0.09	0.28	0.36
X14	third_year	21	0.18	0.12	0.17	-0.07	0.36	0.43

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	0.9964900
third_year-first_year	0.3491960
third_year-second_year	0.4716465

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	20	0.11	0.12	0.13	-0.21	0.31	0.52
X12	White	42	0.16	0.11	0.17	-0.08	0.36	0.44

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.1909842

#plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Only females had consistently taken the Nucleic Acid survey, so no “Sex_birth” analysis is provided

4.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	21	0.16	0.08	0.14	0.04	0.39	0.35
X12	O Chem 2	21	0.16	0.05	0.17	0.04	0.27	0.23
X13	BiocF22	21	0.19	0.08	0.20	0.04	0.30	0.26
X14	Expert	7	0.43	0.08	0.44	0.33	0.53	0.20

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	0.9947528
BiocF22-O Chem 1	0.7725937
Expert-O Chem 1	0.0000000
BiocF22-O Chem 2	0.6257497
Expert-O Chem 2	0.0000000
Expert-BiocF22	0.0000000

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.43	0.08	0.44	0.33	0.53	0.20
X12	first_year	21	0.16	0.08	0.14	0.04	0.39	0.35
X13	second_year	21	0.16	0.05	0.17	0.04	0.27	0.23
X14	third_year	21	0.19	0.08	0.20	0.04	0.30	0.26

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	0.9947528
third_year-first_year	0.7725937
third_year-second_year	0.6257497

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	20	0.18	0.09	0.19	0.04	0.39	0.35
X12	White	42	0.16	0.06	0.15	0.04	0.30	0.26

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.3247307

#plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Only females had consistently taken the Nucleic Acid survey, so no “Sex_birth” analysis is provided

4.3 PLC/NS clustering

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.71	0.08	0.69	0.60	0.82	0.22
X12	HP	22	0.25	0.06	0.23	0.14	0.36	0.22
X13	IP	30	0.12	0.07	0.12	0.00	0.31	0.31
X14	LP	11	-0.04	0.08	-0.06	-0.21	0.05	0.26

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	0e+00
IP-Expert	0e+00
LP-Expert	0e+00
IP-HP	2e-07
LP-HP	0e+00
LP-IP	0e+00

4.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.11131

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Nucleic Acids - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 21	19 %		67 %		14 %
	Sex: males N= 0 ; females N= 20	male	female	male	female	male	female
	Sex: males N= 0 ; females N= 20	NaN %	20 %	NaN %	70 %	NaN %	10 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	21 %	14 %	79 %	43 %	0 %	43 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 21	38 %		33 %		29 %
	Sex: males N= 0 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 0 ; females N= 21	NaN %	38 %	NaN %	33 %	NaN %	29 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	29 %	57 %	36 %	29 %	36 %	14 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 21	48 %		43 %		9.5 %
	Sex: males N= 0 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 0 ; females N= 21	NaN %	48 %	NaN %	43 %	NaN %	9.5 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	50 %	43 %	43 %	43 %	7.1 %	14 %

4.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.11131

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Nucleic Acids - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 21	19 %		67 %		14 %
	Sex: males N= 0 ; females N= 20	male	female	male	female	male	female
	Sex: males N= 0 ; females N= 20	NaN %	20 %	NaN %	70 %	NaN %	10 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	21 %	14 %	79 %	43 %	0 %	43 %

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 21	38 %		33 %		29 %
	Sex: males N= 0 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 0 ; females N= 21	NaN %	38 %	NaN %	33 %	NaN %	29 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	29 %	57 %	36 %	29 %	36 %	14 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 21	48 %		43 %		9.5 %
	Sex: males N= 0 ; females N= 21	male	female	male	female	male	female
	Sex: males N= 0 ; females N= 21	NaN %	48 %	NaN %	43 %	NaN %	9.5 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	50 %	43 %	43 %	43 %	7.1 %	14 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

#plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")
#plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")
#plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.40224

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.02993

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.86071

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

5 UMR courses: Oxygen Binding

5.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs4)
allBiochem = addExperts(allBiochem,exs4)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	25	0.16	0.13	0.18	-0.19	0.34	0.53
X12	O Chem 2	25	0.16	0.15	0.16	-0.12	0.48	0.60
X13	BiocF22	25	0.19	0.13	0.20	-0.04	0.41	0.45
X14	Expert	15	0.69	0.13	0.66	0.52	0.89	0.38

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	1.0000000
BiocF22-O Chem 1	0.8602106
Expert-O Chem 1	0.0000000
BiocF22-O Chem 2	0.8582771
Expert-O Chem 2	0.0000000
Expert-BiocF22	0.0000000

plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	15	0.69	0.13	0.66	0.52	0.89	0.38
X12	first_year	25	0.16	0.13	0.18	-0.19	0.34	0.53
X13	second_year	25	0.16	0.15	0.16	-0.12	0.48	0.60
X14	third_year	25	0.19	0.13	0.20	-0.04	0.41	0.45

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	1.0000000
third_year-first_year	0.8602106
third_year-second_year	0.8582771

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	29	0.14	0.13	0.14	-0.12	0.40	0.52
X12	White	45	0.20	0.13	0.20	-0.19	0.48	0.68

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.053386

plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	71	0.17	0.13	0.18	-0.19	0.48	0.68
X12	Male	3	0.19	0.14	0.16	0.06	0.35	0.28

Anova: PLC: Sex
Testing statistical significance: p-values
0.8472443

5.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	25	0.16	0.06	0.17	0.04	0.30	0.26
X12	O Chem 2	25	0.15	0.07	0.15	0.00	0.30	0.30
X13	BiocF22	25	0.17	0.06	0.17	0.04	0.30	0.26
X14	Expert	15	0.35	0.09	0.35	0.25	0.53	0.28

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	0.9166441
BiocF22-O Chem 1	0.9843835
Expert-O Chem 1	0.0000000
BiocF22-O Chem 2	0.7483636
Expert-O Chem 2	0.0000000
Expert-BiocF22	0.0000000

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	15	0.35	0.09	0.35	0.25	0.53	0.28
X12	first_year	25	0.16	0.06	0.17	0.04	0.30	0.26
X13	second_year	25	0.15	0.07	0.15	0.00	0.30	0.30
X14	third_year	25	0.17	0.06	0.17	0.04	0.30	0.26

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	0.9166441
third_year-first_year	0.9843835
third_year-second_year	0.7483636

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	29	0.15	0.06	0.15	0.04	0.3	0.26
X12	White	45	0.17	0.07	0.17	0.00	0.3	0.30

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.3355991

plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	71	0.16	0.07	0.17	0.00	0.30	0.30
X12	Male	3	0.13	0.07	0.15	0.05	0.18	0.13

Anova: NS: Sex
Testing statistical significance: p-values
0.3499788

5.3 PLC/NS clustering

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	15	0.69	0.13	0.66	0.52	0.89	0.38
X12	HP	25	0.27	0.10	0.26	0.10	0.48	0.38
X13	IP	36	0.18	0.07	0.18	0.03	0.37	0.34
X14	LP	14	-0.02	0.09	-0.01	-0.19	0.08	0.28

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	0.0000000
IP-Expert	0.0000000
LP-Expert	0.0000000
IP-HP	0.0054258
LP-HP	0.0000000
LP-IP	0.0000000

5.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.48122

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Oxygen Binding - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 25	20 %		56 %		24 %
	Sex: males N= 1 ; females N= 23	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 23	0 %	22 %	0 %	57 %	100 %	22 %
	Race: White N= 15 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 15 ; Non-white N= 10	33 %	0 %	47 %	70 %	20 %	30 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 25	36 %		48 %		16 %
	Sex: males N= 1 ; females N= 24	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 24	0 %	38 %	100 %	46 %	0 %	17 %
	Race: White N= 15 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 15 ; Non-white N= 10	47 %	20 %	40 %	60 %	13 %	20 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 25	44 %		40 %		16 %
	Sex: males N= 1 ; females N= 24	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 24	100 %	42 %	0 %	42 %	0 %	17 %
	Race: White N= 15 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 15 ; Non-white N= 10	47 %	40 %	40 %	40 %	13 %	20 %

5.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.48122

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Oxygen Binding - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 25	20 %		56 %		24 %
	Sex: males N= 1 ; females N= 23	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 23	0 %	22 %	0 %	57 %	100 %	22 %
	Race: White N= 15 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 15 ; Non-white N= 10	33 %	0 %	47 %	70 %	20 %	30 %

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 25	36 %		48 %		16 %
	Sex: males N= 1 ; females N= 24	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 24	0 %	38 %	100 %	46 %	0 %	17 %
	Race: White N= 15 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 15 ; Non-white N= 10	47 %	20 %	40 %	60 %	13 %	20 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 25	44 %		40 %		16 %
	Sex: males N= 1 ; females N= 24	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 24	100 %	42 %	0 %	42 %	0 %	17 %
	Race: White N= 15 ; Non-white N= 10	white	non-white	white	non-white	white	non-white
	Race: White N= 15 ; Non-white N= 10	47 %	40 %	40 %	40 %	13 %	20 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.56879

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.20904

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.51537

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.39616

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.12451

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.89258

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

6 UMR courses: Protein Structure

6.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs5)
allBiochem = addExperts(allBiochem,exs5)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	21	0.14	0.15	0.13	-0.11	0.46	0.57
X12	O Chem 2	21	0.20	0.20	0.22	-0.18	0.62	0.80
X13	BiocF22	21	0.25	0.11	0.27	0.03	0.51	0.48
X14	Expert	7	0.76	0.10	0.79	0.59	0.89	0.30

Anova: PLC: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	0.5529700
BiocF22-O Chem 1	0.0890982
Expert-O Chem 1	0.0000000
BiocF22-O Chem 2	0.7099252
Expert-O Chem 2	0.0000000
Expert-BiocF22	0.0000000

plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.76	0.10	0.79	0.59	0.89	0.30
X12	first_year	21	0.14	0.15	0.13	-0.11	0.46	0.57
X13	second_year	21	0.20	0.20	0.22	-0.18	0.62	0.80
X14	third_year	21	0.25	0.11	0.27	0.03	0.51	0.48

Anova: PLC: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000000
second_year-first_year	0.5529700
third_year-first_year	0.0890982
third_year-second_year	0.7099252

plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	20	0.14	0.17	0.19	-0.18	0.41	0.59
X12	White	42	0.22	0.16	0.22	-0.07	0.62	0.69

Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.0748154

plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	59	0.20	0.16	0.22	-0.18	0.62	0.80
X12	Male	3	-0.01	0.06	-0.04	-0.05	0.07	0.11

Anova: PLC: Sex
Testing statistical significance: p-values
0.0288722

6.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
	group1	n	mean	sd	median	min	max	range
X11	O Chem 1	21	0.16	0.05	0.16	0.03	0.25	0.22
X12	O Chem 2	21	0.17	0.07	0.15	0.04	0.35	0.31
X13	BiocF22	21	0.18	0.04	0.18	0.08	0.29	0.21
X14	Expert	7	0.35	0.08	0.35	0.24	0.44	0.21

Anova: NS: Course
	Testing statistical significance: p-values
O Chem 2-O Chem 1	0.9395887
BiocF22-O Chem 1	0.6126707
Expert-O Chem 1	0.0000000
BiocF22-O Chem 2	0.9143450
Expert-O Chem 2	0.0000000
Expert-BiocF22	0.0000001

plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.35	0.08	0.35	0.24	0.44	0.21
X12	first_year	21	0.16	0.05	0.16	0.03	0.25	0.22
X13	second_year	21	0.17	0.07	0.15	0.04	0.35	0.31
X14	third_year	21	0.18	0.04	0.18	0.08	0.29	0.21

Anova: NS: Year
	Testing statistical significance: p-values
first_year-Expert	0.0000000
second_year-Expert	0.0000000
third_year-Expert	0.0000001
second_year-first_year	0.9395887
third_year-first_year	0.6126707
third_year-second_year	0.9143450

plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
	group1	n	mean	sd	median	min	max	range
X11	Non-white	20	0.17	0.06	0.17	0.04	0.28	0.24
X12	White	42	0.17	0.06	0.17	0.03	0.35	0.31

Anova: NS: White/Non-white
Testing statistical significance: p-values
0.8978509

plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
	group1	n	mean	sd	median	min	max	range
X11	Female	59	0.17	0.06	0.17	0.03	0.35	0.31
X12	Male	3	0.12	0.03	0.12	0.09	0.14	0.05

Anova: NS: Sex
Testing statistical significance: p-values
0.1400967

6.3 PLC/NS clustering

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
	group1	n	mean	sd	median	min	max	range
X11	Expert	7	0.76	0.10	0.79	0.59	0.89	0.30
X12	HP	14	0.35	0.13	0.32	0.11	0.62	0.51
X13	IP	30	0.25	0.08	0.24	0.13	0.46	0.33
X14	LP	19	0.00	0.09	0.02	-0.18	0.11	0.29

Anova: PLC: Cluster letter
	Testing statistical significance: p-values
HP-Expert	0.0000000
IP-Expert	0.0000000
LP-Expert	0.0000000
IP-HP	0.0108399
LP-HP	0.0000000
LP-IP	0.0000000

6.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.11599

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Protein Structure - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")

Results for category: O Chem 2

		High Performers		Intermediate Performers		Low Performers
O Chem 2	Total N= 21	33 %		33 %		33 %
	Sex: males N= 1 ; females N= 19	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 19	0 %	37 %	0 %	32 %	100 %	32 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	29 %	43 %	43 %	14 %	29 %	43 %

Results for category: O Chem 1

		High Performers		Intermediate Performers		Low Performers
O Chem 1	Total N= 21	14 %		43 %		43 %
	Sex: males N= 1 ; females N= 20	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 20	0 %	15 %	0 %	45 %	100 %	40 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	14 %	14 %	50 %	29 %	36 %	57 %

Results for category: BiocF22

		High Performers		Intermediate Performers		Low Performers
BiocF22	Total N= 21	19 %		67 %		14 %
	Sex: males N= 1 ; females N= 20	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 20	0 %	20 %	0 %	70 %	100 %	10 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	21 %	14 %	71 %	57 %	7.1 %	29 %

6.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.11599

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Protein Structure - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")

Results for category: second_year

		High Performers		Intermediate Performers		Low Performers
second_year	Total N= 21	33 %		33 %		33 %
	Sex: males N= 1 ; females N= 19	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 19	0 %	37 %	0 %	32 %	100 %	32 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	29 %	43 %	43 %	14 %	29 %	43 %

Results for category: first_year

		High Performers		Intermediate Performers		Low Performers
first_year	Total N= 21	14 %		43 %		43 %
	Sex: males N= 1 ; females N= 20	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 20	0 %	15 %	0 %	45 %	100 %	40 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	14 %	14 %	50 %	29 %	36 %	57 %

Results for category: third_year

		High Performers		Intermediate Performers		Low Performers
third_year	Total N= 21	19 %		67 %		14 %
	Sex: males N= 1 ; females N= 20	male	female	male	female	male	female
	Sex: males N= 1 ; females N= 20	0 %	20 %	0 %	70 %	100 %	10 %
	Race: White N= 14 ; Non-white N= 7	white	non-white	white	non-white	white	non-white
	Race: White N= 14 ; Non-white N= 7	21 %	14 %	71 %	57 %	7.1 %	29 %

cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.49659

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.37627

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.04285

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.60653

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.42437

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.41316

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

Clustering only students who were in all semesters

Xavier Prat-Resina

2023-03-15

1 Introduction

2 UMR courses: ES Chemical Equation

2.1 PLC only: Anova

2.2 NS only: Anova

2.3 PLC/NS clustering

2.3.1 Analysis by course

2.3.2 Analysis by year

3 UMR courses: ES Glucosidase

3.1 PLC only: Anova

3.2 NS only: Anova

3.3 PLC/NS clustering

3.3.1 Analysis by course

3.3.2 Analysis by year

4 UMR courses: Nucleic Acids

4.1 PLC only: Anova

4.2 NS only: Anova

4.3 PLC/NS clustering

4.3.1 Analysis by course

4.3.2 Analysis by year

5 UMR courses: Oxygen Binding

5.1 PLC only: Anova

5.2 NS only: Anova

5.3 PLC/NS clustering

5.3.1 Analysis by course

5.3.2 Analysis by year

6 UMR courses: Protein Structure

6.1 PLC only: Anova

6.2 NS only: Anova

6.3 PLC/NS clustering

6.3.1 Analysis by course

6.3.2 Analysis by year