setwd("~/Research/02b Neural Network Research UMR/Data + Analysis/Clustering_Xavier")
umr = read.csv("UMR_all_for_R_with_courses.csv",header = TRUE)

umr[which(umr$Term_collected == "Fall2022"),]$Course_collected = gsub('Biochem 1', 'BiocF22', umr[which(umr$Term_collected == "Fall2022"),]$Course_collected)
umr[which(umr$Term_collected == "Fall2021"),]$Course_collected = gsub('Biochem 1', 'BiocF21', umr[which(umr$Term_collected == "Fall2021"),]$Course_collected)

umrs1 = umr[which(umr$Survey=="ES_Chemical_Reaction"),]
umrs2 = umr[which(umr$Survey=="ES_Glucosidase"),]
umrs3 = umr[which(umr$Survey=="Nucleic_Acids"),]
umrs4 = umr[which(umr$Survey=="Oxygen_Binding"),]
umrs5 = umr[which(umr$Survey=="Protein_Structure"),]

#filter out the ones who were did not continue ochem1, 2 and biochem
deid = read.csv("deids_f22_f21_s21.csv",header = TRUE)
# some of the df below are not multiples of 3 which means that
#somehow the deid does not match with the main source of answers, UMR_all_for_R_with_courses.csv has eliminated some students?
# the easy solution would be to eliminate the students who are not 
# we will just do it manually removing UMRBIOC3321F22ES045 from deid
deid = deid[!deid$f22=="UMRBIOC3321F22ES045",]
umrs1 = umrs1[umrs1$Deidentifier %in% deid$s21 |  umrs1$Deidentifier %in% deid$f21 | umrs1$Deidentifier %in% deid$f22 ,]
umrs2 = umrs2[umrs2$Deidentifier %in% deid$s21 |  umrs2$Deidentifier %in% deid$f21 | umrs2$Deidentifier %in% deid$f22 ,]
umrs3 = umrs3[umrs3$Deidentifier %in% deid$s21 |  umrs3$Deidentifier %in% deid$f21 | umrs3$Deidentifier %in% deid$f22 ,]
umrs4 = umrs4[umrs4$Deidentifier %in% deid$s21 |  umrs4$Deidentifier %in% deid$f21 | umrs4$Deidentifier %in% deid$f22 ,]
umrs5 = umrs5[umrs5$Deidentifier %in% deid$s21 |  umrs5$Deidentifier %in% deid$f21 | umrs5$Deidentifier %in% deid$f22 ,]

#sum(umrs5$Term_collected=='Fall2021')
#sum(umrs5$Term_collected=='Fall2022')
#sum(umrs5$Term_collected=='Spring2021')

expert = read.csv("Experts_all_for_R.csv",header = TRUE)
exs1 = expert[which(expert$Survey=="ES_Chemical_Equation"),]
exs2 = expert[which(expert$Survey=="ES_Glucosidase"),]
exs3 = expert[which(expert$Survey=="Nucleic_Acids"),]
exs4 = expert[which(expert$Survey=="Oxygen_Binding"),]
exs5 = expert[which(expert$Survey=="Protein_Strcuture"),]
library(psych)

analyzeUMRCourses = function(umrs1){
 #allBiochem = data.frame(matrix(ncol = 8,nrow=0))
 #myCols = c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","PLC","NS")
 #colnames(allBiochem) = myCols

 #allBiochem = rbind(allBiochem,otherss1[myCols])

 ##remove UT/BIO206 and all Dennison and non-Bioc3321 at UMR
 #allBiochem = allBiochem[! grepl("Dennison",allBiochem$Institution ),]
 #allBiochem = allBiochem[! grepl("BIO206",allBiochem$Course_collected),]
 #tempo = umrs1[grep("BIOC3321",umrs1$Course_collected),]
 #allBiochem = rbind(allBiochem,tempo[myCols])
  
  allBiochem = umrs1[,c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","NS","actual_year","PLC")]
  allBiochem$Coherency = as.numeric(allBiochem$Coherency)
  allBiochem$NS = as.numeric(allBiochem$NS)
  allBiochem$PLC = as.numeric(allBiochem$PLC)
  allBiochem$race_binary <- ifelse(allBiochem$Race_ethnicity == "White/Caucasian" , 'White', "Non-white")
   
  #Cluster. Setting one seed, whatever
  set.seed(42)
  df <- matrix(data=c(allBiochem$PLC,allBiochem$NS),ncol=2)
  allBiochem$cluster = kmeans(scale(df[,1:2]),3)$cluster
  
  #this is clumsy but I have to programmatically find the cluster number corresponding to HP, LP, and IP
  #Using the PLC to make sure its working
  meanPLCbyCluster = describeBy(allBiochem$PLC,allBiochem$cluster,mat=TRUE)
  maxPLC = max(meanPLCbyCluster$mean)
  HPgroup = as.numeric(meanPLCbyCluster[which(meanPLCbyCluster$mean==maxPLC),]$group1)
  minPLC = min(meanPLCbyCluster$mean)
  LPgroup = as.numeric(meanPLCbyCluster[which(meanPLCbyCluster$mean==minPLC),]$group1)
  if (HPgroup + LPgroup == 3 ){IPgroup = 3}
  if (HPgroup + LPgroup == 4 ){IPgroup = 2}
  if (HPgroup + LPgroup == 5 ){IPgroup = 1}
  allBiochem$clusterLetter = ifelse(allBiochem$cluster == HPgroup, "HP",
                                    ifelse(allBiochem$cluster == LPgroup,"LP",
                                           ifelse(allBiochem$cluster == IPgroup,"IP","Oops")))  
  allBiochem$Course_collected = factor(allBiochem$Course_collected,levels = c(
    "Gen + Organic 1","O Chem 1","O Chem 2","Gen Chem 2","BiocF21","BiocF22","Biochem 2"))
  return(allBiochem)
}

buildTables = function(allBiochem){
  mata<-describeBy(allBiochem$PLC,allBiochem$clusterLetter,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by cluster group"))
  mata<-describeBy(allBiochem$PLC,allBiochem$Institution,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by institution"))
  mata<-describeBy(allBiochem$PLC,allBiochem$actual_year,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by Actual Year"))
  mata<-describeBy(allBiochem$PLC,allBiochem$Course_collected,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by course"))
  mata<-describeBy(allBiochem$PLC,allBiochem$Sex_birth,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by Sex"))
  mata<-describeBy(allBiochem$PLC,allBiochem$race_binary,mat=TRUE,digits = 2)
  print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] ,  caption = "PLC by Race"))
}
calcStats = function(allBiochem,mycategory){
  #using the term course as a generic category
   for (course in unique(allBiochem$Course_collected)){
     if ( course == "Expert") next
     header = paste("<b>Results for category: ",course,"</b></br></br>")
     cat(header)
     umrTot= sum(allBiochem$Course_collected == course )
     umrHP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "HP")
     umrIP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "IP")
     umrLP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "LP")
     
     umrMale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male")
     umrHPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "HP")
     umrIPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "IP")
     umrLPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "LP")
      
     umrFemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female")
     umrHPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "HP")
     umrIPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "IP")
     umrLPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "LP")
      
     umrWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White")
     umrHPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "HP")
     umrIPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "IP")
     umrLPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "LP")
      
     umrNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white")
     umrHPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "HP")
     umrIPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "IP")
     umrLPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "LP")
     
     output = paste("<table >
<thead>
<tr>
  <th colspan='2'></th>
  <th colspan='2'>High Performers</th>
  <th colspan='2'>Intermediate Performers</th>
  <th colspan='2'>Low Performers</th>
  
</tr>
</thead>
<tbody>
  <tr>
    <td rowspan='5'>",course," </td>
    <td>Total N=", umrTot,"</td>
    <td colspan='2'>", signif(umrHP/umrTot*100,digits=2),"% </td>
    <td colspan='2'>", signif(umrIP/umrTot*100,digits=2),"%</td>
    <td colspan='2'>", signif(umrLP/umrTot*100,digits=2),"% </td>
  </tr>
  <tr>
    <td rowspan='2'>Sex: males N=",umrMale,"; females N=",umrFemale,"</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
  </tr>
  <tr>
    <td>", signif(umrHPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrHPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrIPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrIPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrLPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrLPfemale/umrFemale*100,digits=2),"%</td>
  </tr>
  <tr>
    <td rowspan='2'>Race: White N=",umrWhite,"; Non-white N=",umrNonwhite,"</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
  </tr>
  <tr>
    <td>", signif(umrHPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrHPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrIPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrIPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrLPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrLPNonwhite/umrNonwhite*100,digits=2),"%</td>
  </tr>
</tbody>
</table> ")
     cat(output)
   }
  
}


calcStats2 = function(allBiochem,mycategory){
  #using the term course as a generic   category
   for (course in unique(allBiochem$actual_year)){
     if ( course == "Expert") next
     header = paste("<b>Results for category: ",course,"</b></br></br>")
     cat(header)
     umrTot= sum(allBiochem$actual_year == course )
     umrHP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "HP")
     umrIP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "IP")
     umrLP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "LP")
     
     umrMale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male")
     umrHPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "HP")
     umrIPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "IP")
     umrLPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "LP")
      
     umrFemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female")
     umrHPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "HP")
     umrIPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "IP")
     umrLPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "LP")
      
     umrWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White")
     umrHPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "HP")
     umrIPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "IP")
     umrLPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "LP")
      
     umrNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white")
     umrHPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "HP")
     umrIPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "IP")
     umrLPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "LP")
     
     output = paste("<table >
<thead>
<tr>
  <th colspan='2'></th>
  <th colspan='2'>High Performers</th>
  <th colspan='2'>Intermediate Performers</th>
  <th colspan='2'>Low Performers</th>
  
</tr>
</thead>
<tbody>
  <tr>
    <td rowspan='5'>",course," </td>
    <td>Total N=", umrTot,"</td>
    <td colspan='2'>", signif(umrHP/umrTot*100,digits=2),"% </td>
    <td colspan='2'>", signif(umrIP/umrTot*100,digits=2),"%</td>
    <td colspan='2'>", signif(umrLP/umrTot*100,digits=2),"% </td>
  </tr>
  <tr>
    <td rowspan='2'>Sex: males N=",umrMale,"; females N=",umrFemale,"</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
    <td>male</td>
    <td>female</td>
  </tr>
  <tr>
    <td>", signif(umrHPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrHPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrIPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrIPfemale/umrFemale*100,digits=2),"%</td>
    <td>", signif(umrLPmale/umrMale*100,digits=2),"%</td>
    <td>", signif(umrLPfemale/umrFemale*100,digits=2),"%</td>
  </tr>
  <tr>
    <td rowspan='2'>Race: White N=",umrWhite,"; Non-white N=",umrNonwhite,"</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
    <td>white</td>
    <td>non-white</td>
  </tr>
  <tr>
    <td>", signif(umrHPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrHPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrIPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrIPNonwhite/umrNonwhite*100,digits=2),"%</td>
    <td>", signif(umrLPWhite/umrWhite*100,digits=2),"%</td>
    <td>", signif(umrLPNonwhite/umrNonwhite*100,digits=2),"%</td>
  </tr>
</tbody>
</table> ")
     cat(output)
   }
  
}



library(ggplot2)
library(ggpubr)
library(psych)

plotGGbox = function(df,myx,myy,mytitle,myylab){
  df = df[complete.cases(df[[myy]]),]
  maxy = max(df[[myy]])
  ggboxplot(df, x = myx, y = myy,  
            title = mytitle,
            color = myx, add = "jitter", legend="none",ylab = myylab) + rotate_x_text(angle = 45) +  
    geom_hline( yintercept = mean(df[[myy]]), linetype = 2) + 
    stat_compare_means(method = "anova", label.y = maxy*1.10) +
    coord_cartesian(ylim = c(0, maxy*1.2)) + 
    stat_compare_means(label = "p.format", size=2.5, method = "t.test", ref.group = ".all.",label.y = maxy*1.05)
}
getAnova = function(df,myx,myy,mytitle,myylab){
  #get anova
  a<- TukeyHSD( aov(df[[myy]] ~ df[[myx]])) 
  b<-as.data.frame(a$`df[[myx]]`[,4])
  colnames(b) = c("Testing statistical significance: p-values")
  print(knitr::kable(b, caption = paste("Anova: ",mytitle)))
}
plotAndTable = function(df,myx,myy,mytitle,myylab){
  if (myx=="Sex_birth" | myx=="race_binary"){
    df = df[!grepl("(?i)Expert", df$Course_collected),]
    df = df[!grepl("(?)Prefer not to answer",df$Sex_birth),]
  }
  print(plotGGbox(df,myx,myy,mytitle,myylab))
  #adding droplevels when the variable is a factor otherwise describeBy will give you NaN for empty categories
  if ( myx == "Course_collected"){
    table = describeBy(df[[myy]],droplevels(df[[myx]]),mat=TRUE,digits = 2)
  }else{
    table = describeBy(df[[myy]],df[[myx]],mat=TRUE,digits = 2)
  }
  print(knitr::kable(table[,c(2,4,5,6,7,10,11,12)],caption=paste("Statistics of ",myylab," based on the category",myx)))
  getAnova(df,myx,myy,mytitle,myylab)
}
addExperts = function(alldf, experts){
  alldf = allBiochem
  ex_new = as.data.frame( matrix( ncol=ncol(alldf),nrow = nrow(experts)) )
  colnames(ex_new) =  colnames(alldf)
  #colnames(ex_new) =  c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","NS","actual_year","PLC","cluster","race_binary","clusterLeter")
  ex_new[,1:12] = "Expert"
  ex_new$PLC = experts$PLC
  ex_new$NS = experts$NS
  ex_new$Coherency = experts$Coherency
  alldf=rbind(alldf,ex_new)
  return(alldf)
}

library(dplyr)
library(corrplot)
plotChi = function(a){
  #I need to use droplevels otherwise it was showing Expert with zeros as a ghost category?
  b=chisq.test(table(droplevels(a)))
  cat(paste("<p><b>The Chi-square analysis gives a p=",round(b$p.value,5),"</b></p>"))
  cat(paste("<p><b>Residuals analysis:</b></p>"))
  cat("A negative residual implies that the measured value is lower than expected and a positive value higher than expected</br>")
  corrplot(b$residuals, is.cor = FALSE)
  #normalize it
  #contrib <- 100*b$residuals^2/b$statistic
  #round(contrib, 3)
  #corrplot(contrib, is.cor = FALSE)
  #corrplot(contrib, is.cor = FALSE, col.lim = c(0.3,1) )


}
plotBarAndCorr = function(df,myx,myy,myxlabel,myylabel,mytitle){
  #myx is the course or demographic variable, the independent variable
  #myy is typically the clusterLetter, the dependent variable
  #remove experts, not useful for the chisquare analysis
  a = df[!grepl("Expert",df[,1]),]
  if (myx=="Sex_birth"){
    a = a[!grepl("(?)Prefer not to answer",a$Sex_birth),]
  }
  #select the two categorical variables
  a = a[,c(myy,myx)]
  print(plotBarCategories(a,myx,myy,myxlabel,myylabel,mytitle))
  plotChi(a)
}
plotBarCategories = function(a,myx,myy,myxlabel,myylabel,mytitle){
  #using aes_string instead of aes because colnames are variables
  #ggplot(a, aes_string(x=myx,fill=myy)) + geom_bar()
  
  
  #c=prop.table(table(a$clusterLetter))
  #scales::percent(as.double(z))
  #a %>% select(clusterLetter) %>% table() %>% prop.table() %>% as.double() %>% scales::percent()
  #this one
  #myx = enquo(myx)
  #myy = enquo(myy)
  a %>% 
    count(!!sym(myy),!!sym(myx))  %>% 
    group_by(!!sym(myx)) %>% 
    mutate(lab = paste0(round(prop.table(n) * 100, 2), '%')) %>%
    ggplot(aes(!!sym(myx),n, fill=!!sym(myy))) + 
    geom_col() + geom_text(aes(label=lab),position='stack',vjust=1.5) +
    labs(x=myxlabel,y=myylabel,title=mytitle)
}

1 Introduction

What was learned from “clustering_indeces_v2” analysis about indicators

  • PLC seem to clearly distinguish experts from students (with significant overlap though)
  • NS does not distinguish as much as PLC, but the highest score and median is still significantly different between experts and students
  • Coherency does not seem to distinguish well and we will discard it in the subsequent analysis

Meaning of PLC and NS * PLC: path length correlation. How connected are two nodes. The closer to one more connected. Compared to expert. * NS: Neighborhood similarity. Compared to the experts.

In here we analyze only the subgroup of students who took the same survey during OChem1, OChem2, and Biochem. The sample is much smaller. Each survey is treated separately, so not because “student1” took the survey1 on those three different semesters means that student1 also took survey2. This is why the sample size is not exactly the same for each survey. For some surveys the analysis by sex is eliminated because there are no males. Because the sample size is so small, be careful of making conclusions on some demographics for some courses/surveys.

The analysis by year is kept even though it should be identical to the analysis by course because OChem1 is taken by first years, OChem2 by second, and Biochem by third years.

2 UMR courses: ES Chemical Equation

2.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs1)
allBiochem = addExperts(allBiochem,exs1)
#adding experts
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 24 0.27 0.17 0.30 -0.16 0.55 0.71
X12 O Chem 2 24 0.26 0.18 0.26 -0.40 0.52 0.92
X13 BiocF22 24 0.44 0.13 0.43 0.23 0.72 0.49
X14 Expert 6 0.67 0.12 0.69 0.49 0.82 0.33
Anova: PLC: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 0.9954125
BiocF22-O Chem 1 0.0021329
Expert-O Chem 1 0.0000029
BiocF22-O Chem 2 0.0009780
Expert-O Chem 2 0.0000016
Expert-BiocF22 0.0120205
#df=allBiochem
#myx="Course_collected"
#myy="PLC"
#mytitle="PLC:Whatever"
#myylab="PLC"
#plotAndTable = function(df,myx,myy,mytitle,myylab){
#  if (myx=="Sex_birth" | myx=="race_binary"){
#    df = df[!grepl("(?i)Expert", df$Course_collected),]
#    df = df[!grepl("(?)Prefer not to answer",df$Sex_birth),]
#  }
#  print(plotGGbox(df,myx,myy,mytitle,myylab))
#  table = describeBy(df[[myy]],df[[myx]],mat=TRUE,digits = 2)
#  table = describeBy(df[[myy]],df[[myx]],digits = 2)
#  print(knitr::kable(table2[,c(2,4,5,6,7,10,11,12)],caption=paste("Statistics of ",myylab," based on the category",myx)))
#  getAnova(df,myx,myy,mytitle,myylab)
#}
plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
group1 n mean sd median min max range
X11 Expert 6 0.67 0.12 0.69 0.49 0.82 0.33
X12 first_year 24 0.27 0.17 0.30 -0.16 0.55 0.71
X13 second_year 24 0.26 0.18 0.26 -0.40 0.52 0.92
X14 third_year 24 0.44 0.13 0.43 0.23 0.72 0.49
Anova: PLC: Year
Testing statistical significance: p-values
first_year-Expert 0.0000029
second_year-Expert 0.0000016
third_year-Expert 0.0120205
second_year-first_year 0.9954125
third_year-first_year 0.0021329
third_year-second_year 0.0009780
plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 25 0.30 0.22 0.30 -0.40 0.72 1.12
X12 White 46 0.34 0.15 0.36 -0.16 0.62 0.78
Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.387264
plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
group1 n mean sd median min max range
X11 Female 64 0.33 0.17 0.36 -0.40 0.62 1.02
X12 Male 7 0.26 0.26 0.27 -0.16 0.72 0.88
Anova: PLC: Sex
Testing statistical significance: p-values
0.3200775

2.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 24 0.22 0.10 0.20 0.04 0.44 0.41
X12 O Chem 2 24 0.23 0.09 0.22 0.08 0.41 0.33
X13 BiocF22 24 0.24 0.06 0.24 0.15 0.35 0.20
X14 Expert 6 0.37 0.11 0.34 0.28 0.57 0.29
Anova: NS: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 0.9562525
BiocF22-O Chem 1 0.9084114
Expert-O Chem 1 0.0009971
BiocF22-O Chem 2 0.9985778
Expert-O Chem 2 0.0028907
Expert-BiocF22 0.0039688
plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
group1 n mean sd median min max range
X11 Expert 6 0.37 0.11 0.34 0.28 0.57 0.29
X12 first_year 24 0.22 0.10 0.20 0.04 0.44 0.41
X13 second_year 24 0.23 0.09 0.22 0.08 0.41 0.33
X14 third_year 24 0.24 0.06 0.24 0.15 0.35 0.20
Anova: NS: Year
Testing statistical significance: p-values
first_year-Expert 0.0009971
second_year-Expert 0.0028907
third_year-Expert 0.0039688
second_year-first_year 0.9562525
third_year-first_year 0.9084114
third_year-second_year 0.9985778
plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 25 0.21 0.08 0.19 0.08 0.39 0.31
X12 White 46 0.24 0.09 0.22 0.04 0.44 0.41
Anova: NS: White/Non-white
Testing statistical significance: p-values
0.2074017
plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
group1 n mean sd median min max range
X11 Female 64 0.24 0.08 0.22 0.08 0.44 0.37
X12 Male 7 0.18 0.07 0.18 0.04 0.26 0.23
Anova: NS: Sex
Testing statistical significance: p-values
0.0713704

2.3 PLC/NS clustering

The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
group1 n mean sd median min max range
X11 Expert 6 0.67 0.12 0.69 0.49 0.82 0.33
X12 HP 23 0.45 0.11 0.44 0.19 0.62 0.43
X13 IP 28 0.38 0.09 0.36 0.27 0.72 0.45
X14 LP 21 0.12 0.15 0.19 -0.40 0.24 0.64
Anova: PLC: Cluster letter
Testing statistical significance: p-values
HP-Expert 0.0006232
IP-Expert 0.0000041
LP-Expert 0.0000000
IP-HP 0.1913762
LP-HP 0.0000000
LP-IP 0.0000000

Are cluster groups unevenly distributed among these categories? A chi-square analysis will give us the probability that all three cluster groups (HP,IP,LP) contain statistically similar proportions of this category (course, year, sex, race…)

2.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.0879

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES_Chemical_Reaction - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")
Results for category: O Chem 2

High Performers Intermediate Performers Low Performers
O Chem 2 Total N= 24 25 % 33 % 42 %
Sex: males N= 2 ; females N= 21 male female male female male female
0 % 29 % 50 % 33 % 50 % 38 %
Race: White N= 16 ; Non-white N= 8 white non-white white non-white white non-white
25 % 25 % 38 % 25 % 38 % 50 %
Results for category: O Chem 1

High Performers Intermediate Performers Low Performers
O Chem 1 Total N= 24 25 % 38 % 38 %
Sex: males N= 2 ; females N= 22 male female male female male female
0 % 27 % 0 % 41 % 100 % 32 %
Race: White N= 16 ; Non-white N= 8 white non-white white non-white white non-white
31 % 12 % 38 % 38 % 31 % 50 %
Results for category: BiocF22

High Performers Intermediate Performers Low Performers
BiocF22 Total N= 24 46 % 46 % 8.3 %
Sex: males N= 3 ; females N= 21 male female male female male female
0 % 52 % 100 % 38 % 0 % 9.5 %
Race: White N= 14 ; Non-white N= 10 white non-white white non-white white non-white
57 % 30 % 43 % 50 % 0 % 20 %

2.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.0879

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES_Chemical_Reaction - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")
Results for category: second_year

High Performers Intermediate Performers Low Performers
second_year Total N= 24 25 % 33 % 42 %
Sex: males N= 2 ; females N= 21 male female male female male female
0 % 29 % 50 % 33 % 50 % 38 %
Race: White N= 16 ; Non-white N= 8 white non-white white non-white white non-white
25 % 25 % 38 % 25 % 38 % 50 %
Results for category: first_year

High Performers Intermediate Performers Low Performers
first_year Total N= 24 25 % 38 % 38 %
Sex: males N= 2 ; females N= 22 male female male female male female
0 % 27 % 0 % 41 % 100 % 32 %
Race: White N= 16 ; Non-white N= 8 white non-white white non-white white non-white
31 % 12 % 38 % 38 % 31 % 50 %
Results for category: third_year

High Performers Intermediate Performers Low Performers
third_year Total N= 24 46 % 46 % 8.3 %
Sex: males N= 3 ; females N= 21 male female male female male female
0 % 52 % 100 % 38 % 0 % 9.5 %
Race: White N= 14 ; Non-white N= 10 white non-white white non-white white non-white
57 % 30 % 43 % 50 % 0 % 20 %
cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.16232

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.67591

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.13187

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.53526

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.79852

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.14937

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

3 UMR courses: ES Glucosidase

3.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs2)
allBiochem = addExperts(allBiochem,exs2)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 24 0.31 0.14 0.30 0.06 0.52 0.46
X12 O Chem 2 24 0.29 0.17 0.33 -0.25 0.52 0.78
X13 BiocF22 24 0.45 0.11 0.46 0.24 0.65 0.41
X14 Expert 8 0.72 0.09 0.70 0.59 0.82 0.23
Anova: PLC: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 0.9571573
BiocF22-O Chem 1 0.0043942
Expert-O Chem 1 0.0000000
BiocF22-O Chem 2 0.0008321
Expert-O Chem 2 0.0000000
Expert-BiocF22 0.0000461
plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
group1 n mean sd median min max range
X11 Expert 8 0.72 0.09 0.70 0.59 0.82 0.23
X12 first_year 24 0.31 0.14 0.30 0.06 0.52 0.46
X13 second_year 24 0.29 0.17 0.33 -0.25 0.52 0.78
X14 third_year 24 0.45 0.11 0.46 0.24 0.65 0.41
Anova: PLC: Year
Testing statistical significance: p-values
first_year-Expert 0.0000000
second_year-Expert 0.0000000
third_year-Expert 0.0000461
second_year-first_year 0.9571573
third_year-first_year 0.0043942
third_year-second_year 0.0008321
plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 25 0.32 0.20 0.32 -0.25 0.65 0.90
X12 White 46 0.36 0.14 0.38 0.04 0.57 0.53
Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.3480736
plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
group1 n mean sd median min max range
X11 Female 64 0.35 0.16 0.38 -0.25 0.60 0.85
X12 Male 7 0.30 0.18 0.30 0.09 0.65 0.56
Anova: PLC: Sex
Testing statistical significance: p-values
0.400078

3.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 24 0.25 0.08 0.26 0.04 0.39 0.35
X12 O Chem 2 24 0.24 0.08 0.24 0.11 0.39 0.28
X13 BiocF22 24 0.26 0.07 0.25 0.10 0.41 0.31
X14 Expert 8 0.40 0.06 0.42 0.29 0.47 0.17
Anova: NS: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 0.9667165
BiocF22-O Chem 1 0.9745936
Expert-O Chem 1 0.0000346
BiocF22-O Chem 2 0.8123887
Expert-O Chem 2 0.0000097
Expert-BiocF22 0.0001065
plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
group1 n mean sd median min max range
X11 Expert 8 0.40 0.06 0.42 0.29 0.47 0.17
X12 first_year 24 0.25 0.08 0.26 0.04 0.39 0.35
X13 second_year 24 0.24 0.08 0.24 0.11 0.39 0.28
X14 third_year 24 0.26 0.07 0.25 0.10 0.41 0.31
Anova: NS: Year
Testing statistical significance: p-values
first_year-Expert 0.0000346
second_year-Expert 0.0000097
third_year-Expert 0.0001065
second_year-first_year 0.9667165
third_year-first_year 0.9745936
third_year-second_year 0.8123887
plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 25 0.24 0.08 0.25 0.10 0.39 0.29
X12 White 46 0.25 0.08 0.25 0.04 0.41 0.37
Anova: NS: White/Non-white
Testing statistical significance: p-values
0.6249095
plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
group1 n mean sd median min max range
X11 Female 64 0.25 0.08 0.25 0.10 0.41 0.31
X12 Male 7 0.21 0.08 0.24 0.04 0.28 0.23
Anova: NS: Sex
Testing statistical significance: p-values
0.1949647

3.3 PLC/NS clustering

The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
group1 n mean sd median min max range
X11 Expert 8 0.72 0.09 0.70 0.59 0.82 0.23
X12 HP 28 0.47 0.09 0.48 0.26 0.65 0.39
X13 IP 34 0.32 0.10 0.34 0.06 0.50 0.43
X14 LP 10 0.11 0.15 0.14 -0.25 0.27 0.52
Anova: PLC: Cluster letter
Testing statistical significance: p-values
HP-Expert 5.0e-07
IP-Expert 0.0e+00
LP-Expert 0.0e+00
IP-HP 2.4e-06
LP-HP 0.0e+00
LP-IP 4.8e-06

3.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.28225

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES Glucosidase - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")
Results for category: O Chem 2

High Performers Intermediate Performers Low Performers
O Chem 2 Total N= 24 29 % 50 % 21 %
Sex: males N= 2 ; females N= 21 male female male female male female
0 % 33 % 100 % 43 % 0 % 24 %
Race: White N= 16 ; Non-white N= 8 white non-white white non-white white non-white
31 % 25 % 56 % 38 % 12 % 38 %
Results for category: O Chem 1

High Performers Intermediate Performers Low Performers
O Chem 1 Total N= 24 33 % 50 % 17 %
Sex: males N= 2 ; females N= 22 male female male female male female
0 % 36 % 50 % 50 % 50 % 14 %
Race: White N= 16 ; Non-white N= 8 white non-white white non-white white non-white
38 % 25 % 38 % 75 % 25 % 0 %
Results for category: BiocF22

High Performers Intermediate Performers Low Performers
BiocF22 Total N= 24 54 % 42 % 4.2 %
Sex: males N= 3 ; females N= 21 male female male female male female
33 % 57 % 67 % 38 % 0 % 4.8 %
Race: White N= 14 ; Non-white N= 10 white non-white white non-white white non-white
64 % 40 % 36 % 50 % 0 % 10 %

3.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.28225

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES Glucosidase - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")
Results for category: second_year

High Performers Intermediate Performers Low Performers
second_year Total N= 24 29 % 50 % 21 %
Sex: males N= 2 ; females N= 21 male female male female male female
0 % 33 % 100 % 43 % 0 % 24 %
Race: White N= 16 ; Non-white N= 8 white non-white white non-white white non-white
31 % 25 % 56 % 38 % 12 % 38 %
Results for category: first_year

High Performers Intermediate Performers Low Performers
first_year Total N= 24 33 % 50 % 17 %
Sex: males N= 2 ; females N= 22 male female male female male female
0 % 36 % 50 % 50 % 50 % 14 %
Race: White N= 16 ; Non-white N= 8 white non-white white non-white white non-white
38 % 25 % 38 % 75 % 25 % 0 %
Results for category: third_year

High Performers Intermediate Performers Low Performers
third_year Total N= 24 54 % 42 % 4.2 %
Sex: males N= 3 ; females N= 21 male female male female male female
33 % 57 % 67 % 38 % 0 % 4.8 %
Race: White N= 14 ; Non-white N= 10 white non-white white non-white white non-white
64 % 40 % 36 % 50 % 0 % 10 %
cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.33591

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.30276

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.62755

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.15335

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.35944

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.31335

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

4 UMR courses: Nucleic Acids

4.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs3)
allBiochem = addExperts(allBiochem,exs3)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 21 0.12 0.14 0.12 -0.21 0.29 0.50
X12 O Chem 2 21 0.12 0.10 0.13 -0.09 0.28 0.36
X13 BiocF22 21 0.18 0.12 0.17 -0.07 0.36 0.43
X14 Expert 7 0.71 0.08 0.69 0.60 0.82 0.22
Anova: PLC: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 0.9964900
BiocF22-O Chem 1 0.3491960
Expert-O Chem 1 0.0000000
BiocF22-O Chem 2 0.4716465
Expert-O Chem 2 0.0000000
Expert-BiocF22 0.0000000
plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
group1 n mean sd median min max range
X11 Expert 7 0.71 0.08 0.69 0.60 0.82 0.22
X12 first_year 21 0.12 0.14 0.12 -0.21 0.29 0.50
X13 second_year 21 0.12 0.10 0.13 -0.09 0.28 0.36
X14 third_year 21 0.18 0.12 0.17 -0.07 0.36 0.43
Anova: PLC: Year
Testing statistical significance: p-values
first_year-Expert 0.0000000
second_year-Expert 0.0000000
third_year-Expert 0.0000000
second_year-first_year 0.9964900
third_year-first_year 0.3491960
third_year-second_year 0.4716465
plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 20 0.11 0.12 0.13 -0.21 0.31 0.52
X12 White 42 0.16 0.11 0.17 -0.08 0.36 0.44
Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.1909842
#plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Only females had consistently taken the Nucleic Acid survey, so no “Sex_birth” analysis is provided

4.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 21 0.16 0.08 0.14 0.04 0.39 0.35
X12 O Chem 2 21 0.16 0.05 0.17 0.04 0.27 0.23
X13 BiocF22 21 0.19 0.08 0.20 0.04 0.30 0.26
X14 Expert 7 0.43 0.08 0.44 0.33 0.53 0.20
Anova: NS: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 0.9947528
BiocF22-O Chem 1 0.7725937
Expert-O Chem 1 0.0000000
BiocF22-O Chem 2 0.6257497
Expert-O Chem 2 0.0000000
Expert-BiocF22 0.0000000
plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
group1 n mean sd median min max range
X11 Expert 7 0.43 0.08 0.44 0.33 0.53 0.20
X12 first_year 21 0.16 0.08 0.14 0.04 0.39 0.35
X13 second_year 21 0.16 0.05 0.17 0.04 0.27 0.23
X14 third_year 21 0.19 0.08 0.20 0.04 0.30 0.26
Anova: NS: Year
Testing statistical significance: p-values
first_year-Expert 0.0000000
second_year-Expert 0.0000000
third_year-Expert 0.0000000
second_year-first_year 0.9947528
third_year-first_year 0.7725937
third_year-second_year 0.6257497
plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 20 0.18 0.09 0.19 0.04 0.39 0.35
X12 White 42 0.16 0.06 0.15 0.04 0.30 0.26
Anova: NS: White/Non-white
Testing statistical significance: p-values
0.3247307
#plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Only females had consistently taken the Nucleic Acid survey, so no “Sex_birth” analysis is provided

4.3 PLC/NS clustering

The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
group1 n mean sd median min max range
X11 Expert 7 0.71 0.08 0.69 0.60 0.82 0.22
X12 HP 22 0.25 0.06 0.23 0.14 0.36 0.22
X13 IP 30 0.12 0.07 0.12 0.00 0.31 0.31
X14 LP 11 -0.04 0.08 -0.06 -0.21 0.05 0.26
Anova: PLC: Cluster letter
Testing statistical significance: p-values
HP-Expert 0e+00
IP-Expert 0e+00
LP-Expert 0e+00
IP-HP 2e-07
LP-HP 0e+00
LP-IP 0e+00

4.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.11131

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Nucleic Acids - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")
Results for category: O Chem 2

High Performers Intermediate Performers Low Performers
O Chem 2 Total N= 21 19 % 67 % 14 %
Sex: males N= 0 ; females N= 20 male female male female male female
NaN % 20 % NaN % 70 % NaN % 10 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
21 % 14 % 79 % 43 % 0 % 43 %
Results for category: O Chem 1

High Performers Intermediate Performers Low Performers
O Chem 1 Total N= 21 38 % 33 % 29 %
Sex: males N= 0 ; females N= 21 male female male female male female
NaN % 38 % NaN % 33 % NaN % 29 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
29 % 57 % 36 % 29 % 36 % 14 %
Results for category: BiocF22

High Performers Intermediate Performers Low Performers
BiocF22 Total N= 21 48 % 43 % 9.5 %
Sex: males N= 0 ; females N= 21 male female male female male female
NaN % 48 % NaN % 43 % NaN % 9.5 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
50 % 43 % 43 % 43 % 7.1 % 14 %

4.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.11131

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Nucleic Acids - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")
Results for category: second_year

High Performers Intermediate Performers Low Performers
second_year Total N= 21 19 % 67 % 14 %
Sex: males N= 0 ; females N= 20 male female male female male female
NaN % 20 % NaN % 70 % NaN % 10 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
21 % 14 % 79 % 43 % 0 % 43 %
Results for category: first_year

High Performers Intermediate Performers Low Performers
first_year Total N= 21 38 % 33 % 29 %
Sex: males N= 0 ; females N= 21 male female male female male female
NaN % 38 % NaN % 33 % NaN % 29 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
29 % 57 % 36 % 29 % 36 % 14 %
Results for category: third_year

High Performers Intermediate Performers Low Performers
third_year Total N= 21 48 % 43 % 9.5 %
Sex: males N= 0 ; females N= 21 male female male female male female
NaN % 48 % NaN % 43 % NaN % 9.5 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
50 % 43 % 43 % 43 % 7.1 % 14 %
cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

#plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")
#plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")
#plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.40224

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.02993

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.86071

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

5 UMR courses: Oxygen Binding

5.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs4)
allBiochem = addExperts(allBiochem,exs4)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 25 0.16 0.13 0.18 -0.19 0.34 0.53
X12 O Chem 2 25 0.16 0.15 0.16 -0.12 0.48 0.60
X13 BiocF22 25 0.19 0.13 0.20 -0.04 0.41 0.45
X14 Expert 15 0.69 0.13 0.66 0.52 0.89 0.38
Anova: PLC: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 1.0000000
BiocF22-O Chem 1 0.8602106
Expert-O Chem 1 0.0000000
BiocF22-O Chem 2 0.8582771
Expert-O Chem 2 0.0000000
Expert-BiocF22 0.0000000
plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
group1 n mean sd median min max range
X11 Expert 15 0.69 0.13 0.66 0.52 0.89 0.38
X12 first_year 25 0.16 0.13 0.18 -0.19 0.34 0.53
X13 second_year 25 0.16 0.15 0.16 -0.12 0.48 0.60
X14 third_year 25 0.19 0.13 0.20 -0.04 0.41 0.45
Anova: PLC: Year
Testing statistical significance: p-values
first_year-Expert 0.0000000
second_year-Expert 0.0000000
third_year-Expert 0.0000000
second_year-first_year 1.0000000
third_year-first_year 0.8602106
third_year-second_year 0.8582771
plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 29 0.14 0.13 0.14 -0.12 0.40 0.52
X12 White 45 0.20 0.13 0.20 -0.19 0.48 0.68
Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.053386
plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
group1 n mean sd median min max range
X11 Female 71 0.17 0.13 0.18 -0.19 0.48 0.68
X12 Male 3 0.19 0.14 0.16 0.06 0.35 0.28
Anova: PLC: Sex
Testing statistical significance: p-values
0.8472443

5.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 25 0.16 0.06 0.17 0.04 0.30 0.26
X12 O Chem 2 25 0.15 0.07 0.15 0.00 0.30 0.30
X13 BiocF22 25 0.17 0.06 0.17 0.04 0.30 0.26
X14 Expert 15 0.35 0.09 0.35 0.25 0.53 0.28
Anova: NS: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 0.9166441
BiocF22-O Chem 1 0.9843835
Expert-O Chem 1 0.0000000
BiocF22-O Chem 2 0.7483636
Expert-O Chem 2 0.0000000
Expert-BiocF22 0.0000000
plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
group1 n mean sd median min max range
X11 Expert 15 0.35 0.09 0.35 0.25 0.53 0.28
X12 first_year 25 0.16 0.06 0.17 0.04 0.30 0.26
X13 second_year 25 0.15 0.07 0.15 0.00 0.30 0.30
X14 third_year 25 0.17 0.06 0.17 0.04 0.30 0.26
Anova: NS: Year
Testing statistical significance: p-values
first_year-Expert 0.0000000
second_year-Expert 0.0000000
third_year-Expert 0.0000000
second_year-first_year 0.9166441
third_year-first_year 0.9843835
third_year-second_year 0.7483636
plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 29 0.15 0.06 0.15 0.04 0.3 0.26
X12 White 45 0.17 0.07 0.17 0.00 0.3 0.30
Anova: NS: White/Non-white
Testing statistical significance: p-values
0.3355991
plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
group1 n mean sd median min max range
X11 Female 71 0.16 0.07 0.17 0.00 0.30 0.30
X12 Male 3 0.13 0.07 0.15 0.05 0.18 0.13
Anova: NS: Sex
Testing statistical significance: p-values
0.3499788

5.3 PLC/NS clustering

The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
group1 n mean sd median min max range
X11 Expert 15 0.69 0.13 0.66 0.52 0.89 0.38
X12 HP 25 0.27 0.10 0.26 0.10 0.48 0.38
X13 IP 36 0.18 0.07 0.18 0.03 0.37 0.34
X14 LP 14 -0.02 0.09 -0.01 -0.19 0.08 0.28
Anova: PLC: Cluster letter
Testing statistical significance: p-values
HP-Expert 0.0000000
IP-Expert 0.0000000
LP-Expert 0.0000000
IP-HP 0.0054258
LP-HP 0.0000000
LP-IP 0.0000000

5.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.48122

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Oxygen Binding - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")
Results for category: O Chem 2

High Performers Intermediate Performers Low Performers
O Chem 2 Total N= 25 20 % 56 % 24 %
Sex: males N= 1 ; females N= 23 male female male female male female
0 % 22 % 0 % 57 % 100 % 22 %
Race: White N= 15 ; Non-white N= 10 white non-white white non-white white non-white
33 % 0 % 47 % 70 % 20 % 30 %
Results for category: O Chem 1

High Performers Intermediate Performers Low Performers
O Chem 1 Total N= 25 36 % 48 % 16 %
Sex: males N= 1 ; females N= 24 male female male female male female
0 % 38 % 100 % 46 % 0 % 17 %
Race: White N= 15 ; Non-white N= 10 white non-white white non-white white non-white
47 % 20 % 40 % 60 % 13 % 20 %
Results for category: BiocF22

High Performers Intermediate Performers Low Performers
BiocF22 Total N= 25 44 % 40 % 16 %
Sex: males N= 1 ; females N= 24 male female male female male female
100 % 42 % 0 % 42 % 0 % 17 %
Race: White N= 15 ; Non-white N= 10 white non-white white non-white white non-white
47 % 40 % 40 % 40 % 13 % 20 %

5.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.48122

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Oxygen Binding - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")
Results for category: second_year

High Performers Intermediate Performers Low Performers
second_year Total N= 25 20 % 56 % 24 %
Sex: males N= 1 ; females N= 23 male female male female male female
0 % 22 % 0 % 57 % 100 % 22 %
Race: White N= 15 ; Non-white N= 10 white non-white white non-white white non-white
33 % 0 % 47 % 70 % 20 % 30 %
Results for category: first_year

High Performers Intermediate Performers Low Performers
first_year Total N= 25 36 % 48 % 16 %
Sex: males N= 1 ; females N= 24 male female male female male female
0 % 38 % 100 % 46 % 0 % 17 %
Race: White N= 15 ; Non-white N= 10 white non-white white non-white white non-white
47 % 20 % 40 % 60 % 13 % 20 %
Results for category: third_year

High Performers Intermediate Performers Low Performers
third_year Total N= 25 44 % 40 % 16 %
Sex: males N= 1 ; females N= 24 male female male female male female
100 % 42 % 0 % 42 % 0 % 17 %
Race: White N= 15 ; Non-white N= 10 white non-white white non-white white non-white
47 % 40 % 40 % 40 % 13 % 20 %
cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.56879

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.20904

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.51537

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.39616

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.12451

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.89258

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

6 UMR courses: Protein Structure

6.1 PLC only: Anova

We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”

#
allBiochem = analyzeUMRCourses(umrs5)
allBiochem = addExperts(allBiochem,exs5)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")

Statistics of PLC based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 21 0.14 0.15 0.13 -0.11 0.46 0.57
X12 O Chem 2 21 0.20 0.20 0.22 -0.18 0.62 0.80
X13 BiocF22 21 0.25 0.11 0.27 0.03 0.51 0.48
X14 Expert 7 0.76 0.10 0.79 0.59 0.89 0.30
Anova: PLC: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 0.5529700
BiocF22-O Chem 1 0.0890982
Expert-O Chem 1 0.0000000
BiocF22-O Chem 2 0.7099252
Expert-O Chem 2 0.0000000
Expert-BiocF22 0.0000000
plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")

Statistics of PLC based on the category actual_year
group1 n mean sd median min max range
X11 Expert 7 0.76 0.10 0.79 0.59 0.89 0.30
X12 first_year 21 0.14 0.15 0.13 -0.11 0.46 0.57
X13 second_year 21 0.20 0.20 0.22 -0.18 0.62 0.80
X14 third_year 21 0.25 0.11 0.27 0.03 0.51 0.48
Anova: PLC: Year
Testing statistical significance: p-values
first_year-Expert 0.0000000
second_year-Expert 0.0000000
third_year-Expert 0.0000000
second_year-first_year 0.5529700
third_year-first_year 0.0890982
third_year-second_year 0.7099252
plotAndTable(allBiochem,"race_binary","PLC","PLC: White/Non-white","PLC")

Statistics of PLC based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 20 0.14 0.17 0.19 -0.18 0.41 0.59
X12 White 42 0.22 0.16 0.22 -0.07 0.62 0.69
Anova: PLC: White/Non-white
Testing statistical significance: p-values
0.0748154
plotAndTable(allBiochem,"Sex_birth","PLC","PLC: Sex","PLC")

Statistics of PLC based on the category Sex_birth
group1 n mean sd median min max range
X11 Female 59 0.20 0.16 0.22 -0.18 0.62 0.80
X12 Male 3 -0.01 0.06 -0.04 -0.05 0.07 0.11
Anova: PLC: Sex
Testing statistical significance: p-values
0.0288722

6.2 NS only: Anova

plotAndTable(allBiochem,"Course_collected","NS","NS: Course","NS")

Statistics of NS based on the category Course_collected
group1 n mean sd median min max range
X11 O Chem 1 21 0.16 0.05 0.16 0.03 0.25 0.22
X12 O Chem 2 21 0.17 0.07 0.15 0.04 0.35 0.31
X13 BiocF22 21 0.18 0.04 0.18 0.08 0.29 0.21
X14 Expert 7 0.35 0.08 0.35 0.24 0.44 0.21
Anova: NS: Course
Testing statistical significance: p-values
O Chem 2-O Chem 1 0.9395887
BiocF22-O Chem 1 0.6126707
Expert-O Chem 1 0.0000000
BiocF22-O Chem 2 0.9143450
Expert-O Chem 2 0.0000000
Expert-BiocF22 0.0000001
plotAndTable(allBiochem,"actual_year","NS","NS: Year","NS")

Statistics of NS based on the category actual_year
group1 n mean sd median min max range
X11 Expert 7 0.35 0.08 0.35 0.24 0.44 0.21
X12 first_year 21 0.16 0.05 0.16 0.03 0.25 0.22
X13 second_year 21 0.17 0.07 0.15 0.04 0.35 0.31
X14 third_year 21 0.18 0.04 0.18 0.08 0.29 0.21
Anova: NS: Year
Testing statistical significance: p-values
first_year-Expert 0.0000000
second_year-Expert 0.0000000
third_year-Expert 0.0000001
second_year-first_year 0.9395887
third_year-first_year 0.6126707
third_year-second_year 0.9143450
plotAndTable(allBiochem,"race_binary","NS","NS: White/Non-white","NS")

Statistics of NS based on the category race_binary
group1 n mean sd median min max range
X11 Non-white 20 0.17 0.06 0.17 0.04 0.28 0.24
X12 White 42 0.17 0.06 0.17 0.03 0.35 0.31
Anova: NS: White/Non-white
Testing statistical significance: p-values
0.8978509
plotAndTable(allBiochem,"Sex_birth","NS","NS: Sex","NS")

Statistics of NS based on the category Sex_birth
group1 n mean sd median min max range
X11 Female 59 0.17 0.06 0.17 0.03 0.35 0.31
X12 Male 3 0.12 0.03 0.12 0.09 0.14 0.05
Anova: NS: Sex
Testing statistical significance: p-values
0.1400967

6.3 PLC/NS clustering

The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”

plotAndTable(allBiochem,"clusterLetter","PLC","PLC: Cluster letter","PLC")

Statistics of PLC based on the category clusterLetter
group1 n mean sd median min max range
X11 Expert 7 0.76 0.10 0.79 0.59 0.89 0.30
X12 HP 14 0.35 0.13 0.32 0.11 0.62 0.51
X13 IP 30 0.25 0.08 0.24 0.13 0.46 0.33
X14 LP 19 0.00 0.09 0.02 -0.18 0.11 0.29
Anova: PLC: Cluster letter
Testing statistical significance: p-values
HP-Expert 0.0000000
IP-Expert 0.0000000
LP-Expert 0.0000000
IP-HP 0.0108399
LP-HP 0.0000000
LP-IP 0.0000000

6.3.1 Analysis by course

plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.11599

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Protein Structure - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats(allBiochem,"Course_collected")
Results for category: O Chem 2

High Performers Intermediate Performers Low Performers
O Chem 2 Total N= 21 33 % 33 % 33 %
Sex: males N= 1 ; females N= 19 male female male female male female
0 % 37 % 0 % 32 % 100 % 32 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
29 % 43 % 43 % 14 % 29 % 43 %
Results for category: O Chem 1

High Performers Intermediate Performers Low Performers
O Chem 1 Total N= 21 14 % 43 % 43 %
Sex: males N= 1 ; females N= 20 male female male female male female
0 % 15 % 0 % 45 % 100 % 40 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
14 % 14 % 50 % 29 % 36 % 57 %
Results for category: BiocF22

High Performers Intermediate Performers Low Performers
BiocF22 Total N= 21 19 % 67 % 14 %
Sex: males N= 1 ; females N= 20 male female male female male female
0 % 20 % 0 % 70 % 100 % 10 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
21 % 14 % 71 % 57 % 7.1 % 29 %

6.3.2 Analysis by year

plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")

The Chi-square analysis gives a p= 0.11599

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Protein Structure - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)

calcStats2(allBiochem,"actual_year")
Results for category: second_year

High Performers Intermediate Performers Low Performers
second_year Total N= 21 33 % 33 % 33 %
Sex: males N= 1 ; females N= 19 male female male female male female
0 % 37 % 0 % 32 % 100 % 32 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
29 % 43 % 43 % 14 % 29 % 43 %
Results for category: first_year

High Performers Intermediate Performers Low Performers
first_year Total N= 21 14 % 43 % 43 %
Sex: males N= 1 ; females N= 20 male female male female male female
0 % 15 % 0 % 45 % 100 % 40 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
14 % 14 % 50 % 29 % 36 % 57 %
Results for category: third_year

High Performers Intermediate Performers Low Performers
third_year Total N= 21 19 % 67 % 14 %
Sex: males N= 1 ; females N= 20 male female male female male female
0 % 20 % 0 % 70 % 100 % 10 %
Race: White N= 14 ; Non-white N= 7 white non-white white non-white white non-white
21 % 14 % 71 % 57 % 7.1 % 29 %
cat("<b>Chi-square analysis of Performance by Sex and Race considering different years</b></br>")

Chi-square analysis of Performance by Sex and Race considering different years

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")

The Chi-square analysis gives a p= 0.49659

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")

The Chi-square analysis gives a p= 0.37627

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")

The Chi-square analysis gives a p= 0.04285

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")

The Chi-square analysis gives a p= 0.60653

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")

The Chi-square analysis gives a p= 0.42437

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected

plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")

The Chi-square analysis gives a p= 0.41316

Residuals analysis:

A negative residual implies that the measured value is lower than expected and a positive value higher than expected