setwd("~/Research/02b Neural Network Research UMR/Data + Analysis/Clustering_Xavier")
umr = read.csv("UMR_all_for_R_with_courses.csv",header = TRUE)
umr[which(umr$Term_collected == "Fall2022"),]$Course_collected = gsub('Biochem 1', 'BiocF22', umr[which(umr$Term_collected == "Fall2022"),]$Course_collected)
umr[which(umr$Term_collected == "Fall2021"),]$Course_collected = gsub('Biochem 1', 'BiocF21', umr[which(umr$Term_collected == "Fall2021"),]$Course_collected)
umrs1 = umr[which(umr$Survey=="ES_Chemical_Reaction"),]
umrs2 = umr[which(umr$Survey=="ES_Glucosidase"),]
umrs3 = umr[which(umr$Survey=="Nucleic_Acids"),]
umrs4 = umr[which(umr$Survey=="Oxygen_Binding"),]
umrs5 = umr[which(umr$Survey=="Protein_Structure"),]
#filter out the ones who were did not continue ochem1, 2 and biochem
deid = read.csv("deids_f22_f21_s21.csv",header = TRUE)
# some of the df below are not multiples of 3 which means that
#somehow the deid does not match with the main source of answers, UMR_all_for_R_with_courses.csv has eliminated some students?
# the easy solution would be to eliminate the students who are not
# we will just do it manually removing UMRBIOC3321F22ES045 from deid
deid = deid[!deid$f22=="UMRBIOC3321F22ES045",]
umrs1 = umrs1[umrs1$Deidentifier %in% deid$s21 | umrs1$Deidentifier %in% deid$f21 | umrs1$Deidentifier %in% deid$f22 ,]
umrs2 = umrs2[umrs2$Deidentifier %in% deid$s21 | umrs2$Deidentifier %in% deid$f21 | umrs2$Deidentifier %in% deid$f22 ,]
umrs3 = umrs3[umrs3$Deidentifier %in% deid$s21 | umrs3$Deidentifier %in% deid$f21 | umrs3$Deidentifier %in% deid$f22 ,]
umrs4 = umrs4[umrs4$Deidentifier %in% deid$s21 | umrs4$Deidentifier %in% deid$f21 | umrs4$Deidentifier %in% deid$f22 ,]
umrs5 = umrs5[umrs5$Deidentifier %in% deid$s21 | umrs5$Deidentifier %in% deid$f21 | umrs5$Deidentifier %in% deid$f22 ,]
#sum(umrs5$Term_collected=='Fall2021')
#sum(umrs5$Term_collected=='Fall2022')
#sum(umrs5$Term_collected=='Spring2021')
expert = read.csv("Experts_all_for_R.csv",header = TRUE)
exs1 = expert[which(expert$Survey=="ES_Chemical_Equation"),]
exs2 = expert[which(expert$Survey=="ES_Glucosidase"),]
exs3 = expert[which(expert$Survey=="Nucleic_Acids"),]
exs4 = expert[which(expert$Survey=="Oxygen_Binding"),]
exs5 = expert[which(expert$Survey=="Protein_Strcuture"),]
library(psych)
analyzeUMRCourses = function(umrs1){
#allBiochem = data.frame(matrix(ncol = 8,nrow=0))
#myCols = c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","PLC","NS")
#colnames(allBiochem) = myCols
#allBiochem = rbind(allBiochem,otherss1[myCols])
##remove UT/BIO206 and all Dennison and non-Bioc3321 at UMR
#allBiochem = allBiochem[! grepl("Dennison",allBiochem$Institution ),]
#allBiochem = allBiochem[! grepl("BIO206",allBiochem$Course_collected),]
#tempo = umrs1[grep("BIOC3321",umrs1$Course_collected),]
#allBiochem = rbind(allBiochem,tempo[myCols])
allBiochem = umrs1[,c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","NS","actual_year","PLC")]
allBiochem$Coherency = as.numeric(allBiochem$Coherency)
allBiochem$NS = as.numeric(allBiochem$NS)
allBiochem$PLC = as.numeric(allBiochem$PLC)
allBiochem$race_binary <- ifelse(allBiochem$Race_ethnicity == "White/Caucasian" , 'White', "Non-white")
#Cluster. Setting one seed, whatever
set.seed(42)
df <- matrix(data=c(allBiochem$PLC,allBiochem$NS),ncol=2)
allBiochem$cluster = kmeans(scale(df[,1:2]),3)$cluster
#this is clumsy but I have to programmatically find the cluster number corresponding to HP, LP, and IP
#Using the PLC to make sure its working
meanPLCbyCluster = describeBy(allBiochem$PLC,allBiochem$cluster,mat=TRUE)
maxPLC = max(meanPLCbyCluster$mean)
HPgroup = as.numeric(meanPLCbyCluster[which(meanPLCbyCluster$mean==maxPLC),]$group1)
minPLC = min(meanPLCbyCluster$mean)
LPgroup = as.numeric(meanPLCbyCluster[which(meanPLCbyCluster$mean==minPLC),]$group1)
if (HPgroup + LPgroup == 3 ){IPgroup = 3}
if (HPgroup + LPgroup == 4 ){IPgroup = 2}
if (HPgroup + LPgroup == 5 ){IPgroup = 1}
allBiochem$clusterLetter = ifelse(allBiochem$cluster == HPgroup, "HP",
ifelse(allBiochem$cluster == LPgroup,"LP",
ifelse(allBiochem$cluster == IPgroup,"IP","Oops")))
allBiochem$Course_collected = factor(allBiochem$Course_collected,levels = c(
"Gen + Organic 1","O Chem 1","O Chem 2","Gen Chem 2","BiocF21","BiocF22","Biochem 2"))
return(allBiochem)
}
buildTables = function(allBiochem){
mata<-describeBy(allBiochem$PLC,allBiochem$clusterLetter,mat=TRUE,digits = 2)
print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] , caption = "PLC by cluster group"))
mata<-describeBy(allBiochem$PLC,allBiochem$Institution,mat=TRUE,digits = 2)
print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] , caption = "PLC by institution"))
mata<-describeBy(allBiochem$PLC,allBiochem$actual_year,mat=TRUE,digits = 2)
print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] , caption = "PLC by Actual Year"))
mata<-describeBy(allBiochem$PLC,allBiochem$Course_collected,mat=TRUE,digits = 2)
print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] , caption = "PLC by course"))
mata<-describeBy(allBiochem$PLC,allBiochem$Sex_birth,mat=TRUE,digits = 2)
print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] , caption = "PLC by Sex"))
mata<-describeBy(allBiochem$PLC,allBiochem$race_binary,mat=TRUE,digits = 2)
print(knitr::kable(mata[,c(2,4,5,6,7,8,9,10,11,12)] , caption = "PLC by Race"))
}
calcStats = function(allBiochem,mycategory){
#using the term course as a generic category
for (course in unique(allBiochem$Course_collected)){
if ( course == "Expert") next
header = paste("<b>Results for category: ",course,"</b></br></br>")
cat(header)
umrTot= sum(allBiochem$Course_collected == course )
umrHP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "HP")
umrIP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "IP")
umrLP = sum(allBiochem$Course_collected == course & allBiochem$clusterLetter == "LP")
umrMale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male")
umrHPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "HP")
umrIPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "IP")
umrLPmale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "LP")
umrFemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female")
umrHPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "HP")
umrIPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "IP")
umrLPfemale = sum(allBiochem$Course_collected == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "LP")
umrWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White")
umrHPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "HP")
umrIPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "IP")
umrLPWhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "LP")
umrNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white")
umrHPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "HP")
umrIPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "IP")
umrLPNonwhite = sum(allBiochem$Course_collected == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "LP")
output = paste("<table >
<thead>
<tr>
<th colspan='2'></th>
<th colspan='2'>High Performers</th>
<th colspan='2'>Intermediate Performers</th>
<th colspan='2'>Low Performers</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan='5'>",course," </td>
<td>Total N=", umrTot,"</td>
<td colspan='2'>", signif(umrHP/umrTot*100,digits=2),"% </td>
<td colspan='2'>", signif(umrIP/umrTot*100,digits=2),"%</td>
<td colspan='2'>", signif(umrLP/umrTot*100,digits=2),"% </td>
</tr>
<tr>
<td rowspan='2'>Sex: males N=",umrMale,"; females N=",umrFemale,"</td>
<td>male</td>
<td>female</td>
<td>male</td>
<td>female</td>
<td>male</td>
<td>female</td>
</tr>
<tr>
<td>", signif(umrHPmale/umrMale*100,digits=2),"%</td>
<td>", signif(umrHPfemale/umrFemale*100,digits=2),"%</td>
<td>", signif(umrIPmale/umrMale*100,digits=2),"%</td>
<td>", signif(umrIPfemale/umrFemale*100,digits=2),"%</td>
<td>", signif(umrLPmale/umrMale*100,digits=2),"%</td>
<td>", signif(umrLPfemale/umrFemale*100,digits=2),"%</td>
</tr>
<tr>
<td rowspan='2'>Race: White N=",umrWhite,"; Non-white N=",umrNonwhite,"</td>
<td>white</td>
<td>non-white</td>
<td>white</td>
<td>non-white</td>
<td>white</td>
<td>non-white</td>
</tr>
<tr>
<td>", signif(umrHPWhite/umrWhite*100,digits=2),"%</td>
<td>", signif(umrHPNonwhite/umrNonwhite*100,digits=2),"%</td>
<td>", signif(umrIPWhite/umrWhite*100,digits=2),"%</td>
<td>", signif(umrIPNonwhite/umrNonwhite*100,digits=2),"%</td>
<td>", signif(umrLPWhite/umrWhite*100,digits=2),"%</td>
<td>", signif(umrLPNonwhite/umrNonwhite*100,digits=2),"%</td>
</tr>
</tbody>
</table> ")
cat(output)
}
}
calcStats2 = function(allBiochem,mycategory){
#using the term course as a generic category
for (course in unique(allBiochem$actual_year)){
if ( course == "Expert") next
header = paste("<b>Results for category: ",course,"</b></br></br>")
cat(header)
umrTot= sum(allBiochem$actual_year == course )
umrHP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "HP")
umrIP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "IP")
umrLP = sum(allBiochem$actual_year == course & allBiochem$clusterLetter == "LP")
umrMale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male")
umrHPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "HP")
umrIPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "IP")
umrLPmale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Male" & allBiochem$clusterLetter == "LP")
umrFemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female")
umrHPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "HP")
umrIPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "IP")
umrLPfemale = sum(allBiochem$actual_year == course & allBiochem$Sex_birth == "Female" & allBiochem$clusterLetter == "LP")
umrWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White")
umrHPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "HP")
umrIPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "IP")
umrLPWhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "White" & allBiochem$clusterLetter == "LP")
umrNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white")
umrHPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "HP")
umrIPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "IP")
umrLPNonwhite = sum(allBiochem$actual_year == course & allBiochem$race_binary == "Non-white" & allBiochem$clusterLetter == "LP")
output = paste("<table >
<thead>
<tr>
<th colspan='2'></th>
<th colspan='2'>High Performers</th>
<th colspan='2'>Intermediate Performers</th>
<th colspan='2'>Low Performers</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan='5'>",course," </td>
<td>Total N=", umrTot,"</td>
<td colspan='2'>", signif(umrHP/umrTot*100,digits=2),"% </td>
<td colspan='2'>", signif(umrIP/umrTot*100,digits=2),"%</td>
<td colspan='2'>", signif(umrLP/umrTot*100,digits=2),"% </td>
</tr>
<tr>
<td rowspan='2'>Sex: males N=",umrMale,"; females N=",umrFemale,"</td>
<td>male</td>
<td>female</td>
<td>male</td>
<td>female</td>
<td>male</td>
<td>female</td>
</tr>
<tr>
<td>", signif(umrHPmale/umrMale*100,digits=2),"%</td>
<td>", signif(umrHPfemale/umrFemale*100,digits=2),"%</td>
<td>", signif(umrIPmale/umrMale*100,digits=2),"%</td>
<td>", signif(umrIPfemale/umrFemale*100,digits=2),"%</td>
<td>", signif(umrLPmale/umrMale*100,digits=2),"%</td>
<td>", signif(umrLPfemale/umrFemale*100,digits=2),"%</td>
</tr>
<tr>
<td rowspan='2'>Race: White N=",umrWhite,"; Non-white N=",umrNonwhite,"</td>
<td>white</td>
<td>non-white</td>
<td>white</td>
<td>non-white</td>
<td>white</td>
<td>non-white</td>
</tr>
<tr>
<td>", signif(umrHPWhite/umrWhite*100,digits=2),"%</td>
<td>", signif(umrHPNonwhite/umrNonwhite*100,digits=2),"%</td>
<td>", signif(umrIPWhite/umrWhite*100,digits=2),"%</td>
<td>", signif(umrIPNonwhite/umrNonwhite*100,digits=2),"%</td>
<td>", signif(umrLPWhite/umrWhite*100,digits=2),"%</td>
<td>", signif(umrLPNonwhite/umrNonwhite*100,digits=2),"%</td>
</tr>
</tbody>
</table> ")
cat(output)
}
}
library(ggplot2)
library(ggpubr)
library(psych)
plotGGbox = function(df,myx,myy,mytitle,myylab){
df = df[complete.cases(df[[myy]]),]
maxy = max(df[[myy]])
ggboxplot(df, x = myx, y = myy,
title = mytitle,
color = myx, add = "jitter", legend="none",ylab = myylab) + rotate_x_text(angle = 45) +
geom_hline( yintercept = mean(df[[myy]]), linetype = 2) +
stat_compare_means(method = "anova", label.y = maxy*1.10) +
coord_cartesian(ylim = c(0, maxy*1.2)) +
stat_compare_means(label = "p.format", size=2.5, method = "t.test", ref.group = ".all.",label.y = maxy*1.05)
}
getAnova = function(df,myx,myy,mytitle,myylab){
#get anova
a<- TukeyHSD( aov(df[[myy]] ~ df[[myx]]))
b<-as.data.frame(a$`df[[myx]]`[,4])
colnames(b) = c("Testing statistical significance: p-values")
print(knitr::kable(b, caption = paste("Anova: ",mytitle)))
}
plotAndTable = function(df,myx,myy,mytitle,myylab){
if (myx=="Sex_birth" | myx=="race_binary"){
df = df[!grepl("(?i)Expert", df$Course_collected),]
df = df[!grepl("(?)Prefer not to answer",df$Sex_birth),]
}
print(plotGGbox(df,myx,myy,mytitle,myylab))
#adding droplevels when the variable is a factor otherwise describeBy will give you NaN for empty categories
if ( myx == "Course_collected"){
table = describeBy(df[[myy]],droplevels(df[[myx]]),mat=TRUE,digits = 2)
}else{
table = describeBy(df[[myy]],df[[myx]],mat=TRUE,digits = 2)
}
print(knitr::kable(table[,c(2,4,5,6,7,10,11,12)],caption=paste("Statistics of ",myylab," based on the category",myx)))
getAnova(df,myx,myy,mytitle,myylab)
}
addExperts = function(alldf, experts){
alldf = allBiochem
ex_new = as.data.frame( matrix( ncol=ncol(alldf),nrow = nrow(experts)) )
colnames(ex_new) = colnames(alldf)
#colnames(ex_new) = c("Institution", "Course_collected", "Deidentifier","Sex_birth","Race_ethnicity","Coherency","NS","actual_year","PLC","cluster","race_binary","clusterLeter")
ex_new[,1:12] = "Expert"
ex_new$PLC = experts$PLC
ex_new$NS = experts$NS
ex_new$Coherency = experts$Coherency
alldf=rbind(alldf,ex_new)
return(alldf)
}
library(dplyr)
library(corrplot)
plotChi = function(a){
#I need to use droplevels otherwise it was showing Expert with zeros as a ghost category?
b=chisq.test(table(droplevels(a)))
cat(paste("<p><b>The Chi-square analysis gives a p=",round(b$p.value,5),"</b></p>"))
cat(paste("<p><b>Residuals analysis:</b></p>"))
cat("A negative residual implies that the measured value is lower than expected and a positive value higher than expected</br>")
corrplot(b$residuals, is.cor = FALSE)
#normalize it
#contrib <- 100*b$residuals^2/b$statistic
#round(contrib, 3)
#corrplot(contrib, is.cor = FALSE)
#corrplot(contrib, is.cor = FALSE, col.lim = c(0.3,1) )
}
plotBarAndCorr = function(df,myx,myy,myxlabel,myylabel,mytitle){
#myx is the course or demographic variable, the independent variable
#myy is typically the clusterLetter, the dependent variable
#remove experts, not useful for the chisquare analysis
a = df[!grepl("Expert",df[,1]),]
if (myx=="Sex_birth"){
a = a[!grepl("(?)Prefer not to answer",a$Sex_birth),]
}
#select the two categorical variables
a = a[,c(myy,myx)]
print(plotBarCategories(a,myx,myy,myxlabel,myylabel,mytitle))
plotChi(a)
}
plotBarCategories = function(a,myx,myy,myxlabel,myylabel,mytitle){
#using aes_string instead of aes because colnames are variables
#ggplot(a, aes_string(x=myx,fill=myy)) + geom_bar()
#c=prop.table(table(a$clusterLetter))
#scales::percent(as.double(z))
#a %>% select(clusterLetter) %>% table() %>% prop.table() %>% as.double() %>% scales::percent()
#this one
#myx = enquo(myx)
#myy = enquo(myy)
a %>%
count(!!sym(myy),!!sym(myx)) %>%
group_by(!!sym(myx)) %>%
mutate(lab = paste0(round(prop.table(n) * 100, 2), '%')) %>%
ggplot(aes(!!sym(myx),n, fill=!!sym(myy))) +
geom_col() + geom_text(aes(label=lab),position='stack',vjust=1.5) +
labs(x=myxlabel,y=myylabel,title=mytitle)
}
What was learned from “clustering_indeces_v2” analysis about indicators
Meaning of PLC and NS * PLC: path length correlation. How connected are two nodes. The closer to one more connected. Compared to expert. * NS: Neighborhood similarity. Compared to the experts.
In here we analyze only the subgroup of students who took the same survey during OChem1, OChem2, and Biochem. The sample is much smaller. Each survey is treated separately, so not because “student1” took the survey1 on those three different semesters means that student1 also took survey2. This is why the sample size is not exactly the same for each survey. For some surveys the analysis by sex is eliminated because there are no males. Because the sample size is so small, be careful of making conclusions on some demographics for some courses/surveys.
The analysis by year is kept even though it should be identical to the analysis by course because OChem1 is taken by first years, OChem2 by second, and Biochem by third years.
We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”
#
allBiochem = analyzeUMRCourses(umrs1)
allBiochem = addExperts(allBiochem,exs1)
#adding experts
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 24 | 0.27 | 0.17 | 0.30 | -0.16 | 0.55 | 0.71 |
X12 | O Chem 2 | 24 | 0.26 | 0.18 | 0.26 | -0.40 | 0.52 | 0.92 |
X13 | BiocF22 | 24 | 0.44 | 0.13 | 0.43 | 0.23 | 0.72 | 0.49 |
X14 | Expert | 6 | 0.67 | 0.12 | 0.69 | 0.49 | 0.82 | 0.33 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 0.9954125 |
BiocF22-O Chem 1 | 0.0021329 |
Expert-O Chem 1 | 0.0000029 |
BiocF22-O Chem 2 | 0.0009780 |
Expert-O Chem 2 | 0.0000016 |
Expert-BiocF22 | 0.0120205 |
#df=allBiochem
#myx="Course_collected"
#myy="PLC"
#mytitle="PLC:Whatever"
#myylab="PLC"
#plotAndTable = function(df,myx,myy,mytitle,myylab){
# if (myx=="Sex_birth" | myx=="race_binary"){
# df = df[!grepl("(?i)Expert", df$Course_collected),]
# df = df[!grepl("(?)Prefer not to answer",df$Sex_birth),]
# }
# print(plotGGbox(df,myx,myy,mytitle,myylab))
# table = describeBy(df[[myy]],df[[myx]],mat=TRUE,digits = 2)
# table = describeBy(df[[myy]],df[[myx]],digits = 2)
# print(knitr::kable(table2[,c(2,4,5,6,7,10,11,12)],caption=paste("Statistics of ",myylab," based on the category",myx)))
# getAnova(df,myx,myy,mytitle,myylab)
#}
plotAndTable(allBiochem,"actual_year","PLC","PLC: Year","PLC")
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 6 | 0.67 | 0.12 | 0.69 | 0.49 | 0.82 | 0.33 |
X12 | first_year | 24 | 0.27 | 0.17 | 0.30 | -0.16 | 0.55 | 0.71 |
X13 | second_year | 24 | 0.26 | 0.18 | 0.26 | -0.40 | 0.52 | 0.92 |
X14 | third_year | 24 | 0.44 | 0.13 | 0.43 | 0.23 | 0.72 | 0.49 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0000029 |
second_year-Expert | 0.0000016 |
third_year-Expert | 0.0120205 |
second_year-first_year | 0.9954125 |
third_year-first_year | 0.0021329 |
third_year-second_year | 0.0009780 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 25 | 0.30 | 0.22 | 0.30 | -0.40 | 0.72 | 1.12 |
X12 | White | 46 | 0.34 | 0.15 | 0.36 | -0.16 | 0.62 | 0.78 |
Testing statistical significance: p-values |
---|
0.387264 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Female | 64 | 0.33 | 0.17 | 0.36 | -0.40 | 0.62 | 1.02 |
X12 | Male | 7 | 0.26 | 0.26 | 0.27 | -0.16 | 0.72 | 0.88 |
Testing statistical significance: p-values |
---|
0.3200775 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 24 | 0.22 | 0.10 | 0.20 | 0.04 | 0.44 | 0.41 |
X12 | O Chem 2 | 24 | 0.23 | 0.09 | 0.22 | 0.08 | 0.41 | 0.33 |
X13 | BiocF22 | 24 | 0.24 | 0.06 | 0.24 | 0.15 | 0.35 | 0.20 |
X14 | Expert | 6 | 0.37 | 0.11 | 0.34 | 0.28 | 0.57 | 0.29 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 0.9562525 |
BiocF22-O Chem 1 | 0.9084114 |
Expert-O Chem 1 | 0.0009971 |
BiocF22-O Chem 2 | 0.9985778 |
Expert-O Chem 2 | 0.0028907 |
Expert-BiocF22 | 0.0039688 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 6 | 0.37 | 0.11 | 0.34 | 0.28 | 0.57 | 0.29 |
X12 | first_year | 24 | 0.22 | 0.10 | 0.20 | 0.04 | 0.44 | 0.41 |
X13 | second_year | 24 | 0.23 | 0.09 | 0.22 | 0.08 | 0.41 | 0.33 |
X14 | third_year | 24 | 0.24 | 0.06 | 0.24 | 0.15 | 0.35 | 0.20 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0009971 |
second_year-Expert | 0.0028907 |
third_year-Expert | 0.0039688 |
second_year-first_year | 0.9562525 |
third_year-first_year | 0.9084114 |
third_year-second_year | 0.9985778 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 25 | 0.21 | 0.08 | 0.19 | 0.08 | 0.39 | 0.31 |
X12 | White | 46 | 0.24 | 0.09 | 0.22 | 0.04 | 0.44 | 0.41 |
Testing statistical significance: p-values |
---|
0.2074017 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Female | 64 | 0.24 | 0.08 | 0.22 | 0.08 | 0.44 | 0.37 |
X12 | Male | 7 | 0.18 | 0.07 | 0.18 | 0.04 | 0.26 | 0.23 |
Testing statistical significance: p-values |
---|
0.0713704 |
The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 6 | 0.67 | 0.12 | 0.69 | 0.49 | 0.82 | 0.33 |
X12 | HP | 23 | 0.45 | 0.11 | 0.44 | 0.19 | 0.62 | 0.43 |
X13 | IP | 28 | 0.38 | 0.09 | 0.36 | 0.27 | 0.72 | 0.45 |
X14 | LP | 21 | 0.12 | 0.15 | 0.19 | -0.40 | 0.24 | 0.64 |
Testing statistical significance: p-values | |
---|---|
HP-Expert | 0.0006232 |
IP-Expert | 0.0000041 |
LP-Expert | 0.0000000 |
IP-HP | 0.1913762 |
LP-HP | 0.0000000 |
LP-IP | 0.0000000 |
Are cluster groups unevenly distributed among these categories? A chi-square analysis will give us the probability that all three cluster groups (HP,IP,LP) contain statistically similar proportions of this category (course, year, sex, race…)
plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.0879
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES_Chemical_Reaction - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 2 | Total N= 24 | 25 % | 33 % | 42 % | |||
Sex: males N= 2 ; females N= 21 | male | female | male | female | male | female | |
0 % | 29 % | 50 % | 33 % | 50 % | 38 % | ||
Race: White N= 16 ; Non-white N= 8 | white | non-white | white | non-white | white | non-white | |
25 % | 25 % | 38 % | 25 % | 38 % | 50 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 1 | Total N= 24 | 25 % | 38 % | 38 % | |||
Sex: males N= 2 ; females N= 22 | male | female | male | female | male | female | |
0 % | 27 % | 0 % | 41 % | 100 % | 32 % | ||
Race: White N= 16 ; Non-white N= 8 | white | non-white | white | non-white | white | non-white | |
31 % | 12 % | 38 % | 38 % | 31 % | 50 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
BiocF22 | Total N= 24 | 46 % | 46 % | 8.3 % | |||
Sex: males N= 3 ; females N= 21 | male | female | male | female | male | female | |
0 % | 52 % | 100 % | 38 % | 0 % | 9.5 % | ||
Race: White N= 14 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
57 % | 30 % | 43 % | 50 % | 0 % | 20 % |
plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.0879
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES_Chemical_Reaction - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
second_year | Total N= 24 | 25 % | 33 % | 42 % | |||
Sex: males N= 2 ; females N= 21 | male | female | male | female | male | female | |
0 % | 29 % | 50 % | 33 % | 50 % | 38 % | ||
Race: White N= 16 ; Non-white N= 8 | white | non-white | white | non-white | white | non-white | |
25 % | 25 % | 38 % | 25 % | 38 % | 50 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
first_year | Total N= 24 | 25 % | 38 % | 38 % | |||
Sex: males N= 2 ; females N= 22 | male | female | male | female | male | female | |
0 % | 27 % | 0 % | 41 % | 100 % | 32 % | ||
Race: White N= 16 ; Non-white N= 8 | white | non-white | white | non-white | white | non-white | |
31 % | 12 % | 38 % | 38 % | 31 % | 50 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
third_year | Total N= 24 | 46 % | 46 % | 8.3 % | |||
Sex: males N= 3 ; females N= 21 | male | female | male | female | male | female | |
0 % | 52 % | 100 % | 38 % | 0 % | 9.5 % | ||
Race: White N= 14 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
57 % | 30 % | 43 % | 50 % | 0 % | 20 % |
Chi-square analysis of Performance by Sex and Race considering different years
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")
The Chi-square analysis gives a p= 0.16232
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")
The Chi-square analysis gives a p= 0.67591
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")
The Chi-square analysis gives a p= 0.13187
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")
The Chi-square analysis gives a p= 0.53526
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")
The Chi-square analysis gives a p= 0.79852
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")
The Chi-square analysis gives a p= 0.14937
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”
#
allBiochem = analyzeUMRCourses(umrs2)
allBiochem = addExperts(allBiochem,exs2)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 24 | 0.31 | 0.14 | 0.30 | 0.06 | 0.52 | 0.46 |
X12 | O Chem 2 | 24 | 0.29 | 0.17 | 0.33 | -0.25 | 0.52 | 0.78 |
X13 | BiocF22 | 24 | 0.45 | 0.11 | 0.46 | 0.24 | 0.65 | 0.41 |
X14 | Expert | 8 | 0.72 | 0.09 | 0.70 | 0.59 | 0.82 | 0.23 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 0.9571573 |
BiocF22-O Chem 1 | 0.0043942 |
Expert-O Chem 1 | 0.0000000 |
BiocF22-O Chem 2 | 0.0008321 |
Expert-O Chem 2 | 0.0000000 |
Expert-BiocF22 | 0.0000461 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 8 | 0.72 | 0.09 | 0.70 | 0.59 | 0.82 | 0.23 |
X12 | first_year | 24 | 0.31 | 0.14 | 0.30 | 0.06 | 0.52 | 0.46 |
X13 | second_year | 24 | 0.29 | 0.17 | 0.33 | -0.25 | 0.52 | 0.78 |
X14 | third_year | 24 | 0.45 | 0.11 | 0.46 | 0.24 | 0.65 | 0.41 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0000000 |
second_year-Expert | 0.0000000 |
third_year-Expert | 0.0000461 |
second_year-first_year | 0.9571573 |
third_year-first_year | 0.0043942 |
third_year-second_year | 0.0008321 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 25 | 0.32 | 0.20 | 0.32 | -0.25 | 0.65 | 0.90 |
X12 | White | 46 | 0.36 | 0.14 | 0.38 | 0.04 | 0.57 | 0.53 |
Testing statistical significance: p-values |
---|
0.3480736 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Female | 64 | 0.35 | 0.16 | 0.38 | -0.25 | 0.60 | 0.85 |
X12 | Male | 7 | 0.30 | 0.18 | 0.30 | 0.09 | 0.65 | 0.56 |
Testing statistical significance: p-values |
---|
0.400078 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 24 | 0.25 | 0.08 | 0.26 | 0.04 | 0.39 | 0.35 |
X12 | O Chem 2 | 24 | 0.24 | 0.08 | 0.24 | 0.11 | 0.39 | 0.28 |
X13 | BiocF22 | 24 | 0.26 | 0.07 | 0.25 | 0.10 | 0.41 | 0.31 |
X14 | Expert | 8 | 0.40 | 0.06 | 0.42 | 0.29 | 0.47 | 0.17 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 0.9667165 |
BiocF22-O Chem 1 | 0.9745936 |
Expert-O Chem 1 | 0.0000346 |
BiocF22-O Chem 2 | 0.8123887 |
Expert-O Chem 2 | 0.0000097 |
Expert-BiocF22 | 0.0001065 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 8 | 0.40 | 0.06 | 0.42 | 0.29 | 0.47 | 0.17 |
X12 | first_year | 24 | 0.25 | 0.08 | 0.26 | 0.04 | 0.39 | 0.35 |
X13 | second_year | 24 | 0.24 | 0.08 | 0.24 | 0.11 | 0.39 | 0.28 |
X14 | third_year | 24 | 0.26 | 0.07 | 0.25 | 0.10 | 0.41 | 0.31 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0000346 |
second_year-Expert | 0.0000097 |
third_year-Expert | 0.0001065 |
second_year-first_year | 0.9667165 |
third_year-first_year | 0.9745936 |
third_year-second_year | 0.8123887 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 25 | 0.24 | 0.08 | 0.25 | 0.10 | 0.39 | 0.29 |
X12 | White | 46 | 0.25 | 0.08 | 0.25 | 0.04 | 0.41 | 0.37 |
Testing statistical significance: p-values |
---|
0.6249095 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Female | 64 | 0.25 | 0.08 | 0.25 | 0.10 | 0.41 | 0.31 |
X12 | Male | 7 | 0.21 | 0.08 | 0.24 | 0.04 | 0.28 | 0.23 |
Testing statistical significance: p-values |
---|
0.1949647 |
The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 8 | 0.72 | 0.09 | 0.70 | 0.59 | 0.82 | 0.23 |
X12 | HP | 28 | 0.47 | 0.09 | 0.48 | 0.26 | 0.65 | 0.39 |
X13 | IP | 34 | 0.32 | 0.10 | 0.34 | 0.06 | 0.50 | 0.43 |
X14 | LP | 10 | 0.11 | 0.15 | 0.14 | -0.25 | 0.27 | 0.52 |
Testing statistical significance: p-values | |
---|---|
HP-Expert | 5.0e-07 |
IP-Expert | 0.0e+00 |
LP-Expert | 0.0e+00 |
IP-HP | 2.4e-06 |
LP-HP | 0.0e+00 |
LP-IP | 4.8e-06 |
plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.28225
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES Glucosidase - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 2 | Total N= 24 | 29 % | 50 % | 21 % | |||
Sex: males N= 2 ; females N= 21 | male | female | male | female | male | female | |
0 % | 33 % | 100 % | 43 % | 0 % | 24 % | ||
Race: White N= 16 ; Non-white N= 8 | white | non-white | white | non-white | white | non-white | |
31 % | 25 % | 56 % | 38 % | 12 % | 38 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 1 | Total N= 24 | 33 % | 50 % | 17 % | |||
Sex: males N= 2 ; females N= 22 | male | female | male | female | male | female | |
0 % | 36 % | 50 % | 50 % | 50 % | 14 % | ||
Race: White N= 16 ; Non-white N= 8 | white | non-white | white | non-white | white | non-white | |
38 % | 25 % | 38 % | 75 % | 25 % | 0 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
BiocF22 | Total N= 24 | 54 % | 42 % | 4.2 % | |||
Sex: males N= 3 ; females N= 21 | male | female | male | female | male | female | |
33 % | 57 % | 67 % | 38 % | 0 % | 4.8 % | ||
Race: White N= 14 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
64 % | 40 % | 36 % | 50 % | 0 % | 10 % |
plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.28225
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "ES Glucosidase - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
second_year | Total N= 24 | 29 % | 50 % | 21 % | |||
Sex: males N= 2 ; females N= 21 | male | female | male | female | male | female | |
0 % | 33 % | 100 % | 43 % | 0 % | 24 % | ||
Race: White N= 16 ; Non-white N= 8 | white | non-white | white | non-white | white | non-white | |
31 % | 25 % | 56 % | 38 % | 12 % | 38 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
first_year | Total N= 24 | 33 % | 50 % | 17 % | |||
Sex: males N= 2 ; females N= 22 | male | female | male | female | male | female | |
0 % | 36 % | 50 % | 50 % | 50 % | 14 % | ||
Race: White N= 16 ; Non-white N= 8 | white | non-white | white | non-white | white | non-white | |
38 % | 25 % | 38 % | 75 % | 25 % | 0 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
third_year | Total N= 24 | 54 % | 42 % | 4.2 % | |||
Sex: males N= 3 ; females N= 21 | male | female | male | female | male | female | |
33 % | 57 % | 67 % | 38 % | 0 % | 4.8 % | ||
Race: White N= 14 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
64 % | 40 % | 36 % | 50 % | 0 % | 10 % |
Chi-square analysis of Performance by Sex and Race considering different years
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")
The Chi-square analysis gives a p= 0.33591
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")
The Chi-square analysis gives a p= 0.30276
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")
The Chi-square analysis gives a p= 0.62755
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")
The Chi-square analysis gives a p= 0.15335
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")
The Chi-square analysis gives a p= 0.35944
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")
The Chi-square analysis gives a p= 0.31335
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”
#
allBiochem = analyzeUMRCourses(umrs3)
allBiochem = addExperts(allBiochem,exs3)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 21 | 0.12 | 0.14 | 0.12 | -0.21 | 0.29 | 0.50 |
X12 | O Chem 2 | 21 | 0.12 | 0.10 | 0.13 | -0.09 | 0.28 | 0.36 |
X13 | BiocF22 | 21 | 0.18 | 0.12 | 0.17 | -0.07 | 0.36 | 0.43 |
X14 | Expert | 7 | 0.71 | 0.08 | 0.69 | 0.60 | 0.82 | 0.22 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 0.9964900 |
BiocF22-O Chem 1 | 0.3491960 |
Expert-O Chem 1 | 0.0000000 |
BiocF22-O Chem 2 | 0.4716465 |
Expert-O Chem 2 | 0.0000000 |
Expert-BiocF22 | 0.0000000 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 7 | 0.71 | 0.08 | 0.69 | 0.60 | 0.82 | 0.22 |
X12 | first_year | 21 | 0.12 | 0.14 | 0.12 | -0.21 | 0.29 | 0.50 |
X13 | second_year | 21 | 0.12 | 0.10 | 0.13 | -0.09 | 0.28 | 0.36 |
X14 | third_year | 21 | 0.18 | 0.12 | 0.17 | -0.07 | 0.36 | 0.43 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0000000 |
second_year-Expert | 0.0000000 |
third_year-Expert | 0.0000000 |
second_year-first_year | 0.9964900 |
third_year-first_year | 0.3491960 |
third_year-second_year | 0.4716465 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 20 | 0.11 | 0.12 | 0.13 | -0.21 | 0.31 | 0.52 |
X12 | White | 42 | 0.16 | 0.11 | 0.17 | -0.08 | 0.36 | 0.44 |
Testing statistical significance: p-values |
---|
0.1909842 |
Only females had consistently taken the Nucleic Acid survey, so no “Sex_birth” analysis is provided
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 21 | 0.16 | 0.08 | 0.14 | 0.04 | 0.39 | 0.35 |
X12 | O Chem 2 | 21 | 0.16 | 0.05 | 0.17 | 0.04 | 0.27 | 0.23 |
X13 | BiocF22 | 21 | 0.19 | 0.08 | 0.20 | 0.04 | 0.30 | 0.26 |
X14 | Expert | 7 | 0.43 | 0.08 | 0.44 | 0.33 | 0.53 | 0.20 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 0.9947528 |
BiocF22-O Chem 1 | 0.7725937 |
Expert-O Chem 1 | 0.0000000 |
BiocF22-O Chem 2 | 0.6257497 |
Expert-O Chem 2 | 0.0000000 |
Expert-BiocF22 | 0.0000000 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 7 | 0.43 | 0.08 | 0.44 | 0.33 | 0.53 | 0.20 |
X12 | first_year | 21 | 0.16 | 0.08 | 0.14 | 0.04 | 0.39 | 0.35 |
X13 | second_year | 21 | 0.16 | 0.05 | 0.17 | 0.04 | 0.27 | 0.23 |
X14 | third_year | 21 | 0.19 | 0.08 | 0.20 | 0.04 | 0.30 | 0.26 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0000000 |
second_year-Expert | 0.0000000 |
third_year-Expert | 0.0000000 |
second_year-first_year | 0.9947528 |
third_year-first_year | 0.7725937 |
third_year-second_year | 0.6257497 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 20 | 0.18 | 0.09 | 0.19 | 0.04 | 0.39 | 0.35 |
X12 | White | 42 | 0.16 | 0.06 | 0.15 | 0.04 | 0.30 | 0.26 |
Testing statistical significance: p-values |
---|
0.3247307 |
Only females had consistently taken the Nucleic Acid survey, so no “Sex_birth” analysis is provided
The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 7 | 0.71 | 0.08 | 0.69 | 0.60 | 0.82 | 0.22 |
X12 | HP | 22 | 0.25 | 0.06 | 0.23 | 0.14 | 0.36 | 0.22 |
X13 | IP | 30 | 0.12 | 0.07 | 0.12 | 0.00 | 0.31 | 0.31 |
X14 | LP | 11 | -0.04 | 0.08 | -0.06 | -0.21 | 0.05 | 0.26 |
Testing statistical significance: p-values | |
---|---|
HP-Expert | 0e+00 |
IP-Expert | 0e+00 |
LP-Expert | 0e+00 |
IP-HP | 2e-07 |
LP-HP | 0e+00 |
LP-IP | 0e+00 |
plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.11131
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Nucleic Acids - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 2 | Total N= 21 | 19 % | 67 % | 14 % | |||
Sex: males N= 0 ; females N= 20 | male | female | male | female | male | female | |
NaN % | 20 % | NaN % | 70 % | NaN % | 10 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
21 % | 14 % | 79 % | 43 % | 0 % | 43 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 1 | Total N= 21 | 38 % | 33 % | 29 % | |||
Sex: males N= 0 ; females N= 21 | male | female | male | female | male | female | |
NaN % | 38 % | NaN % | 33 % | NaN % | 29 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
29 % | 57 % | 36 % | 29 % | 36 % | 14 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
BiocF22 | Total N= 21 | 48 % | 43 % | 9.5 % | |||
Sex: males N= 0 ; females N= 21 | male | female | male | female | male | female | |
NaN % | 48 % | NaN % | 43 % | NaN % | 9.5 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
50 % | 43 % | 43 % | 43 % | 7.1 % | 14 % |
plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.11131
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Nucleic Acids - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
second_year | Total N= 21 | 19 % | 67 % | 14 % | |||
Sex: males N= 0 ; females N= 20 | male | female | male | female | male | female | |
NaN % | 20 % | NaN % | 70 % | NaN % | 10 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
21 % | 14 % | 79 % | 43 % | 0 % | 43 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
first_year | Total N= 21 | 38 % | 33 % | 29 % | |||
Sex: males N= 0 ; females N= 21 | male | female | male | female | male | female | |
NaN % | 38 % | NaN % | 33 % | NaN % | 29 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
29 % | 57 % | 36 % | 29 % | 36 % | 14 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
third_year | Total N= 21 | 48 % | 43 % | 9.5 % | |||
Sex: males N= 0 ; females N= 21 | male | female | male | female | male | female | |
NaN % | 48 % | NaN % | 43 % | NaN % | 9.5 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
50 % | 43 % | 43 % | 43 % | 7.1 % | 14 % |
Chi-square analysis of Performance by Sex and Race considering different years
#plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")
#plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")
#plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")
The Chi-square analysis gives a p= 0.40224
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")
The Chi-square analysis gives a p= 0.02993
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")
The Chi-square analysis gives a p= 0.86071
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”
#
allBiochem = analyzeUMRCourses(umrs4)
allBiochem = addExperts(allBiochem,exs4)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 25 | 0.16 | 0.13 | 0.18 | -0.19 | 0.34 | 0.53 |
X12 | O Chem 2 | 25 | 0.16 | 0.15 | 0.16 | -0.12 | 0.48 | 0.60 |
X13 | BiocF22 | 25 | 0.19 | 0.13 | 0.20 | -0.04 | 0.41 | 0.45 |
X14 | Expert | 15 | 0.69 | 0.13 | 0.66 | 0.52 | 0.89 | 0.38 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 1.0000000 |
BiocF22-O Chem 1 | 0.8602106 |
Expert-O Chem 1 | 0.0000000 |
BiocF22-O Chem 2 | 0.8582771 |
Expert-O Chem 2 | 0.0000000 |
Expert-BiocF22 | 0.0000000 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 15 | 0.69 | 0.13 | 0.66 | 0.52 | 0.89 | 0.38 |
X12 | first_year | 25 | 0.16 | 0.13 | 0.18 | -0.19 | 0.34 | 0.53 |
X13 | second_year | 25 | 0.16 | 0.15 | 0.16 | -0.12 | 0.48 | 0.60 |
X14 | third_year | 25 | 0.19 | 0.13 | 0.20 | -0.04 | 0.41 | 0.45 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0000000 |
second_year-Expert | 0.0000000 |
third_year-Expert | 0.0000000 |
second_year-first_year | 1.0000000 |
third_year-first_year | 0.8602106 |
third_year-second_year | 0.8582771 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 29 | 0.14 | 0.13 | 0.14 | -0.12 | 0.40 | 0.52 |
X12 | White | 45 | 0.20 | 0.13 | 0.20 | -0.19 | 0.48 | 0.68 |
Testing statistical significance: p-values |
---|
0.053386 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Female | 71 | 0.17 | 0.13 | 0.18 | -0.19 | 0.48 | 0.68 |
X12 | Male | 3 | 0.19 | 0.14 | 0.16 | 0.06 | 0.35 | 0.28 |
Testing statistical significance: p-values |
---|
0.8472443 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 25 | 0.16 | 0.06 | 0.17 | 0.04 | 0.30 | 0.26 |
X12 | O Chem 2 | 25 | 0.15 | 0.07 | 0.15 | 0.00 | 0.30 | 0.30 |
X13 | BiocF22 | 25 | 0.17 | 0.06 | 0.17 | 0.04 | 0.30 | 0.26 |
X14 | Expert | 15 | 0.35 | 0.09 | 0.35 | 0.25 | 0.53 | 0.28 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 0.9166441 |
BiocF22-O Chem 1 | 0.9843835 |
Expert-O Chem 1 | 0.0000000 |
BiocF22-O Chem 2 | 0.7483636 |
Expert-O Chem 2 | 0.0000000 |
Expert-BiocF22 | 0.0000000 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 15 | 0.35 | 0.09 | 0.35 | 0.25 | 0.53 | 0.28 |
X12 | first_year | 25 | 0.16 | 0.06 | 0.17 | 0.04 | 0.30 | 0.26 |
X13 | second_year | 25 | 0.15 | 0.07 | 0.15 | 0.00 | 0.30 | 0.30 |
X14 | third_year | 25 | 0.17 | 0.06 | 0.17 | 0.04 | 0.30 | 0.26 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0000000 |
second_year-Expert | 0.0000000 |
third_year-Expert | 0.0000000 |
second_year-first_year | 0.9166441 |
third_year-first_year | 0.9843835 |
third_year-second_year | 0.7483636 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 29 | 0.15 | 0.06 | 0.15 | 0.04 | 0.3 | 0.26 |
X12 | White | 45 | 0.17 | 0.07 | 0.17 | 0.00 | 0.3 | 0.30 |
Testing statistical significance: p-values |
---|
0.3355991 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Female | 71 | 0.16 | 0.07 | 0.17 | 0.00 | 0.30 | 0.30 |
X12 | Male | 3 | 0.13 | 0.07 | 0.15 | 0.05 | 0.18 | 0.13 |
Testing statistical significance: p-values |
---|
0.3499788 |
The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 15 | 0.69 | 0.13 | 0.66 | 0.52 | 0.89 | 0.38 |
X12 | HP | 25 | 0.27 | 0.10 | 0.26 | 0.10 | 0.48 | 0.38 |
X13 | IP | 36 | 0.18 | 0.07 | 0.18 | 0.03 | 0.37 | 0.34 |
X14 | LP | 14 | -0.02 | 0.09 | -0.01 | -0.19 | 0.08 | 0.28 |
Testing statistical significance: p-values | |
---|---|
HP-Expert | 0.0000000 |
IP-Expert | 0.0000000 |
LP-Expert | 0.0000000 |
IP-HP | 0.0054258 |
LP-HP | 0.0000000 |
LP-IP | 0.0000000 |
plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.48122
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Oxygen Binding - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 2 | Total N= 25 | 20 % | 56 % | 24 % | |||
Sex: males N= 1 ; females N= 23 | male | female | male | female | male | female | |
0 % | 22 % | 0 % | 57 % | 100 % | 22 % | ||
Race: White N= 15 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
33 % | 0 % | 47 % | 70 % | 20 % | 30 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 1 | Total N= 25 | 36 % | 48 % | 16 % | |||
Sex: males N= 1 ; females N= 24 | male | female | male | female | male | female | |
0 % | 38 % | 100 % | 46 % | 0 % | 17 % | ||
Race: White N= 15 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
47 % | 20 % | 40 % | 60 % | 13 % | 20 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
BiocF22 | Total N= 25 | 44 % | 40 % | 16 % | |||
Sex: males N= 1 ; females N= 24 | male | female | male | female | male | female | |
100 % | 42 % | 0 % | 42 % | 0 % | 17 % | ||
Race: White N= 15 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
47 % | 40 % | 40 % | 40 % | 13 % | 20 % |
plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.48122
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Oxygen Binding - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
second_year | Total N= 25 | 20 % | 56 % | 24 % | |||
Sex: males N= 1 ; females N= 23 | male | female | male | female | male | female | |
0 % | 22 % | 0 % | 57 % | 100 % | 22 % | ||
Race: White N= 15 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
33 % | 0 % | 47 % | 70 % | 20 % | 30 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
first_year | Total N= 25 | 36 % | 48 % | 16 % | |||
Sex: males N= 1 ; females N= 24 | male | female | male | female | male | female | |
0 % | 38 % | 100 % | 46 % | 0 % | 17 % | ||
Race: White N= 15 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
47 % | 20 % | 40 % | 60 % | 13 % | 20 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
third_year | Total N= 25 | 44 % | 40 % | 16 % | |||
Sex: males N= 1 ; females N= 24 | male | female | male | female | male | female | |
100 % | 42 % | 0 % | 42 % | 0 % | 17 % | ||
Race: White N= 15 ; Non-white N= 10 | white | non-white | white | non-white | white | non-white | |
47 % | 40 % | 40 % | 40 % | 13 % | 20 % |
Chi-square analysis of Performance by Sex and Race considering different years
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")
The Chi-square analysis gives a p= 0.56879
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")
The Chi-square analysis gives a p= 0.20904
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")
The Chi-square analysis gives a p= 0.51537
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")
The Chi-square analysis gives a p= 0.39616
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")
The Chi-square analysis gives a p= 0.12451
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")
The Chi-square analysis gives a p= 0.89258
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
We are comparing how the PLC score is significantly different among the different categories “Course collected”, “Student year”, “White/Non-white”, and “Sex at birth”
#
allBiochem = analyzeUMRCourses(umrs5)
allBiochem = addExperts(allBiochem,exs5)
#buildTables(allBiochem)
plotAndTable(allBiochem,"Course_collected","PLC","PLC: Course","PLC")
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 21 | 0.14 | 0.15 | 0.13 | -0.11 | 0.46 | 0.57 |
X12 | O Chem 2 | 21 | 0.20 | 0.20 | 0.22 | -0.18 | 0.62 | 0.80 |
X13 | BiocF22 | 21 | 0.25 | 0.11 | 0.27 | 0.03 | 0.51 | 0.48 |
X14 | Expert | 7 | 0.76 | 0.10 | 0.79 | 0.59 | 0.89 | 0.30 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 0.5529700 |
BiocF22-O Chem 1 | 0.0890982 |
Expert-O Chem 1 | 0.0000000 |
BiocF22-O Chem 2 | 0.7099252 |
Expert-O Chem 2 | 0.0000000 |
Expert-BiocF22 | 0.0000000 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 7 | 0.76 | 0.10 | 0.79 | 0.59 | 0.89 | 0.30 |
X12 | first_year | 21 | 0.14 | 0.15 | 0.13 | -0.11 | 0.46 | 0.57 |
X13 | second_year | 21 | 0.20 | 0.20 | 0.22 | -0.18 | 0.62 | 0.80 |
X14 | third_year | 21 | 0.25 | 0.11 | 0.27 | 0.03 | 0.51 | 0.48 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0000000 |
second_year-Expert | 0.0000000 |
third_year-Expert | 0.0000000 |
second_year-first_year | 0.5529700 |
third_year-first_year | 0.0890982 |
third_year-second_year | 0.7099252 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 20 | 0.14 | 0.17 | 0.19 | -0.18 | 0.41 | 0.59 |
X12 | White | 42 | 0.22 | 0.16 | 0.22 | -0.07 | 0.62 | 0.69 |
Testing statistical significance: p-values |
---|
0.0748154 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Female | 59 | 0.20 | 0.16 | 0.22 | -0.18 | 0.62 | 0.80 |
X12 | Male | 3 | -0.01 | 0.06 | -0.04 | -0.05 | 0.07 | 0.11 |
Testing statistical significance: p-values |
---|
0.0288722 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | O Chem 1 | 21 | 0.16 | 0.05 | 0.16 | 0.03 | 0.25 | 0.22 |
X12 | O Chem 2 | 21 | 0.17 | 0.07 | 0.15 | 0.04 | 0.35 | 0.31 |
X13 | BiocF22 | 21 | 0.18 | 0.04 | 0.18 | 0.08 | 0.29 | 0.21 |
X14 | Expert | 7 | 0.35 | 0.08 | 0.35 | 0.24 | 0.44 | 0.21 |
Testing statistical significance: p-values | |
---|---|
O Chem 2-O Chem 1 | 0.9395887 |
BiocF22-O Chem 1 | 0.6126707 |
Expert-O Chem 1 | 0.0000000 |
BiocF22-O Chem 2 | 0.9143450 |
Expert-O Chem 2 | 0.0000000 |
Expert-BiocF22 | 0.0000001 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 7 | 0.35 | 0.08 | 0.35 | 0.24 | 0.44 | 0.21 |
X12 | first_year | 21 | 0.16 | 0.05 | 0.16 | 0.03 | 0.25 | 0.22 |
X13 | second_year | 21 | 0.17 | 0.07 | 0.15 | 0.04 | 0.35 | 0.31 |
X14 | third_year | 21 | 0.18 | 0.04 | 0.18 | 0.08 | 0.29 | 0.21 |
Testing statistical significance: p-values | |
---|---|
first_year-Expert | 0.0000000 |
second_year-Expert | 0.0000000 |
third_year-Expert | 0.0000001 |
second_year-first_year | 0.9395887 |
third_year-first_year | 0.6126707 |
third_year-second_year | 0.9143450 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Non-white | 20 | 0.17 | 0.06 | 0.17 | 0.04 | 0.28 | 0.24 |
X12 | White | 42 | 0.17 | 0.06 | 0.17 | 0.03 | 0.35 | 0.31 |
Testing statistical significance: p-values |
---|
0.8978509 |
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Female | 59 | 0.17 | 0.06 | 0.17 | 0.03 | 0.35 | 0.31 |
X12 | Male | 3 | 0.12 | 0.03 | 0.12 | 0.09 | 0.14 | 0.05 |
Testing statistical significance: p-values |
---|
0.1400967 |
The problem with clustering is that it is an iterative method and different “initial seeds” will yield to different results. It is only reproducible when the k-means method uses “set.seed(42)”
group1 | n | mean | sd | median | min | max | range | |
---|---|---|---|---|---|---|---|---|
X11 | Expert | 7 | 0.76 | 0.10 | 0.79 | 0.59 | 0.89 | 0.30 |
X12 | HP | 14 | 0.35 | 0.13 | 0.32 | 0.11 | 0.62 | 0.51 |
X13 | IP | 30 | 0.25 | 0.08 | 0.24 | 0.13 | 0.46 | 0.33 |
X14 | LP | 19 | 0.00 | 0.09 | 0.02 | -0.18 | 0.11 | 0.29 |
Testing statistical significance: p-values | |
---|---|
HP-Expert | 0.0000000 |
IP-Expert | 0.0000000 |
LP-Expert | 0.0000000 |
IP-HP | 0.0108399 |
LP-HP | 0.0000000 |
LP-IP | 0.0000000 |
plotBarAndCorr(allBiochem,"Course_collected","clusterLetter","Course","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.11599
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$Course_collected))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Protein Structure - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$Course_collected), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 2 | Total N= 21 | 33 % | 33 % | 33 % | |||
Sex: males N= 1 ; females N= 19 | male | female | male | female | male | female | |
0 % | 37 % | 0 % | 32 % | 100 % | 32 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
29 % | 43 % | 43 % | 14 % | 29 % | 43 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
O Chem 1 | Total N= 21 | 14 % | 43 % | 43 % | |||
Sex: males N= 1 ; females N= 20 | male | female | male | female | male | female | |
0 % | 15 % | 0 % | 45 % | 100 % | 40 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
14 % | 14 % | 50 % | 29 % | 36 % | 57 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
BiocF22 | Total N= 21 | 19 % | 67 % | 14 % | |||
Sex: males N= 1 ; females N= 20 | male | female | male | female | male | female | |
0 % | 20 % | 0 % | 70 % | 100 % | 10 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
21 % | 14 % | 71 % | 57 % | 7.1 % | 29 % |
plotBarAndCorr(allBiochem,"actual_year","clusterLetter","Year","N of students","High, Intermediate, Low Performance cluster")
The Chi-square analysis gives a p= 0.11599
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
markerIntegers = as.integer(as.factor(allBiochem$actual_year))
plot(allBiochem$PLC,allBiochem$NS,pch=allBiochem$clusterLetter,main = "Protein Structure - High(H), Intermediate(I), Low(L) performers",ylab="NS",xlab="PLC",col=markerIntegers)
legend("topleft", legend=unique(allBiochem$actual_year), col=unique(markerIntegers), lty=1:1, cex=0.8)
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
second_year | Total N= 21 | 33 % | 33 % | 33 % | |||
Sex: males N= 1 ; females N= 19 | male | female | male | female | male | female | |
0 % | 37 % | 0 % | 32 % | 100 % | 32 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
29 % | 43 % | 43 % | 14 % | 29 % | 43 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
first_year | Total N= 21 | 14 % | 43 % | 43 % | |||
Sex: males N= 1 ; females N= 20 | male | female | male | female | male | female | |
0 % | 15 % | 0 % | 45 % | 100 % | 40 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
14 % | 14 % | 50 % | 29 % | 36 % | 57 % |
High Performers | Intermediate Performers | Low Performers | |||||
---|---|---|---|---|---|---|---|
third_year | Total N= 21 | 19 % | 67 % | 14 % | |||
Sex: males N= 1 ; females N= 20 | male | female | male | female | male | female | |
0 % | 20 % | 0 % | 70 % | 100 % | 10 % | ||
Race: White N= 14 ; Non-white N= 7 | white | non-white | white | non-white | white | non-white | |
21 % | 14 % | 71 % | 57 % | 7.1 % | 29 % |
Chi-square analysis of Performance by Sex and Race considering different years
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 1st year")
The Chi-square analysis gives a p= 0.49659
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 2nd year")
The Chi-square analysis gives a p= 0.37627
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"Sex_birth","clusterLetter","Sex","N of students","Performance by Sex 3rd year")
The Chi-square analysis gives a p= 0.04285
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="first_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 1st year")
The Chi-square analysis gives a p= 0.60653
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="second_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 2nd year")
The Chi-square analysis gives a p= 0.42437
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected
plotBarAndCorr(allBiochem[which(allBiochem$actual_year=="third_year"),],"race_binary","clusterLetter","Race","N of students","Performance by Race 3rd year")
The Chi-square analysis gives a p= 0.41316
Residuals analysis:
A negative residual implies that the measured value is lower than expected and a positive value higher than expected