data source:
load("./in/kidney/sc_kidney.Rdata")
Idents(sc_kidney) %>% table()
We run searchMarker 5 times.
t.Star.kidney.new #1.707269 1.713925 1.333312 1.607241 1.359588 1.593838 1.288894 1.553189 1.451442 1.462615
t.Star.kidney.all.new #2.804183 2.418100 2.543570 2.402248 2.409476 2.529940 2.426745 2.481272 2.574774 2.425738
we use dotplot to show the marker genes found by starTracer.
out.Start.kidney <- starTracer::searchMarker(sc_kidney,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 3,gene.use = "HVG")
(p <- use_dotplot(sc_kidney,features = out.Start.kidney$genes.markers, colors.gradient = c("#018daa","#ffffff","#ed463a")))
We test the run time 10 times. The 11th time’s result will be stored.
t.Seurat <- c()
for(i in 1:11){
if(i <= 10){
print(paste0("calculating time:",i))
t1 <- Sys.time()
FindAllMarkers(sc_kidney,logfc.threshold = 0.5,only.pos = T)
t2 <- Sys.time()
t <- t2-t1 #10.99338/10.1306 secs 12.34614/12.43914/11.35643 for HVG
t.Seurat <- append(t.Seurat,t)
} else {
out.kidney.Seurat <- FindAllMarkers(sc_kidney,logfc.threshold = 0.5,only.pos = T)
}
}
load
load("./out/f3_kidney/t.Seurat.kidney.Rdata") #name: t.Seurat.kidney
load("./out/f3_kidney/markers.Seurat.Rdata") #name: out.kidney.Seurat
The calculation time:
t.Seurat.kidney
t.Star.kidney
plot of Seurat Result
time.1 <- as.numeric(t.Seurat.kidney)
time.2 <- as.numeric(t.Star.kidney.new)
time.3 <- as.numeric(t.Star.kidney.all.new)
frame.time <- data.frame(starTracer.all = time.3, starTracer = time.2, Seurat = time.1)
frame.time$RunTime <- 1:10 %>% as.vector()
mat.use <- reshape2::melt(frame.time,id = "RunTime")
comparisons <- list(c("starTracer.all","starTracer"), c("starTracer.all", "Seurat"),
c("starTracer", "Seurat"))
(p <- ggbarplot(mat.use, x = "variable", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#e3b802","#fcdc57","blue"),stat = "identity",
position = position_dodge(0.8)) + stat_compare_means(method = "t.test",comparisons = comparisons) + theme(axis.text.x = element_text(angle = 45,hjust = 1)))
show the results:
(fc1 <- mean(time.1)/mean(time.2)) #38.2036
(fc2 <- mean(time.1)/mean(time.3)) #17.91526
t.test(time.1,time.2)$p.value #2.192067e-23
t.test(time.1,time.3)$p.value #6.118015e-22
t.test(time.2,time.3)$p.value #4.3955e-16
Same as previous chapters, we will find the top 5 marker genes for specificity test. Specificity is defined as:
\[ T_i=\frac{\mathrm{card}\ \left(\mathrm{G}_\mathrm{i}\mathrm{\cap}\ \mathrm{G}_{\mathrm{clust}}\right)}{\mathrm{card}\left(\mathrm{G}_\mathrm{i}\right)} \]
Where:
Gi represents the set of the cells expressing gene i in-silico.
Gclust represents the set of cells where gene i serves as the in-silico marker gene.
Please refer to our original research article to see more details.
calculating specificity level defined by Ti might be time consuming
especially for large data set. Ti is calculated by function
cal_spe
. Please refer to “function.R
” for more
details.
frame.star <- subset(out.Start.kidney.5$para_frame,out.Start.kidney.5$para_frame$gene %in% out.Start.kidney.5$genes.markers)[,c("max.X","gene")]
colnames(frame.star)[1] <- "cluster"
#Idents(snRNA.hpfc.sct)
frame.star <- cal_spe(sc_kidney,ident.use = "cell_type__ontology_label",frame.star)
frame.star %>% head()
Generating the frame of Seurat is the same as before:
gene.seurat <- subset(out.kidney.Seurat,out.kidney.Seurat$p_val_adj < 0.05)
gene.seurat <- arrange(gene.seurat,cluster,desc(avg_log2FC))
gene.seurat <- with(gene.seurat,by(gene.seurat[,"gene"],INDICES = cluster,FUN = function(x)head(x,10))) %>% unlist() %>% as.vector() %>% rev()
gene.seurat %>% head()
We then re-arrange the marker gene data.frame. As for the genes that occurs in more than one cluster, we will only keep it in the cluster that has larger avg_log2FC.
frame.seurat <- out.kidney.Seurat[,c("cluster","gene","avg_log2FC")] %>% subset(out.kidney.Seurat$gene %in% gene.seurat)
frame.seurat <- arrange(frame.seurat,gene,desc(avg_log2FC))
frame.seurat <- frame.seurat[!duplicated(frame.seurat$gene),] %>% arrange(cluster,desc(avg_log2FC))
frame.seurat <- frame.seurat %>%
group_by(cluster) %>%
top_n(5, wt = avg_log2FC) %>%
ungroup()
frame.seurat$cluster %>% table()
#this is for plotting
frame.seurat.p <- frame.seurat %>%
group_by(cluster) %>%
top_n(3, wt = avg_log2FC) %>%
ungroup()
frame.seurat.p$cluster %>% table() %>% head()
plot
(p <- use_dotplot(sc_kidney,features = rev(frame.seurat.p$gene), colors.gradient = c("#13c4cd","#ffffff","#b91192"),dot.scale = 3))
frame.seurat <- frame.seurat[,c("cluster","gene")]
frame.seurat <- cal_spe(sc_kidney,ident.use = "cell_type__ontology_label",as.data.frame(frame.seurat))
frame.seurat %>% head()
frame.star <- subset(frame.star,frame.star$cluster %in% frame.seurat$cluster)
frame.star %>% head()
identical(frame.star$cluster,frame.seurat$cluster)
frame.pct.pos <- data.frame(cluster = frame.star$cluster,pct.star = frame.star$pct.pos,pct.seurat = frame.seurat$pct.pos)
#cal seurat.mean
frame.pct.pos$pct.seurat.mean <- rep(aggregate(pct.seurat ~ cluster, data = frame.pct.pos, FUN = mean)$pct.seurat,each = 5)
#divide seurat.mean to get star.adk and seurat.adj
frame.pct.pos$pct.seurat.adj <- frame.pct.pos$pct.seurat/frame.pct.pos$pct.seurat.mean
frame.pct.pos$pct.star.adj <- frame.pct.pos$pct.star/frame.pct.pos$pct.seurat.mean
#cal star.adj.mean to arrange the plot
frame.pct.pos$pct.star.adj.mean <- rep(aggregate(pct.star.adj ~ cluster, data = frame.pct.pos, FUN = mean)$pct.star.adj,each = 5)
#arrange
frame.pct.pos <- arrange(frame.pct.pos,desc(pct.star.adj.mean))
frame.pct.pos$cluster <- factor(frame.pct.pos$cluster,levels = unique(frame.pct.pos$cluster))
p <- c()
foldchange <- c()
for (i in unique(frame.pct.pos$cluster)){
mat <- subset(frame.pct.pos,cluster == i)
t1 <- mat$pct.star
t2 <- mat$pct.seurat
p.i <- t.test(t1,t2)$p.value
foldchange.i <- mean(mat$pct.star)/mean(mat$pct.seurat)
foldchange <- append(foldchange,foldchange.i)
p <- append(p,p.i)
}
frame.pct.pos.stat <- data.frame(cluster = unique(frame.pct.pos$cluster),p = p, foldchange = foldchange)
visulize
mat.use <- reshape2::melt(frame.pct.pos[,c(1,5,6)])
(p <- ggbarplot(mat.use, x = "cluster", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#fcdc57","blue"),
position = position_dodge(0.8)) + stat_compare_means(aes(group = variable), label = "p.signif") + theme(axis.text.x = element_text(angle = 45,hjust = 1)))