1 Load data

data source:

Kidney single cell data

load("./in/kidney/sc_kidney.Rdata")
Idents(sc_kidney) %>% table()

2 Run Time Test

2.1 starTracer Running Time Test

run time test

We run searchMarker 5 times.

t.Star.kidney.new #1.707269 1.713925 1.333312 1.607241 1.359588 1.593838 1.288894 1.553189 1.451442 1.462615
t.Star.kidney.all.new #2.804183 2.418100 2.543570 2.402248 2.409476 2.529940 2.426745 2.481272 2.574774 2.425738

visualization

we use dotplot to show the marker genes found by starTracer.

out.Start.kidney <- starTracer::searchMarker(sc_kidney,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 3,gene.use = "HVG")
(p <- use_dotplot(sc_kidney,features = out.Start.kidney$genes.markers, colors.gradient = c("#018daa","#ffffff","#ed463a")))

2.2 Seurat Running Time Test

We test the run time 10 times. The 11th time’s result will be stored.

t.Seurat <- c()
for(i in 1:11){
  if(i <= 10){
    print(paste0("calculating time:",i))
    t1 <- Sys.time()
    FindAllMarkers(sc_kidney,logfc.threshold = 0.5,only.pos = T)
    t2 <- Sys.time()
    t <- t2-t1 #10.99338/10.1306 secs 12.34614/12.43914/11.35643 for HVG
    t.Seurat <- append(t.Seurat,t)
  } else {
    out.kidney.Seurat <- FindAllMarkers(sc_kidney,logfc.threshold = 0.5,only.pos = T)
  }
}

load

load("./out/f3_kidney/t.Seurat.kidney.Rdata") #name: t.Seurat.kidney
load("./out/f3_kidney/markers.Seurat.Rdata") #name: out.kidney.Seurat

The calculation time:

t.Seurat.kidney
t.Star.kidney

plot of Seurat Result

2.3 Difference of calculation time of Seurat and starTracer

time.1 <- as.numeric(t.Seurat.kidney)
time.2 <- as.numeric(t.Star.kidney.new)
time.3 <- as.numeric(t.Star.kidney.all.new)
frame.time <- data.frame(starTracer.all = time.3, starTracer = time.2, Seurat = time.1)
frame.time$RunTime <- 1:10 %>% as.vector()
mat.use <- reshape2::melt(frame.time,id = "RunTime")

comparisons <- list(c("starTracer.all","starTracer"), c("starTracer.all", "Seurat"),
                       c("starTracer", "Seurat"))

(p <- ggbarplot(mat.use, x = "variable", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#e3b802","#fcdc57","blue"),stat = "identity",
          position = position_dodge(0.8)) + stat_compare_means(method = "t.test",comparisons = comparisons) + theme(axis.text.x = element_text(angle = 45,hjust = 1)))

show the results:

(fc1 <- mean(time.1)/mean(time.2)) #38.2036
(fc2 <- mean(time.1)/mean(time.3)) #17.91526

t.test(time.1,time.2)$p.value #2.192067e-23

t.test(time.1,time.3)$p.value #6.118015e-22

t.test(time.2,time.3)$p.value #4.3955e-16

3 Specificity

3.1 Specificity of the Top5 marker genes of starTracer

Calculate the top5 marker genes for Ti calculation

Same as previous chapters, we will find the top 5 marker genes for specificity test. Specificity is defined as:

\[ T_i=\frac{\mathrm{card}\ \left(\mathrm{G}_\mathrm{i}\mathrm{\cap}\ \mathrm{G}_{\mathrm{clust}}\right)}{\mathrm{card}\left(\mathrm{G}_\mathrm{i}\right)} \]

Where:

Gi represents the set of the cells expressing gene i in-silico.

Gclust represents the set of cells where gene i serves as the in-silico marker gene.

Please refer to our original research article to see more details.

Calculate specificity measured by Ti

calculating specificity level defined by Ti might be time consuming especially for large data set. Ti is calculated by function cal_spe. Please refer to “function.R” for more details.

frame.star <- subset(out.Start.kidney.5$para_frame,out.Start.kidney.5$para_frame$gene %in% out.Start.kidney.5$genes.markers)[,c("max.X","gene")]
colnames(frame.star)[1] <- "cluster"

#Idents(snRNA.hpfc.sct)

frame.star <- cal_spe(sc_kidney,ident.use = "cell_type__ontology_label",frame.star)
frame.star %>% head()

3.2 Frame of Seurat

Generating the frame of Seurat is the same as before:

  1. Find marker genes using FindAllMarkers function.
  2. Subset the matrix by setting p-value < 0.05
  3. re-arrange the data.frame by cluster and descending average log2(Fold-Change)
  4. Keep the top 10 marker gene in each cluster
gene.seurat <- subset(out.kidney.Seurat,out.kidney.Seurat$p_val_adj < 0.05)
gene.seurat <- arrange(gene.seurat,cluster,desc(avg_log2FC))
gene.seurat <- with(gene.seurat,by(gene.seurat[,"gene"],INDICES = cluster,FUN = function(x)head(x,10))) %>% unlist() %>% as.vector() %>% rev()

gene.seurat %>% head()

We then re-arrange the marker gene data.frame. As for the genes that occurs in more than one cluster, we will only keep it in the cluster that has larger avg_log2FC.

frame.seurat <- out.kidney.Seurat[,c("cluster","gene","avg_log2FC")] %>% subset(out.kidney.Seurat$gene %in% gene.seurat)
frame.seurat <- arrange(frame.seurat,gene,desc(avg_log2FC))
frame.seurat <- frame.seurat[!duplicated(frame.seurat$gene),] %>% arrange(cluster,desc(avg_log2FC))

frame.seurat <- frame.seurat %>%
  group_by(cluster) %>% 
  top_n(5, wt = avg_log2FC) %>% 
  ungroup()
frame.seurat$cluster %>% table() 

#this is for plotting
frame.seurat.p <- frame.seurat %>%
  group_by(cluster) %>% 
  top_n(3, wt = avg_log2FC) %>% 
  ungroup()
frame.seurat.p$cluster %>% table() %>% head()

plot

(p <- use_dotplot(sc_kidney,features = rev(frame.seurat.p$gene), colors.gradient = c("#13c4cd","#ffffff","#b91192"),dot.scale = 3))

3.3 The Specificity Level Measured by Ti Among Two Softwares

frame.seurat <- frame.seurat[,c("cluster","gene")]
frame.seurat <- cal_spe(sc_kidney,ident.use = "cell_type__ontology_label",as.data.frame(frame.seurat))
frame.seurat %>% head() 
frame.star <- subset(frame.star,frame.star$cluster %in% frame.seurat$cluster)
frame.star %>% head()
identical(frame.star$cluster,frame.seurat$cluster)
frame.pct.pos <- data.frame(cluster = frame.star$cluster,pct.star = frame.star$pct.pos,pct.seurat = frame.seurat$pct.pos)

#cal seurat.mean
frame.pct.pos$pct.seurat.mean <- rep(aggregate(pct.seurat ~ cluster, data = frame.pct.pos, FUN = mean)$pct.seurat,each = 5)

#divide seurat.mean to get star.adk and seurat.adj
frame.pct.pos$pct.seurat.adj <- frame.pct.pos$pct.seurat/frame.pct.pos$pct.seurat.mean
frame.pct.pos$pct.star.adj <- frame.pct.pos$pct.star/frame.pct.pos$pct.seurat.mean

#cal star.adj.mean to arrange the plot
frame.pct.pos$pct.star.adj.mean <- rep(aggregate(pct.star.adj ~ cluster, data = frame.pct.pos, FUN = mean)$pct.star.adj,each = 5)

#arrange
frame.pct.pos <- arrange(frame.pct.pos,desc(pct.star.adj.mean))
frame.pct.pos$cluster <- factor(frame.pct.pos$cluster,levels = unique(frame.pct.pos$cluster))

p <- c()
foldchange <- c()
for (i in unique(frame.pct.pos$cluster)){
  mat <- subset(frame.pct.pos,cluster == i)
  t1 <- mat$pct.star
  t2 <- mat$pct.seurat
  p.i <- t.test(t1,t2)$p.value
  foldchange.i <- mean(mat$pct.star)/mean(mat$pct.seurat)

  foldchange <- append(foldchange,foldchange.i)
  p <- append(p,p.i)
}
frame.pct.pos.stat <- data.frame(cluster = unique(frame.pct.pos$cluster),p = p, foldchange = foldchange)

visulize

mat.use <- reshape2::melt(frame.pct.pos[,c(1,5,6)])
(p <- ggbarplot(mat.use, x = "cluster", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#fcdc57","blue"),
          position = position_dodge(0.8)) + stat_compare_means(aes(group = variable), label = "p.signif") + theme(axis.text.x = element_text(angle = 45,hjust = 1)))