1 Load Seurat Object

Here we run the same test again on the human heart sample. A single-cell sequencing data containing aroung 600,000 cells. We will test the performance of starTracer on the big sample at the same time.

data source:

heart_single_cell_data

Check the number of cells:

dim(heart_big)

2 Run Time Test

2.1 starTracer Running Time Test

We run searchMarker 5 times.

t.Star.heart.new <- c()
for(i in 1:5){
  print(paste0("calculating time:",i))
  t1 <- Sys.time()
  starTracer::searchMarker(heart_big,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 3,gene.use = "HVG")
  t2 <- Sys.time()
  t <- t2-t1
  t.Star.heart.new <- append(t.Star.heart.new,t)
}

t.Star.heart.all.new <- c()
for(i in 1:5){
  print(paste0("calculating time:",i))
  t1 <- Sys.time()
  starTracer::searchMarker(heart_big,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 3)
  t2 <- Sys.time()
  t <- t2-t1
  t.Star.heart.all.new <- append(t.Star.heart.all.new,t)
}

Let’s see the results.

t.Star.heart.new #42.43307 44.13184 43.95437 39.74770 40.41655
## Time differences in secs
## [1] 35.68560 36.31657 32.68245 35.20165 32.84306
t.Star.heart.all.new
## Time differences in secs
## [1] 40.09263 36.30915 38.73241 37.02653 38.12459

visualization

We set thresh.1 to 0.5, thresh.2 to 0.3 and num to 3 to plot the top 3 marker genes.

Idents(heart_big) <- heart_big$cell_type__ontology_label
test.heart.big <- starTracer::searchMarker(heart_big,thresh.1 = 0.5,thresh.2 = 0.3,num = 3,gene.use = "HVG")

Dotplot:

(p <- use_dotplot(heart_big,features = test.heart.big$genes.markers, colors.gradient = c("#018daa","#ffffff","#ed463a")))

2.2 Seurat Running Time Test

This scripts takes around 30 hours for 5 runs.

print("loading data")
load("./tmp/heart_big.Rdata")
print("loading data complete!")

print("setting...")
DefaultAssay(heart_big) <- "RNA"
Idents(heart_big) <- "cell_type__ontology_label"

print("running")
t.Seurat.heart <- c()
for(i in 1:5){
  t1 <- Sys.time()
  FindAllMarkers(heart_big,logfc.threshold = 0.5,only.pos = T)
  t2 <- Sys.time()
  t <- t2-t1 #10.99338/10.1306 secs 12.34614/12.43914/11.35643 for HVG
  t.Seurat <- append(t.Seurat,t)
}
save(t.Seurat.heart,file = "./out/f2_bigdata/t.Seurat.heart.Rdata")
print(t.Seurat.heart)

load the file. Each calculation takes around 6 hours.

load("./out/f2_bigdata/t.Seurat.heart.Rdata")
t.Seurat.heart

#Time differences in hours
#[1] 6.424730 6.346148 6.346987 6.353355 6.302240

2.3 Find Difference

We use bar plot to show the difference of calculation time between starTracer and Seurat.

time.1 <- as.numeric(t.Seurat.heart) * 60
time.2 <- as.numeric(t.Star.heart.new)/60
time.3 <- as.numeric(t.Star.heart.all.new)/60
frame.time <- data.frame(starTracer.all = time.3, starTracer = time.2, Seurat = time.1)
frame.time$RunTime <- 1:5 %>% as.vector()
mat.use <- reshape2::melt(frame.time,id = "RunTime")

comparisons <- list(c("starTracer.all","starTracer"), c("starTracer.all", "Seurat"),
                       c("starTracer", "Seurat"))

(p <- ggbarplot(mat.use, x = "variable", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#e3b802","#fcdc57","blue"),stat = "identity",
          position = position_dodge(0.8)) + stat_compare_means(method = "t.test",comparisons = comparisons) + theme(axis.text.x = element_text(angle = 45,hjust = 1)))

Calculate the fold-change:

(fc1 <- mean(time.1)/mean(time.2)) #582.5597
(fc2 <- mean(time.1)/mean(time.3)) #578.5956

t.test(time.1,time.2)$p.value # 5.619788e-10

t.test(time.1,time.3)$p.value #5.617958e-10

t.test(time.2,time.3)$p.value # 0.4741188

3 Calculation Time Using Samples with Different Sample Size

We generate a sequence of Seurat Object containing 100%, 90%, 80%…10% of total cells to run the test.

3.1 Test on starTracer

Samples are generated by using function sample

test.size <- function(){
  t.sample.size <- c()
  n.sample.size <- c()
  for(i in seq(0.1,1,by = 0.1)){
  
  message(i)
  
  n.total <- ncol(heart_big) 
  n.subset <- round(n.total*i)
  
  set.seed(1222)
  cells.subset <- sample(1:n.total, size = n.subset, replace = FALSE) %>% sort()
  
  seurat.subset <- heart_big[,cells.subset]
  table(seurat.subset$cell_type__ontology_label)
  
  t1 <- Sys.time()
  starTracer::searchMarker(seurat.subset,thresh.1 = 0.5,thresh.2 = 0.5,method = "del_MI",num = 3,gene.use = "HVG")
  t2 <- Sys.time()
  t <- t2 - t1
  
  t.sample.size <- append(t.sample.size,t)
  n.sample.size <- append(n.sample.size,n.subset)
  
  rm(seurat.subset);gc()
  }
  sample.size <- data.frame(number = n.sample.size, time = t.sample.size) 
  return(sample.size)
}
sample.size <- test.size()
colnames(sample.size)[2] <- "time.1"
sample.size.1 <- sample.size

sample.size <- test.size()
colnames(sample.size)[2] <- "time.2"
sample.size.2 <- sample.size

sample.size <- test.size()
colnames(sample.size)[2] <- "time.3"
sample.size.3 <- sample.size

sample.size <- inner_join(sample.size.1, sample.size.2, by = "number") %>% inner_join(sample.size.3, by = "number")

sample.size$cell.pct <- seq(0.1,1,by = 0.1)
sample.size

3.2 Test on Seurat

The script of calculation of Seurat is not included in this script. Separated R scripts are stored in ./heart10_2_100, denoted as heart_*.R .

#nohup Rscript ./heart10_2_100/heart_10.R > pct.10.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.20.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.30.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.40.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.50.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.60.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.70.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.80.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.90.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.100.log 2>&1 &

All the results are saved as an Rdata. The name of the variable will be sample.size.seurat

Here are all the calculation time of Seurat. The column number represent the cells in the subset Seurat object. time.1, time.2 and time.3 represents the time Seurat spends calculating the marker genes from the subset object.

sample.size.seurat

3.3 Load the Running Time Data from Seurat and starTracer

We load the results from 3.1 and 3.2, to make a comparison.

The results shows a significant difference in the calculation time between starTracer and Seurat.

p <- c()
foldchange <- c()
for (i in unique(time.seurat.star.size$cell.pct)){
  mat.i <- subset(time.seurat.star.size,cell.pct == i)
  t1 <- mat.i$time.star
  t2 <- mat.i$time.seurat
  p.i <- t.test(t1,t2)$p.value
  foldchange.i <- mean(mat.i$time.seurat)/mean(mat.i$time.star)

  foldchange <- append(foldchange,foldchange.i)
  p <- append(p,p.i)
}
frame.time.size <- data.frame(cell.pct = unique(time.seurat.star.size$cell.pct),p = p, foldchange = foldchange)
frame.time.size

Plot the graph of calculation time.

mat <- time.seurat.star.size
mat <- mat[,c(2,3,4,5)] %>% reshape2::melt(id.var = c("cell.pct","variable"))
mat$value <- mat$value/60
colnames(mat)[2:3] <- c("time","method")

mat.use <- Rmisc::summarySE(mat, measurevar = "value", groupvar = c("cell.pct","method"))

(p <- ggplot(data = mat.use,aes(x = cell.pct, y = value, color = method, group = method)) + 
  #facet_grid(method~.) +
  geom_errorbar(aes(ymin = value - se, ymax = value + se), width = 0.02) + 
  geom_line() + geom_point() + 
  scale_color_manual(values = c("blue","#fcdc57")) + theme_bw() + ylab("time(minutes)"))

4 Calulate the Specificity of Marker Genes

Same as in brain.Rmd, we will run the seachMarker function again. This time we choose to get 5 marker genes in each group. This result will be used to calculate Ti in each cluster.

Ti is defined as

\[ T_i=\frac{\mathrm{card}\ \left(\mathrm{G}_\mathrm{i}\mathrm{\cap}\ \mathrm{G}_{\mathrm{clust}}\right)}{\mathrm{card}\left(\mathrm{G}_\mathrm{i}\right)} \]

Where:

Gi represents the set of the cells expressing gene i in-silico.

Gclust represents the set of cells where gene i serves as the in-silico marker gene.

4.1 Specificity of the Marker Genes Found by starTracer

frame.star <- starTracer::searchMarker(heart_big,thresh.1 = 0.5,thresh.2 = 0.3,num = 5,gene.use = "HVG")
frame.star <- subset(frame.star$para_frame,frame.star$para_frame$gene %in% frame.star$genes.markers)[,c("max.X","gene")]
colnames(frame.star)[1] <- "cluster"
frame.star <- cal_spe(heart_big,ident.use = "cell_type__ontology_label",frame.star)

Let’s see frame.star . A new column named pct.pos is added into the data.frame

frame.star %>% head()

4.2 Specificity of the Marker Genes Found by Seurat

We run heart_bigdata_findAllMArkers in the command line and load the results into R.

load("./out/f2_bigdata/heart_big.Seurat.Rdata")

Generating the frame of Seurat:

  1. Find marker genes using FindAllMarkers function.
  2. Subset the matrix by setting p-value < 0.05
  3. re-arrange the data.frame by cluster and descending average log2(Fold-Change)
  4. Keep the top 10 marker gene in each cluster
gene.seurat <- subset(heart_big.Seurat,heart_big.Seurat$p_val_adj < 0.05)
gene.seurat <- arrange(gene.seurat,cluster,desc(avg_log2FC))
gene.seurat <- with(gene.seurat,by(gene.seurat[,"gene"],INDICES = cluster,FUN = function(x)head(x,10))) %>% unlist() %>% as.vector() %>% rev()
frame.seurat <- heart_big.Seurat[,c("cluster","gene","avg_log2FC")] %>% subset(heart_big.Seurat$gene %in% gene.seurat)
frame.seurat <- arrange(frame.seurat,gene,desc(avg_log2FC))
frame.seurat <- frame.seurat[!duplicated(frame.seurat$gene),] %>% arrange(cluster,desc(avg_log2FC))

frame.seurat <- frame.seurat %>%
  group_by(cluster) %>% #按照cluster列进行分组
  top_n(5, wt = avg_log2FC) %>% #按照FC列从大到小排序,选择每组前2行
  ungroup()
frame.seurat$cluster %>% table()

#this is used to draw graphs
frame.seurat.p <- frame.seurat %>%
  group_by(cluster) %>% 
  top_n(3, wt = avg_log2FC) %>% 
  ungroup()
frame.seurat.p$cluster %>% table() %>% head()
(p <- use_dotplot(heart_big,features = rev(frame.seurat.p$gene), colors.gradient = c("#13c4cd","#ffffff","#b91192"),dot.scale = 3))

Every cluster has 5 marker genes. Calculating the Ti will cost a lot of time, especially for samples with large number of cells.

frame.seurat <- frame.seurat[,c("cluster","gene")]
frame.seurat <- cal_spe(heart_big,ident.use = "cell_type__ontology_label",as.data.frame(frame.seurat))

Show the result of frame.seurat . A new column pct.pos is added

frame.seurat %>% head()

4.3 Find the Difference of the Specificity of Marker genes found by Seurat and starTracer

Same as described previously, we will not include the t.test results in the final results.

frame.pct.pos <- data.frame(cluster = frame.star$cluster,pct.star = frame.star$pct.pos,pct.seurat = frame.seurat$pct.pos)

#cal seurat.mean
frame.pct.pos$pct.seurat.mean <- rep(aggregate(pct.seurat ~ cluster, data = frame.pct.pos, FUN = mean)$pct.seurat,each = 5)

#divide seurat.mean to get star.adk and seurat.adj
frame.pct.pos$pct.seurat.adj <- frame.pct.pos$pct.seurat/frame.pct.pos$pct.seurat.mean
frame.pct.pos$pct.star.adj <- frame.pct.pos$pct.star/frame.pct.pos$pct.seurat.mean

#cal star.adj.mean to arrange the plot
frame.pct.pos$pct.star.adj.mean <- rep(aggregate(pct.star.adj ~ cluster, data = frame.pct.pos, FUN = mean)$pct.star.adj,each = 5)

#arrange
frame.pct.pos <- arrange(frame.pct.pos,desc(pct.star.adj.mean))
frame.pct.pos$cluster <- factor(frame.pct.pos$cluster,levels = unique(frame.pct.pos$cluster))

p <- c()
foldchange <- c()
for (i in unique(frame.pct.pos$cluster)){
  mat <- subset(frame.pct.pos,cluster == i)
  t1 <- mat$pct.star
  t2 <- mat$pct.seurat
  p.i <- t.test(t1,t2)$p.value
  foldchange.i <- mean(mat$pct.star)/mean(mat$pct.seurat)

  foldchange <- append(foldchange,foldchange.i)
  p <- append(p,p.i)
}
frame.pct.pos.stat <- data.frame(cluster = unique(frame.pct.pos$cluster),p = p, foldchange = foldchange)

Visualize

mat.use <- reshape2::melt(frame.pct.pos[,c(1,5,6)])
(p <- ggbarplot(mat.use, x = "cluster", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#fcdc57","blue"),
          position = position_dodge(0.8)) + stat_compare_means(aes(group = variable), label = "p.signif",method = "wilcox") + theme(axis.text.x = element_text(angle = 45,hjust = 1)))