Here we run the same test again on the human heart sample. A single-cell sequencing data containing aroung 600,000 cells. We will test the performance of starTracer on the big sample at the same time.
data source:
Check the number of cells:
dim(heart_big)
We run searchMarker 5 times.
t.Star.heart.new <- c()
for(i in 1:5){
print(paste0("calculating time:",i))
t1 <- Sys.time()
starTracer::searchMarker(heart_big,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 3,gene.use = "HVG")
t2 <- Sys.time()
t <- t2-t1
t.Star.heart.new <- append(t.Star.heart.new,t)
}
t.Star.heart.all.new <- c()
for(i in 1:5){
print(paste0("calculating time:",i))
t1 <- Sys.time()
starTracer::searchMarker(heart_big,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 3)
t2 <- Sys.time()
t <- t2-t1
t.Star.heart.all.new <- append(t.Star.heart.all.new,t)
}
Let’s see the results.
t.Star.heart.new #42.43307 44.13184 43.95437 39.74770 40.41655
## Time differences in secs
## [1] 35.68560 36.31657 32.68245 35.20165 32.84306
t.Star.heart.all.new
## Time differences in secs
## [1] 40.09263 36.30915 38.73241 37.02653 38.12459
We set thresh.1 to 0.5, thresh.2 to 0.3 and num to 3 to plot the top 3 marker genes.
Idents(heart_big) <- heart_big$cell_type__ontology_label
test.heart.big <- starTracer::searchMarker(heart_big,thresh.1 = 0.5,thresh.2 = 0.3,num = 3,gene.use = "HVG")
Dotplot:
(p <- use_dotplot(heart_big,features = test.heart.big$genes.markers, colors.gradient = c("#018daa","#ffffff","#ed463a")))
This scripts takes around 30 hours for 5 runs.
print("loading data")
load("./tmp/heart_big.Rdata")
print("loading data complete!")
print("setting...")
DefaultAssay(heart_big) <- "RNA"
Idents(heart_big) <- "cell_type__ontology_label"
print("running")
t.Seurat.heart <- c()
for(i in 1:5){
t1 <- Sys.time()
FindAllMarkers(heart_big,logfc.threshold = 0.5,only.pos = T)
t2 <- Sys.time()
t <- t2-t1 #10.99338/10.1306 secs 12.34614/12.43914/11.35643 for HVG
t.Seurat <- append(t.Seurat,t)
}
save(t.Seurat.heart,file = "./out/f2_bigdata/t.Seurat.heart.Rdata")
print(t.Seurat.heart)
load the file. Each calculation takes around 6 hours.
load("./out/f2_bigdata/t.Seurat.heart.Rdata")
t.Seurat.heart
#Time differences in hours
#[1] 6.424730 6.346148 6.346987 6.353355 6.302240
We use bar plot to show the difference of calculation time between
starTracer
and Seurat
.
time.1 <- as.numeric(t.Seurat.heart) * 60
time.2 <- as.numeric(t.Star.heart.new)/60
time.3 <- as.numeric(t.Star.heart.all.new)/60
frame.time <- data.frame(starTracer.all = time.3, starTracer = time.2, Seurat = time.1)
frame.time$RunTime <- 1:5 %>% as.vector()
mat.use <- reshape2::melt(frame.time,id = "RunTime")
comparisons <- list(c("starTracer.all","starTracer"), c("starTracer.all", "Seurat"),
c("starTracer", "Seurat"))
(p <- ggbarplot(mat.use, x = "variable", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#e3b802","#fcdc57","blue"),stat = "identity",
position = position_dodge(0.8)) + stat_compare_means(method = "t.test",comparisons = comparisons) + theme(axis.text.x = element_text(angle = 45,hjust = 1)))
Calculate the fold-change:
(fc1 <- mean(time.1)/mean(time.2)) #582.5597
(fc2 <- mean(time.1)/mean(time.3)) #578.5956
t.test(time.1,time.2)$p.value # 5.619788e-10
t.test(time.1,time.3)$p.value #5.617958e-10
t.test(time.2,time.3)$p.value # 0.4741188
We generate a sequence of Seurat Object containing 100%, 90%, 80%…10% of total cells to run the test.
Samples are generated by using function sample
test.size <- function(){
t.sample.size <- c()
n.sample.size <- c()
for(i in seq(0.1,1,by = 0.1)){
message(i)
n.total <- ncol(heart_big)
n.subset <- round(n.total*i)
set.seed(1222)
cells.subset <- sample(1:n.total, size = n.subset, replace = FALSE) %>% sort()
seurat.subset <- heart_big[,cells.subset]
table(seurat.subset$cell_type__ontology_label)
t1 <- Sys.time()
starTracer::searchMarker(seurat.subset,thresh.1 = 0.5,thresh.2 = 0.5,method = "del_MI",num = 3,gene.use = "HVG")
t2 <- Sys.time()
t <- t2 - t1
t.sample.size <- append(t.sample.size,t)
n.sample.size <- append(n.sample.size,n.subset)
rm(seurat.subset);gc()
}
sample.size <- data.frame(number = n.sample.size, time = t.sample.size)
return(sample.size)
}
sample.size <- test.size()
colnames(sample.size)[2] <- "time.1"
sample.size.1 <- sample.size
sample.size <- test.size()
colnames(sample.size)[2] <- "time.2"
sample.size.2 <- sample.size
sample.size <- test.size()
colnames(sample.size)[2] <- "time.3"
sample.size.3 <- sample.size
sample.size <- inner_join(sample.size.1, sample.size.2, by = "number") %>% inner_join(sample.size.3, by = "number")
sample.size$cell.pct <- seq(0.1,1,by = 0.1)
sample.size
The script of calculation of Seurat is not included in this script.
Separated R scripts are stored in ./heart10_2_100
, denoted
as heart_*.R
.
#nohup Rscript ./heart10_2_100/heart_10.R > pct.10.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.20.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.30.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.40.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.50.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.60.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.70.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.80.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.90.log 2>&1 &
#nohup Rscript ./heart10_2_100/heart_10.R > pct.100.log 2>&1 &
All the results are saved as an Rdata
. The name of the
variable will be sample.size.seurat
Here are all the calculation time of Seurat. The column
number
represent the cells in the subset Seurat object.
time.1, time.2 and time.3 represents the time Seurat spends calculating
the marker genes from the subset object.
sample.size.seurat
We load the results from 3.1 and 3.2, to make a comparison.
The results shows a significant difference in the calculation time between starTracer and Seurat.
p <- c()
foldchange <- c()
for (i in unique(time.seurat.star.size$cell.pct)){
mat.i <- subset(time.seurat.star.size,cell.pct == i)
t1 <- mat.i$time.star
t2 <- mat.i$time.seurat
p.i <- t.test(t1,t2)$p.value
foldchange.i <- mean(mat.i$time.seurat)/mean(mat.i$time.star)
foldchange <- append(foldchange,foldchange.i)
p <- append(p,p.i)
}
frame.time.size <- data.frame(cell.pct = unique(time.seurat.star.size$cell.pct),p = p, foldchange = foldchange)
frame.time.size
Plot the graph of calculation time.
mat <- time.seurat.star.size
mat <- mat[,c(2,3,4,5)] %>% reshape2::melt(id.var = c("cell.pct","variable"))
mat$value <- mat$value/60
colnames(mat)[2:3] <- c("time","method")
mat.use <- Rmisc::summarySE(mat, measurevar = "value", groupvar = c("cell.pct","method"))
(p <- ggplot(data = mat.use,aes(x = cell.pct, y = value, color = method, group = method)) +
#facet_grid(method~.) +
geom_errorbar(aes(ymin = value - se, ymax = value + se), width = 0.02) +
geom_line() + geom_point() +
scale_color_manual(values = c("blue","#fcdc57")) + theme_bw() + ylab("time(minutes)"))
Same as in brain.Rmd
, we will run the
seachMarker
function again. This time we choose to get 5
marker genes in each group. This result will be used to calculate Ti in
each cluster.
Ti is defined as
\[ T_i=\frac{\mathrm{card}\ \left(\mathrm{G}_\mathrm{i}\mathrm{\cap}\ \mathrm{G}_{\mathrm{clust}}\right)}{\mathrm{card}\left(\mathrm{G}_\mathrm{i}\right)} \]
Where:
Gi represents the set of the cells expressing gene i in-silico.
Gclust represents the set of cells where gene i serves as the in-silico marker gene.
frame.star <- starTracer::searchMarker(heart_big,thresh.1 = 0.5,thresh.2 = 0.3,num = 5,gene.use = "HVG")
frame.star <- subset(frame.star$para_frame,frame.star$para_frame$gene %in% frame.star$genes.markers)[,c("max.X","gene")]
colnames(frame.star)[1] <- "cluster"
frame.star <- cal_spe(heart_big,ident.use = "cell_type__ontology_label",frame.star)
Let’s see frame.star
. A new column named
pct.pos
is added into the data.frame
frame.star %>% head()
We run heart_bigdata_findAllMArkers
in the command line
and load the results into R.
load("./out/f2_bigdata/heart_big.Seurat.Rdata")
Generating the frame of Seurat:
gene.seurat <- subset(heart_big.Seurat,heart_big.Seurat$p_val_adj < 0.05)
gene.seurat <- arrange(gene.seurat,cluster,desc(avg_log2FC))
gene.seurat <- with(gene.seurat,by(gene.seurat[,"gene"],INDICES = cluster,FUN = function(x)head(x,10))) %>% unlist() %>% as.vector() %>% rev()
frame.seurat <- heart_big.Seurat[,c("cluster","gene","avg_log2FC")] %>% subset(heart_big.Seurat$gene %in% gene.seurat)
frame.seurat <- arrange(frame.seurat,gene,desc(avg_log2FC))
frame.seurat <- frame.seurat[!duplicated(frame.seurat$gene),] %>% arrange(cluster,desc(avg_log2FC))
frame.seurat <- frame.seurat %>%
group_by(cluster) %>% #按照cluster列进行分组
top_n(5, wt = avg_log2FC) %>% #按照FC列从大到小排序,选择每组前2行
ungroup()
frame.seurat$cluster %>% table()
#this is used to draw graphs
frame.seurat.p <- frame.seurat %>%
group_by(cluster) %>%
top_n(3, wt = avg_log2FC) %>%
ungroup()
frame.seurat.p$cluster %>% table() %>% head()
(p <- use_dotplot(heart_big,features = rev(frame.seurat.p$gene), colors.gradient = c("#13c4cd","#ffffff","#b91192"),dot.scale = 3))
Every cluster has 5 marker genes. Calculating the Ti will cost a lot of time, especially for samples with large number of cells.
frame.seurat <- frame.seurat[,c("cluster","gene")]
frame.seurat <- cal_spe(heart_big,ident.use = "cell_type__ontology_label",as.data.frame(frame.seurat))
Show the result of frame.seurat
. A new column
pct.pos
is added
frame.seurat %>% head()
Same as described previously, we will not include the t.test results in the final results.
frame.pct.pos <- data.frame(cluster = frame.star$cluster,pct.star = frame.star$pct.pos,pct.seurat = frame.seurat$pct.pos)
#cal seurat.mean
frame.pct.pos$pct.seurat.mean <- rep(aggregate(pct.seurat ~ cluster, data = frame.pct.pos, FUN = mean)$pct.seurat,each = 5)
#divide seurat.mean to get star.adk and seurat.adj
frame.pct.pos$pct.seurat.adj <- frame.pct.pos$pct.seurat/frame.pct.pos$pct.seurat.mean
frame.pct.pos$pct.star.adj <- frame.pct.pos$pct.star/frame.pct.pos$pct.seurat.mean
#cal star.adj.mean to arrange the plot
frame.pct.pos$pct.star.adj.mean <- rep(aggregate(pct.star.adj ~ cluster, data = frame.pct.pos, FUN = mean)$pct.star.adj,each = 5)
#arrange
frame.pct.pos <- arrange(frame.pct.pos,desc(pct.star.adj.mean))
frame.pct.pos$cluster <- factor(frame.pct.pos$cluster,levels = unique(frame.pct.pos$cluster))
p <- c()
foldchange <- c()
for (i in unique(frame.pct.pos$cluster)){
mat <- subset(frame.pct.pos,cluster == i)
t1 <- mat$pct.star
t2 <- mat$pct.seurat
p.i <- t.test(t1,t2)$p.value
foldchange.i <- mean(mat$pct.star)/mean(mat$pct.seurat)
foldchange <- append(foldchange,foldchange.i)
p <- append(p,p.i)
}
frame.pct.pos.stat <- data.frame(cluster = unique(frame.pct.pos$cluster),p = p, foldchange = foldchange)
Visualize
mat.use <- reshape2::melt(frame.pct.pos[,c(1,5,6)])
(p <- ggbarplot(mat.use, x = "cluster", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#fcdc57","blue"),
position = position_dodge(0.8)) + stat_compare_means(aes(group = variable), label = "p.signif",method = "wilcox") + theme(axis.text.x = element_text(angle = 45,hjust = 1)))