1 Load Data

We will use major_clust as the Ident.

When inputting a Seurat object, please make sure that your object have run the following method:

  1. FindVariableFeatures
  2. NormalizeData
snRNA.hpfc.sct$major_clust %>% table()
#snRNA.hpfc.sct <- FindVariableFeatures(snRNA.hpfc.sct,nfeatures = 3000)

 
(p <- DimPlot(snRNA.hpfc.sct,group.by = "major_clust") + scale_color_manual(values = colors.brain) + theme_void())

2 Run Time Test

2.1 starTracer Running Time Test

Run time test

We set thresh.1 = 0.5 (by default),thresh.2 = 0.3, num = 2.

Explanations of parameters:

  1. thresh.1: The threshold to decide a cluster goes to the group where it might become a positive marker or group where it might be a negative marker, for each gene included.
  2. thresh.2: The threshold to filter out the lowest thresh.2 (by percentage) in each group after assigning genes into clusters.
  3. num: The number of marker genes in the output matrix for each cluster
#using HVG
Idents(snRNA.hpfc.sct) <- snRNA.hpfc.sct$major_clust
t.Star <- c()
for(i in 1:10){
  print(paste0("calculating time:",i))
  t1 <- Sys.time()
  starTracer::searchMarker(snRNA.hpfc.sct,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 2,gene.use = "HVG")
  t2 <- Sys.time()
  t <- t2-t1 #10.99338/10.1306 secs 12.34614/12.43914/11.35643 for HVG
  t.Star <- append(t.Star,t)
}
t.Star.brain <- t.Star
#save(t.Star.brain,file = "./out/f1_brain/t.Star.brain.new.Rdata")

#using ALL genes
t.Star <- c()
for(i in 1:10){
  print(paste0("calculating time:",i))
  t1 <- Sys.time()
  starTracer::searchMarker(snRNA.hpfc.sct,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 2)
  t2 <- Sys.time()
  t <- t2-t1 #10.99338/10.1306 secs 12.34614/12.43914/11.35643 for HVG
  t.Star <- append(t.Star,t)
}
t.Star.brain.all <- t.Star
#save(t.Star.brain,file = "./out/f1_brain/t.Star.brain.all.new.Rdata")

Let’s see the results of executing time

t.Star.brain 

Visualization of top 3 marker genes

We choose the top 3 marker genes when performing visualization

Note that: the value of num will NOT influence the calculation speed significantly. Because searchMarker calculates all the potential marker genes and only output the topN marker for each cluster specified by the user by num.

We run searchMarker again by setting num to 3 for visualization.

out.brain.Star <- starTracer::searchMarker(snRNA.hpfc.sct,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 3,gene.use = "HVG")
(p <- use_dotplot(snRNA.hpfc.sct,features = out.brain.Star$genes.markers, colors.gradient = c("#018daa","#ffffff","#ed463a")))

Here we will find markers again from all genes detected in the during the sequencing:

out.brain.Star.all <- starTracer::searchMarker(snRNA.hpfc.sct,thresh.1 = 0.5,thresh.2 = 0.8,method = "del_MI",num = 3)
(p <- use_dotplot(snRNA.hpfc.sct,features = out.brain.Star.all$genes.markers, colors.gradient = c("#018daa","#ffffff","#ed463a")))
(p <- use_dotplot(snRNA.hpfc.sct,features = out.brain.Star.all$genes.markers, colors.gradient = c("#018daa","#ffffff","#ed463a")))

2.2 The Calculation Time of Seurat

t.Seurat.brain <- c()
for(i in 1:11){
  if(i <= 10){
    print(paste0("calculating time:",i))
    t1 <- Sys.time()
    FindAllMarkers(snRNA.hpfc.sct,logfc.threshold = 0.5,only.pos = T)
    t2 <- Sys.time()
    t <- t2-t1 #10.99338/10.1306 secs 12.34614/12.43914/11.35643 for HVG
    t.Seurat.brain <- append(t.Seurat.brain,t)
  } else {
    out.brain.Seurat <- FindAllMarkers(snRNA.hpfc.sct,logfc.threshold = 0.5,only.pos = T)
  }
}

Let’s see the calculation time of Seurat and starTracer .

t.Seurat.brain # in minutes
t.Star.brain # in seconds
t.Star.brain.all

2.3 Difference of Calculation Time of Seurat and starTracer

time.1 <- as.numeric(t.Seurat.brain) * 60
time.2 <- as.numeric(t.Star.brain)
time.3 <- as.numeric(t.Star.brain.all)
frame.time <- data.frame(starTracer.all = time.3, starTracer = time.2, Seurat = time.1)
frame.time$RunTime <- 1:10 %>% as.vector()
mat.use <- reshape2::melt(frame.time,id = "RunTime")

comparisons <- list(c("starTracer.all","starTracer"), c("starTracer.all", "Seurat"),
                       c("starTracer", "Seurat"))

(p <- ggbarplot(mat.use, x = "variable", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#e3b802","#fcdc57","blue"),stat = "identity",
          position = position_dodge(0.8)) + stat_compare_means(method = "t.test",comparisons = comparisons) + theme(axis.text.x = element_text(angle = 45,hjust = 1)))
(fc1 <- mean(time.1)/mean(time.2)) #185.2817
(fc2 <- mean(time.1)/mean(time.3)) #168.9358

t.test(time.1,time.2)$p.value #7.845196e-19

t.test(time.1,time.3)$p.value #8.640955e-19

t.test(time.2,time.3)$p.value #0.02561478

mean time of starTracer is 7.85s

while with all genes instead of just HVGs, the time is 3.86s

3 Specificity

3.1 Specificity of the Top5 Marker Genes of starTracer

Calculate the top5 marker genes for Ti calculation

We will run the seachMarker function again. This time we choose to get 5 marker genes in each group. This result will be used to calculate Ti in each cluster.

Ti is defined as

\[ T_i=\frac{\mathrm{card}\ \left(\mathrm{G}_\mathrm{i}\mathrm{\cap}\ \mathrm{G}_{\mathrm{clust}}\right)}{\mathrm{card}\left(\mathrm{G}_\mathrm{i}\right)} \]

Where:

Gi represents the set of the cells expressing gene i in-silico.

Gclust represents the set of cells where gene i serves as the in-silico marker gene.

Please refer to our original research article to see more details.

out.brain.Star.5 <- starTracer::searchMarker(snRNA.hpfc.sct,thresh.1 = 0.5,thresh.2 = 0.3,method = "del_MI",num = 5,gene.use = "HVG")

Calculate specificity measured by Ti

Calculating specificity level defined by Ti might be time consuming especially for large data set. Ti is calculated by function cal_spe. Please refer to “function.R” for more details.

frame.star <- subset(out.brain.Star.5$para_frame,out.brain.Star.5$para_frame$gene %in% out.brain.Star.5$genes.markers)[,c("max.X","gene")]
colnames(frame.star)[1] <- "cluster"

frame.star <- cal_spe(snRNA.hpfc.sct,ident.use = "major_clust",frame.star)

check the results of frame.star

frame.star %>% head()

3.2 Frame of Seurat

Generating the frame of Seurat:

  1. Find marker genes using FindAllMarkers function.
  2. Subset the matrix by setting p-value < 0.05
  3. re-arrange the data.frame by cluster and descending average log2(Fold-Change)
  4. Keep the top 10 marker gene in each cluster

The reason we choose top 10 marker genes: The marker genes in the result of Seurat might be redundant, which means that a gene might be identified as a marker in more than one cluster. Some “marker” genes identified in one cluster might be in fact a better marker in another cluster. Thus, we will keep 10 marker genes to make sure each cluster has more than 5 marker genes. The number 10 is not compulsory.

The results of Seurat has already been calculated in Chap #2

We then re-arrange the marker genes’ data.frame. As for the genes that occur in more than one cluster, we will only keep it in the cluster that has the largest avg_log2FC.

frame.seurat <- out.brain.Seurat[,c("cluster","gene","avg_log2FC","p_val_adj")] %>% subset(p_val_adj < 0.05)
frame.seurat <- arrange(frame.seurat,gene,desc(avg_log2FC))
frame.seurat <- frame.seurat[!duplicated(frame.seurat$gene),] %>% arrange(cluster,desc(avg_log2FC))

frame.seurat <- frame.seurat %>%
  group_by(cluster) %>% #group by cluster
  top_n(5, wt = avg_log2FC) %>% #arrange by FC
  ungroup()
frame.seurat$cluster %>% table()

#We will choose the top 3 marker gene to visualize:
frame.seurat.p <- frame.seurat %>%
  group_by(cluster) %>% 
  top_n(3, wt = avg_log2FC) %>% #choose the top 3
  ungroup()
frame.seurat.p$cluster %>% table() 

Let’s check the results.

(p <- use_dotplot(snRNA.hpfc.sct,features = rev(frame.seurat.p$gene), colors.gradient = c("#13c4cd","#ffffff","#b91192"),dot.scale = 3))

3.3 The Specificity Level Measured by Ti Among Two Softwares

Calculate the specificity level measured by Ti. Please refer to the self-defined functions for details.

frame.seurat <- frame.seurat[,c("cluster","gene")]
frame.seurat <- cal_spe(snRNA.hpfc.sct,ident.use = "major_clust",as.data.frame(frame.seurat))
save(frame.seurat,file = "./out/f1_brain/frame.seurat.Rdata")
load("./out/f1_brain/frame.seurat.Rdata")

Calculate the difference

frame.star <- subset(frame.star,frame.star$cluster %in% frame.seurat$cluster)
identical(frame.star$cluster,frame.seurat$cluster)

We use frame.pct.pos to store the top 5 marker genes specificity level from 2 algorithms and store the results in frame.pct.pos.stat .

frame.pct.pos <- data.frame(cluster = frame.star$cluster,pct.star = frame.star$pct.pos,pct.seurat = frame.seurat$pct.pos)

During the calculation, we will normalize the specificity level to Seurat, which means the mean specificity level will be 1. Meanwhile, only the top 5 marker gene in Seurat will be retained. More specifically:

  1. Calculate the mean value of marker gene’s Ti in each group of Seurat (denoted as mean.Seurat).
  2. In each cluster, normalize the Ti of marker genes from both starTracer and Seurat to the mean.Seurat (denoted as star.adj and seurat.adj). Notice that the mean value of seurat.adj will be close to 1.
  3. Arrange the order of the matrix by descending star.adj.
  4. Fold-change will be calculated using the original Ti from starTracer and Seurat. P-value will be calculated using t-test.

We didn’t include the t-test result in our research even though most of the data shows a significant difference: The marker genes found by starTracer and Seurat are different genes identified by 2 different algorithm, which are not samples from some population, so, the H0 may not be defined.

frame.pct.pos <- data.frame(cluster = frame.star$cluster,pct.star = frame.star$pct.pos,pct.seurat = frame.seurat$pct.pos)

#cal seurat.mean
frame.pct.pos$pct.seurat.mean <- rep(aggregate(pct.seurat ~ cluster, data = frame.pct.pos, FUN = mean)$pct.seurat,each = 5)

#divide seurat.mean to get star.adj and seurat.adj
frame.pct.pos$pct.seurat.adj <- frame.pct.pos$pct.seurat/frame.pct.pos$pct.seurat.mean
frame.pct.pos$pct.star.adj <- frame.pct.pos$pct.star/frame.pct.pos$pct.seurat.mean

#cal star.adj.mean to arrange the plot
frame.pct.pos$pct.star.adj.mean <- rep(aggregate(pct.star.adj ~ cluster, data = frame.pct.pos, FUN = mean)$pct.star.adj,each = 5)

#arrange
frame.pct.pos <- arrange(frame.pct.pos,desc(pct.star.adj.mean))
frame.pct.pos$cluster <- factor(frame.pct.pos$cluster,levels = unique(frame.pct.pos$cluster))

p <- c()
foldchange <- c()
for (i in unique(frame.pct.pos$cluster)){
  mat <- subset(frame.pct.pos,cluster == i)
  t1 <- mat$pct.star
  t2 <- mat$pct.seurat
  p.i <- t.test(t1,t2)$p.value
  foldchange.i <- mean(mat$pct.star)/mean(mat$pct.seurat)

  foldchange <- append(foldchange,foldchange.i)
  p <- append(p,p.i)
}
frame.pct.pos.stat <- data.frame(cluster = unique(frame.pct.pos$cluster),p = p, foldchange = foldchange)

the specificity level measured by Ti

mat.use <- reshape2::melt(frame.pct.pos[,c(1,5,6)])

(p <- ggbarplot(mat.use, x = "cluster", y = "value", add = c("mean_se"),color = "variable",fill = "white",palette = c("#fcdc57","blue"),
          position = position_dodge(0.8)) + stat_compare_means(aes(group = variable), label = "p.signif",methods = "wilcox") + theme(axis.text.x = element_text(angle = 45,hjust = 1)))