說(shuō)明:僅根據(jù)官網(wǎng)指南加個(gè)人理解,相應(yīng)圖片參考官網(wǎng)(目前官網(wǎng)上最新的Tutorial已經(jīng)更新成Seurat3.0版本,下面的流程是2.4版本,有些許出入。新版本將會(huì)在2019年4月16日通過(guò)CRAN下載)
以下是Seurat(v2.4版本)標(biāo)準(zhǔn)的數(shù)據(jù)處理過(guò)程,包括構(gòu)建Seurat對(duì)象、QC、數(shù)據(jù)標(biāo)準(zhǔn)化、檢測(cè)高變異基因等
1.構(gòu)建Seurat對(duì)象
library(Seurat)
library(dplyr)
#讀取10X數(shù)據(jù)
pbmc.data <- Read10X(data.dir = "~/Downloads/filtered_gene_bc_matrices/hg19/")
#構(gòu)建Seurat對(duì)象,這里會(huì)有個(gè)初篩,保證所有基因在至少3個(gè)細(xì)胞中表達(dá)(0.1%細(xì)胞數(shù)),保證每個(gè)細(xì)胞至少能檢測(cè)到200個(gè)基因。
pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")
2. QC選擇細(xì)胞進(jìn)行后續(xù)分析
#UMI (Unique Molecular Identifier)是一段固定長(zhǎng)度的隨機(jī)小片段,用以區(qū)別不同的PCR擴(kuò)增模板
#barcode,標(biāo)記不同的樣品或者細(xì)胞
#根據(jù)細(xì)胞內(nèi)基因數(shù)以及線粒體基因所占比例進(jìn)行過(guò)濾細(xì)胞
#提取線粒體DNA并計(jì)算其比例
mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)
#使用AddMetaData函數(shù),將上面合并到pbmc對(duì)象中,方便后續(xù)的QC
pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)
#過(guò)濾細(xì)胞,細(xì)胞數(shù)控制在200~2500之間,如果不希望設(shè)定閾值,用-Inf表示,線粒體DNA所占比例控制在0.05以內(nèi)
pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
3.對(duì)數(shù)據(jù)進(jìn)行標(biāo)準(zhǔn)化
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
4.在單細(xì)胞水平上檢測(cè)變異基因
#這里參數(shù)的設(shè)置根據(jù)數(shù)據(jù)的類型、樣本的異質(zhì)性以及標(biāo)準(zhǔn)化的方法,存在差異
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
#查看篩選出來(lái)的變異基因的數(shù)量,一般是2000多
length(x = pbmc@var.genes)
5.除去不想要的變異來(lái)源
#這里就包括除去技術(shù)水平的噪音、批次效應(yīng)、細(xì)胞狀態(tài)等
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
6.PCA降維處理
pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5)
#通過(guò)下面的幾種function可以查看定義特定PC的細(xì)胞和基因
PrintPCA(object = pbmc, pcs.print = 1:5, genes.print = 5, use.full = FALSE)
VizPCA(object = pbmc, pcs.use = 1:2)
PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2)
#PCHeatmap這個(gè)功能可以很清楚的展示一個(gè)數(shù)據(jù)集的異質(zhì)性,同時(shí)對(duì)于確定下游分析中用哪一個(gè)PC有著很大的幫助。對(duì)于探索相關(guān)的基因集合有著很大的幫助
PCHeatmap(object = pbmc, pc.use = 1, cells.use = 500, do.balanced = TRUE, label.columns = FALSE)#用PC1作圖
PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced = TRUE, label.columns = FALSE, use.full = FALSE)#用PC1~12作圖


7.確定在統(tǒng)計(jì)學(xué)上顯著的PC
#這一步耗時(shí)很長(zhǎng)
pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
#這一步可以看一下每一個(gè)PC的相關(guān)性和顯著性,實(shí)線在虛線上方
JackStrawPlot(object = pbmc, PCs = 1:12)
#確定需要選取的PC,通過(guò)曲線的走勢(shì)
PCElbowPlot(object = pbmc)

8.對(duì)細(xì)胞進(jìn)行聚類
#resolution這一參數(shù)一般設(shè)定在0.6~1.2之間(細(xì)胞數(shù)3K左右),聚類結(jié)果放在object@ident中
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, resolution = 0.6, print.output = 0, save.SNN = TRUE)
9.進(jìn)行非線性降維(tSNE)
#tSNE的輸入,建議和之前聚類所用的PC一樣
pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)
#這一步,你可以添加參數(shù)do.label=T,顯示每一類群的label
TSNEPlot(object = pbmc)
#保存這一對(duì)象
saveRDS(pbmc, file = "~/Projects/datasets/pbmc3k/pbmc_tutorial.rds")

10. 找差異表達(dá)的基因(聚類marker)
#找到cluster1所對(duì)應(yīng)的marker(positive+negative)
#ident.1 用來(lái)指定cluster
#min.pct 用來(lái)指定特定基因需要在%的細(xì)胞中被檢測(cè)到
cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
print(x = head(x = cluster1.markers, n = 5))
#找到區(qū)分cluster5和cluster0,3的marker
cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
print(x = head(x = cluster5.markers, n = 5))
#找到區(qū)分所有cluster的marker
#只展示positive
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
#可視化marker的表達(dá)
VlnPlot(object = pbmc, features.plot = c("MS4A1", "CD79A"))
#可以選擇用raw UMI counts
VlnPlot(object = pbmc, features.plot = c("NKG7", "PF4"), use.raw = TRUE, y.log = TRUE)
#用PCA或者tSNE圖可視化基因的表達(dá)
FeaturePlot(object = pbmc, features.plot = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"), cols.use = c("grey", "blue"), reduction.use = "tsne")
#針對(duì)給定的細(xì)胞和基因,用DoHeatmap函數(shù)畫(huà)出表達(dá)熱圖
#這里選定的是每個(gè)cluster中表達(dá)量前十的基因
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
#slim.col.label= TRUE表示,只展示cluster的ID而不是名字
DoHeatmap(object = pbmc, genes.use = top10$gene, slim.col.label = TRUE, remove.key = TRUE)

VlnPlot

FeaturePlot

DoHeatmap
11.更改cluster的名字(確定細(xì)胞類型的前提下)
current.cluster.ids <- c(0, 1, 2, 3, 4, 5, 6, 7)
new.cluster.ids <- c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(object = pbmc, do.label = TRUE, pt.size = 0.5)

12.進(jìn)一步細(xì)分細(xì)胞類型
#如果你修改分辨率參數(shù)resolution或者改變PC的個(gè)數(shù),有可能會(huì)使得原來(lái)的細(xì)胞類群進(jìn)一步的細(xì)分。這樣你可以進(jìn)一步探究在這個(gè)細(xì)分的cluster中,兩者之間的marker有什么區(qū)別
#在此之前,需要對(duì)新的聚類群進(jìn)行重命名,不然會(huì)導(dǎo)致之前的結(jié)果被覆蓋
#分辨率為0.6
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")
#分辨率為0.8
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, resolution = 0.8, print.output = FALSE)
#出圖
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, group.by = "ClusterNames_0.6", no.legend = TRUE, do.label = TRUE)
plot_grid(plot1, plot2)
