用clusterProfile進(jìn)行非模式物種且沒(méi)有參考基因組的GO和KEGG富集分析

新組裝的基因組并沒(méi)有上傳和生成RefSeq基因組信息。而且一系列需要or.db或者構(gòu)建or.db的步驟嘗試構(gòu)建數(shù)據(jù)庫(kù)都不成功,要么缺乏物種信息,要么缺乏gene symbol。因此,我們利用自己的數(shù)據(jù)在eggNOG-mapper上進(jìn)行功能注釋的信息,同樣可以進(jìn)行富集分析。參考Y叔clusterProfile的手冊(cè),以及一系列網(wǎng)上搜索的結(jié)果:https://zhuanlan.zhihu.com/p/35510434https://cloud.tencent.com/developer/article/1607669?from=14588;
http://m.itdecent.cn/p/d484003dced5;
https://www.omicsclass.com/article/1513

直接上整理出來(lái)的腳本吧:

##universal enrichment analysis
library(magrittr)
library(clusterProfiler)
library(stringr)
library(dplyr)
library(ggplot2)
library(enrichplot)

#下載安裝注釋數(shù)據(jù)庫(kù), 有參考基因組的,有注釋結(jié)果的make a database
#source("https://bioconductor.org/biocLite.R")
library("AnnotationHub")
BiocManager::install("biomaRt")
library("biomaRt")
hub <- AnnotationHub::AnnotationHub()  #snapshotDate(): 2020-04-27
query(hub, "Erisoma") #搜索有參考基因組的KEGG注釋信息
#非模式物種以及沒(méi)有參考基因組的物種,不適合用eggGO和eggKEGG

gene_list <- read.table("Sl.expand.gene.txt", header = F)
gene <- as.character(gene_list[,1])


##1、GO 富集分析
#先處理emapper的數(shù)據(jù),#注意去除title的注釋符號(hào)“#”
egg <- read.table("Sl.out.emapper.annotations", sep="\t", header=T) 

#提取GO數(shù)據(jù)
gene_ids <- egg$query
eggnog_annoations_go <- str_split(egg$GOs, ",")
gene_to_go <- data.frame(gene = rep(gene_ids,
                         times = sapply(eggnog_annoations_go, length)),
                         term = unlist(eggnog_annoations_go))
gene2go <- filter(gene_to_go, term != "-")

#term2gene <- gene_to_go[,c(2,1)]
term2gene <- gene2go[,c(2,1)]
colnames(term2gene)[1] <- "ID"
df1 <- go2term(term2gene$ID) ##根據(jù)ID生成GO注釋信息
df2 <- go2ont(term2gene$ID) ##根據(jù)ID進(jìn)行分類:BP, CC, MF
colnames(df1)[1] <- "ID"
colnames(df2)[1] <- "ID"
df <- left_join(term2gene, df1, by = "ID")
df3 <- left_join(df, df2, by = "ID")

#df3_BP <- filter(df3, Ontology == "BP")
#df_three <- split(df3, with(df3, Ontology))
colnames(df3)[4] <- "Class"
gid2gene <- df3[, c("ID", "gene", "Class")]
gid2name <- df3[, c("ID", "Term", "Class")]
ego <- enricher(gene, TERM2GENE = gid2gene, TERM2NAME = gid2name, 
                pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
dotplot(ego)
result <- as.data.frame(ego) #轉(zhuǎn)化為數(shù)據(jù)框方便看富集的結(jié)果

gid2gene <- split(gid2gene, with(gid2gene, Class)) #拆分成BP,MF,CC三個(gè)數(shù)據(jù)框
gid2name <- split(gid2name, with(gid2name, Class)) #拆分成BP,MF,CC三個(gè)數(shù)據(jù)框
#gid2gene <- df3_BP[, c("ID", "gene")]
#gid2name <- df3_BP[, c("ID", "Term")]

#如果只是想關(guān)注某一GO分類的富集情況可運(yùn)行如下:
ego_BP <- enricher(gene, TERM2GENE = gid2gene[['BP']][c(1,2)], 
                   TERM2NAME = gid2name[['BP']][c(1,2)], 
                pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
dotplot(ego_BP, title = "BP")
ego_CC <- enricher(gene, TERM2GENE = gid2gene[['CC']][c(1,2)], 
                   TERM2NAME = gid2name[['CC']][c(1,2)], 
                   pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
ego_MF <- enricher(gene, TERM2GENE = gid2gene[['MF']][c(1,2)], 
                   TERM2NAME = gid2name[['MF']][c(1,2)], 
                   pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
#以下可以將三個(gè)分類都整合到一個(gè)圖內(nèi)
library(ggpubr)
BP <- dotplot(ego_BP, title = "BP")
CC <- dotplot(ego_CC, title = "CC")
MF <- dotplot(ego_MF, title = "MF")
ggarrange(BP, CC, MF, ncol = 1, nrow = 3, align = "hv")

#下面的數(shù)據(jù)是TBtools整理出來(lái)的
#dat <- read.table("out.emapper.annotations.GO.txt", header = F, sep = "\t")
#colnames(dat)[1] <- "gene"
#colnames(dat)[2] <- "Gid"
#term2gene <- dat[,c(2,1)]

#這里是先富集分析再給id注釋和分類,然后用ggplot2畫(huà),因?yàn)闆](méi)法用dotplot畫(huà)
df<-enricher(gene=gene,
             pvalueCutoff = 0.05,
             pAdjustMethod = "BH",
             TERM2GENE = term2gene)

df <- as.data.frame(df) #一定要轉(zhuǎn)成數(shù)據(jù)框
df1 <- go2term(df$ID)  ##根據(jù)ID生成注釋信息
colnames(df1)[1] <- "ID"
ego <- left_join(df, df1, by = "ID")
df2 <- go2ont(df$ID)   ##根據(jù)ID進(jìn)行分類:BP, CC, MF
colnames(df2)[1] <- "ID"
ego1 <- left_join(ego, df2, by = "ID")
df3 <- ego1 %>% select(c("Term", "Ontology", "p.adjust", "Count"))
df3 <- na.omit(df3)
df_m <- df3[c(1:20),]    #提取前20個(gè)term
df_m <- df_m[order(df_m$Count, decreasing = F),]  #按照count排序
df_m$Term <- factor(df_m$Term, levels = df_m$Term)  #給排好序的term固定變量,作圖時(shí)就能按照這個(gè)順序
ggplot(df_m,aes(x=Term,y=Count))+ 
  geom_point(aes(size = Count, color=-log10(p.adjust))) +  
  facet_grid(Ontology~., scales = "free", space = "free") +
  coord_flip()+labs(x="")+
  theme_bw() + scale_color_steps(low = "blue", high = "red")
#color越紅,p值越小 
#接上面GO的分析,此處不需要count排序只要pvalue的柱形圖,等同于TBtools里的圖
df3 <- ego1 %>% select(c("Term", "Ontology", "pvalue"))
df3 <- na.omit(df3)
ggplot(df3[c(1:20),],aes(x=Term,y=-log10(pvalue)))+
  geom_bar(aes(fill=Ontology))+
  coord_flip()+labs(x="")+
  theme_bw()

接著提取KEGG注釋信息并富集分析。。。。

# 2. KEGG pathway
#提取KEGG數(shù)據(jù),要注意KO pathway與k number的區(qū)別,前者才可轉(zhuǎn)化為代謝功能
gene_ko <- egg %>%
  dplyr::select(GID = query, Ko = KEGG_ko)  #這里的KEGG_ko是K number
gene_ko[,2]<-gsub("ko:","",gene_ko[,2]) #替換去除ko:
gene_ko_list <- str_split(gene_ko$Ko, ",")
gene2ko <- data.frame(gene = rep(gene_ids,
                                 times = sapply(gene_ko_list, length)),
                      term = unlist(gene_ko_list))
gene2ko <- filter(gene2ko, term != "-")

gene_pathway <- egg %>%
  dplyr::select(GID = query, Pathway = KEGG_Pathway)  #這里的才是Pathway
gene_pathway_list <- str_split(gene_pathway$Pathway, ",")
gene2pathway <- data.frame(gene = rep(gene_ids,
                                      times = sapply(gene_pathway_list, length)),
                           term = unlist(gene_pathway_list))
gene2pathway <- filter(gene2pathway, term != "-")

term2gene <- gene2pathway[,c(2,1)]
colnames(term2gene)[1] <- "ID"
pathway2name <- ko2name(term2gene$ID) #根據(jù)pathway的id輸出注釋信息
pathway2name <- na.omit(pathway2name)
pathway2name <- unique.data.frame(pathway2name)
colnames(pathway2name)[1] <- "ID"
ko2gene <- term2gene[grep(pattern = "^ko", term2gene$ID),]
#這里的id有的是map開(kāi)頭的需要去除,因?yàn)閗o2name只能注釋ko開(kāi)頭的

kegg <- enricher(gene, TERM2GENE = ko2gene, TERM2NAME = pathway2name,
                 pvalueCutoff = 0.05, qvalueCutoff = 0.2, minGSSize = 10)
dotplot(kegg)
#browseKEGG(kegg, 'ko05034') #在pathway通路圖上標(biāo)記富集到的基因,會(huì)鏈接到KEGG官網(wǎng)


最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容