不要還守著GO分析不放手了。來看看新的工具。
eggnog-mapper官網(wǎng)
徐州更的教程
主要是數(shù)據(jù)庫功能比較全,問題是數(shù)據(jù)庫比較大,需要下載時(shí)間比較長。而且部分內(nèi)容不是最新版的。
主要使用環(huán)境是:用于注釋新的基因組、轉(zhuǎn)錄組、微生物(宏基因組)
最新版本的是V2版本 github
- 新增加了GO、KEGG的數(shù)據(jù)庫
- 優(yōu)點(diǎn)是:使用的是直系同源的序列進(jìn)行功能注釋
- 使用conda安裝emapper
conda install -c bioconda eggnog-mapper
2.下載數(shù)據(jù)庫
從eggnog5 下載數(shù)據(jù)庫
需要下載的文件有:
eggnog.db.gz
eggnog_proteins.dmnd.gz
選擇當(dāng)前最新的版本下載
準(zhǔn)備的輸入文件
- 物種的蛋白文件(注意:蛋白序列的名稱一定是以后你要做富集分析的基因名稱的格式,如果不一樣,后續(xù)你可能需要重新制作orgdb)
- 從https://www.ncbi.nlm.nih.gov/taxonomy 輸入物種拉丁名稱,查詢你的物種的tax_id。例如:Ctenopharyngodon idella 對(duì)應(yīng)的id是7959
- 從https://www.genome.jp/kegg-bin/download_htext?htext=ko00001&format=json&filedir= 下載ko00001.json文件,主要是用來解析kegg的文件結(jié)構(gòu),并不是作為數(shù)據(jù)庫,因此只下載這一個(gè)即可
3.開始分析
abbr="C.idella"
sed -i 's/.$//g' ${abbr}.pep.fa #刪除序列行尾多余的.
emapper.py -i ${abbr}.pep.fa -o ${abbr} -m diamond --cpu 24 #指定24個(gè)cpu來進(jìn)行分析
sed '/^##/d' ${abbr}.emapper.annotations|sed 's/#//' > emapper.annotations.tsv #刪除以##開頭的行,同時(shí)刪除表頭行的#
4.構(gòu)建自己的orgdb數(shù)據(jù)庫(多線程,速度非???
library(tidyverse)
library(KEGGREST)
library(AnnotationForge)
library(clusterProfiler)
library(jsonlite)
library(purrr)
library(RCurl)
if(! require("parallel"))install.packages("parallel")
library(parallel)#用于后面的多線程
emapper <- rio::import('emapper.annotations.tsv')
#將空值替換為NA,方便后續(xù)使用na.omit()函數(shù)提出沒有注釋到的行
emapper[emapper==""]<-NA
###########################提取GO信息#############
#提取query列,Preferred_named列,GO列
gene_info <- emapper %>% dplyr::select(GID = query, GENENAME = Preferred_name) %>% na.omit()
gos <- emapper %>% dplyr::select(query, GOs) %>% na.omit()
#構(gòu)建一個(gè)空的數(shù)據(jù)框?yàn)楹竺嫣畛鋽?shù)據(jù)
gene2go = data.frame(GID = character(),
GO = character(),
EVIDENCE = character())
#填充gene2go數(shù)據(jù)框(這一步時(shí)間比較長,所以開啟多線程)
# for (row in 1:nrow(gos)) {
# the_gid <- gos[row, "query"][[1]]
# the_gos <- str_split(gos[row,"GOs"], ",", simplify = FALSE)[[1]]
# df_temp <- data_frame(GID = rep(the_gid, length(the_gos)),
# GO = the_gos,
# EVIDENCE = rep("IEA", length(the_gos)))
# gene2go <- rbind(gene2go, df_temp)}
#####################多線程開始
#檢測當(dāng)前可用的cpu核數(shù)
cl.cores <- detectCores()
#使用指定核數(shù)8,開啟集群任務(wù)
cl <- makeCluster(8)
###并行任務(wù)
#定義并行的函數(shù)
getgene2go <- function(row){
the_gid <- gos[row, "query"][[1]]
the_gos <- str_split(gos[row,"GOs"], ",", simplify = FALSE)[[1]]
df_temp <- data_frame(GID = rep(the_gid, length(the_gos)),
GO = the_gos,
EVIDENCE = rep("IEA", length(the_gos)))
return(df_temp)
}
#開始并行分析
list_rows <- 1:nrow(gos)
clusterExport(cl,"gos")#導(dǎo)入外部變量gos
clusterEvalQ(cl, library(tidyverse))#加載外部環(huán)境的包tidyverse
gene2go <- parLapply(cl,list_rows,getgene2go) # lapply的并行版本
gene2go <- do.call('rbind',gene2go) # 整合結(jié)果
# 關(guān)閉集群任務(wù)
stopCluster(cl)
#####################多線程結(jié)束
#將“-”替換為NA,然后使用na.omit()刪除含有NA的行
gene2go$GO[gene2go$GO=="-"]<-NA
gene2go<-na.omit(gene2go)
save(gene2go,file="gene2go.RData")
############提取KEGG信息##################
#將emapper中query列,KEGG_ko列提取出來
gene2ko <- emapper %>% dplyr::select(GID = query, Ko = KEGG_ko) %>% na.omit()
#將gene2ko的Ko列中"ko:"刪除,不然后面找pathway會(huì)報(bào)錯(cuò)
gene2ko$Ko <- gsub("ko:","",gene2ko$Ko)
#從https://www.genome.jp/kegg-bin/download_htext?htext=ko00001&format=json&filedir= 下載ko00001.json文件,主要是用來解析kegg的文件結(jié)構(gòu),并不是作為數(shù)據(jù)庫,因此只下載這一個(gè)即可
update_kegg <- function(json = "ko00001.json") {
pathway2name <- tibble(Pathway = character(), Name = character())
ko2pathway <- tibble(Ko = character(), Pathway = character())
kegg <- fromJSON(json)
for (a in seq_along(kegg[["children"]][["children"]])) {
A <- kegg[["children"]][["name"]][[a]]
for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
B <- kegg[["children"]][["children"]][[a]][["name"]][[b]]
for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
kos <- str_match(kos_info, "K[0-9]*")[,1]
ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))}}}
save(pathway2name, ko2pathway, file = "kegg_info.RData")}
# 調(diào)用函數(shù)后在本地創(chuàng)建kegg_info.RData文件
update_kegg()
# 載入kegg_info.RData文件
load(file = "kegg_info.RData")
#根據(jù)gene2ko中的ko信息將gene2ko中的Pathway列提取出來
gene2pathway <- gene2ko %>% left_join(ko2pathway, by = "Ko") %>% dplyr::select(GID, Pathway) %>% na.omit()
####去除重復(fù)
gene2go <- unique(gene2go)
gene2go <- gene2go[!duplicated(gene2go),]
gene2ko <- gene2ko[!duplicated(gene2ko),]
gene2pathway <- gene2pathway[!duplicated(gene2pathway),]
save(pathway2name, ko2pathway,gene2pathway,file="KEGG.RData")
#save.image(file="All.Rdata") #保存所有的數(shù)據(jù)到ALL.Rdata
############構(gòu)建OrgDb(注意:郵箱(作者和維護(hù)者)中不能包括“_”)
tax_id = "7959"
genus = "Ctenopharyngodon"
species = "idella"
makeOrgPackage(gene_info=gene_info,
go=gene2go,
ko=gene2ko,
maintainer='chaimol <chaimol@163.com>',#你自己的郵箱
author='chaimol <chaimol@163.com>',#你自己的郵箱
pathway=gene2pathway,
version="0.0.1",
outputDir =".",#輸出路徑
tax_id=tax_id,#你的物種在NCBI的id
genus=genus,#屬名
species=species,#種名
goTable="go")
## 安裝剛剛創(chuàng)建完成的orgdb數(shù)據(jù)庫
## then you can call install.packages based on the return value
install.packages("org.Cidella.eg.db", repos=NULL,type="source")
使用創(chuàng)建的orgdb進(jìn)行GO富集分析
library(org.Cidella.eg.db)
library(tidyverse)
library(clusterProfiler)
#顯示所有的列名
columns(org.Cidella.eg.db)
##[1] "EVIDENCE" "EVIDENCEALL" "GENENAME" "GID" "GO" "GOALL" "Ko" "ONTOLOGY" "ONTOLOGYALL" "Pathway"
#顯示所有的基因
keys(org.Cidella.eg.db)
#查看數(shù)據(jù)庫總共有多少基因
length(keys(org.Cidella.eg.db))
genel <- read.csv("test.csv",header=FALSE) #讀取一個(gè)基因列表
ego <- enrichGO(gene = genel,
OrgDb = org.Cidella.eg.db,
keyType = "GID",
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
head(ego)
dotplot(ego)
進(jìn)行KEGG富集分析 沒法使用Orgdb,所以只能使用enricher函數(shù)來實(shí)現(xiàn)。其實(shí)GO也可以同樣完成
需要提供TERM2GENE和TERM2NAME參數(shù)。
- TERM2GENE格式如下
ko04012 Cli01G000030
- TERM2NAME格式如下
ko00010 Glycolysis / Gluconeogenesis
使用上面輸出的KEGG.Rdata來創(chuàng)建上述2個(gè)文件。
load("KEGG.RData")
pathway2name$Name <- gsub(" \\[BR:ko[0-9]{5}\\]","",pathway2name$Name) #刪除BR:ko等字符
pathway2name <- na.omit(pathway2name)
term2gene <- gene2pathway[,c("Pathway","GID")]
term2name <- pathway2name
ego_kegg <- enricher(gene = genel,
TERM2GENE = term2gene,
TERM2NAME = term2name)
head(ego_kegg)
dotplot(ego_kegg)
cnetplot(ego_kegg,layout = "circle")
cnetplot(ego_kegg)
參考1:http://m.itdecent.cn/p/f2e4dbaae719
參考作者:三點(diǎn)水的番薯 鏈接:http://m.itdecent.cn/p/0bcc3db0d7e8
在使用parLapply的時(shí)候需要開啟多線程,同時(shí)使用下面的命令分別導(dǎo)入環(huán)境中的變量和包。否則多線程里無法使用環(huán)境中的變量和包。
clusterExport(cl,"gos")#導(dǎo)入外部變量gos
clusterEvalQ(cl, library(tidyverse))#加載外部環(huán)境的包tidyverse