編寫:王采荷
感覺已經(jīng)很久沒有在簡書上分享自己的東西了,今天在集合其他大佬的力量終于把“如何用R語言做差異代謝物的kegg富集分析”這個問題給解決了。我決定將這個過程分享到簡書上,一來可以給在這方面有需求的其他朋友一個參考;二來如果有錯誤的地方也能得到大佬的指點。這篇短文主要分為以下三個部分組成;
一、將代謝物及對應的KEGG數(shù)據(jù)庫信息進行下載并整理
- 數(shù)據(jù)下載
# 加載所需要的R包
rm(list = ls())
library(XML)
library(RCurl)
library(tidyverse)
library(ggplot2)
library(magrittr)
library(clusterProfiler)
#' 第一步下載KEGG數(shù)據(jù)庫信息
extractCompounds <- function(pathwayId) {
compoundUrl <- paste0("https://www.genome.jp/dbget-bin/get_linkdb?-t+compound+path:", pathwayId)
compoundDoc <- htmlParse(getURL(compoundUrl), encoding = "utf-8")
compoundLinks <- getNodeSet(compoundDoc, "/html/body/pre/a")
compoundIds <- sapply(compoundLinks, function(node) xmlGetAttr(node, "href"))
compoundNames <- sapply(getNodeSet(compoundDoc, "/html/body/pre/text()"), xmlValue)[-1]
data.frame(compoundId = paste(compoundIds, collapse = ";"), compoundName = paste(compoundNames, collapse = ";"))
}
# Main process
keggUrl <- "https://www.genome.jp/kegg/pathway.html#global"
keggDoc <- htmlParse(getURL(keggUrl), encoding = "UTF-8")
pathwayLinks <- getNodeSet(keggDoc, "http://a[@href]")
pathwayIds <- sapply(pathwayLinks[65:276], function(node) gsub("/pathway/", "", xmlGetAttr(node, "href")))
pathwayNames <- sapply(pathwayLinks[65:276], xmlValue)
# Applying extractCompounds function to each pathwayId
compoundDataList <- Map(extractCompounds, pathwayIds)
pathwayData <- do.call(rbind, compoundDataList)
# Combine all data into a single data frame
finalData <- data.frame(pathwayId = pathwayIds, pathwayName = pathwayNames, pathwayData)
finalData %>% write_csv(paste0("KeggAllcompounds-",Sys.Date(),".csv"))
finalData2 <- finalData[finalData$compoundId != "", ]
==注意事項==
finalData2 <- finalData[finalData$compoundId != "", ]這一步的目的是為了將finalDate$compoundId中有空值的的行進行刪除,否則后面數(shù)據(jù)整理的時候會報錯。
- 數(shù)據(jù)整理
#' 分別采用for和map的方式將結(jié)果進行整理
#' 整理成為result_data和result_data2
#' 采用for循環(huán)的方式
result_data <- data.frame()
nrow(finalData2)
for (i in 1:nrow(finalData2)) {
cid <- finalData2$compoundId[i]
extracted_cid <- str_extract_all(cid, "C\\d+")
CID <- unlist(extracted_cid)
CName <- finalData2$compoundName[i]
split_CName <- strsplit(CName, "\n;")
CompoundName <- lapply(split_CName[[1]], trimws) %>% unlist()
pathway <- cbind(CID, CompoundName)
pathwayId <- rep(finalData2$pathwayId[i], nrow(pathway))
pathwayName <- rep(finalData2$pathwayName[i], nrow(pathway))
dat <- cbind(pathway, pathwayId, pathwayName)
result_data <- rbind(result_data, dat)
}
#' 采用map的方式
# Define a function to process each row
process_row <- function(row) {
cid <- row$compoundId
extracted_cid <- str_extract_all(cid, "C\\d+")
CID <- unlist(extracted_cid)
CName <- row$compoundName
split_CName <- strsplit(CName, "\n;")
CompoundName <- lapply(split_CName[[1]], trimws) %>% unlist()
# Ensure the result is a data frame
if (length(CID) > 0 && length(CompoundName) > 0) {
pathway <- data.frame(CID, CompoundName, stringsAsFactors = FALSE)
pathwayId <- rep(row$pathwayId, nrow(pathway))
pathwayName <- rep(row$pathwayName, nrow(pathway))
return(data.frame(pathway, pathwayId, pathwayName, stringsAsFactors = FALSE))
} else {
return(data.frame(CID = character(), CompoundName = character(), pathwayId = row$pathwayId, pathwayName = row$pathwayName, stringsAsFactors = FALSE))
}
}
# Apply the process_row function to each row of finalData using map_df
result_data2 <- map_df(seq_len(nrow(finalData2)), ~process_row(finalData2[.x, ]))
result_data %>% write_csv(paste0("keggAllCompoundReshapedData2-",Sys.Date(),".csv")) #將整理好的結(jié)果進行儲存,后面可以直接讀取進來用,不用重復跑前面的代碼,畢竟也需要時間的
result_data2 %>% write_csv(paste0("keggAllCompoundReshapedData22-",Sys.Date(),".csv")

表1 初步整理后的代謝物kegg數(shù)據(jù)庫表
- 對kegg代謝物數(shù)據(jù)庫進一步整理
# 后期富集分析只需要保留表1的第1和第4列
keggannotation <- result_data %>%
select(c(-2,-3)) %>% set_colnames(c("ID", "pathway"))
二、進行kegg富集分析
- 先將自己測定的正負離子模式代謝物鑒定的表(注意要包含kegg id這一列,如果沒有,想方設法將代謝物的kegg id找到)讀取后進行整理
uploadfile1 <- "D:/其他人員2/王書/公司結(jié)果/非靶向/Result-X101SC23124721-Z01-J002-B1-42/Result-X101SC23124721-Z01-J002-B1-42/2.MetAnnotation/HMDB_KEGG_Lipidmaps"
uploadfile2 <- "D:/其他人員2/王書/公司結(jié)果/非靶向/Result-X101SC23124721-Z01-J002-B1-42/Result-X101SC23124721-Z01-J002-B1-42/3.MetDiffScreening/MPP.vs.MPN"
allkeggid <- read.csv(paste0(uploadfile1,"/meta_intensity_neg_hmdb_kegg_lipidmaps.csv")) %>% select(Kegg_ID) %>%
bind_rows(.,read.csv(paste0(uploadfile1,"/meta_intensity_pos_hmdb_kegg_lipidmaps.csv")) %>% select(Kegg_ID)) %>%
filter(Kegg_ID != "--") %>%
set_colnames("ID") %>%
mutate(across("ID",str_replace,"cpd:", ""))
其實就是把正負離子代謝物所對應的KEGG ID合并,如表2:

表2 正負離子代謝物表讀取后只保留kegg id這一列
- 讀取自己的差異代謝物并保留kegg id
#'差異代謝物
diffkeggid <- read.csv(paste0(uploadfile2,"/MPP.vs.MPN_neg_diff.anno.csv")) %>% select(Kegg_ID) %>%
bind_rows(.,read.csv(paste0(uploadfile2,"/MPP.vs.MPN_pos_diff.anno.csv")) %>% select(Kegg_ID)) %>%
filter(Kegg_ID != "--") %>%
set_colnames("ID") %>%
mutate(across("ID",str_replace,"cpd:", ""))
- 數(shù)據(jù)整合
這里需要大家理解的是,我們每個人做的課題不一樣,所測定到的代謝物也不一樣,所以需要根據(jù)自己實際測定到的代謝物具體多少,構(gòu)建自己的代謝物kegg背景庫,而不是直接采用表1來進行富集分析。如表3所示:
total <- right_join(keggannotation,allkeggid,by="ID") %>% select(2,1)

表3 基于自己項目測出的代謝物構(gòu)建kegg庫
- kegg富集分析并導出結(jié)果
# 富集分析
x <- clusterProfiler::enricher(gene = diffkeggid$ID,TERM2GENE = total,minGSSize = 1,pvalueCutoff = 1,qvalueCutoff = 1)
# 結(jié)果導出
write.csv(as.data.frame(x@result) %>% select(-1,-2),
file = paste0(uploadfile2,"/MP_KEGG_enrichment_result.csv"))
我們來看看富集分析的結(jié)果怎么樣,如表4所示:

表4 代謝物kegg富集分析結(jié)果
三、對富集的結(jié)果進行整理和可視化
#' 1.數(shù)據(jù)清洗
df <- read_csv(paste0(uploadfile2,"/MP_KEGG_enrichment_result.csv")) %>%
dplyr::rename("Description"="...1") %>%
arrange(desc(Count)) %>%
select(1,2,3,4,8) %>%
separate(`GeneRatio`,into=c("A","B"),sep="/") %>%
mutate(A=as.numeric(A),B=as.numeric(B)) %>%
mutate(count=A/B) %>% head(10) %>% arrange(Count)
#' 2.定義因子
df$Description <- factor(df$Description,levels = c(df$Description %>% as.data.frame() %>% pull()))
#' 3.數(shù)據(jù)可視化
p <- df %>% ggplot(aes(count,Description))+
geom_point(aes(size=Count,color=pvalue,fill=pvalue),pch=21)+
scale_color_gradientn(colours = (rev(RColorBrewer::brewer.pal(11,"RdBu"))))+
scale_fill_gradientn(colours =(rev(RColorBrewer::brewer.pal(11,"RdBu"))))+
guides(size=guide_legend(title="Count"))+
labs(x=NULL,y=NULL)+
theme(axis.title = element_blank(),
axis.text.x=element_text(color="black",angle =0,hjust=0.5,vjust=0.5, margin = margin(b =5)),
axis.text.y=element_text(color="black",angle =0,hjust=1,vjust=0.5),
panel.background = element_rect(fill = NA,color = NA),
panel.grid.minor= element_line(size=0.2,color="#e5e5e5"),
panel.grid.major = element_line(size=0.2,color="#e5e5e5"),
panel.border = element_rect(fill=NA,color="black",size=1,linetype="solid"),
legend.key=element_blank(),
legend.title = element_text(color="black",size=9),
legend.text = element_text(color="black",size=8),
legend.spacing.x=unit(0.1,'cm'),
legend.key.width=unit(0.5,'cm'),
legend.key.height=unit(0.5,'cm'),
legend.background=element_blank(),
legend.box="horizontal",
legend.box.background = element_rect(color="black"),
legend.position = c(1,0),legend.justification = c(1,0))+
scale_y_discrete(labels = function(y) str_wrap(y,width=30))
ggsave(paste0(uploadfile2, "/MP_Kegg_enrichment.png"), plot = p, width = 6, height = 6, dpi = 300)
-
結(jié)果如圖所示
圖1 kegg富集分析通路圖
