隨機森林分析在宏基因組數(shù)據(jù)分析中還是蠻常見的,所以這里整理一個常規(guī)的分析案例
1 | 數(shù)據(jù)準備
1.1 菌群數(shù)據(jù)

圖一 菌群數(shù)據(jù)-OTU
1.2 樣品信息數(shù)據(jù)

圖二 樣品信息數(shù)據(jù)
2 | 直接貼代碼
#For windows system#
##ramdomforest.crossvalidation.r##
rm(list=ls())
# loading packages if necessary
library(randomForest)
library(ggplot2)
library(caret)
library(tidyverse)
# setting your working directory
setwd("E:\\F6_castrate_analysis\\total\\Duroc 16S\\fsm\\OTU\\稀疏后的OTU表")
set.seed(123)
## group male before and after castration
# loading Your datasets
dat1 <- read.table("fsm_otu_profile.csv",header=T,sep=',',row.names = 1)
conf <- read.table("fsm_metadata.csv",header=T,sep=',',row.names = 1)
#calculate the relative abundance of dat1
#dat1 <- sweep(dat1,2,colSums(dat1),'/')*100
dat1 <- log10(dat1 + 1)/log10(max(dat1))
cN.prof <- colnames(dat1)
#cN.prof <- sub("X","",cN.prof)
rN.conf <- rownames(conf)
gid <- intersect(cN.prof ,rN.conf) #求A和B的交集,根據(jù)樣品名
dat1 <- dat1[,pmatch(gid,cN.prof)] # pmatch部分匹配
conf <- conf[pmatch(gid,rN.conf),]
dat2<-dat1[,conf$sex=="Gilts" | conf$sex=="Entire boars"]
conf2<-conf[conf$sex=="Gilts" | conf$sex=="Entire boars",]
conf2$sex=as.factor(as.character(conf2$sex))
outcome=conf2$sex
outcome<-sub("Gilts","0",outcome)
outcome<-sub("Entire boars","1",outcome)
outcome<-as.factor(outcome)
dat<-dat2
X <- as.data.frame(t(dat))
X$outcome = outcome
# 隨機1-2有放回抽樣293次,概率分別為0.67和0.33,用于獲取訓練集
ind = sample(2,nrow(X),replace=TRUE, prob=c(0.67,0.33))
# 2/3作預測建模
ind.train <- X[ind == 1,]
ind.test <- X[ind == 2,]
set.seed(123)
# 1.在隨機森林算法的函數(shù)randomForest()中有兩個非常重要的參數(shù),
# 而這兩個參數(shù)又將影響模型的準確性
# 2.它們分別是mtry和ntree。一般對mtry的選擇是逐一嘗試,直到找
# 到比較理想的值,ntree的選擇可通過圖形大致判斷模型內(nèi)誤差穩(wěn)定時的值。
# 選取randomforest函數(shù)中的mtry節(jié)點值,一般可默認,mtry指定節(jié)點中用于二叉樹的變量個數(shù)
# 默認情況下為數(shù)據(jù)集變量個數(shù)的二次方根(分類模型)或三分之一(預測模型)
n <- length(names(ind.train))
set.seed(100)
errRate <- NULL
for(i in 1:(n-1)){
mtry_fit <- randomForest(outcome ~ ., data = ind.train, mtry = i)
err <- mean(mtry_fit$err.rate)
errRate[i] <- err
}
# 選擇平均誤差最小的mtry
m <- which.min(errRate)
# m為221
# 之后選擇ntree值,ntree指定隨機森林所包含的決策樹數(shù)目,默認為500,
set.seed(100)
ntree_fit <- randomForest(outcome ~ ., data = ind.train, mtry = 221, ntree = 1000)
plot(ntree_fit) # ntree到600以后就基本不變
# 根據(jù)以上結(jié)果,默認情況下的mtry效果更好,所以以mtry=18,ntree=600為參數(shù)構(gòu)建隨機森林模型。
rf.train <- randomForest(outcome ~ ., data = ind.train, mtry = 18,
importance = TRUE,proximity=TRUE,ntree = 600)
print(rf.train)
#summary(rf)
#' 交叉驗證選擇features
# set.seed(123)
#' rfcv是隨機森林交叉驗證函數(shù):Random Forest Cross Validation
# result <- rfcv(X[,-ncol(X)],X$outcome,cv.fold = 10)
# result$error.cv #' 查看錯誤率表,21時錯誤率最低,為最佳模型
#' 繪制驗證結(jié)果
# with(result,plot(n.var,error.cv,log="x",type = "o",lwd=2))
# 使用replicate進行多次交叉驗證,可選
result <- replicate(5, rfcv(ind.train[,-ncol(ind.train)],ind.train$outcome,cv.fold = 20), simplify=FALSE)
error.cv <- sapply(result, "[[", "error.cv")
error.cv <- cbind(rowMeans(error.cv),error.cv)
n.var = rownames(error.cv) %>% as.numeric()
error.cv = error.cv[,2:6]
colnames(error.cv) = paste('err',1:5,sep='.')
err.mean = apply(error.cv,1,mean)
allerr = data.frame(num=n.var,err.mean=err.mean,error.cv)
# number of features selected
optimal = 164
# 表格輸出
write.table(allerr, file = "family_rfcv.txt", sep = "\t", quote = F, row.names = T, col.names = T)
# the pre-setted parameters used for ploting afterwards
main_theme = theme(panel.background=element_blank(),
panel.grid=element_blank(),
axis.line.x=element_line(size=.5, colour="black"),
axis.line.y=element_line(size=.5, colour="black"),
axis.ticks=element_line(color="black"),
axis.text=element_text(color="black", size=7),
legend.position="right",
legend.background=element_blank(),
legend.key=element_blank(),
legend.text= element_text(size=7),
text=element_text(family="sans", size=7))
p = ggplot() +
geom_line(aes(x = allerr$num, y = allerr$err.1), colour = 'grey') +
geom_line(aes(x = allerr$num, y = allerr$err.2), colour = 'grey') +
geom_line(aes(x = allerr$num, y = allerr$err.3), colour = 'grey') +
geom_line(aes(x = allerr$num, y = allerr$err.4), colour = 'grey') +
geom_line(aes(x = allerr$num, y = allerr$err.5), colour = 'grey') +
geom_line(aes(x = allerr$num, y = allerr$err.mean), colour = 'black') +
geom_vline(xintercept = optimal, colour='black', lwd=0.36, linetype="dashed") +
# geom_hline(yintercept = min(allerr$err.mean), colour='black', lwd=0.36, linetype="dashed") +
coord_trans(x = "log2") +
scale_x_continuous(breaks = c(1, 3, 5, 10, 20, 40, 60, 100, 200, 300)) + # , max(allerr$num)
labs(title=paste('Training set (n = ', dim(dat)[2],')', sep = ''),
x='Number of OTUs ', y='Cross-validation error rate') +
annotate("text", x = optimal, y = max(allerr$err.mean), label=paste("optimal = ", optimal, sep="")) +
main_theme
ggsave(p, file = "duroc_otu_rfcv.pdf", width = 89, height = 50, unit = 'mm')
p
###--------------------------------------######
###---- 將最佳的那164個OTUs ######
imp <- as.data.frame(rf.train$importance)
# 隨機森林中的特征重要性表示在該特征上拆分的所有節(jié)點的基尼不純度減少的總和
# 減少的越多,說明該特征在分類過程中起的作用越大。
imp <- imp[order(imp[,1],decreasing = T),]
#head(imp)
#' 輸出重要性排序的表格
#write.table(imp,file = "importance_class_20180719.txt",quote = F,sep = '\t',row.names = T,col.names = T)
#' 簡單可視化
varImpPlot(rf,main = "Top 164-feature OTU importance",n.var = 164,bg=par("bg"),
color = par("fg"),gcolor=par("fg"),lcolor="gray")
#' 美化feature貢獻度柱狀圖
#' 分析選擇top5效果最好
imp = head(imp, n=164)
write.table(imp,file = "importance_class_selected.txt",quote = F,sep = '\t',row.names = T,col.names = T)
#' 反向排序X軸,讓柱狀圖從上往下畫
imp = imp[order(imp[,3],decreasing = F),]
imp$name <- rownames(imp)
imp$name <- factor(imp$name,levels = imp$name)
OTU_attribution <- read.table("OTU_phylum.csv",header = T,sep = ",",check.names = F)
rownames(OTU_attribution) <- OTU_attribution$OTUID
OTU_attribution <- OTU_attribution[rownames(imp),]
imp$phylum <- OTU_attribution$phylum
imp2 <- imp
rownames(imp2) <- paste(rownames(imp2),OTU_attribution$class,sep = " ")
imp2$name <- rownames(imp2)
imp2$name <- factor(imp2$name,levels = imp2$name)
#' load ggplot2 package
library(ggplot2)
p <- ggplot(imp2,aes(x=name,y=MeanDecreaseAccuracy,fill=phylum)) +
geom_bar(stat = "identity") + coord_flip()
p <- p+theme_bw()
p <- p+theme(legend.position = c(0.75,0.15))
p <- p + theme(plot.background=element_blank(),panel.background=element_blank())
p <- p + theme(axis.text.x=element_text(face="bold",color="black",size=14),
axis.text.y=element_text(face = "bold",color="black",size = 14),
axis.title.y=element_text(size = 16,face = "bold"),axis.title.x=element_text(size = 16))
p <- p + scale_fill_manual(values=c("red2", "blue4","yellow2"))
p + theme(legend.background = element_blank())
p <- p + theme(
legend.key.height=unit(0.8,"cm"),
legend.key.width=unit(0.8,"cm"),
legend.text=element_text(lineheight=0.8,face="bold",size=16),
legend.title=element_text(size=18,face = "bold"))
p <- p+labs(x="OTU rank")
ggsave(paste("rf_imp_feature" , ".tiff", sep=""), p, width = 10, height = 6)
###---------------------------------###
###--- 熱圖展示其豐度 -----###
# ' 篩選82個feature 展示
sub_abu = X[,rownames(imp)]
#' transposition the data
sub_abu <- as.data.frame(t(sub_abu))
#' table(X$outcome)
#' 0 1
#' 108 185
#sub_abu$class <- X$outcome
#sub_abu$class <- sub("0","Gilts",sub_abu$class)
#sub_abu$class <- sub("1","Entire_boars",sub_abu$class)
sub_abu_matrix <- as.matrix(sub_abu)
library(pheatmap)
annotation_col <- data.frame(Group=factor(rep(c("Gilts","Entire_boars"),c(108,185))))
rownames(annotation_col) <- colnames(sub_abu_matrix)
ann_colors <- list(Group=c(Gilts="#4DAF4A",Entire_boars="#984EA3"))
#plot.new()
tiff("OTU_rf_pheatmap.tiff",width = 6000,height = 1200,res = 300)
pheatmap(sub_abu_matrix,treeheight_col = 20,annotation_colors = ann_colors,cluster_cols = FALSE,
cluster_rows = FALSE,show_colnames = FALSE,annotation_col=annotation_col,
gaps_col = c(108,108),border_color = "NA",cellwidth = 4.3,cellheight = 20)
dev.off()
#' 用選擇好的164個OTU重新構(gòu)建randomforest模型
ind.train2 <- cbind(ind.train[,rownames(imp)], ind.train$outcome)
names(ind.train2)[ncol(ind.train2)] <- "class"
rf.train2 <- randomForest(class ~ ., data = ind.train2, importance = TRUE,
proximity = TRUE, ntree = )
# 預測響應變量
ind.train2$predicted.response <- predict(rf.train2, ind.train2)
# confusionMatrix function from caret package can be used for
# creating confusion matrix based on actual response
# variable and predicted value(構(gòu)建混淆矩陣建立真實響應變量和預測響應變量值)
confusionMatrix(data=ind.train2$predicted.response, reference=ind.train2$class)
# 測試訓練集
ind.test <- cbind(ind.test[,rownames(imp)], ind.test$outcome)
names(ind.test)[ncol(ind.test)] <- "class"
# 預測訓練集的響應變量
ind.test$predicted.reponse <- predict(rf.train2, ind.test)
confusionMatrix(data=ind.test$predicted.reponse, reference=ind.test$class)
#rf.test <- randomForest(class ~ ., data = ind.test,
# importance = TRUE,proximity=TRUE,ntree = 1000)
#rf.test.pre <- predict(rf.test,type="prob")
#p.train<-rf.test.pre[,2]
########ROC in train######
library(pROC)
predicted.reponse <- predict(rf.train2, ind.test, type = "prob")
roc(ind.test$class,predicted.reponse[,2])
###################
roc1 <- roc(ind.test$class,
predicted.reponse[,2],
partial.auc.correct=TRUE,
ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,
plot=F, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE)
#####################
tiff("fsm_variable_pROC.tiff",width=2200,height=1800,res=300)
roc1 <- roc(ind.test$class, predicted.reponse[,2],
ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,font=2, font.lab=2,
plot=TRUE, percent=roc1$percent,col="#F781BF",cex.lab=1.2, cex.axis=1.2,cex.main=1.5)
plot(roc1,col="black",add=T)
legend("bottomright",cex=1.4,c(paste("AUC=",round(roc1$ci[2],2),"%"),
paste("95% CI:",round(roc1$ci[1],2),"%-",round(roc1$ci[3],2),"%")))
dev.off()
3 | 結(jié)果展示

圖三 選擇了82個OTU,使得模型的錯誤率最低