簡介
這是復(fù)現(xiàn)一篇結(jié)腸癌的文獻圖例,GSE4107,文獻地址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4399548/pdf/ott-8-745.pdf,其中由于作者細(xì)節(jié)未公布,可能差異基因個數(shù)有稍許差別
數(shù)據(jù)提取
利用getGEO函數(shù)下載GSE4107,用exprs函數(shù)獲的exp表達矩陣,pData函數(shù)獲取臨床信息pd臨床數(shù)據(jù),@annotation獲得gpl平臺數(shù)據(jù)。
#數(shù)據(jù)下載
rm(list = ls())
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE4107"
eSet <- getGEO(gse,
destdir = '.',
getGPL = F)
#(1)提取表達矩陣exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
exp = log2(exp+1)
#(2)提取臨床信息
pd <- pData(eSet[[1]])
#(3)調(diào)整pd的行名順序與exp列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
#(4)提取芯片平臺編號
gpl <- eSet[[1]]@annotation
save(gse,pd,exp,gpl,file = "step1output.Rdata")
獲取GPL平臺信息
首先利用pd中的title信息獲取臨床信息并factor化,再利用GPL獲取探針信息。
rm(list = ls())
load(file = "step1output.Rdata")
library(stringr)
pd$title
group_list=ifelse(str_detect(pd$title,"healthy control"),"control","treat")
#設(shè)置參考水平,對照在前,處理在后
group_list = factor(group_list,
levels = c("control","treat"))
#2.ids-----------------
#方法1 BioconductorR包
gpl
#http://www.bio-info-trainee.com/1399.html
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL)
head(ids)
# 方法2 讀取gpl頁面的soft文件,按列取子集
# 方法3 官網(wǎng)下載
# 方法4 自主注釋
save(exp,group_list,ids,file = "step2output.Rdata")
制作PCA圖和熱圖
這一步只要提供exp和grou_list其他不用變
rm(list = ls())
load(file = "step1output.Rdata")
load(file = "step2output.Rdata")
boxplot(exp,outline=FALSE,notch=T,las=2)
#輸入數(shù)據(jù):exp和group_list
#Principal Component Analysis
#http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials
dat=as.data.frame(t(exp))
library(FactoMineR)#畫主成分分析圖需要加載這兩個包
library(factoextra)
# pca的統(tǒng)一操作走起
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
#palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)
pca_plot
ggsave(plot = pca_plot,filename = paste0(gse,"PCA.png"))
save(pca_plot,file = "pca_plot.Rdata")
#熱圖
cg=names(tail(sort(apply(exp,1,sd)),1000))
n=exp[cg,]
#繪制熱圖
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n)
library(pheatmap)
pheatmap1 <- pheatmap(n,
show_colnames =F,
show_rownames = F,
annotation_col=annotation_col,
scale = "row")
ggsave(plot = pheatmap1,filename = paste0(gse,"pheatmap1.png"))
做差異分析
利用貝葉斯獲取表達差異矩陣,用inner_join獲得探針信息,獲得deg表達。
rm(list = ls())
load(file = "step2output.Rdata")
#差異分析,用limma包來做
#需要表達矩陣和group_list,不需要改
library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)
#為deg數(shù)據(jù)框添加幾列
#1.加probe_id列,把行名變成一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg))
head(deg)
#2.加symbol列,火山圖要用
deg <- inner_join(deg,ids,by="probe_id")
head(deg)
deg <- deg[!duplicated(deg$symbol),]
#3.加change列,標(biāo)記上下調(diào)基因
logFC_t=1
P.Value_t = 0.05
k1 = (deg$adj.P.Val < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$adj.P.Val < P.Value_t)&(deg$logFC > logFC_t)
table(k1)
table(k2)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg <- mutate(deg,change)
火山圖
提供了差異前幾基因或者某一單獨基因的實現(xiàn)方法
load(file = "step1output.Rdata")
load(file = "step4output.Rdata")
#1.火山圖----
library(dplyr)
library(ggplot2)
dat = deg
if(T) {
x1 = dat %>%
filter(change == "up") %>%
arrange(desc(logFC))%>%
head(3)
x2 = dat %>%
filter(change == "down") %>%
arrange(logFC)%>%
head(3)
for_label = rbind(x1,x2)
}
#%>%
if(F){
for_label <- dat%>%
filter(symbol %in% c("VIP","SFRP1"))
}
if(F){
for_label <- dat %>% head(10)
}
if(F) {
x1 = dat %>%
filter(change == "up") %>%
head(3)
x2 = dat %>%
filter(change == "down") %>%
head(3)
for_label = rbind(x1,x2)
}
# head(10)
p <- ggplot(data = dat,
aes(x = logFC,
y = -log10(adj.P.Val))) +
geom_point(alpha=0.4, size=3.5,
aes(color=change)) +
ylab("-log10(Pvalue)")+
scale_color_manual(values=c("blue", "grey","red"))+
geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +
geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +
theme_bw()+
theme(panel.border = element_blank(),panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),axis.line = element_line(colour = "black"))
p
volcano_plot <- p +
geom_point(size = 3, shape = 1, data = for_label) +
ggrepel::geom_label_repel(
aes(label = symbol),
data = for_label,
color="black")
ggsave(plot = volcano_plot,filename = paste0(gse,"volcano.png"))

image.png
熱圖和拼圖
熱圖提供前多少基因或者所有差異基因的熱圖,if函數(shù)提供了標(biāo)記列名的函數(shù),最后用patchwork進行拼圖
load(file = 'step2output.Rdata')
dd1=deg
x=dd1$logFC
names(x)=dd1$probe_id
cg=c(names(head(sort(x),30)),names(tail(sort(x),30)))
cg
#cg = deg$probe_id[deg$change !="stable"]
rownames(dd1) <- dd1$probe_id
dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
dd2
if(F) {
rownames(dd1) <- dd1$probe_id
dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
dd2
}
n=exp[cg,]
#rownames(n) <- dd2
dim(n)
#作熱圖
library(pheatmap)
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n)
library(ggplotify)
heatmap_plot <- as.ggplot(pheatmap(n,show_colnames =F,
show_rownames = T,
scale = "row",
#cluster_cols = F,
annotation_col=annotation_col
))
heatmap_plot
ggsave(heatmap_plot,filename = paste0(gse,"heatmap.png"))
load("pca_plot.Rdata")
library(patchwork)
(pca_plot | volcano_plot )/heatmap_plot+ plot_annotation(tag_levels = "A")
Cggsave(filename = "test.png",width = 15,height = 5)

image.png