第一步:簡單的GEO數(shù)據(jù)提取及數(shù)據(jù)分析

簡介

這是復(fù)現(xiàn)一篇結(jié)腸癌的文獻圖例,GSE4107,文獻地址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4399548/pdf/ott-8-745.pdf,其中由于作者細(xì)節(jié)未公布,可能差異基因個數(shù)有稍許差別

數(shù)據(jù)提取

利用getGEO函數(shù)下載GSE4107,用exprs函數(shù)獲的exp表達矩陣,pData函數(shù)獲取臨床信息pd臨床數(shù)據(jù),@annotation獲得gpl平臺數(shù)據(jù)。

#數(shù)據(jù)下載
rm(list = ls())
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE4107"
eSet <- getGEO(gse, 
               destdir = '.', 
               getGPL = F)
#(1)提取表達矩陣exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
exp = log2(exp+1)
#(2)提取臨床信息
pd <- pData(eSet[[1]])
#(3)調(diào)整pd的行名順序與exp列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
#(4)提取芯片平臺編號
gpl <- eSet[[1]]@annotation
save(gse,pd,exp,gpl,file = "step1output.Rdata")

獲取GPL平臺信息

首先利用pd中的title信息獲取臨床信息并factor化,再利用GPL獲取探針信息。

rm(list = ls())  
load(file = "step1output.Rdata")
library(stringr)
pd$title
group_list=ifelse(str_detect(pd$title,"healthy control"),"control","treat")
#設(shè)置參考水平,對照在前,處理在后
group_list = factor(group_list,
                    levels = c("control","treat"))
#2.ids-----------------
#方法1 BioconductorR包
gpl 
#http://www.bio-info-trainee.com/1399.html
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL)
head(ids)
# 方法2 讀取gpl頁面的soft文件,按列取子集
# 方法3 官網(wǎng)下載
# 方法4 自主注釋 
save(exp,group_list,ids,file = "step2output.Rdata")

制作PCA圖和熱圖

這一步只要提供expgrou_list其他不用變

rm(list = ls())  
load(file = "step1output.Rdata")
load(file = "step2output.Rdata")
boxplot(exp,outline=FALSE,notch=T,las=2)
#輸入數(shù)據(jù):exp和group_list
#Principal Component Analysis
#http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials

dat=as.data.frame(t(exp))
library(FactoMineR)#畫主成分分析圖需要加載這兩個包
library(factoextra) 
# pca的統(tǒng)一操作走起
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
                         geom.ind = "point", # show points only (nbut not "text")
                         col.ind = group_list, # color by groups
                         #palette = c("#00AFBB", "#E7B800"),
                         addEllipses = TRUE, # Concentration ellipses
                         legend.title = "Groups"
)
pca_plot
ggsave(plot = pca_plot,filename = paste0(gse,"PCA.png"))
save(pca_plot,file = "pca_plot.Rdata")

#熱圖 
cg=names(tail(sort(apply(exp,1,sd)),1000))
n=exp[cg,]

#繪制熱圖
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n) 
library(pheatmap)
pheatmap1 <- pheatmap(n,
         show_colnames =F,
         show_rownames = F,
         annotation_col=annotation_col,
         scale = "row")
ggsave(plot = pheatmap1,filename = paste0(gse,"pheatmap1.png"))

做差異分析

利用貝葉斯獲取表達差異矩陣,用inner_join獲得探針信息,獲得deg表達。

rm(list = ls()) 
load(file = "step2output.Rdata")
#差異分析,用limma包來做
#需要表達矩陣和group_list,不需要改
library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)

#為deg數(shù)據(jù)框添加幾列
#1.加probe_id列,把行名變成一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg))
head(deg)
#2.加symbol列,火山圖要用
deg <- inner_join(deg,ids,by="probe_id")
head(deg)
deg <- deg[!duplicated(deg$symbol),]
#3.加change列,標(biāo)記上下調(diào)基因
logFC_t=1
P.Value_t = 0.05
k1 = (deg$adj.P.Val < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$adj.P.Val < P.Value_t)&(deg$logFC > logFC_t)
table(k1)
table(k2)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg <- mutate(deg,change)

火山圖

提供了差異前幾基因或者某一單獨基因的實現(xiàn)方法

load(file = "step1output.Rdata")
load(file = "step4output.Rdata")
#1.火山圖----
library(dplyr)
library(ggplot2)

dat  = deg
if(T) {
  x1 = dat %>% 
    filter(change == "up") %>% 
    arrange(desc(logFC))%>%
    head(3)
  x2 = dat %>% 
    filter(change == "down") %>% 
    arrange(logFC)%>%
    head(3)
  for_label = rbind(x1,x2)
}

#%>%
if(F){
  for_label <- dat%>% 
    filter(symbol %in% c("VIP","SFRP1")) 
}
if(F){
  for_label <- dat %>% head(10)
}
if(F) {
  x1 = dat %>% 
    filter(change == "up") %>% 
    head(3)
  x2 = dat %>% 
    filter(change == "down") %>% 
    head(3)
  for_label = rbind(x1,x2)
}

#  head(10)
p <- ggplot(data = dat, 
            aes(x = logFC, 
                y = -log10(adj.P.Val))) +
  geom_point(alpha=0.4, size=3.5, 
             aes(color=change)) +
  ylab("-log10(Pvalue)")+
  scale_color_manual(values=c("blue", "grey","red"))+
  geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +
  geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +
  theme_bw()+
  theme(panel.border = element_blank(),panel.grid.major = element_blank(),
        
         panel.grid.minor = element_blank(),axis.line = element_line(colour = "black"))
p
volcano_plot <- p +
  geom_point(size = 3, shape = 1, data = for_label) +
  ggrepel::geom_label_repel(
    aes(label = symbol),
    data = for_label,
    color="black")
ggsave(plot = volcano_plot,filename = paste0(gse,"volcano.png"))
image.png

熱圖和拼圖

熱圖提供前多少基因或者所有差異基因的熱圖,if函數(shù)提供了標(biāo)記列名的函數(shù),最后用patchwork進行拼圖

load(file = 'step2output.Rdata')
dd1=deg
x=dd1$logFC 
names(x)=dd1$probe_id
cg=c(names(head(sort(x),30)),names(tail(sort(x),30)))
cg
#cg = deg$probe_id[deg$change !="stable"]
rownames(dd1) <- dd1$probe_id
dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
dd2
if(F) {
  rownames(dd1) <- dd1$probe_id
  dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
  dd2
}
n=exp[cg,]
#rownames(n) <- dd2
dim(n)
#作熱圖
library(pheatmap)
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n) 
library(ggplotify)
heatmap_plot <- as.ggplot(pheatmap(n,show_colnames =F,
                                   show_rownames = T,
                                   scale = "row",
                                   #cluster_cols = F, 
                                   annotation_col=annotation_col
                              )) 
heatmap_plot
ggsave(heatmap_plot,filename = paste0(gse,"heatmap.png"))
load("pca_plot.Rdata")
library(patchwork)
(pca_plot | volcano_plot )/heatmap_plot+ plot_annotation(tag_levels = "A")
Cggsave(filename = "test.png",width = 15,height = 5)
image.png
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容