2019-06-29 R語言作業(yè)(中級)

R語言作業(yè)(中級)題目鏈接:http://www.bio-info-trainee.com/3750.html

作業(yè)1 根據(jù)R包org.Hs.eg.db找到下面ensembl 基因ID 對應(yīng)的基因名(symbol)

#1.加載這個注釋包org.Hs.eg.db
library(org.Hs.eg.db) 
#了解一下這個包 ?org.Hs.eg.db 發(fā)現(xiàn)里面含有一個函數(shù)可以查看org.Hs.eg.db包的信息 columns()
columns(org.Hs.eg.db)
#了解一個函數(shù)toTable(),
#toTable(x) turns Bimap object x into a data frame
g2s=toTable(org.Hs.egSYMBOL)
g2e=toTable(org.Hs.egENSEMBL)
head(g2s)
head(g2e)

作業(yè)2 根據(jù)R包hgu133a.db找到下面探針對應(yīng)的基因名(symbol)

options(BioC_mirror='https://mirrors.ustc.edu.cn/bioc')
if(!require('hgu133a.db')) BiocManager::install('hgu133a.db',ask = F,update = F)
library(hgu133a.db)
columns(hgu133a.db)
ids=toTable(hgu133aSYMBOL)
head(ids)
#找到下面探針對應(yīng)的基因名(symbol)
prb <- read.table('names.txt')
head(prb)
colnames(prb)='probe_id'
head(prb)
work2 <- merge(prb,ids,by='probe_id')

作業(yè)3 找到R包CLL內(nèi)置的數(shù)據(jù)集的表達(dá)矩陣?yán)锩娴腡P53基因的表達(dá)量,并且繪制在 progres.-stable分組的boxplot圖

library(CLL)
data("sCLLex")
#使用CLL包時,需要了解這個包 ?CLL  發(fā)現(xiàn)里面有一個data(sCLLex)
dim(exprSet)#表達(dá)矩陣的維度
pData(sCLLex)#sCLLex的分組情況
sampleNames(sCLLex)#sCLLex的樣本名稱
varMetadata(sCLLex)#sCLLex的標(biāo)簽描述
varLabels(sCLLex)#sCLLex的觀測變量
#了解一下?sCLLex 發(fā)現(xiàn)里面有一個exprs()函數(shù)可以提取sCLLex的表達(dá)矩陣
library(CLL)
data("sCLLex")
exprSet=exprs(sCLLex)
#尋找TP53基因的表達(dá)量,并且繪制在 progres.-stable分組的boxplot圖
head(exprSet)
#head一下發(fā)現(xiàn)exprSet里面是probe_id
library(hgu95av2.db)
columns(hgu95av2.db)
g3s=toTable(hgu95av2SYMBOL)
head(g3s)
which(g3s[,2]=='TP53')
g3s[c(966,997,1420),c(1,2)]
head(exprSet)
tmp=exprSet
rownames(exprSet)
class(rownames(exprSet))
tmp1 <- data.frame(rownames(exprSet),exprSet)
tmp1[1:4,1:4]
colnames(tmp1)[1] <-  'probe_id'
TP53_prob <- data.frame(g3s[c(966,997,1420),c(1,2)][,1])
names(TP53_prob) <- 'probe_id'
tmp2 <- merge(TP53_prob,tmp1,by='probe_id')
pData(sCLLex)
pData1 <- pData(sCLLex)
pData2 <- data.frame(rownames(pData1),pData1)
colnames(pData2)[1] <- 'sample'
rownames(tmp2)=tmp2[,1]
tmp2=tmp2[,-1]
tmp2_t=t(tmp2)
tmp2_t=data.frame(rownames(tmp2_t),tmp2_t)
colnames(tmp2_t)[1] <- 'sample'
#hebing
TP53_exprSet <- merge(tmp2_t,pData2,by='sample')
#使用apply求TP53三個探針的平均值
apply(TP53_exprSet[,c(2,3,4)], 1, mean)
TP53_exprSet$mean <- apply(TP53_exprSet[,c(2,3,4)], 1, mean)
#繪制在 progres.-stable分組的boxplot圖
boxplot(TP53_exprSet$mean~TP53_exprSet$Disease)
t.test(TP53_exprSet$mean~TP53_exprSet$Disease)
#ggpubr美化圖形
library(ggpubr)
p <- ggboxplot(TP53_exprSet, x = "Disease", y = "mean",
               color = "Disease", palette =c("#00AFBB", "#E7B800"),
               add = "jitter", shape = "Disease")
my_comparisons <- list( c("progres.", "stable"))
p <- p + stat_compare_means(comparisons = my_comparisons)+stat_compare_means(label.y = 50)+ylim(2.4,4)
p

作業(yè)4 作業(yè)4 找到BRCA1基因在TCGA數(shù)據(jù)庫的乳腺癌數(shù)據(jù)集(Breast Invasive Carcinoma (TCGA, PanCancer Atlas))的表達(dá)情況

作業(yè)5 找到TP53基因在TCGA數(shù)據(jù)庫的乳腺癌數(shù)據(jù)集的表達(dá)量分組看其是否影響生存

作業(yè)6 下載數(shù)據(jù)集GSE17215的表達(dá)矩陣并且提取下面的基因畫熱圖

exprSet <- read.table('GSE17215_series_matrix.txt',header = T,comment.char = '!',stringsAsFactors = F)

g6id <- c('ACTR3B','ANLN','BAG1','BCL2' ,'BIRC5' ,'BLVRA', 'CCNB1', 'CCNE1' ,'CDC20' ,'CDC6', 'CDCA1', 'CDH3', 'CENPF', 'CEP55', 'CXXC5', 'EGFR', 'ERBB2', 'ESR1', 'EXO1', 'FGFR4', 'FOXA1', 'FOXC1', 'GPR160', 'GRB7', 'KIF2C', 'KNTC2', 'KRT14', 'KRT17', 'KRT5', 'MAPT', 'MDM2', 'MELK', 'MIA', 'MKI67', 'MLPH', 'MMP11', 'MYBL2', 'MYC', 'NAT1','ORC6L' ,'PGR', 'PHGDH', 'PTTG1', 'RRM2', 'SFRP1', 'SLC39A6', 'TMEM45B', 'TYMS', 'UBE2C', 'UBE2T')
g6id2 <- data.frame(g6id)
colnames(g6id2) <- c('symbol')
GSE17215_annotation <- data.table::fread('GPL3921.annot',header = T)
g6s <- GSE17215_annotation[,c(1,3)]
colnames(g6s) <- c('ID','symbol')
g6id_merge <- merge(g6id2,g6s,by='symbol')
colnames(exprSet)[1] <- 'REF_ID'
colnames(g6id_merge)[2] <- 'REF_ID'
g6_expression <- merge(g6id_merge,exprSet,by='REF_ID')
g6_expression <- g6_expression[,-1]
#繪制熱圖
library(pheatmap)
rownames(g6_expression) <- g6_expression[,1]
table(g6_expression[,1])
length(unique(g6_expression[,1]))
nrow(g6_expression)
#發(fā)現(xiàn)探針I(yè)D對應(yīng)了多個基因ID,因此要對基因ID進(jìn)行刪除重復(fù)
library(dplyr)
tmp <- g6_expression %>% select(symbol,1:(length(g6_expression))) %>% mutate(rowMean=rowMeans(.[grep('GSM',names(.))])) %>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean)
g6_exp <- g6_expression %>% select(symbol,1:(length(g6_expression))) %>% mutate(rowMean=rowMeans(.[grep('GSM',names(.))])) %>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean)
rownames(g6_exp) <- g6_exp[,1]
g6_exp <- g6_exp[,-1]
pheatmap(g6_exp,scale = 'row')

作業(yè)7 下載數(shù)據(jù)集GSE24673的表達(dá)矩陣計算樣本的相關(guān)性并且繪制熱圖,需要標(biāo)記上樣本分組信息

exprSet <- read.table('GSE24673_series_matrix.txt',header = T,comment.char = '!',row.names = 1)
cor(exprSet)
library(pheatmap)
pheatmap(cor(exprSet))
colnames(exprSet)
col <- colnames(exprSet)
paste0('RB_139_09_TR',c(1,2,3))
trea <- paste0(RB_139_09_TR,c(1,2,3))
#添加注釋信息
group_list=c(rep('RB_139_09_TR',3),rep('RB_535_09_TR',3),rep('RB_984_09_TR',3),rep('Human_Healthy_Retina',2))
group <- data.frame(col,group_list)
pheatmap(cor(exprSet),annotation_col = group,annotation_legend = T)
rownames(group) <- group[,1]
group <- group[,-1]
group <- as.character(group)
group <- as.data.frame(group)
rownames(group) <- colnames(exprSet)
pheatmap(cor(exprSet),annotation_col = group,annotation_legend = T)

作業(yè)8 找到 GPL6244 platform of Affymetrix Human Gene 1.0 ST Array 對應(yīng)的R的bioconductor注釋包,并且安裝它

options()$repos
options()$BioC_mirror 
if(length(getOption('BioC_mirror'))==0) options(BioC_mirror='https://mirror.ustc.edu.cn/bioc/')
if(!require('hugene10sttranscriptcluster.db')) BiocManager::install('hugene10sttranscriptcluster.db',ask = F,update = F)

作業(yè)9 下載數(shù)據(jù)集GSE42872的表達(dá)矩陣,并且分別挑選出 所有樣本的(平均表達(dá)量/sd/mad/)最大的探針,并且找到它們對應(yīng)的基因。

rm(list = ls())
options(stringsAsFactors = F)
exprSet <- read.table('GSE42872_series_matrix.txt',header = T,comment.char = '!',row.names = 1)
tmp_mean <- apply(exprSet, 1, mean)
tmp_mean_1 <- sort(tmp_mean,decreasing = T)[1]
tmp_mean_1
tmp_sd <- apply(exprSet, 1, sd)
tmp_sd_1 <- sort(tmp_sd,decreasing = T)[1]
tmp_sd_1
tmp_mad <- apply(exprSet, 1, mad)
tmp_mad_1 <- sort(tmp_mad,decreasing = T)[1]
tmp_mad_1
#尋找到探針對應(yīng)的基因SYMBOL
library(hugene10sttranscriptcluster.db)
g9s <- toTable(hugene10sttranscriptclusterSYMBOL)
which(g9s$probe_id=='7978905')
which(g9s$probe_id=='8133876')
g9s[16473,]

作業(yè)10 下載數(shù)據(jù)集GSE42872的表達(dá)矩陣,并且根據(jù)分組使用limma做差異分析,得到差異結(jié)果矩陣

rm(list=ls())
exprSet <- read.table('GSE42872_series_matrix.txt',comment.char = '!',header = T)
library(hugene10sttranscriptcluster.db)
g10s <- toTable(hugene10sttranscriptclusterSYMBOL)
colnames(exprSet)[1] <- 'probe_id'
exprSet_merge <- merge(exprSet,g10s,by='probe_id')
nrow(exprSet_merge)
length(unique(exprSet_merge$symbol))
table(exprSet_merge$symbol)
##探針轉(zhuǎn)化以及去重,得到最終的表達(dá)矩陣
library(dplyr)
exprSet_expression <- exprSet_merge %>% select(-probe_id) %>%
  select(symbol,1:(length(exprSet)-1)) %>% mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>% arrange(desc(rowMean)) %>% distinct(symbol,.keep_all = T) %>% select(-rowMean)
rownames(exprSet_expression) <- exprSet_expression[,1]
exprSet_expression <- exprSet_expression[,-1]
#加載limma包進(jìn)行差異表達(dá)分析
library(limma)
group <- c (rep('con',3),rep('treatment',3))
design <- model.matrix(~factor(group))
colnames(design) <- c('con','treatment')
design
fit <- lmFit(exprSet_expression,design)
fit2 <- eBayes(fit)
allDiff=topTable(fit2,adjust='fdr',coef=2,number = Inf)
diffLab <- subset(allDiff,abs(logFC)>1 & adj.P.Val<0.05)
save(diffLab,group,allDiff,file = 'diff.Rdata')

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

友情鏈接更多精彩內(nèi)容