geo讀取表達(dá)矩陣 RNA-seq R語言
方法一:1.從geo頁面直接下載表達(dá)矩陣,然后通過r讀取表達(dá)矩陣
2.利用getgeo函數(shù)讀取表達(dá)矩陣
3.利用geo自帶的geo2r,調(diào)整p值為1,獲取探針和基因名的對(duì)應(yīng)關(guān)系
1
#http://zouyawen.top/2020/10/09/%E8%BD%AC%E5%BD%95%E7%BB%844/
getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE150425_a549_kras_organoid/")
expr.df=read.csv("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE150425_a549_kras_organoid/GSE150425_Cufflinks_Gene_Counts/GSE150425_Cufflinks_Gene_Counts.csv",
header=T,
sep=",", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
comment.char="!")
head(expr.df)
head(expr.df)
expr.df[expr.df$Gene_ID %in% 'RETN',]
library(dplyr)
colnames(expr.df)[1]='Gene_Symbol'
expr.df[expr.df$Gene_Symbol=='Retn'|
expr.df$Gene_Symbol=='Retnlb'|
expr.df$Gene_Symbol=='Jchain'|
expr.df$Gene_Symbol=='Igha1'|
expr.df$Gene_Symbol=='Pdk4'|
expr.df$Gene_Symbol=='Actb',]
#http://zouyawen.top/2020/10/09/%E8%BD%AC%E5%BD%95%E7%BB%844/
getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE133037_a549_transfection 0f runx__tgfb/")
expr.df=read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE133037_a549_transfection 0f runx__tgfb/GSE133037_VK-Rx3.TGFb_FPKM/GSE133037_VK.Rx3.TGFb_FPKM.txt",
header=T,
sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
comment.char="!")
head(expr.df)
#colnames(expr.df)=expr.df[1,]
input=expr.df
head(input)
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Chr')|
grepl(colnames(input),pattern = 'Start')|
grepl(colnames(input),pattern = 'End')|
grepl(colnames(input),pattern = 'Strand'))]
head(input)
colnames(input)[1]='Gene.ID'
colnames(input)[2]='Gene.Name'
head(input)
mydata=input %>% filter(Gene.Name=='RETNLB'|Gene.Name=='ACTB'|Gene.ID=='1053'|
Gene.Name=='56729'|Gene.ID=='3493'|
Gene.ID=='3512'|Gene.ID=='3934')
mydata
dim(input)
多個(gè)組別 合并 id轉(zhuǎn)化
#http://zouyawen.top/2020/10/09/%E8%BD%AC%E5%BD%95%E7%BB%844/
getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007697_A549-2/")
expr_SRR957678=read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007697_A549-2/GSM4007696_A549-1.htseq.count.txt",
header=FALSE,
sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
comment.char="!")
expr_SRR957677=read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007697_A549-2/GSM4007697_A549-2.htseq.count.txt",
header=FALSE,
sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
comment.char="!")
expr_SRR957679 =read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007699_TD-2/GSM4007698_TD-1.htseq.count.txt",
header=FALSE,
sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
comment.char="!")
expr_SRR957680 =read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007699_TD-2/GSM4007699_TD-2.htseq.count.txt",
header=FALSE,
sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
comment.char="!")
head(expr_SRR957677)
head(expr_SRR957678)
#先對(duì)照和實(shí)驗(yàn)組分別合并到一個(gè)data.frame再合并為一個(gè)總的數(shù)據(jù)框
control <- merge(expr_SRR957677,expr_SRR957678,by="V1")#by="gene_id"按照gene_id這列合并
head(control)
treat <- merge(expr_SRR957679,expr_SRR957680,by="V1")
head(treat)
raw_count <- merge(control,treat,by="V1")
head(raw_count)
colnames(raw_count)=c('gene_id','a5491','a5492','td1','td2')
raw_count <-raw_count[-1:-5,]#刪掉前五行,由上一步結(jié)果可以看出前五行g(shù)ene_id有問題
head(raw_count)
2#.去除gene_id名字中的小數(shù)點(diǎn)
#EBI數(shù)據(jù)庫中無法搜到ENSG00000000003.10這樣的基因,只能是整數(shù)ENSG00000000003才能進(jìn)行id轉(zhuǎn)換
library(stringr)
ENSEMBL <- raw_count$gene_id
head(ENSEMBL)
ENSEMBL <- str_split_fixed(ENSEMBL,'[.]',2)[,1]#將小數(shù)點(diǎn)及后面的數(shù)字去掉,ensembl_gene_id是整數(shù)
head(ENSEMBL)
rownames(raw_count) <- ENSEMBL#將去除小數(shù)點(diǎn)后的ensembl_gene_id作為行名后期方便繪圖
raw_count$ensembl_gene_id <- ENSEMBL#新建ensembl_gene_id列便于合并
head(raw_count)
3.#對(duì)基因進(jìn)行注釋,獲取gene_symbol
#bioMart包是一個(gè)連接bioMart數(shù)據(jù)庫的R語言接口,能通過這個(gè)軟件包自由連接到bioMart數(shù)據(jù)庫
# BiocManager::install('biomaRt')
library(biomaRt)
#顯示一下能連接的數(shù)據(jù)庫
listMarts()
listMarts()
#用useMart函數(shù)選定數(shù)據(jù)庫
plant<-useMart("ensembl")
#用listDatasets()函數(shù)顯示當(dāng)前數(shù)據(jù)庫所含的基因組注釋,我們要獲取的基因注釋的基因是人類基因,所以選擇hsapiens_gene_ensembl
listDatasets(plant)
grep(listDatasets(plant)[,1],pattern = 'hsa')
#選定ensembl數(shù)據(jù)庫中的hsapiens_gene_ensembl基因組
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
#選定我們需要獲得的注釋類型
#用lsitFilters()函數(shù)查看可選擇的類型,選定要獲取的注釋類型,以及已知注釋的類型用lsitFilters()函數(shù)查看可選擇的類型,選定要獲取的注釋類型,以及已知注釋的類型
listFilters(mart)
#這里我們選擇這些要獲得數(shù)值的類型ensembl_gene_id ,hgnc_symbol chromosome_name start_position end_position band我們已知的類型是ensembl_gene_id選擇好數(shù)據(jù)庫,基因組,要獲得的注釋類型,和已知的注釋類型,就可以開始獲取注釋了
#用getBM()函數(shù)獲取注釋
hg_symbols<- getBM(attributes=c('ensembl_gene_id','hgnc_symbol',"chromosome_name", "start_position","end_position", "band"), filters= 'ensembl_gene_id', values = ENSEMBL, mart = mart)
#attributers()里面的值為我們要獲取的注釋類型ensembl_gene_id ,hgnc_symbol chromosome_name start_position end_position band
#filters()里面的值為我們已知的注釋類型ensembl_gene_id
#values= 這個(gè)值就是我們已知的注釋類型的數(shù)據(jù),把上面我們通過數(shù)據(jù)處理得到的ensembl基因序號(hào)作為ensembl_gene_id 的值
#mart= 這個(gè)值是我們所選定的數(shù)據(jù)庫的基因組
head(hg_symbols)
getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb")
save(mart, file = "mart.RData")#將mart變量保存為mart.RData
save(hg_symbols, file = "hg_symbols.RData")#將hg_symbols變量保存為hg_symbols,biomart包不太穩(wěn)定,有時(shí)連不上
4.#將合并后的表達(dá)數(shù)據(jù)框raw_count與注釋得到的hg_symbols整合
read_count <- merge(raw_count,hg_symbols,by="ensembl_gene_id")#將raw_count與注釋后得到的hg_symbols合并
head(read_count)
openxlsx::write.xlsx(read_count,file = "readcount_all.xlsx")
input =openxlsx::read.xlsx("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/readcount_all.xlsx")
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Chr')|
grepl(colnames(input),pattern = 'Start')|
grepl(colnames(input),pattern = 'End')|
grepl(colnames(input),pattern = 'Strand'))]
head(input)
colnames(input)[1]='Gene.ID'
colnames(input)[7]='Gene.Name'
head(input)
mydata=input %>% filter(Gene.Name=='RETNLB'|Gene.Name=='ACTB'|Gene.ID=='ENSG00000104918'|
Gene.Name=='RETN'|Gene.ID=='ENSG00000132465'|
Gene.ID=='ENSG00000211895'|Gene.ID=='ENSG00000092067')
mydata
dim(input)
下載表達(dá)矩陣和getgeo函數(shù)聯(lián)合使用文章來源:http://www.zghlxwxcb.cn/news/detail-615344.html
#http://zouyawen.top/2020/10/09/%E8%BD%AC%E5%BD%95%E7%BB%844/
getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/")
expr.df=read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/geo2r.tsv",
header=T,
sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
comment.char="!")
head(expr.df)
expr.df[1:10,]
#colnames(expr.df)=expr.df[1,]
input=expr.df
head(input)
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Chr')|
grepl(colnames(input),pattern = 'Start')|
grepl(colnames(input),pattern = 'End')|
grepl(colnames(input),pattern = 'Strand'))]
head(input)
grep(input$Gene.title,pattern = 'RETN')
input[grep(input$Gene.symbol,pattern = 'RETN'),]
input[grep(input$Gene.symbol,pattern = 'IGHA1'),]
input[grep(input$Gene.title,pattern = 'immunoglobulin heavy'),]
library(GEOquery)
f='GSE114761_eSet.Rdata'
if(!file.exists(f)){
gset <- getGEO('GSE114761', destdir=".",
AnnotGPL = F, ## 注釋文件
getGPL = F) ## 平臺(tái)文件
save(gset,file=f) ## 保存到本地
}
getwd()
load('GSE114761_eSet.Rdata') ## 載入數(shù)據(jù)
class(gset) #查看數(shù)據(jù)類型
length(gset) #
class(gset[[1]])
gset
exprs(gset[[1]])
gset[[1]]
a=exprs(gset[[1]])
head(a)
a['220570_at',]
a[rownames(a)=='220570_at',]
input[grep(input$Gene.symbol,pattern = 'IGHA1'),][,1]
a[rownames(a) %in% input[grep(input$Gene.symbol,pattern = 'IGHA1'),][,1],]
a[rownames(a) %in% input[grep(input$Gene.title,pattern = 'IGCJ'),][,1],]
myexpression=as.data.frame(a)
head(myexpression)
myexpression$probe_id=rownames(myexpression)
head(myexpression)
head(input)
input$probe_id=input$ID
head(input)
mydata=merge(myexpression,input,by='probe_id')
head(mydata)
colnames(a)
boxplot(mydata[,2:42])
getwd()
save(mydata,file = "G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/myexpression_matrix.rds")
load("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/myexpression_matrix.rds")
colnames(mydata)
a=read.table(file = 'clipboard')
head(a)
a=a[-1,]
a
colnames(mydata)[2:43]=a$V2
head(mydata)
mydata[mydata$Gene.symbol=='RETN'|
mydata$Gene.symbol=='IGHA1'|
mydata$Gene.symbol=='RETNLB',]
讀取excel表達(dá)矩陣文章來源地址http://www.zghlxwxcb.cn/news/detail-615344.html
getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE164160_CMT64_TGFB_Alveogenic lung carcinoma/")
syndecan_fibrosis=openxlsx::read.xlsx("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE164160_CMT64_TGFB_Alveogenic lung carcinoma/GSE164160_RAW/GSM4998575_TGFb_1.clean_trimmed_GE_.xlsx",
colNames = T)
head(syndecan_fibrosis)
input=syndecan_fibrosis
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Region')|
grepl(colnames(input),pattern = 'Start')|
grepl(colnames(input),pattern = 'End')|
grepl(colnames(input),pattern = 'Strand'))]
head(input)
mydata=input %>% filter(Name=='Regnla'|Name=='Actb'|ENSEMBL=='ENSMUSG00000061100'|ENSEMBL=='ENSMUSG00000015839'|
Name=='Retnlg'|ENSEMBL=='ENSMUSG00000052435'|
ENSEMBL=='ENSMUSG00000095079'|ENSEMBL=='ENSMUSG00000067149')
mydata
dim(input)
#################################33
syndecan_fibrosis=openxlsx::read.xlsx("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE164160_CMT64_TGFB_Alveogenic lung carcinoma/GSE164160_RAW/GSM4998576_TNFa_1.clean_trimmed_GE_.xlsx",
colNames = T)
head(syndecan_fibrosis)
input=syndecan_fibrosis
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Region')|
grepl(colnames(input),pattern = 'Start')|
grepl(colnames(input),pattern = 'End')|
grepl(colnames(input),pattern = 'Strand'))]
head(input)
mydata=input %>% filter(Name=='Regnla'|Name=='Actb'|ENSEMBL=='ENSMUSG00000061100'|ENSEMBL=='ENSMUSG00000015839'|
Name=='Retnlg'|ENSEMBL=='ENSMUSG00000052435'|
ENSEMBL=='ENSMUSG00000095079'|ENSEMBL=='ENSMUSG00000067149')
mydata
dim(input)
到了這里,關(guān)于geo讀取表達(dá)矩陣 RNA-seq R語言部分(表達(dá)矩陣合并及id轉(zhuǎn)換)的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!