ATAC-seq pre-practice

下载数据包作为材料:https://gdc.cancer.gov/about-data/publications/ATACseq-AWG,选择下载 All cancer type-specific peak sets. [ZIP] 和 Pan-cancer peak set. [TXT]两个文件。

安装需要用到的包:

# to read txt files——library(readr)   

# to transform data into GenomicRanges——library(GenomicRanges)

# other ones used to prepare the data——library(tidyr),library(dplyr) ,library(SummarizedExperiment)

# For the t.test loop ——library(plyr)

# For easy volcano plot ——library(TCGAbiolinks)

# For heatmap plot ——library(ComplexHeatmap) ,library(circlize)

# For the bigwig plot ——library(karyoploteR),library(TxDb.Hsapiens.UCSC.hg38.knownGene)

如果install.pachages的时候报错。可以使用biocondunctor上的代码进行加载。

处理数据:

case1<-readr::read_tsv("kl/ESCA_peakCalls.txt")
head(case1)
message("Range Score:",paste0(range(case1$score),collapse = "-"))
case2<-readr::read_tsv("kl/TCGA-ATAC_PanCancer_PeakSet.txt")
head(case2)
plyr::count(stringr::str_split(case2$name,"_",simplify=T)[,1])
message("Range Score:",paste0(range(case2$score),collapse="-"))
library(GenomicRanges)
case1.gr<-makeGRangesFromDataFrame(case1,keep.extra.columns = T)
case2.gr<-makeGRangesFromDataFrame(case2,keep.extra.columns = T)
length(subsetByOverlaps(case1.gr,case2.gr))
"ESCA_17603"%in%case2.gr$name
subsetByOverlaps(case1.gr,case2.gr[case2.gr$name=="ACC_10008"])
subsetByOverlaps(case2.gr[case2.gr$name=="ACC_10008"],case1.gr)
unique(width(case1.gr))
unique(width(case2.gr))

case3<-readr::read_tsv("Data/ESCA_log2norm.txt")
case3[1:4,1:8]
w.file<-"https://api.gdc.cancer.gov/data/7a3d7067-09d6-4acf-82c8-a1a81febf72c"
sam<-readr::read_tsv(w.file)
sam<-readr::read_tsv(w.file,col_types = readr::cols())
sam$sample<-substr(sam$Case_ID,1,16)
colnames(case3)[-c(1:5)] <- sam$Case_ID[match(gsub("_","-",colnames(case3)[-c(1:5)]),sam$bam_prefix)]
case3[1:4,1:8]
case.all<-case3
num<-c(1:5)
sam.info <- TCGAbiolinks:::colDataPrepare(unique(colnames(case.all)[-c(num)]))
head(sam.info)[,c("sample","primary_diagnosis")]
sam.map <- gsub(",| |NOS","",gsub("Adenocarcinoma","ESAD",gsub("Squamous cell carcinoma","ESCC",paste0(sam.info$primary_diagnosis,"-",sam.info$sample))))
#======
samples.map <- gsub(",| |NOS","",gsub("Adenocarcinoma","ESAD",gsub("Squamous cell carcinoma","ESCC",paste0(samples.info$primary_diagnosis,"-",samples.info$sample))))
colnames(atac)[-c(non.cts.idx)] <- samples.map[match(substr(colnames(atac)[-c(non.cts.idx)],1,16),substr(samples.map,6,21))]
#=====
colnames(case.all)[-c(num)]<-sam.map[match(substr(colnames(case.all)[-c(num)],1,16),substr(sam.map,6,21))]
counts<-case.all[,-c(1:5)]
head(counts)
rowRanges <- makeGRangesFromDataFrame(case.all)
rowRanges$score <- case.all$score
rowRanges$name <- case.all$name
names(rowRanges) <- paste(case.all$name,case.all$seqnames,case.all$start,case.all$end, sep = "_")
rowRanges
sam$sample <- substr(sam$Case_ID,1,16)
?left_join
library(dplyr)
colData <- unique(left_join(sam.info,sam))
head(colData)
library(SummarizedExperiment)
esca.rse <- SummarizedExperiment(assays=SimpleList(log2norm=as.matrix(counts)),
rowRanges = rowRanges,
colData = DataFrame(colData))

esca.rse

至此,分析之前所需要的数据读取,名称规范(UUIDs to TCGA barcodes)及数据结构查看就完成啦~

 

posted @ 2019-10-23 19:56  一曲春山几多恨  阅读(252)  评论(0编辑  收藏  举报