1. RTCGA包(了解)
数据库式的R包
优点:数据库式,将所有数据打包下载下来,可以在电脑上直接提取数据。
缺点:不是最新的数据。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | library (RTCGA.rnaseq) #library(RTCGA.miRNASeq) library (RTCGA.clinical) ls ( "package:RTCGA.rnaseq" ) #> [1] "ACC.rnaseq" "BLCA.rnaseq" "BRCA.rnaseq" "CESC.rnaseq" #> [5] "CHOL.rnaseq" "COAD.rnaseq" "COADREAD.rnaseq" "DLBC.rnaseq" #> [9] "ESCA.rnaseq" "GBM.rnaseq" "GBMLGG.rnaseq" "HNSC.rnaseq" #> [13] "KICH.rnaseq" "KIPAN.rnaseq" "KIRC.rnaseq" "KIRP.rnaseq" #> [17] "LAML.rnaseq" "LGG.rnaseq" "LIHC.rnaseq" "LUAD.rnaseq" #> [21] "LUSC.rnaseq" "OV.rnaseq" "PAAD.rnaseq" "PCPG.rnaseq" #> [25] "PRAD.rnaseq" "READ.rnaseq" "SARC.rnaseq" "SKCM.rnaseq" #> [29] "STAD.rnaseq" "STES.rnaseq" "TGCT.rnaseq" "THCA.rnaseq" #> [33] "THYM.rnaseq" "UCEC.rnaseq" "UCS.rnaseq" "UVM.rnaseq" ls ( "package:RTCGA.clinical" ) #> [1] "ACC.clinical" "BLCA.clinical" "BRCA.clinical" #> [4] "CESC.clinical" "CHOL.clinical" "COAD.clinical" #> [7] "COADREAD.clinical" "DLBC.clinical" "ESCA.clinical" #> [10] "FPPP.clinical" "GBM.clinical" "GBMLGG.clinical" #> [13] "HNSC.clinical" "KICH.clinical" "KIPAN.clinical" #> [16] "KIRC.clinical" "KIRP.clinical" "LAML.clinical" #> [19] "LGG.clinical" "LIHC.clinical" "LUAD.clinical" #> [22] "LUSC.clinical" "MESO.clinical" "OV.clinical" #> [25] "PAAD.clinical" "PCPG.clinical" "PRAD.clinical" #> [28] "READ.clinical" "SARC.clinical" "SKCM.clinical" #> [31] "STAD.clinical" "STES.clinical" "TGCT.clinical" #> [34] "THCA.clinical" "THYM.clinical" "UCEC.clinical" #> [37] "UCS.clinical" "UVM.clinical" #表达矩阵 exp2 = t (CHOL.rnaseq) exp2[1:4,1:4] #> [,1] #> bcr_patient_barcode "TCGA-3X-AAV9-01A-72R-A41I-07" #> ?|100130426 "0.0000" #> ?|100133144 " 2.2265" #> ?|100134869 "21.7256" #> [,2] #> bcr_patient_barcode "TCGA-3X-AAVA-01A-11R-A41I-07" #> ?|100130426 "0.0000" #> ?|100133144 " 4.0766" #> ?|100134869 "22.2241" #> [,3] #> bcr_patient_barcode "TCGA-3X-AAVB-01A-31R-A41I-07" #> ?|100130426 "0.0000" #> ?|100133144 " 1.4149" #> ?|100134869 " 4.4196" #> [,4] #> bcr_patient_barcode "TCGA-3X-AAVC-01A-21R-A41I-07" #> ?|100130426 "0.0000" #> ?|100133144 " 0.0000" #> ?|100134869 " 3.8152" dim (exp2) #> [1] 20532 45 clinical2 = CHOL.clinical #去除临床信息里全是NA的列 tmp = apply (clinical2, 2, function (x){ all ( is.na (x)|x== "" )}) clinical2 = clinical2[,!tmp] |
2. TCGAbiolinks(了解)
基于网页选择+GDC下载,自动化整理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | cancer_type= "TCGA-CHOL" library (TCGAbiolinks) #> Warning: package 'TCGAbiolinks' was built under R version 4.0.2 clinical <- GDCquery_clinic (project = cancer_type, type = "clinical" ) clinical[1:4,1:4] #> submitter_id NA. tumor_grade ajcc_pathologic_m #> 1 TCGA-4G-AAZT NA not reported M0 #> 2 TCGA-ZH-A8Y2 NA not reported M0 #> 3 TCGA-W5-AA2J NA not reported M0 #> 4 TCGA-W5-AA31 NA not reported M0 dim (clinical) #> [1] 51 158 #有些列全是NA,去掉。 tmp = apply (clinical, 2, function (x){ all ( is.na (x)|x== "" )}) clinical = clinical[,!tmp] query <- GDCquery (project = cancer_type, data.category = "Transcriptome Profiling" , data.type = "Gene Expression Quantification" , workflow.type = "HTSeq - Counts" ) if (! dir.exists ( "GDCdata" )) GDCdownload (query, method = "api" , files.per.chunk = 50) expdat <- GDCprepare (query = query) #> | | 0% |= |2.222222% ~6 s remaining |== |4.444444% ~5 s remaining |=== |6.666667% ~5 s remaining |==== |8.888889% ~4 s remaining |===== |11.11111% ~4 s remaining |====== |13.33333% ~4 s remaining |======== |15.55556% ~4 s remaining |========= |17.77778% ~4 s remaining |========== | 20% ~4 s remaining |=========== |22.22222% ~4 s remaining |============ |24.44444% ~3 s remaining |============= |26.66667% ~3 s remaining |=============== |28.88889% ~3 s remaining |================ |31.11111% ~3 s remaining |================= |33.33333% ~3 s remaining |================== |35.55556% ~4 s remaining |=================== |37.77778% ~3 s remaining |==================== | 40% ~3 s remaining |===================== |42.22222% ~3 s remaining |======================= |44.44444% ~3 s remaining |======================== |46.66667% ~3 s remaining |========================= |48.88889% ~3 s remaining |========================== |51.11111% ~2 s remaining |=========================== |53.33333% ~2 s remaining |============================ |55.55556% ~2 s remaining |============================== |57.77778% ~2 s remaining |=============================== | 60% ~2 s remaining |================================ |62.22222% ~2 s remaining |================================= |64.44444% ~2 s remaining |================================== |66.66667% ~2 s remaining |=================================== |68.88889% ~1 s remaining |==================================== |71.11111% ~1 s remaining |====================================== |73.33333% ~1 s remaining |======================================= |75.55556% ~1 s remaining |======================================== |77.77778% ~1 s remaining |========================================= | 80% ~1 s remaining |========================================== |82.22222% ~1 s remaining |=========================================== |84.44444% ~1 s remaining |============================================= |86.66667% ~1 s remaining |============================================== |88.88889% ~1 s remaining |=============================================== |91.11111% ~0 s remaining |================================================ |93.33333% ~0 s remaining |================================================= |95.55556% ~0 s remaining |================================================== |97.77778% ~0 s remaining |====================================================|100% ~0 s remaining |====================================================|100% Completed after 5 s library (SummarizedExperiment) #> Warning: package 'SummarizedExperiment' was built under R version 4.0.2 #> Warning: package 'DelayedArray' was built under R version 4.0.2 exp = assay (expdat) exp[1:3,1:3] #> TCGA-4G-AAZT-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07 #> ENSG00000000003 7542 4802 #> ENSG00000000005 0 1 #> ENSG00000000419 1121 1198 #> TCGA-W5-AA34-01A-11R-A41I-07 #> ENSG00000000003 8150 #> ENSG00000000005 0 #> ENSG00000000419 1770 #过滤一下 dim (exp) #> [1] 56493 45 exp = exp[ apply (exp, 1, function (x) sum (x > 1) > 9), ] dim (exp) #> [1] 29729 45 exp[1:4,1:4] #> TCGA-4G-AAZT-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07 #> ENSG00000000003 7542 4802 #> ENSG00000000419 1121 1198 #> ENSG00000000457 403 1099 #> ENSG00000000460 127 290 #> TCGA-W5-AA34-01A-11R-A41I-07 TCGA-W5-AA2G-01A-11R-A41I-07 #> ENSG00000000003 8150 1777 #> ENSG00000000419 1770 1299 #> ENSG00000000457 1202 653 #> ENSG00000000460 378 204 |
3. UCSC Xena(重点)
浏览器网站下载: https://xenabrowser.net/datapages/
优点:分类打包,直接下载
缺点:更新较慢,不是最新数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | exp3 = read.table ( "HiSeqV2" ,header = T,row.names = 1,check.names = F) exp3 = as.matrix (exp3) clinical3 = data.table:: fread ( "CHOL_clinicalMatrix" ) dim (exp3) #> [1] 20530 45 exp3 = exp3[ apply (exp3, 1, function (x) sum (x > 1) > 9), ] dim (exp3) #> [1] 17710 45 exp3[1:4,1:4] #> TCGA-4G-AAZT-01 TCGA-W5-AA2R-11 TCGA-W5-AA36-01 TCGA-W5-AA2U-01 #> ARHGEF10L 10.1826 11.2568 10.6879 10.4950 #> HIF3A 5.5601 4.6809 1.4479 4.5270 #> RNF10 11.6858 11.9161 12.1831 10.8378 #> RNF11 10.6041 11.1185 9.3851 10.6367 clinical3[1:4,1:4] #> sampleID _INTEGRATION _PATIENT _cohort #> 1: TCGA-3X-AAV9-01 TCGA-3X-AAV9-01 TCGA-3X-AAV9 TCGA Bile Duct Cancer (CHOL) #> 2: TCGA-3X-AAVA-01 TCGA-3X-AAVA-01 TCGA-3X-AAVA TCGA Bile Duct Cancer (CHOL) #> 3: TCGA-3X-AAVB-01 TCGA-3X-AAVB-01 TCGA-3X-AAVB TCGA Bile Duct Cancer (CHOL) #> 4: TCGA-3X-AAVC-01 TCGA-3X-AAVC-01 TCGA-3X-AAVC TCGA Bile Duct Cancer (CHOL) |
4. GDCRNATools(了解)
基于gdc-client下载并简化整理,完全用R语言完成
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | rm (list = ls ()) library (GDCRNATools) project <- "TCGA-CHOL" rnadir <- paste (project, 'RNAseq' , sep= '/' ) mirdir <- paste (project, 'miRNAs' , sep= '/' ) clinicaldir <- paste (project, 'Clinical' , sep= '/' ) gdcRNADownload (project.id = project, data.type = 'RNAseq' , write.manifest = FALSE , method = 'gdc-client' , directory = rnadir) gdcRNADownload (project.id = project, data.type = 'miRNAs' , write.manifest = FALSE , method = 'gdc-client' , directory = mirdir) gdcClinicalDownload (project.id = project, write.manifest = FALSE , method = 'gdc-client' , directory = clinicaldir) #2.临床信息解析---- metaMatrix.RNA <- gdcParseMetadata (project.id = project, data.type = 'RNAseq' , write.meta = FALSE ) metaMatrix.RNA <- gdcFilterDuplicate (metaMatrix.RNA) metaMatrix.RNA <- gdcFilterSampleType (metaMatrix.RNA) dim (metaMatrix.RNA) #> [1] 45 15 metaMatrix.MIR <- gdcParseMetadata (project.id = project, data.type = 'miRNAs' , write.meta = FALSE ) metaMatrix.MIR <- gdcFilterDuplicate (metaMatrix.MIR) metaMatrix.MIR <- gdcFilterSampleType (metaMatrix.MIR) dim (metaMatrix.MIR) #> [1] 45 15 clinicalDa <- gdcClinicalMerge (path = clinicaldir, key.info = TRUE ) clinicalDa[1:2,1:5] #> age_at_initial_pathologic_diagnosis ethnicity gender #> TCGA-W5-AA2Q 68 NOT HISPANIC OR LATINO MALE #> TCGA-W5-AA2O 57 NOT HISPANIC OR LATINO MALE #> race clinical_stage #> TCGA-W5-AA2Q WHITE NA #> TCGA-W5-AA2O WHITE NA #3.count矩阵---- rnaCounts <- gdcRNAMerge (metadata = metaMatrix.RNA, path = rnadir, organized = FALSE , data.type = 'RNAseq' ) mirCounts <- gdcRNAMerge (metadata = metaMatrix.MIR, path = mirdir, organized = FALSE , data.type = 'miRNAs' ) dim (rnaCounts) #> [1] 60483 45 dim (mirCounts) #> [1] 2588 45 |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!