教小高改bug

  博客园 :: 首页 :: 博问 :: 闪存 :: :: 联系 :: :: 管理 ::
  49 随笔 :: 28 文章 :: 20 评论 :: 78333 阅读

1. RTCGA包(了解)

数据库式的R包

优点:数据库式,将所有数据打包下载下来,可以在电脑上直接提取数据。

缺点:不是最新的数据。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
library(RTCGA.rnaseq)
#library(RTCGA.miRNASeq)
library(RTCGA.clinical)
ls("package:RTCGA.rnaseq")
#>  [1] "ACC.rnaseq"      "BLCA.rnaseq"     "BRCA.rnaseq"     "CESC.rnaseq"   
#>  [5] "CHOL.rnaseq"     "COAD.rnaseq"     "COADREAD.rnaseq" "DLBC.rnaseq"   
#>  [9] "ESCA.rnaseq"     "GBM.rnaseq"      "GBMLGG.rnaseq"   "HNSC.rnaseq"   
#> [13] "KICH.rnaseq"     "KIPAN.rnaseq"    "KIRC.rnaseq"     "KIRP.rnaseq"   
#> [17] "LAML.rnaseq"     "LGG.rnaseq"      "LIHC.rnaseq"     "LUAD.rnaseq"   
#> [21] "LUSC.rnaseq"     "OV.rnaseq"       "PAAD.rnaseq"     "PCPG.rnaseq"   
#> [25] "PRAD.rnaseq"     "READ.rnaseq"     "SARC.rnaseq"     "SKCM.rnaseq"   
#> [29] "STAD.rnaseq"     "STES.rnaseq"     "TGCT.rnaseq"     "THCA.rnaseq"   
#> [33] "THYM.rnaseq"     "UCEC.rnaseq"     "UCS.rnaseq"      "UVM.rnaseq"
ls("package:RTCGA.clinical")
#>  [1] "ACC.clinical"      "BLCA.clinical"     "BRCA.clinical"   
#>  [4] "CESC.clinical"     "CHOL.clinical"     "COAD.clinical"   
#>  [7] "COADREAD.clinical" "DLBC.clinical"     "ESCA.clinical"   
#> [10] "FPPP.clinical"     "GBM.clinical"      "GBMLGG.clinical" 
#> [13] "HNSC.clinical"     "KICH.clinical"     "KIPAN.clinical"  
#> [16] "KIRC.clinical"     "KIRP.clinical"     "LAML.clinical"   
#> [19] "LGG.clinical"      "LIHC.clinical"     "LUAD.clinical"   
#> [22] "LUSC.clinical"     "MESO.clinical"     "OV.clinical"     
#> [25] "PAAD.clinical"     "PCPG.clinical"     "PRAD.clinical"   
#> [28] "READ.clinical"     "SARC.clinical"     "SKCM.clinical"   
#> [31] "STAD.clinical"     "STES.clinical"     "TGCT.clinical"   
#> [34] "THCA.clinical"     "THYM.clinical"     "UCEC.clinical"   
#> [37] "UCS.clinical"      "UVM.clinical"
#表达矩阵
exp2 = t(CHOL.rnaseq)
exp2[1:4,1:4]
#>                     [,1]                         
#> bcr_patient_barcode "TCGA-3X-AAV9-01A-72R-A41I-07"
#> ?|100130426         "0.0000"                     
#> ?|100133144         " 2.2265"                    
#> ?|100134869         "21.7256"                    
#>                     [,2]                         
#> bcr_patient_barcode "TCGA-3X-AAVA-01A-11R-A41I-07"
#> ?|100130426         "0.0000"                     
#> ?|100133144         " 4.0766"                    
#> ?|100134869         "22.2241"                    
#>                     [,3]                         
#> bcr_patient_barcode "TCGA-3X-AAVB-01A-31R-A41I-07"
#> ?|100130426         "0.0000"                     
#> ?|100133144         " 1.4149"                    
#> ?|100134869         " 4.4196"                    
#>                     [,4]                         
#> bcr_patient_barcode "TCGA-3X-AAVC-01A-21R-A41I-07"
#> ?|100130426         "0.0000"                     
#> ?|100133144         " 0.0000"                    
#> ?|100134869         " 3.8152"
dim(exp2)
#> [1] 20532    45
clinical2 = CHOL.clinical
#去除临床信息里全是NA的列
tmp = apply(clinical2, 2, function(x){all(is.na(x)|x=="")})
clinical2 = clinical2[,!tmp]

2. TCGAbiolinks(了解)

基于网页选择+GDC下载,自动化整理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
cancer_type="TCGA-CHOL"
library(TCGAbiolinks)
#> Warning: package 'TCGAbiolinks' was built under R version 4.0.2
clinical <- GDCquery_clinic(project = cancer_type,
                            type = "clinical")
clinical[1:4,1:4]
#>   submitter_id NA.  tumor_grade ajcc_pathologic_m
#> 1 TCGA-4G-AAZT  NA not reported                M0
#> 2 TCGA-ZH-A8Y2  NA not reported                M0
#> 3 TCGA-W5-AA2J  NA not reported                M0
#> 4 TCGA-W5-AA31  NA not reported                M0
 
dim(clinical)
#> [1]  51 158
#有些列全是NA,去掉。
tmp = apply(clinical, 2, function(x){all(is.na(x)|x=="")})
clinical = clinical[,!tmp]
query <- GDCquery(project = cancer_type,
                  data.category = "Transcriptome Profiling",
                  data.type = "Gene Expression Quantification",
                  workflow.type = "HTSeq - Counts")
if(!dir.exists("GDCdata")) GDCdownload(query, method = "api", files.per.chunk = 50)
 
expdat <- GDCprepare(query = query)
#>
|                                                    |  0%                     
|=                                                   |2.222222% ~6 s remaining 
|==                                                  |4.444444% ~5 s remaining 
|===                                                 |6.666667% ~5 s remaining 
|====                                                |8.888889% ~4 s remaining 
|=====                                               |11.11111% ~4 s remaining 
|======                                              |13.33333% ~4 s remaining 
|========                                            |15.55556% ~4 s remaining 
|=========                                           |17.77778% ~4 s remaining 
|==========                                          | 20% ~4 s remaining      
|===========                                         |22.22222% ~4 s remaining 
|============                                        |24.44444% ~3 s remaining 
|=============                                       |26.66667% ~3 s remaining 
|===============                                     |28.88889% ~3 s remaining 
|================                                    |31.11111% ~3 s remaining 
|=================                                   |33.33333% ~3 s remaining 
|==================                                  |35.55556% ~4 s remaining 
|===================                                 |37.77778% ~3 s remaining 
|====================                                | 40% ~3 s remaining      
|=====================                               |42.22222% ~3 s remaining 
|=======================                             |44.44444% ~3 s remaining 
|========================                            |46.66667% ~3 s remaining 
|=========================                           |48.88889% ~3 s remaining 
|==========================                          |51.11111% ~2 s remaining 
|===========================                         |53.33333% ~2 s remaining 
|============================                        |55.55556% ~2 s remaining 
|==============================                      |57.77778% ~2 s remaining 
|===============================                     | 60% ~2 s remaining      
|================================                    |62.22222% ~2 s remaining 
|=================================                   |64.44444% ~2 s remaining 
|==================================                  |66.66667% ~2 s remaining 
|===================================                 |68.88889% ~1 s remaining 
|====================================                |71.11111% ~1 s remaining 
|======================================              |73.33333% ~1 s remaining 
|=======================================             |75.55556% ~1 s remaining 
|========================================            |77.77778% ~1 s remaining 
|=========================================           | 80% ~1 s remaining      
|==========================================          |82.22222% ~1 s remaining 
|===========================================         |84.44444% ~1 s remaining 
|=============================================       |86.66667% ~1 s remaining 
|==============================================      |88.88889% ~1 s remaining 
|===============================================     |91.11111% ~0 s remaining 
|================================================    |93.33333% ~0 s remaining 
|=================================================   |95.55556% ~0 s remaining 
|==================================================  |97.77778% ~0 s remaining 
|====================================================|100% ~0 s remaining      
|====================================================|100%                      Completed after 5 s
library(SummarizedExperiment)
#> Warning: package 'SummarizedExperiment' was built under R version 4.0.2
#> Warning: package 'DelayedArray' was built under R version 4.0.2
exp = assay(expdat)
exp[1:3,1:3]
#>                 TCGA-4G-AAZT-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07
#> ENSG00000000003                         7542                         4802
#> ENSG00000000005                            0                            1
#> ENSG00000000419                         1121                         1198
#>                 TCGA-W5-AA34-01A-11R-A41I-07
#> ENSG00000000003                         8150
#> ENSG00000000005                            0
#> ENSG00000000419                         1770
#过滤一下
dim(exp)
#> [1] 56493    45
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 9), ]
dim(exp)
#> [1] 29729    45
exp[1:4,1:4]
#>                 TCGA-4G-AAZT-01A-11R-A41I-07 TCGA-W5-AA2R-01A-11R-A41I-07
#> ENSG00000000003                         7542                         4802
#> ENSG00000000419                         1121                         1198
#> ENSG00000000457                          403                         1099
#> ENSG00000000460                          127                          290
#>                 TCGA-W5-AA34-01A-11R-A41I-07 TCGA-W5-AA2G-01A-11R-A41I-07
#> ENSG00000000003                         8150                         1777
#> ENSG00000000419                         1770                         1299
#> ENSG00000000457                         1202                          653
#> ENSG00000000460                          378                          204

3. UCSC Xena(重点)

浏览器网站下载:  https://xenabrowser.net/datapages/  

优点:分类打包,直接下载

缺点:更新较慢,不是最新数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
exp3 = read.table("HiSeqV2",header = T,row.names = 1,check.names = F)
exp3 = as.matrix(exp3)
clinical3 = data.table::fread("CHOL_clinicalMatrix")
dim(exp3)
#> [1] 20530    45
exp3 = exp3[apply(exp3, 1, function(x) sum(x > 1) > 9), ]
dim(exp3)
#> [1] 17710    45
exp3[1:4,1:4]
#>           TCGA-4G-AAZT-01 TCGA-W5-AA2R-11 TCGA-W5-AA36-01 TCGA-W5-AA2U-01
#> ARHGEF10L         10.1826         11.2568         10.6879         10.4950
#> HIF3A              5.5601          4.6809          1.4479          4.5270
#> RNF10             11.6858         11.9161         12.1831         10.8378
#> RNF11             10.6041         11.1185          9.3851         10.6367
clinical3[1:4,1:4]
#>           sampleID    _INTEGRATION     _PATIENT                      _cohort
#> 1: TCGA-3X-AAV9-01 TCGA-3X-AAV9-01 TCGA-3X-AAV9 TCGA Bile Duct Cancer (CHOL)
#> 2: TCGA-3X-AAVA-01 TCGA-3X-AAVA-01 TCGA-3X-AAVA TCGA Bile Duct Cancer (CHOL)
#> 3: TCGA-3X-AAVB-01 TCGA-3X-AAVB-01 TCGA-3X-AAVB TCGA Bile Duct Cancer (CHOL)
#> 4: TCGA-3X-AAVC-01 TCGA-3X-AAVC-01 TCGA-3X-AAVC TCGA Bile Duct Cancer (CHOL)

4. GDCRNATools(了解)

基于gdc-client下载并简化整理,完全用R语言完成

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
rm(list = ls())
library(GDCRNATools)
project <- "TCGA-CHOL"
rnadir <- paste(project, 'RNAseq', sep='/')
mirdir <- paste(project, 'miRNAs', sep='/')
clinicaldir <- paste(project, 'Clinical', sep='/')
gdcRNADownload(project.id     = project,
               data.type      = 'RNAseq',
               write.manifest = FALSE,
               method         = 'gdc-client',
               directory      = rnadir)
gdcRNADownload(project.id     = project,
               data.type      = 'miRNAs',
               write.manifest = FALSE,
               method         = 'gdc-client',
               directory      = mirdir)
gdcClinicalDownload(project.id     = project,
                    write.manifest = FALSE,
                    method         = 'gdc-client',
                    directory      = clinicaldir)
#2.临床信息解析----
metaMatrix.RNA <- gdcParseMetadata(project.id = project,
                                   data.type  = 'RNAseq',
                                   write.meta = FALSE)
metaMatrix.RNA <- gdcFilterDuplicate(metaMatrix.RNA)
metaMatrix.RNA <- gdcFilterSampleType(metaMatrix.RNA)
dim(metaMatrix.RNA)
#> [1] 45 15
metaMatrix.MIR <- gdcParseMetadata(project.id = project,
                                   data.type  = 'miRNAs',
                                   write.meta = FALSE)
metaMatrix.MIR <- gdcFilterDuplicate(metaMatrix.MIR)
metaMatrix.MIR <- gdcFilterSampleType(metaMatrix.MIR)
dim(metaMatrix.MIR)
#> [1] 45 15
 
clinicalDa <- gdcClinicalMerge(path = clinicaldir,
                               key.info = TRUE)
clinicalDa[1:2,1:5]
#>              age_at_initial_pathologic_diagnosis              ethnicity gender
#> TCGA-W5-AA2Q                                  68 NOT HISPANIC OR LATINO   MALE
#> TCGA-W5-AA2O                                  57 NOT HISPANIC OR LATINO   MALE
#>               race clinical_stage
#> TCGA-W5-AA2Q WHITE             NA
#> TCGA-W5-AA2O WHITE             NA
 
#3.count矩阵----
rnaCounts <- gdcRNAMerge(metadata  = metaMatrix.RNA,
                         path      = rnadir,
                         organized = FALSE,
                         data.type = 'RNAseq')
mirCounts <- gdcRNAMerge(metadata  = metaMatrix.MIR,
                         path      = mirdir,
                         organized = FALSE,
                         data.type = 'miRNAs')
dim(rnaCounts)
#> [1] 60483    45
dim(mirCounts)
#> [1] 2588   45

 

posted on   小高不高  阅读(639)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
点击右上角即可分享
微信分享提示