R获取指定GO term和KEGG pathway的gene list基因集
2022年09月06日
新方法:R | 提取GO分类下的所有基因
1 2 3 4 5 6 7 8 9 | library (tidyverse) # library(org.Hs.eg.db) library (org.Mm.eg.db) GOID <- c ( "GO:0042573" ) # GOgeneID <- get(GOID, org.Hs.egGO2ALLEGS) %>% mget(org.Hs.egSYMBOL) %>% unlist() GOgeneID <- get (GOID, org.Mm.egGO2ALLEGS) %>% mget (org.Mm.egSYMBOL) %>% unlist () |
clusterProfiler没有显性的接口,但是可以直接扣取clusterProfiler里的函数。
核心函数就是get_GO_data
1 | GO_DATA <- get_GO_data ( "org.Hs.eg.db" , "BP" , "SYMBOL" ) |
可以看到输入的是GO数据库,选定类别,基因名字类型,输出的就是整个数据库。
但是想调用这个函数没那么简单,得导入一系列的基础函数。
一个常见的任务就是获取GO数据库里所有的cell cycle相关的基因,这需要从我们的基因集里移除。
有了这个函数,我们就可以这么做了,两句R代码搞定。
1 2 3 4 5 | cellCycleGO <- names (GO_DATA$PATHID2NAME[ grep ( "cell cycle|DNA replication|cell division|segregation" , GO_DATA$PATHID2NAME)]) cellCycleGene <- unique ( unlist (GO_DATA$PATHID2EXTID[cellCycleGO])) print ( length (cellCycleGene)) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | library (DOSE) library (GOSemSim) library (clusterProfiler) library (org.Hs.eg.db) # get_GO_data <- function (OrgDb, ont, keytype) { GO_Env <- get_GO_Env () use_cached <- FALSE if ( exists ( "organism" , envir=GO_Env, inherits= FALSE ) && exists ( "keytype" , envir=GO_Env, inherits= FALSE )) { org <- get ( "organism" , envir=GO_Env) kt <- get ( "keytype" , envir=GO_Env) if (org == DOSE::: get_organism (OrgDb) && keytype == kt && exists ( "goAnno" , envir=GO_Env, inherits= FALSE )) { ## https://github.com/GuangchuangYu/clusterProfiler/issues/182 ## && exists("GO2TERM", envir=GO_Env, inherits=FALSE)){ use_cached <- TRUE } } if (use_cached) { goAnno <- get ( "goAnno" , envir=GO_Env) } else { OrgDb <- GOSemSim::: load_OrgDb (OrgDb) kt <- keytypes (OrgDb) if (! keytype % in % kt) { stop ( "keytype is not supported..." ) } kk <- keys (OrgDb, keytype=keytype) goAnno <- suppressMessages ( select (OrgDb, keys=kk, keytype=keytype, columns= c ( "GOALL" , "ONTOLOGYALL" ))) goAnno <- unique (goAnno[! is.na (goAnno$GOALL), ]) assign ( "goAnno" , goAnno, envir=GO_Env) assign ( "keytype" , keytype, envir=GO_Env) assign ( "organism" , DOSE::: get_organism (OrgDb), envir=GO_Env) } if (ont == "ALL" ) { GO2GENE <- unique (goAnno[, c (2,1)]) } else { GO2GENE <- unique (goAnno[goAnno$ONTOLOGYALL == ont, c (2,1)]) } GO_DATA <- DOSE::: build_Anno (GO2GENE, get_GO2TERM_table ()) goOnt.df <- goAnno[, c ( "GOALL" , "ONTOLOGYALL" )] %>% unique goOnt <- goOnt.df[,2] names (goOnt) <- goOnt.df[,1] assign ( "GO2ONT" , goOnt, envir=GO_DATA) return (GO_DATA) } get_GO_Env <- function () { if (! exists ( ".GO_clusterProfiler_Env" , envir = .GlobalEnv)) { pos <- 1 envir <- as.environment (pos) assign ( ".GO_clusterProfiler_Env" , new.env (), envir=envir) } get ( ".GO_clusterProfiler_Env" , envir = .GlobalEnv) } get_GO2TERM_table <- function () { GOTERM.df <- get_GOTERM () GOTERM.df[, c ( "go_id" , "Term" )] %>% unique } get_GOTERM <- function () { pos <- 1 envir <- as.environment (pos) if (! exists ( ".GOTERM_Env" , envir=envir)) { assign ( ".GOTERM_Env" , new.env (), envir) } GOTERM_Env <- get ( ".GOTERM_Env" , envir = envir) if ( exists ( "GOTERM.df" , envir = GOTERM_Env)) { GOTERM.df <- get ( "GOTERM.df" , envir=GOTERM_Env) } else { GOTERM.df <- toTable (GOTERM) assign ( "GOTERM.df" , GOTERM.df, envir = GOTERM_Env) } return (GOTERM.df) } |
获取KEGG的通路和基因是一样的,也是用clusterProfiler
代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | hsa_kegg <- clusterProfiler:: download_KEGG ( "hsa" ) names (hsa_kegg) head (hsa_kegg$KEGGPATHID2NAME) head (hsa_kegg$KEGGPATHID2EXTID) PATH2ID <- hsa_kegg$KEGGPATHID2EXTID PATH2NAME <- hsa_kegg$KEGGPATHID2NAME PATH_ID_NAME <- merge (PATH2ID, PATH2NAME, by= "from" ) colnames (PATH_ID_NAME) <- c ( "KEGGID" , "ENTREZID" , "DESCRPTION" ) # write.table(PATH_ID_NAME, "HSA_KEGG.txt", sep="\t") library (biomaRt) mart <- useDataset ( "hsapiens_gene_ensembl" , useMart ( "ensembl" )) entrezgene <- PATH_ID_NAME$ENTREZID # This step need some time ensembl_gene_id<- getBM (attributes= c ( "ensembl_gene_id" , "entrezgene" ), filters = "entrezgene" , values=entrezgene , mart= mart) PATH_ID_NAME <- merge (PATH_ID_NAME, ensembl_gene_id, by.x= "ENTREZID" ,by.y= "entrezgene" ) |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)