R获取指定GO term和KEGG pathway的gene list基因集
2022年09月06日
新方法:R | 提取GO分类下的所有基因
library(tidyverse) # library(org.Hs.eg.db) library(org.Mm.eg.db) GOID <- c("GO:0042573") # GOgeneID <- get(GOID, org.Hs.egGO2ALLEGS) %>% mget(org.Hs.egSYMBOL) %>% unlist() GOgeneID <- get(GOID, org.Mm.egGO2ALLEGS) %>% mget(org.Mm.egSYMBOL) %>% unlist()
clusterProfiler没有显性的接口,但是可以直接扣取clusterProfiler里的函数。
核心函数就是get_GO_data
GO_DATA <- get_GO_data("org.Hs.eg.db", "BP", "SYMBOL")
可以看到输入的是GO数据库,选定类别,基因名字类型,输出的就是整个数据库。
但是想调用这个函数没那么简单,得导入一系列的基础函数。
一个常见的任务就是获取GO数据库里所有的cell cycle相关的基因,这需要从我们的基因集里移除。
有了这个函数,我们就可以这么做了,两句R代码搞定。
cellCycleGO <- names(GO_DATA$PATHID2NAME[grep("cell cycle|DNA replication|cell division|segregation", GO_DATA$PATHID2NAME)]) cellCycleGene <- unique(unlist(GO_DATA$PATHID2EXTID[cellCycleGO])) print(length(cellCycleGene))
library(DOSE) library(GOSemSim) library(clusterProfiler) library(org.Hs.eg.db) # get_GO_data <- function(OrgDb, ont, keytype) { GO_Env <- get_GO_Env() use_cached <- FALSE if (exists("organism", envir=GO_Env, inherits=FALSE) && exists("keytype", envir=GO_Env, inherits=FALSE)) { org <- get("organism", envir=GO_Env) kt <- get("keytype", envir=GO_Env) if (org == DOSE:::get_organism(OrgDb) && keytype == kt && exists("goAnno", envir=GO_Env, inherits=FALSE)) { ## https://github.com/GuangchuangYu/clusterProfiler/issues/182 ## && exists("GO2TERM", envir=GO_Env, inherits=FALSE)){ use_cached <- TRUE } } if (use_cached) { goAnno <- get("goAnno", envir=GO_Env) } else { OrgDb <- GOSemSim:::load_OrgDb(OrgDb) kt <- keytypes(OrgDb) if (! keytype %in% kt) { stop("keytype is not supported...") } kk <- keys(OrgDb, keytype=keytype) goAnno <- suppressMessages( select(OrgDb, keys=kk, keytype=keytype, columns=c("GOALL", "ONTOLOGYALL"))) goAnno <- unique(goAnno[!is.na(goAnno$GOALL), ]) assign("goAnno", goAnno, envir=GO_Env) assign("keytype", keytype, envir=GO_Env) assign("organism", DOSE:::get_organism(OrgDb), envir=GO_Env) } if (ont == "ALL") { GO2GENE <- unique(goAnno[, c(2,1)]) } else { GO2GENE <- unique(goAnno[goAnno$ONTOLOGYALL == ont, c(2,1)]) } GO_DATA <- DOSE:::build_Anno(GO2GENE, get_GO2TERM_table()) goOnt.df <- goAnno[, c("GOALL", "ONTOLOGYALL")] %>% unique goOnt <- goOnt.df[,2] names(goOnt) <- goOnt.df[,1] assign("GO2ONT", goOnt, envir=GO_DATA) return(GO_DATA) } get_GO_Env <- function () { if (!exists(".GO_clusterProfiler_Env", envir = .GlobalEnv)) { pos <- 1 envir <- as.environment(pos) assign(".GO_clusterProfiler_Env", new.env(), envir=envir) } get(".GO_clusterProfiler_Env", envir = .GlobalEnv) } get_GO2TERM_table <- function() { GOTERM.df <- get_GOTERM() GOTERM.df[, c("go_id", "Term")] %>% unique } get_GOTERM <- function() { pos <- 1 envir <- as.environment(pos) if (!exists(".GOTERM_Env", envir=envir)) { assign(".GOTERM_Env", new.env(), envir) } GOTERM_Env <- get(".GOTERM_Env", envir = envir) if (exists("GOTERM.df", envir = GOTERM_Env)) { GOTERM.df <- get("GOTERM.df", envir=GOTERM_Env) } else { GOTERM.df <- toTable(GOTERM) assign("GOTERM.df", GOTERM.df, envir = GOTERM_Env) } return(GOTERM.df) }
获取KEGG的通路和基因是一样的,也是用clusterProfiler
代码:
hsa_kegg <- clusterProfiler::download_KEGG("hsa") names(hsa_kegg) head(hsa_kegg$KEGGPATHID2NAME) head(hsa_kegg$KEGGPATHID2EXTID) PATH2ID <- hsa_kegg$KEGGPATHID2EXTID PATH2NAME <- hsa_kegg$KEGGPATHID2NAME PATH_ID_NAME <- merge(PATH2ID, PATH2NAME, by="from") colnames(PATH_ID_NAME) <- c("KEGGID", "ENTREZID", "DESCRPTION") # write.table(PATH_ID_NAME, "HSA_KEGG.txt", sep="\t") library(biomaRt) mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) entrezgene <- PATH_ID_NAME$ENTREZID # This step need some time ensembl_gene_id<- getBM(attributes=c("ensembl_gene_id", "entrezgene"), filters = "entrezgene", values=entrezgene , mart= mart) PATH_ID_NAME <- merge(PATH_ID_NAME, ensembl_gene_id, by.x= "ENTREZID",by.y= "entrezgene")