非参物种富集分析

点击查看代码
#!~/soft/miniconda3/envs/py3/bin/Rscript  需要修改
##############  非模式生物 go kegg 富集分析   ########################
 
 
## 1. 参考链接：https://zhuanlan.zhihu.com/p/389005258   https://zhuanlan.zhihu.com/p/196761601

## 2. 加载R包
#加载R包
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(edgeR))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(survival))
suppressPackageStartupMessages(library(survminer))
suppressPackageStartupMessages(library(glmnet))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(rms))
suppressPackageStartupMessages(library(survivalROC))
suppressPackageStartupMessages(library(plotROC))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(clusterProfiler))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(AnnotationForge))
suppressPackageStartupMessages(library(jsonlite))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(RCurl))



## 脚本记录
options(stringsAsFactors = F)
option_list <- list(
  make_option(c("-e", "--egg-emapper.txt"), help="the file produced from egg-gmaper" ),
  make_option(c("-g", "--go.term.txt"), help="the file downloded from datasets" ),
  make_option(c("-l", "--gene.list"), help="the gene list you want to do go analysis" ),
  make_option(c("-j", "--ko00001.json"), help="the json orign from kegg database" ),

)
opt_parser = OptionParser(option_list=option_list);
opt <- parse_args(opt_parser);


egg  <-  read.csv(opt$e, sep = "\t")
egg[egg==""]<-NA

# 从在线版下载的name可能不一样
colnames(egg)
gene_info <- egg %>% 
    dplyr::select(GID = query, EGGannot = seed_ortholog, SYMBOL=Preferred_name) %>% 
    na.omit()
    
gterms <- egg %>%
  dplyr::select(query, GOs) %>% na.omit()
  
gene_ids <- egg$query
eggnog_lines_with_go <- egg$GOs!= ""
eggnog_annoations_go <- str_split(egg[eggnog_lines_with_go,]$GOs, ",")
gene_to_go <- data.frame(gene = rep(gene_ids[eggnog_lines_with_go],
                                   times = sapply(eggnog_annoations_go, length)),
                         term = unlist(eggnog_annoations_go))
head(gene_to_go,2)

go_anno <- gene_to_go
names(go_anno) <- c('gene_id','ID') 
go_class <-read.delim(opt$g, header=FALSE, stringsAsFactors =FALSE) 
names(go_class) <- c('ID','Description','Ontology')  
go_anno <-merge(go_anno, go_class, by = 'ID', all.x = TRUE)
head(go_anno,2)

go_anno = go_anno[,c(2,1,3,4)]
go_anno2 = go_anno %>% na.omit()
write.table(go_anno2,file = 'out.emapper.go.txt',sep = '\t',quote = F,col.names = F,row.names = F)

#测试go 

gene_list = opt$l
go_rich <- enricher(gene = gene_list,
                  TERM2GENE = go_anno[c('ID','gene_id')],
                  TERM2NAME = go_anno[c('ID','Description')],
                  pvalueCutoff = 0.05,
                  pAdjustMethod = 'BH',
                  qvalueCutoff = 0.2,
                  maxGSSize = 200) 
barplot(go_rich,drop=T) 

# kegg

KEGG_info='opt$j'  #from https://www.genome.jp/kegg-bin/get_htext?ko00001
gene2ko <- egg %>% dplyr::select(GID = query, Ko = KEGG_ko) %>%  na.omit()
if(F){
  # 需要下载 json文件(这是是经常更新的)

  update_kegg <- function(json = "ko00001.json") {
    pathway2name <- tibble(Pathway = character(), Name = character())
    ko2pathway <- tibble(Ko = character(), Pathway = character())
    kegg <- fromJSON(json)
    for (a in seq_along(kegg[["children"]][["children"]])) {
      A <- kegg[["children"]][["name"]][[a]]
      for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
        B <- kegg[["children"]][["children"]][[a]][["name"]][[b]] 
        for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
          pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
          pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
          pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
          pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
          kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
          kos <- str_match(kos_info, "K[0-9]*")[,1]
          ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))
        }
      }
    }
    save(pathway2name, ko2pathway, file = "kegg_info.RData")
  }
  update_kegg(json = "ko00001.json")
}
                  
colnames(pathway2name)

#########组建通路(Pathway)与蛋白名称(query)信息#########
ko2gene <- tibble(Ko=character(),GID=character())#由于此时存在一蛋白对应多个KO,因此将其拆成一对一的多列储存进新的dataframe中
for (query in gene2ko$GID){
  ko_list <- strsplit(gene2ko$Ko[which(gene2ko[,1]==query)],split = ',')
  for (ko in ko_list){
    if (length(which(ko2gene[,1]==ko))==0){
      tmp <- data.frame(Ko=ko,GID=query)
      ko2gene <- rbind(ko2gene,tmp)
    }
    else{
      old_query <- ko2gene$GID[which(ko2gene[,1]==ko)]
      ko2gene$GID[which(ko2gene[,1]==ko)] <- paste(old_query,query,sep = ',')
    }
  }
}

pathway2gene <- tibble(Pathway = character(), GID = character())
for (ko in ko2pathway$Ko){
  pathway_list <- ko2pathway$Pathway[which(ko2pathway[,1]==ko)]
  for (pathway in pathway_list){
    if (paste('ko:',ko,sep='') %in% ko2gene$Ko){
      ko <- paste('ko:',ko,sep='')
      if (length(which(pathway2gene[,1]==pathway))==0 ){
        ko2gene$GID[which(ko2gene[,1]==ko)]
        tmp <- data.frame(pathway=pathway,GID=ko2gene$GID[which(ko2gene[,1]==ko)])
        pathway2gene <- rbind(pathway2gene,tmp)
      }
      else{
        old_query <- pathway2gene$GID[which(pathway2gene[,1]==pathway)]
        query <- ko2gene$GID[which(ko2gene[,1]==ko)]
        pathway2gene$GID[which(pathway2gene[,1]==pathway)] <- paste(old_query,query,sep=',')
      }
    }
  }
}

library(openxlsx)#读取.xlsx文件
library(ggplot2)#柱状图和点状图
library(stringr)#基因ID转换
library(enrichplot)#GO,KEGG,GSEA
library(clusterProfiler)#GO,KEGG,GSEA
library(GOplot)#弦图，弦表图，系统聚类图
library(DOSE)#
library(ggnewscale)#
library(circlize)#绘制富集分析圈图
library(ComplexHeatmap)#绘制图例

ko2pathway$Ko = paste0("ko:",ko2pathway$Ko)

pathway2gene <- gene2ko %>% left_join(ko2pathway, by = "Ko") %>%
  dplyr::select(pathway=Pathway,gene=GID) %>%
  na.omit()
  
KEGG <- enricher(gene_list,
                TERM2GENE = pathway2gene, 
                TERM2NAME = pathway2name,
                pvalueCutoff = 0.5, 
#                qvalueCutoff = 0.05,
#                pAdjustMethod = "BH",
                minGSSize = 1)
barplot(KEGG)
posted on 2022-08-25 13:43 Bonjour_! 阅读(229) 评论(0) 收藏举报
刷新页面返回顶部
非参物种富集分析

公告