点击查看代码
#!~/soft/miniconda3/envs/py3/bin/Rscript 需要修改
############## 非模式生物 go kegg 富集分析 ########################
## 1. 参考链接:https://zhuanlan.zhihu.com/p/389005258 https://zhuanlan.zhihu.com/p/196761601
## 2. 加载R包
#加载R包
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(edgeR))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(survival))
suppressPackageStartupMessages(library(survminer))
suppressPackageStartupMessages(library(glmnet))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(rms))
suppressPackageStartupMessages(library(survivalROC))
suppressPackageStartupMessages(library(plotROC))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(clusterProfiler))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(AnnotationForge))
suppressPackageStartupMessages(library(jsonlite))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(RCurl))
## 脚本记录
options(stringsAsFactors = F)
option_list <- list(
make_option(c("-e", "--egg-emapper.txt"), help="the file produced from egg-gmaper" ),
make_option(c("-g", "--go.term.txt"), help="the file downloded from datasets" ),
make_option(c("-l", "--gene.list"), help="the gene list you want to do go analysis" ),
make_option(c("-j", "--ko00001.json"), help="the json orign from kegg database" ),
)
opt_parser = OptionParser(option_list=option_list);
opt <- parse_args(opt_parser);
egg <- read.csv(opt$e, sep = "\t")
egg[egg==""]<-NA
# 从在线版下载的name可能不一样
colnames(egg)
gene_info <- egg %>%
dplyr::select(GID = query, EGGannot = seed_ortholog, SYMBOL=Preferred_name) %>%
na.omit()
gterms <- egg %>%
dplyr::select(query, GOs) %>% na.omit()
gene_ids <- egg$query
eggnog_lines_with_go <- egg$GOs!= ""
eggnog_annoations_go <- str_split(egg[eggnog_lines_with_go,]$GOs, ",")
gene_to_go <- data.frame(gene = rep(gene_ids[eggnog_lines_with_go],
times = sapply(eggnog_annoations_go, length)),
term = unlist(eggnog_annoations_go))
head(gene_to_go,2)
go_anno <- gene_to_go
names(go_anno) <- c('gene_id','ID')
go_class <-read.delim(opt$g, header=FALSE, stringsAsFactors =FALSE)
names(go_class) <- c('ID','Description','Ontology')
go_anno <-merge(go_anno, go_class, by = 'ID', all.x = TRUE)
head(go_anno,2)
go_anno = go_anno[,c(2,1,3,4)]
go_anno2 = go_anno %>% na.omit()
write.table(go_anno2,file = 'out.emapper.go.txt',sep = '\t',quote = F,col.names = F,row.names = F)
#测试go
gene_list = opt$l
go_rich <- enricher(gene = gene_list,
TERM2GENE = go_anno[c('ID','gene_id')],
TERM2NAME = go_anno[c('ID','Description')],
pvalueCutoff = 0.05,
pAdjustMethod = 'BH',
qvalueCutoff = 0.2,
maxGSSize = 200)
barplot(go_rich,drop=T)
# kegg
KEGG_info='opt$j' #from https://www.genome.jp/kegg-bin/get_htext?ko00001
gene2ko <- egg %>% dplyr::select(GID = query, Ko = KEGG_ko) %>% na.omit()
if(F){
# 需要下载 json文件(这是是经常更新的)
update_kegg <- function(json = "ko00001.json") {
pathway2name <- tibble(Pathway = character(), Name = character())
ko2pathway <- tibble(Ko = character(), Pathway = character())
kegg <- fromJSON(json)
for (a in seq_along(kegg[["children"]][["children"]])) {
A <- kegg[["children"]][["name"]][[a]]
for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
B <- kegg[["children"]][["children"]][[a]][["name"]][[b]]
for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
kos <- str_match(kos_info, "K[0-9]*")[,1]
ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))
}
}
}
save(pathway2name, ko2pathway, file = "kegg_info.RData")
}
update_kegg(json = "ko00001.json")
}
colnames(pathway2name)
#########组建通路(Pathway)与蛋白名称(query)信息#########
ko2gene <- tibble(Ko=character(),GID=character())#由于此时存在一蛋白对应多个KO,因此将其拆成一对一的多列储存进新的dataframe中
for (query in gene2ko$GID){
ko_list <- strsplit(gene2ko$Ko[which(gene2ko[,1]==query)],split = ',')
for (ko in ko_list){
if (length(which(ko2gene[,1]==ko))==0){
tmp <- data.frame(Ko=ko,GID=query)
ko2gene <- rbind(ko2gene,tmp)
}
else{
old_query <- ko2gene$GID[which(ko2gene[,1]==ko)]
ko2gene$GID[which(ko2gene[,1]==ko)] <- paste(old_query,query,sep = ',')
}
}
}
pathway2gene <- tibble(Pathway = character(), GID = character())
for (ko in ko2pathway$Ko){
pathway_list <- ko2pathway$Pathway[which(ko2pathway[,1]==ko)]
for (pathway in pathway_list){
if (paste('ko:',ko,sep='') %in% ko2gene$Ko){
ko <- paste('ko:',ko,sep='')
if (length(which(pathway2gene[,1]==pathway))==0 ){
ko2gene$GID[which(ko2gene[,1]==ko)]
tmp <- data.frame(pathway=pathway,GID=ko2gene$GID[which(ko2gene[,1]==ko)])
pathway2gene <- rbind(pathway2gene,tmp)
}
else{
old_query <- pathway2gene$GID[which(pathway2gene[,1]==pathway)]
query <- ko2gene$GID[which(ko2gene[,1]==ko)]
pathway2gene$GID[which(pathway2gene[,1]==pathway)] <- paste(old_query,query,sep=',')
}
}
}
}
library(openxlsx)#读取.xlsx文件
library(ggplot2)#柱状图和点状图
library(stringr)#基因ID转换
library(enrichplot)#GO,KEGG,GSEA
library(clusterProfiler)#GO,KEGG,GSEA
library(GOplot)#弦图,弦表图,系统聚类图
library(DOSE)#
library(ggnewscale)#
library(circlize)#绘制富集分析圈图
library(ComplexHeatmap)#绘制图例
ko2pathway$Ko = paste0("ko:",ko2pathway$Ko)
pathway2gene <- gene2ko %>% left_join(ko2pathway, by = "Ko") %>%
dplyr::select(pathway=Pathway,gene=GID) %>%
na.omit()
KEGG <- enricher(gene_list,
TERM2GENE = pathway2gene,
TERM2NAME = pathway2name,
pvalueCutoff = 0.5,
# qvalueCutoff = 0.05,
# pAdjustMethod = "BH",
minGSSize = 1)
barplot(KEGG)