差异表达基因和GSVA通路富集

无论是单细胞RNA测序还是批量RNA测序，我们都需要进行差异基因表达分析和GSVA途径富集分析。以下代码摘自单节文章

代码说明

加载所需的库和函数。parse_h5ad用于从Scanpy的工作流中读取andata，如我们所见，返回了表达式数据和元数据。

## load data
.libPaths("/data2/csj/tools/Rlib")
library(Seurat)
library(dplyr)
library(monocle)
options(stringsAsFactors=FALSE)
library(reticulate)

parse_h5ad <- function(adata){
    require(reticulate)
    ad <- import("anndata", convert = FALSE)
    ada <- ad$read_h5ad(adata)
    meta <- py_to_r(ada$obs)
    if(class(ada$raw$X)[1] == "scipy.sparse.csr.csr_matrix"){
        exp <- t(py_to_r(ada$raw$X$toarray()))
    }
    else{
        exp <- t(py_to_r(ada$raw$X))
    }
    rownames(exp) <- rownames(py_to_r(ada$raw$var))
    colnames(exp) <- rownames(meta)
    return(
        list(
        metadata = meta,
        expression = exp
        )
    )
}

作者计算了对数倍数变化，p值和调整后的p值。“ pseudocount.use”用于避免零日志。使用Wilcox.test进行测试，然后通过BH法调整p值。

############## different expression
DEGene <- function(h5ad,Cluster1,Cluster2){
    ## h5ad: the result of function parse_h5ad
    ## Cluster1 : a vector of Cluster
    ## Cluster2 : a vector of Cluster

    pseudocount.use =1

    cell_name_1 <- rownames(h5ad$metadata[h5ad$metadata$MajorCluster %in% Cluster1,])
    cell_name_2 <- rownames(h5ad$metadata[h5ad$metadata$MajorCluster %in% Cluster2,])

    Expression_1 <- h5ad$expression[,cell_name_1]
    Expression_2 <- h5ad$expression[,cell_name_2]

    ## log FC
    mean_c1 <- as.data.frame(rowMeans(as.matrix(Expression_1)))
    colnames(mean_c1) <- "mean_c1"
    mean_c2 <- as.data.frame(rowMeans(as.matrix(Expression_2)))
    colnames(mean_c2) <- "mean_c2"
    log2fc <- data.frame(log2fc = log2(mean_c1$mean_c1 + pseudocount.use) - log2(mean_c2$mean_c2 + pseudocount.use))
    rownames(log2fc) <- rownames(mean_c1)
    log2fc$gene <- rownames(log2fc)

    ## wilcox test
    group.info <- data.frame(row.names = c(cell_name_1, cell_name_2))
    group.info[cell_name_1, "group"] <- "Group1"
    group.info[cell_name_2, "group"] <- "Group2"
    group.info[, "group"] <- factor(x = group.info[, "group"])
    data.use <- h5ad$expression[, rownames(x = group.info), drop = FALSE]

    p_val <- sapply(
        X = 1:nrow(x = data.use),
        FUN = function(x) {
        return(wilcox.test(data.use[x, ] ~ group.info[, "group"])$p.value)
    })

    ## BH correction
    adj_p_val <- p.adjust(p_val, method="BH")

    ## DE table
    result <- data.frame(gene=log2fc$gene, log2FC=log2fc$log2fc, Pvalue=p_val, Adj_pval=adj_p_val)
    return(result)
}

clusterProfiler软件包用于运行GSVA分析。

##################################################### GSVA
runGSVA <- function(h5ad,Cluster1,Cluster2,kcdf="Gaussian",AdjPvalueCutoff=0.05){
    ## h5ad: the result of function parse_h5ad
    ## Cluster1 : Cluster1
    ## Cluster2 : Cluster2
    ## kcdf: kcdf="Gaussian" for continuous and 'Poisson for integer counts'

    require(GSVA)
    require(GSEABase)
    require(GSVAdata)
    require(clusterProfiler)
    data(c2BroadSets)
    library(limma)

    expression <- h5ad$expression[,rownames(h5ad$metadata[h5ad$metadata$MajorCluster %in% c(Cluster1,Cluster2),])]

    ## change gene symbol to geneid
    gene_entrezid <- bitr(rownames(expression), fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
    expression_filt <- expression[gene_entrezid$SYMBOL,]
    rownames(expression_filt) <- gene_entrezid$ENTREZID
    expression_filt <- as.matrix(expression_filt)

    res_gsva <- gsva(expression_filt, c2BroadSets, parallel.sz=10,kcdf=kcdf) 

    annotation_col = data.frame(CellType = factor(h5ad$metadata[colnames(res_gsva),]$MajorCluster))

    rownames(annotation_col) = colnames(res_gsva)

    ## using limma to conduct DE analysis
    f <- factor(annotation_col$CellType)
    design <- model.matrix(~0+f)
    colnames(design) <- c("C1","C2")
    rownames(design) <- colnames(res_gsva)

    fit <- lmFit(res_gsva, design)
    cont.matrix=makeContrasts('C1-C2',levels = design)
    fit2=contrasts.fit(fit,cont.matrix)
    fit2=eBayes(fit2)
    gs <- topTable(fit2,adjust='BH', number=Inf, p.value=AdjPvalueCutoff)
    gs$cluster <- ifelse(gs$t > 0 , "C1", "C2")
    return(gs)
}

条形图显示了通过基因组变异分析
（GSVA）对每个细胞评分的肺癌中富含C1QC +巨噬细胞和SPP1 +巨噬细胞的不同途径。t值通过limma回归计算

结果解释

尽管没有代码的绘图功能，但很容易在条形图功能中运行它。
图显示，作为验证的结果，SPP1 + TAMs显示出参与血管生成的基因的优先表达。

原始出处

http://www.thecodesearch.com/2021/03/16/differentially-expressed-genes-and-gsva-pathway-enrichment/

ywliao

介绍

代码说明

结果解释

公告