1. 脚本主要内容
* 批量读取下机数据
* 计算双细胞比例
* BBKNN去除批次效应
* 去除细胞周期的影响
* 转换为seurat对象
2. 脚本
点击查看代码
import scanpy as sc
import anndata as an
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scrublet as scr
import scipy.io
import numpy as np
import os
import pandas as pd
from math import sqrt
import bbknn
import random
from matplotlib.pyplot import rc_context
import datetime
import DIY
from DIY import get_tsne
sc.logging.print_versions()
sc.set_figure_params(facecolor="white", figsize=(8, 8))
sc.settings.verbosity = 3
random.seed(234)
figure_out = './figures'
results_file = 'write/sce.h5ad'
table_out = './write'
datadir='./CellrangerOut/'
# df=pd.read_csv('sample_info.txt',header=0,index_col=0,sep='\t')
sample_type={}
samplelist = []
def InputData(SampleInfo):
metadata = pd.read_csv(SampleInfo,header=0,index_col=0,sep='\t')
filenames = metadata.index
adatas = [sc.read_10x_mtx(datadir + filename+ '/filtered_feature_bc_matrix',cache=True) for filename in filenames] # use list to store separate adata
for i in range(len(adatas)):
adatas[i].obs['sample'] = metadata.index[i]
for col in metadata.columns:
adatas[i].obs[col] = metadata[col][i]
adata = adatas[0].concatenate(adatas[1:],batch_categories = metadata.index)
adata.var_names_make_unique()
sc.pl.highest_expr_genes(adata, n_top=20, save = True )
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
# ribo_gene_df = pd.read_csv(r'KEGG_RIBOSOME.v2023.1.Hs.csv', sep=',')
# ribo_gene_df = ribo_gene_df[ribo_gene_df.STANDARD_NAME=='GENE_SYMBOLS']
# ribo_gene_names = ribo_gene_df.loc[:, 'KEGG_RIBOSOME'].values[0].split(',')
# adata.var['ribo'] = adata.var.index.isin(ribo_gene_names)
adata.var['mito'] = adata.var.index.str.startswith(('MT-', 'mt-', 'Mt-'))
sc.pp.calculate_qc_metrics(adata,
expr_type='counts', # default value
var_type='genes', # default value
qc_vars=['mito'],
percent_top=None,
log1p=False,
inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
jitter=0.4, multi_panel=True,save=True)
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
plt.subplots_adjust(wspace=.5)
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito',show=False,ax=ax1)
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts',show=False,ax=ax2)
fig.savefig(os.path.join(figure_out, 'scatter.pdf'),
dpi=300, bbox_inches='tight')
return adata
# 计算doublet
sim_doublet_ratio = 2
def Compute_Doublet(adata,rate):
counts_matrix = adata.to_df()
n_cells = adata.shape[0]
scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=rate,
n_neighbors = round(0.5 * sqrt(n_cells)),
sim_doublet_ratio = sim_doublet_ratio)
### annoy-1.15.1
doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2,
min_cells=3,
min_gene_variability_pctl=85,
n_prin_comps=30)
scrub.plot_histogram()
plt.savefig(os.path.join(figure_out,"histogram.pdf"),dpi=300, bbox_inches='tight')
plt.show()
scrub.set_embedding('TSNE', scr.get_tsne(scrub.manifold_obs_, 0.5,10))
scrub.plot_embedding('TSNE', order_points=True)
plt.savefig(os.path.join(figure_out,"predicted_doublets.pdf"))
plt.show()
out_df = pd.DataFrame()
out_df['barcodes'] = counts_matrix.index
out_df['doublet_scores'] = doublet_scores
out_df['predicted_doublets'] = predicted_doublets
#out_df.to_csv('doublet.txt', index=False,header=True)
out_df.to_csv(os.path.join(table_out,'doublet.txt'), index=False,header=True)
out_df.head()
return out_df,doublet_scores,predicted_doublets
def Filter_cells(adata,doublet_scores,predicted_doublets,mito):
adata.obs["doublet_scores"] = doublet_scores
adata.obs["predicted_doublets"] = predicted_doublets
#~ 可以作为取反的功能
adata = adata[~adata.obs.predicted_doublets, :]
upper_limit = np.quantile(adata.obs.n_genes_by_counts, 0.98)
adata = adata[adata.obs.n_genes_by_counts < upper_limit, :]
adata = adata[adata.obs.pct_counts_mito < mito, :]
return adata,upper_limit
def CellCycleScoring(adata,cycle,species):
data = pd.read_csv(cycle,sep=',',header='infer',usecols=[1,2,3,4])
if species == 'hsa':
s_genes = list(data.loc[ : ,'hsa_s.genes'])
g2m_genes = list(data.loc[ : ,'hsa_g2m.genes'])
else:
s_genes = list(data.loc[ : ,'mmu_s.genes'])
g2m_genes = list(data.loc[ : ,'mmu_g2m.genes'])
cell_cycle_genes = s_genes + g2m_genes
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.scale(adata,zero_center=False)
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
adata_cc_genes = adata[:, cell_cycle_genes]
sc.tl.pca(adata_cc_genes)
sc.pl.pca_scatter(adata_cc_genes, color='phase',save='.Cycle.pdf')
adata.obs['Difference'] = adata.obs['S_score'] - adata.obs['G2M_score']
adata.write('./write/sce_raw.h5ad')
return adata
def Run_Normalization(adata,n_neighbors,n_pcs,resolution):
adata.layers['count'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata,
n_top_genes=3000,
flavor='seurat',
subset=False,
batch_key='sample')
sc.pl.highly_variable_genes(adata,save=True)
adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['Difference'])
#scale数据
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata)
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=resolution, key_added='leiden')
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
plt.subplots_adjust(wspace=.5)
sc.pl.umap(adata, color=['sample'], frameon=False, ax=ax1, show=False)
sc.pl.umap(adata, color=['leiden'], frameon=False, legend_loc='on data', ax=ax2, show=False)
plt.savefig(os.path.join(figure_out,"umap.pdf"))
plt.show()
return adata
def Run_BBKNN(adata,n_neighbors,n_pcs,resolution):
sc.tl.pca(adata)
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs)
sc.external.pp.bbknn(adata, batch_key='sample')
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=resolution, key_added='leiden')
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
plt.subplots_adjust(wspace=.5)
sc.pl.umap(adata, color=['sample'], frameon=False, ax=ax1, show=False)
sc.pl.umap(adata, color=['leiden'], frameon=False, legend_loc='on data', ax=ax2, show=False)
plt.savefig(os.path.join(figure_out,"umap-BBKNN.pdf"))
plt.show()
with rc_context({'figure.figsize': (5, 5)}):
sc.pl.umap(adata, color='leiden', add_outline=True, legend_loc='on data',
legend_fontsize=12, legend_fontoutline=2,frameon=False,
title='clustering of cells', palette='Set1',save ='-BBKNN-re.pdf')
adata.write(results_file)
return adata
def Show_Markers(adata):
ax = sc.pl.correlation_matrix(adata, 'leiden', figsize=(5,3.5),save= True)
adata_raw = sc.read_h5ad('./write/sce_raw.h5ad')
adata_raw.obs = adata.obs
adata_raw.obsm['X_umap'] = adata.obsm['X_umap']
adata_raw.obsm['seurat_clusters'] = adata.obsm['leiden']
adata_raw.obsm['nCount_RNA'] = adata.obsm['total_counts']
adata_raw.obsm['nFeature_RNA'] = adata.obsm['n_genes']
adata_raw.obsm['percent.mt'] = adata.obsm['pct_counts_mito']
adata_raw.write('./write/sce_raw.h5ad')
adata = adata_raw
adata.layers['count'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.settings.verbosity=2
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
markers = sc.get.rank_genes_groups_df(adata, group=None, pval_cutoff=.05, log2fc_min=.25)
markers.to_csv(os.path.join(table_out,'all_markers.csv'), index=False,header=True)
top5 = pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)
top5.to_csv(os.path.join(table_out,'top5_markers.csv'),index=False,header =True)
fig=plt.figure(figsize=(6,24),dpi=100)
for i in top5.columns:
plt.subplot(2, 7, int(i)+1) #做一个3*3的图 range(9)从0开始,因需要从1开始,所以i+1
sc.pl.rank_genes_groups_violin(adata, groups=str(i), n_genes=5,show=False)
plt.tight_layout()
plt.axis = 'off' #关闭坐标 让图更美观
plt.savefig(os.path.join(figure_out,"top5-markers.png"))
plt.show()
adata.layers['scaled'] = sc.pp.scale(adata, copy=True).X
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
sc.pl.rank_genes_groups_matrixplot(adata, n_genes=3, use_raw=False, vmin=-3, vmax=3, cmap='bwr', layer='scaled',save =True)
sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=3, cmap='bwr',save = True)
sc.pl.rank_genes_groups_dotplot(adata, n_genes=3, values_to_plot='logfoldchanges', min_logfoldchange=3, vmax=7, vmin=-7, cmap='bwr',save = True)
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, use_raw=False, swap_axes=True, show_gene_labels=False,
vmin=-3, vmax=3, cmap='bwr',save =True)
os.system('/PERSONALBIO/work/singlecell/s01/.conda/envs/py4/bin/Rscript ./sceasy.R')
return adata
if __name__ == '__main__':
start = datetime.datetime.now()
SampleInfo = './sample_info.txt'
cycle = './cycle.gene.csv'
species = 'hsa' ## or mmu
adata = InputData(SampleInfo)
out_df,doublet_scores,predicted_doublets = Compute_Doublet(adata,0.06) # 双细胞比率 默认0.06
adata,upper_limit = Filter_cells(adata,doublet_scores,predicted_doublets,10) # 分辨率
adata = CellCycleScoring(adata,cycle,species)
adata = Run_Normalization(adata,50,50,0.99) # neibor pc resolution
adata = Run_BBKNN(adata,50,50,0.99)
Show_Markers(adata)
end = datetime.datetime.now()
print("程序运行时间:"+str((end-start).seconds/3600)+"h")
3. 主要结果展示
-
去除批次作用前
-
去除批次作用后
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 上周热点回顾(3.3-3.9)
· AI 智能体引爆开源社区「GitHub 热点速览」
· 写一个简单的SQL生成工具