Scanpy学习1-sc.tl.score_genes 源代码解析
Scanpy sc.tl.score_genes 源代码解析
现在转到python,想要拥抱人工智能,对于scanpy源码进行解析。20240820
def score_genes(
adata: AnnData,
gene_list: Sequence[str],
ctrl_size: int = 50,
gene_pool: Optional[Sequence[str]] = None,
n_bins: int = 25,
score_name: str = 'score',
random_state: AnyRandom = 0,
copy: bool = False,
use_raw: Optional[bool] = None,
) -> Optional[AnnData]:
"""\
Score a set of genes [Satija15]_.
The score is the average expression of a set of genes subtracted with the
average expression of a reference set of genes. The reference set is
randomly sampled from the `gene_pool` for each binned expression value.
This reproduces the approach in Seurat [Satija15]_ and has been implemented
for Scanpy by Davide Cittaro.
Parameters
----------
adata
The annotated data matrix.
gene_list
The list of gene names used for score calculation.
ctrl_size
Number of reference genes to be sampled from each bin. If `len(gene_list)` is not too
low, you can set `ctrl_size=len(gene_list)`.
gene_pool
Genes for sampling the reference set. Default is all genes.
n_bins
Number of expression level bins for sampling.
score_name
Name of the field to be added in `.obs`.
random_state
The random seed for sampling.
copy
Copy `adata` or modify it inplace.
use_raw
Whether to use `raw` attribute of `adata`. Defaults to `True` if `.raw` is present.
.. versionchanged:: 1.4.5
Default value changed from `False` to `None`.
Returns
-------
Depending on `copy`, returns or updates `adata` with an additional field
`score_name`.
Examples
--------
See this `notebook <https://github.com/scverse/scanpy_usage/tree/master/180209_cell_cycle>`__.
"""
start = logg.info(f'computing score {score_name!r}')
adata = adata.copy() if copy else adata
use_raw = _check_use_raw(adata, use_raw) #检查是否使用原始矩阵
if random_state is not None:
np.random.seed(random_state) #设定随机种子数
gene_list_in_var = []
var_names = adata.raw.var_names if use_raw else adata.var_names
genes_to_ignore = []
for gene in gene_list:
if gene in var_names:
gene_list_in_var.append(gene)
else:
genes_to_ignore.append(gene) #确认genelist中的基因是否在adata对象中,排除
if len(genes_to_ignore) > 0:
logg.warning(f'genes are not in var_names and ignored: {genes_to_ignore}')
gene_list = set(gene_list_in_var[:])
if len(gene_list) == 0:
raise ValueError("No valid genes were passed for scoring.")
if gene_pool is None:
gene_pool = list(var_names) #设定基因池的大小,默认为adata对象检测所有基因
else:
gene_pool = [x for x in gene_pool if x in var_names]
if not gene_pool:
raise ValueError("No valid genes were passed for reference set.")
# Trying here to match the Seurat approach in scoring cells.
# Basically we need to compare genes against random genes in a matched
# interval of expression.
_adata = adata.raw if use_raw else adata
_adata_subset = (
_adata[:, gene_pool] if len(gene_pool) < len(_adata.var_names) else _adata
) #按照上述的分选条件提取sub adata
if issparse(_adata_subset.X): #检测adata.X是否为矩阵
obs_avg = pd.Series(
np.array(_sparse_nanmean(_adata_subset.X, axis=0)).flatten(),
index=gene_pool,
) # average expression of genes
else:
obs_avg = pd.Series(
np.nanmean(_adata_subset.X, axis=0), index=gene_pool
) # average expression of genes pool
#求解基因表达平均值
obs_avg = obs_avg[
np.isfinite(obs_avg)#检测数值是否无穷大或小
] # Sometimes (and I don't know how) missing data may be there, with nansfor
n_items = int(np.round(len(obs_avg) / (n_bins - 1))) #n个基因/24 进而四舍五入
obs_cut = obs_avg.rank(method='min') // n_items #按照表达丰度进行排序
control_genes = set()
# now pick `ctrl_size` genes from every cut
for cut in np.unique(obs_cut.loc[gene_list]):
r_genes = np.array(obs_cut[obs_cut == cut].index)#将gene_list中gene的排序转为array
np.random.shuffle(r_genes)#将基因随机打乱
# uses full r_genes if ctrl_size > len(r_genes)
control_genes.update(set(r_genes[:ctrl_size]))#取前50个gene,取决于len(gene_list)<ctrl_size
# To index, we need a list – indexing implies an order.
control_genes = list(control_genes - gene_list)#排除gene_list
gene_list = list(gene_list)
X_list = _adata[:, gene_list].X
if issparse(X_list):
X_list = np.array(_sparse_nanmean(X_list, axis=1)).flatten()
else:
X_list = np.nanmean(X_list, axis=1, dtype='float64')
X_control = _adata[:, control_genes].X
if issparse(X_control):
X_control = np.array(_sparse_nanmean(X_control, axis=1)).flatten()
else:
X_control = np.nanmean(X_control, axis=1, dtype='float64')
score = X_list - X_control
adata.obs[score_name] = pd.Series(
np.array(score).ravel(), index=adata.obs_names, dtype='float64'
)#输出为adata.obs['names']
logg.info(
' finished',
time=start,
deep=(
'added\n'
f' {score_name!r}, score of gene set (adata.obs).\n'
f' {len(control_genes)} total control genes are used.'
),
)
return adata if copy else None
****
核心点:计算单个细胞中基因集基因的平均表达量减去非基因集基因的平均表达量。
宁在一丝进,不在一丝停。