2022-11-20 12:09:06 星期日
目的
原先写过一个 DNA germline 变异可信度判定 ,描述了一套评分机制,根据几项证据项进行变异可信度的判定,但是没有对证据项的来源进行描述的讲解,本篇补充证据项收集内容,并展示一些可用的代码。
思路
所需的证据项,均可从 bam 文件内提取解析得到,因此,证据项收集思路为:
- 根据变异位置,获取 bam 文件内变异附近的 reads 记录。
- 解析每条 reads 记录,收集所需的证据项信息。
证据项定义
DP: 跨过变异区域的 reads 条目数量 (即变异位置的测序深度)
AD: 检测到变异的 reads 条目数量 (即变异深度)
Is_reads: 测穿变异的 reads 条目数量
Uniq: 变异处于唯一比对区域。 80% 的 reads 都没有 XA,则 uniq 为 True
complex: 变异在复杂序列区域。(变异左右的序列是否是复杂序列)
high_MQ_reads: 变异所在 reads 的平均 MQ ,不低于其他 reads 的平均 MQ 的 50%。且变异的平均 MQ > 10。
Double_direction: 双向 reads 测到。
代码实现
点击查看代码
import sys
import attr
import pysam
import logging
import argparse
logging.basicConfig(
level = logging.INFO,
format = "[%(asctime)s - %(levelname)-4s - %(module)s : line %(lineno)s] %(message)s",
datefmt = "%Y-%m-%d %H:%M:%S"
)
@attr.s
class Var_Info:
var= attr.ib(type=str, default=attr.Factory(str)) # 实际位置变异,非 vcf 位置变异
Bam= attr.ib(type=str, default=attr.Factory(str))
Fasta = attr.ib(type=str, default=attr.Factory(str))
# 重复次数超过限制认为是简单重复区域
dup_num= attr.ib(type=int, default=attr.Factory(int))
def __attrs_post_init__(self):
if not self.dup_num:
self.dup_num = 3
self.fasta = pysam.FastaFile(self.Fasta)
try:
self.Bam_reader = pysam.AlignmentFile(self.Bam, "rb")
except StopIteration as Err:
logging.error(f'变异位置不存在 reads [{self.var}]')
sys.exit(1)
self.var_Chr, self.var_Start, self.var_End, \
self.var_Ref, self.var_Alt = self.var.split(' ')
self.var_Start: int = int(self.var_Start)
self.var_End: int = int(self.var_End)
self.__set_Type_Len()
self.DP: int = 0
self.AD: int = 0
self.Is_reads: int = 0
self.XA_reads_num: int = 0
self.var_MQ: list = []
self.other_MQ: list = []
self.var_reads_name: list = []
self.complex: bool = True
self.reads_direction: set = set()
def __set_Type_Len(self) -> None:
""" 分析设置变异类型和长度 """
if self.var_Ref == '-':
self.var_Type = 'INS'
self.var_Len = len(self.var_Alt)
elif self.var_Alt == '-':
self.var_Type = 'DEL'
self.var_Len = len(self.var_Ref)
elif len(self.var_Alt) == len(self.var_Ref) == 1:
self.var_Type: str = 'SNV'
self.var_Len: int = 1
else:
self.var_Type = 'DELINS'
self.var_Len = len(self.var_Alt) if len(self.var_Alt) > len(self.var_Ref) else len(self.var_Ref)
if self.var_Len > 50:
logging.error(f'变异大于 50 bp,不适用于本程序 [{self.var}]')
sys.exit(1)
@property
def Uniq(self) -> bool:
""" 判断变异是否处于 reads 的唯一比对位置 """
if self.XA_reads_num/self.DP > 0.2:
return False
return True
@property
def Double_direction(self) -> bool:
""" 判断变异是否处于 reads 的唯一比对位置 """
if len(self.reads_direction) == 1:
return False
return True
@property
def high_MQ(self) -> bool:
""" 判断变异是否处于 reads 的唯一比对位置 """
if len(self.other_MQ) == 0 and len(self.var_MQ) != 0:
if sum(self.var_MQ)/len(self.var_MQ) > 10:
return True
else:
return False
if sum(self.other_MQ) == 0:
if sum(self.var_MQ)/len(self.var_MQ) > 10:
return True
else:
return False
if (sum(self.var_MQ)/len(self.var_MQ)) / (sum(self.other_MQ)/len(self.other_MQ)) < 0.5:
return False
return True
@property
def AF(self) -> float:
if self.DP == 0:
return 1
return self.AD / self.DP
@staticmethod
def min_dup(base: str):
""" 找到 del、ins 的最小重复区域 """
for i in range(len(base)):
if base.count(base[0:i+1]) == len(base) / (i + 1):
return base[0:i+1]
else:
return base
def var_inside(self, Start, End) -> bool:
""" 变异是否在某个区间内 """
# 变异在比对位置内
if self.var_Start >= Start and self.var_End <= End:
return True
return False
def var_near(self, Start, End) -> bool:
""" 变异是否在某个区间内 """
# 变异在比对位置内
if self.var_Start == End or self.var_End == Start:
return True
return False
def add_SNV(self, record):
""" SNV 变异记录处理 """
# 点突变在 reads 上的起始索引位置
for (i, pos) in record.get_aligned_pairs():
if pos == self.var_Start - 1:
snv_index: int = i
break
else:
logging.error(f'SNV 位置不存在 [{self.var_Start}] in reads [{record.query_name}]')
sys.exit(1)
# 变异在这个记录的 reads 上
if record.query_sequence[snv_index] == self.var_Alt:
self.AD += 1
# 简单区域判断
if self.complex:
# 变异在参考基因组上的位置之后可能的重复区域
ref_seq: str = self.fasta.fetch(region=f'{self.var_Chr}:{self.var_Start+1}-{self.var_End + (self.var_Len * self.dup_num)}')
if ref_seq.count(self.var_Alt) == self.dup_num:
self.complex = False
# var MQ
self.var_MQ.append(record.mapping_quality)
# 测穿的 reads 对数量
if record.query_name in self.var_reads_name:
self.Is_reads += 1
else:
self.var_reads_name.append(record.query_name)
else:
self.other_MQ.append(record.mapping_quality)
def add_INS(self, record):
""" INS 变异记录处理 """
# 简单区域判断
if self.complex:
# 变异在参考基因组上的位置之后可能的重复区域
self.dup_base = self.min_dup(self.var_Alt)
ref_seq: str = self.fasta.fetch(
region=f'{self.var_Chr}:{self.var_Start + 1}-{self.var_End + (len(self.dup_base) * self.dup_num)}'
)
if ref_seq.count(self.dup_base) == self.dup_num:
self.complex = False
# INS 在 reads 上的起始索引位置,实际的 INS 实在起始碱基之后才开始插入序列的
for (i, pos) in record.get_aligned_pairs():
if pos == self.var_Start - 1:
ins_index: int = i
break
else:
logging.error(f'INS 位置不存在 [{self.var_Start}] in reads [{record.query_name}]')
sys.exit(1)
# 变异在这个记录的 reads 上
var_in_reads: bool = False
if self.complex and record.query_sequence[ins_index+1:ins_index+1+self.var_Len] == self.var_Alt:
var_in_reads = True
elif not self.complex:
ref_seq: str = self.fasta.fetch(
region=f'{self.var_Chr}:{self.var_Start + 1}-{self.var_End + 150}'
)
for i in range(self.dup_num, 150):
if not ref_seq.startswith(self.dup_base * i):
var_seq = record.query_sequence[
ins_index + 1:ins_index + 1 + self.var_Len + len(self.dup_base) * (i - 1)
]
if var_seq == self.var_Alt + self.dup_base * (i - 1):
var_in_reads = True
break
if var_in_reads:
self.AD += 1
# var MQ
self.var_MQ.append(record.mapping_quality)
# 测穿的 reads 对数量
if record.query_name in self.var_reads_name:
self.Is_reads += 1
else:
self.var_reads_name.append(record.query_name)
else:
self.other_MQ.append(record.mapping_quality)
def add_DEL(self, record):
""" DEL 变异记录处理 """
# 简单区域判断
if self.complex:
# 变异在参考基因组上的位置之后可能的重复区域
dup_base = self.min_dup(self.var_Ref)
ref_seq: str = self.fasta.fetch(
region=f'{self.var_Chr}:{self.var_Start}-{self.var_Start - 1 + (len(dup_base) * self.dup_num)}'
)
if ref_seq.count(dup_base) == self.dup_num:
self.complex = False
pos_dict: dict = {v: k for (k, v) in record.get_aligned_pairs()}
# DEL 在 reads 上的话,前面能匹配上且有索引,后面匹配上且有索引,del 区域没有索引
if pos_dict[self.var_Start - 2] and pos_dict[self.var_End] and \
all(not pos_dict[i] for i in range(self.var_Start - 1, self.var_End)):
self.AD += 1
# var MQ
self.var_MQ.append(record.mapping_quality)
# 测穿的 reads 对数量
if record.query_name in self.var_reads_name:
self.Is_reads += 1
else:
self.var_reads_name.append(record.query_name)
else:
self.other_MQ.append(record.mapping_quality)
def add_DELINS(self, record):
""" DELINS 变异记录处理 """
# DELINS 在 reads 上的话,前面能匹配上且有索引,后面匹配上且有索引,del 区域没有索引
pre_var_seq: str = ''
inside: bool = False
for (i, pos) in record.get_aligned_pairs():
if not inside and (not pos or self.var_Start - 2 < pos):
inside = True
if i:
# print(i)
pre_var_seq = f"{pre_var_seq}{record.query_sequence[i]}"
continue
if inside and (not pos or pos < self.var_End):
if i:
# print(i)
pre_var_seq = f"{pre_var_seq}{record.query_sequence[i]}"
continue
if pos and pos >= self.var_End:
break
# logging.debug(pre_var_seq)
ref_seq: str = self.fasta.fetch(region=f'{self.var_Chr}:{self.var_Start}-{self.var_End}')
var_seq: str = ref_seq.replace(self.var_Ref, self.var_Alt, 1)
if var_seq == pre_var_seq:
self.AD += 1
# var MQ
self.var_MQ.append(record.mapping_quality)
# 测穿的 reads 对数量
if record.query_name in self.var_reads_name:
self.Is_reads += 1
else:
self.var_reads_name.append(record.query_name)
else:
self.other_MQ.append(record.mapping_quality)
def run(self):
""" 添加 bam 记录 """
for record in self.Bam_reader.fetch(region=f"{self.var_Chr}:{self.var_Start}-{self.var_End}"):
# reads 方向
self.reads_direction.add(record.is_reverse)
# 未比对,非配对的 reads 剔除
if record.is_unmapped or not record.is_paired:
continue
# 变异在 read 的比对上的区域( reads 不包含变异位置的 reads 剔除)
if self.var_inside(record.pos, record.aend):
self.DP += 1
# 按变异类型分开处理
if self.var_Type == 'SNV':
self.add_SNV(record)
elif self.var_Type == 'INS':
self.add_INS(record)
elif self.var_Type == 'DEL':
self.add_DEL(record)
else:
self.add_DELINS(record)
# 其他比对位置
if record.has_tag('XA'):
self.XA_reads_num += 1
# elif self.var_near(record.pos, record.aend):
# pass
else:
continue
self.score: float = call_score(
self.AD, self.AF, self.Is_reads, self.Uniq, self.complex, self.high_MQ, self.Double_direction
)
self.level: str = cal_level(self.score)
代码检验
从 bam 文件内挑选一些示范性的变异,利用上期评分机制代码,检验证据项收集代码的实用性,下方是测试代码,与人工在 igv 上审查证据项基本没有差异,最终的评级也基本一致。
点击查看代码
from call_confidence import call_score, cal_level
def check_example():
""" 测试数据检查结果 """
group = [{
"Bam": "NBS22FJ0003275.bam.dedup.bam",
"Info": [
{
"Var": "1 386546 386547 TG C",
"Res": f"1 {1 / 2} 0 True True False True E"
},
{
"Var": "1 386557 386557 G C",
"Res": f"1 {1 / 2} 0 True True False True E"
},
{
"Var": "1 16374185 16374185 G A", # NBS22FJ0003560 样本,该变异处于 reads 边缘位置
"Res": f"12 {12 / 12} 0 True True True False B"
},
{
"Var": "1 16376231 16376232 AA TAT",
"Res": f"245 {245 / 295} 62 False True True True B"
},
{
"Var": "1 16370873 16370873 - ACACACAT",
"Res": f"11 {11 / 51} 0 True True True True B"
},
{
"Var": "1 16370873 16370873 - ACACACACAT", #! 变异在 reads 边缘,原结果也未计算入总深度
"Res": f"32 {32 / 51} 3 True True True True A"
},
{
"Var": "1 16373200 16373202 CTT -",
"Res": f"194 {201 / 201} 54 True True True True A"
},
{
"Var": "1 21880818 21880818 - ATAT",
"Res": f"2 {2 / 9} 0 True False True False E"
},
{
"Var": "1 24019391 24019392 TT -",
"Res": f"2 {2 / 7} 0 True False True False E"
},
{
"Var": "1 24141011 24141011 C T",
"Res": f"4 {4 / 5} 0 True True True False D"
},
{
"Var": "2 211466714 211466715 AT -",
"Res": f"2 {2 / 4} 0 True False True False E"
},
{
"Var": "2 228118596 228118596 A G",
"Res": f"4 {4 / 5} 0 True True True True C"
},
{
"Var": "1 2338126 2338126 C A",
"Res": f"307 {307 / 310} 78 True True True True A"
},
],
}]
all_check: int = 0
fail_check: int = 0
for g in group:
for info in g['Info']:
all_check += 1
myvar = Var_Info(
var=info['Var'],
Bam=g['Bam'],
Fasta='/analysis_s140/reference/GATK_bundle_b37/human_g1k_v37_decoy.fasta',
dup_num=4
)
myvar.run()
# print('Type', myvar.var_Type)
# print('AD', myvar.AD)
# print('DP', myvar.DP)
# print('Is_reads', myvar.Is_reads)
# print('Uniq', myvar.Uniq)
# print('XA_reads', myvar.XA_reads_num)
# print('complex', myvar.complex)
# print('var_MQ', myvar.var_MQ)
# print('other_MQ', myvar.other_MQ)
# print('high_MQ', myvar.high_MQ)
if myvar.DP == 0:
sys.exit(f'深度为 0,请确认变异位置是否正确 {info["Var"]}')
score = call_score(
myvar.AD, myvar.AF, myvar.Is_reads, myvar.Uniq, myvar.complex, myvar.high_MQ, myvar.Double_direction)
level = cal_level(score)
res = f"{myvar.AD} {myvar.AF} {myvar.Is_reads} {myvar.Uniq} {myvar.complex} {myvar.high_MQ} {myvar.Double_direction} {level}"
if info['Res'] != res:
fail_check += 1
print(info["Var"])
print(f"[Error] 结果于预期不一致!\n\t预期: {info['Res']}\n\t结果: {res}")
print('-'*50)
continue
fail_info: str = ''
if fail_check:
fail_info = f"{fail_check} 个不符合要求。"
logging.info(f'一共验证 {all_check} 个变异。{fail_info}')
总结
- 该代码主要是为了节省人工核查变异的耗时。
- 支持的变异包括 SNV, 小于 50bp 的 DEL、INS、DELINS 的复杂变异。
- 由于代码还未经过全面的变异检验,可能会存在未考虑到的缺陷。