前言
工作中一直接触 DNA 突变,突变检出软件众多,包括 GATK、pisces、vardict、strelka 等,但经常会出现两个、甚至多个突变需要合并起来综合判断其功能等影响的情况;因此,希望能有一个便捷的工具对突变进行合并,而不需要人工从 igv 上进行去判读。
实现逻辑
- 将需要合并的突变进行标准化(左对齐)
- 按突变位置进行排序
- 根据突变位置顺序,依次整理合并出后的结果
输入输出
- 输入需要参考基因组 fa 以对突变进行左对齐和确定合并后的突变 ref,alt 碱基序列
- 合并完成后 Mergr 对象会多一个 result_var 属性,其是 Var 的实例化对象
完整代码
点击查看代码
import re
import pysam
from itertools import combinations
class Merge:
""" 合并多个突变 """
def __init__(self, fasta):
self.fasta = fasta
self.Fa = pysam.FastaFile(fasta)
self.vars = []
def 增加突变(self, in_var):
""" 添加需要合并的单个突变,重复调用添加多个突变 """
self.vars.append(Var(in_var, self.fasta))
def _check_chr(self):
""" 核对所有的突变是否在同一条染色体上 """
Chrs = set(var.Chr for var in self.vars)
if len(Chrs) > 1:
msg = f'染色体不一致 {Chrs}'
raise ValueError(msg)
def _check_pos(self):
""" 核对突变位置是否有交集 """
for a, b in combinations(self.vars, 2):
if a.with_same_pos(b):
msg = f'两个突变存在共同位置,不能合并. {a} {b}'
raise ValueError(msg)
def _sort_var_by_pos(self):
""" 根据位置顺序,排序变异 """
# 插入和点图标的话,插入突变要在其后端
pos_var = {}
for var in self.vars:
if var.Ref == '-':
pos_var[f'{var.Start}_2ins'] = var
else:
pos_var[f'{var.Start}_1other'] = var
new_var_list = []
for k in sorted([key for key in pos_var.keys()]):
# print(k, pos_var[k])
new_var_list.append(pos_var[k])
self.vars = new_var_list
def 合并(self) -> Var:
""" 将输入的突变合并成新的复杂合并 """
self._check_chr()
self._check_pos()
self._sort_var_by_pos()
start_pos = self.vars[0].Start + 1 if self.vars[0].Ref == '-' else self.vars[0].Start
# print(start_pos)
ref_seq = self.Fa.fetch(region=f'{self.vars[0].Chr}:{start_pos}-{self.vars[-1].End}')
base_dict = {p:b for p,b in zip(range(start_pos, self.vars[-1].End+1), ref_seq)}
# print(base_dict)
last_end = 0
alt_seq = ''
for n, var in enumerate(self.vars):
if n > 0:
end = var.Start + 1 if var.Ref == '-' else var.Start
alt_seq += "".join(base_dict[i] for i in range(last_end+1, end))
# print(var)
alt_seq += var.Alt
last_end = var.End
# 去除缺失符号 -
alt_seq = alt_seq.replace('-', '')
if len(ref_seq) > len(alt_seq):
loop = range(len(alt_seq))
else:
loop = range(len(ref_seq))
# print(ref_seq)
# print(alt_seq)
# End 移动
move = 0
for i in loop:
if ref_seq[::-1][i] != alt_seq[::-1][i]:
break
move += 1
if move:
ref_seq = ref_seq[:-move] if ref_seq[:-move] else '-'
alt_seq = alt_seq[:-move] if alt_seq[:-move] else '-'
End = self.vars[-1].End - move
# Start 移动
move = 0
for i in loop:
if ref_seq[i] != alt_seq[i]:
break
move += 1
ref_seq = ref_seq[move:] if ref_seq[move:] else '-'
alt_seq = alt_seq[move:] if alt_seq[move:] else '-'
if ref_seq == '-':
Start = start_pos+move - 1
End = End + 1
else:
Start = start_pos+move
End = End
# print(f'{Start} {End} {ref_seq} {alt_seq}')
o_var = f'{self.vars[0].Chr} {Start} {End} {ref_seq if len(ref_seq) < 10 else (ref_seq[:10] + "...")} {alt_seq if len(alt_seq)<10 else (alt_seq[:10] + "...")}'
print(f'合并后的突变: [{o_var}]')
self.result_var = Var(f'{self.vars[0].Chr} {Start} {End} {ref_seq} {alt_seq}', self.fasta)
def __str__(self):
return str(self.result_var)
class Var:
""" 突变基础类,记录突变的 染色体、起始位置、终止位置、ref序列和突变序列
并将其进行左对齐,以进行比较
"""
def __init__(self, var, fasta):
self.in_var = var
self.fasta = pysam.FastaFile(fasta)
self.Chr, self.Start, self.End, self.Ref, self.Alt, self.Type = self.格式化(var)
self.左对齐()
def 格式化(self, var):
""" # 处理 vcf 格式的插入缺失, 如: T/TC TC/T """
var = re.sub('[ \:\t_]+', ' ', var.strip())
m = re.findall(r'(\d *?\- *?\d)', var)
if m:
var = var.replace(m[0], m[0].replace('-', ' '))
var = re.sub(r' +', ' ', var)
Chr, Start, End, Ref, Alt = var.strip().split(' ')
Ref = Ref.upper()
Alt = Alt.upper()
Start = int(Start)
End = int(End)
if Ref.startswith(Alt):
Ref = Ref[len(Alt):]
Alt = '-'
Start += len(Alt)
elif Alt.startswith(Ref):
Alt = Alt[len(Ref):]
Ref = '-'
elif Ref == '-':
if Start == End - 1:
End -= 1
Chr = Chr.upper()
if Chr.startswith('CHR'):
Chr = Chr.replace('CHR', 'chr')
else:
Chr = f'chr{Chr}'
if Ref == '' or Alt == '' or (Ref == '-' and Alt == '-'):
raise ValueError(f'突变的碱基写法错误 {self.in_var}')
if any([c not in 'ATCG-' for c in Ref+Alt]):
raise ValueError(f'突变的碱基字母错误 {self.in_var}')
if not any([f'chr{c}' == Chr for c in list(range(1,23)) + ['X', 'Y']]):
raise ValueError(f'染色体有误 {Chr}')
Type = self.突变类型判断(Ref, Alt)
return Chr, Start, End, Ref, Alt, Type
@staticmethod
def 最小重复碱基(base: str):
""" 找到输入碱基的最小重复区域,用于确定串联重复的最小单位 """
for i in range(len(base)):
if base.count(base[0:i+1]) == len(base) / (i + 1):
return base[0:i+1]
else:
return base
@staticmethod
def 突变类型判断(Ref, Alt):
""" 根据输入位置判断突变的类型 """
if Ref == '-':
return 'INS'
elif Alt == '-':
return 'DEL'
elif len(Ref) > 1:
return 'DELINS'
else:
return 'SNV'
def 左对齐(self):
""" DEL、INS 左对齐,注意碱基变化 """
if self.Type in ['SNV', 'DELINS']:
return
var_dup_base = self.最小重复碱基(self.Alt) if self.Type == 'INS' else self.最小重复碱基(self.Ref)
local_start = self.Start if self.Type == 'INS' else self.Start - 1
ref_left_seq: str = self.fasta.fetch(
region=f'{self.Chr}:{local_start-500}-{local_start}'
).upper()[::-1]
dup_len = len(var_dup_base)
反向重复碱基 = var_dup_base[::-1]
for count in range(int(500//len(var_dup_base))):
if ref_left_seq[count*dup_len:(count+1)*dup_len] != 反向重复碱基:
break
# 先对重复区域左移
if count:
move_len = count * dup_len
self.Start = self.Start - move_len
self.End = self.End - move_len
# 相同碱基的左对齐
if self.Type == 'DEL':
# print(self.Start)
left_seq: str = self.fasta.fetch(
region=f'{self.Chr}:{self.Start-150}-{self.Start-1}'
).upper()[::-1]
target_seq = self.Ref[::-1]
elif self.Type == 'INS':
left_seq: str = self.fasta.fetch(
region=f'{self.Chr}:{self.Start-150}-{self.Start}'
).upper()[::-1]
target_seq = self.Alt[::-1]
mv_len = 0
for i, base in enumerate(target_seq):
if base == left_seq[i]:
mv_len += 1
continue
break
# 最终位置和碱基调整
if mv_len:
if self.Type == 'DEL':
self.Start = self.Start - mv_len
self.End = self.End - mv_len
self.Ref = self.fasta.fetch(region=f'{self.Chr}:{self.Start}-{self.End}').upper()
elif self.Type == 'INS':
self.Start = self.Start - mv_len
self.End = self.End - mv_len
self.Alt = f'{left_seq[:mv_len][::-1]}{self.Alt[:-mv_len]}'
def with_same_pos(self, in_var):
""" 判断两个突变位置是否有交集 """
# 插入突变不消耗位置
if in_var.Ref == '-' or self.Ref == '-':
return False
return self.Start <= in_var.Start <= self.End or \
self.Start <= in_var.End <= self.End or \
(in_var.Start <= self.Start and \
in_var.End >= self.End)
def __str__(self):
return f'[{self.Chr} {self.Start} {self.End} {self.Ref} {self.Alt}]'
def tab_str(self):
return f'{self.Chr}\t{self.Start}\t{self.End}\t{self.Ref}\t{self.Alt}'
使用示例
点击查看代码
import Merge
fasta = 'genome.fa'
m = Merge(fasta)
# 三个突变合并到一起
m.增加突变('chr6 117632269 117632270 - ACT')
m.增加突变('chr6 117632269 117632269 A T')
m.增加突变('chr6 117632270 117632270 G A')
m.合并()
# 运行后提示为 “合并后的突变: [chr6 117632269 117632270 AG TACTA]”
# 也可使用 print(m.result_var) 打印其内容
拓展
可结合先前开发的可信度评估工具,对检出的突变自动判断是否需要合并,然后自动判读合并的突变是否可信,再将可信度突变增加到检出结果内,随后续流程一并注释: https://www.cnblogs.com/magower/p/18062277