2022-11-20 12:09:06 星期日

目的

原先写过一个 DNA germline 变异可信度判定 ,描述了一套评分机制,根据几项证据项进行变异可信度的判定,但是没有对证据项的来源进行描述的讲解,本篇补充证据项收集内容,并展示一些可用的代码。

思路

所需的证据项,均可从 bam 文件内提取解析得到,因此,证据项收集思路为:

  1. 根据变异位置,获取 bam 文件内变异附近的 reads 记录。
  2. 解析每条 reads 记录,收集所需的证据项信息。

证据项定义

DP: 跨过变异区域的 reads 条目数量 (即变异位置的测序深度)
AD: 检测到变异的 reads 条目数量 (即变异深度)
Is_reads: 测穿变异的 reads 条目数量
Uniq: 变异处于唯一比对区域。 80% 的 reads 都没有 XA,则 uniq 为 True
complex: 变异在复杂序列区域。(变异左右的序列是否是复杂序列)
high_MQ_reads: 变异所在 reads 的平均 MQ ,不低于其他 reads 的平均 MQ 的 50%。且变异的平均 MQ > 10。
Double_direction: 双向 reads 测到。

代码实现

点击查看代码
import sys
import attr
import pysam
import logging
import argparse

logging.basicConfig(
    level = logging.INFO,
    format = "[%(asctime)s - %(levelname)-4s - %(module)s : line %(lineno)s] %(message)s",
    datefmt = "%Y-%m-%d %H:%M:%S"
)

@attr.s
class Var_Info:
    var= attr.ib(type=str, default=attr.Factory(str))    # 实际位置变异,非 vcf 位置变异
    Bam= attr.ib(type=str, default=attr.Factory(str))
    Fasta = attr.ib(type=str, default=attr.Factory(str))
    # 重复次数超过限制认为是简单重复区域
    dup_num= attr.ib(type=int, default=attr.Factory(int))

    def __attrs_post_init__(self):
        if not self.dup_num:
            self.dup_num = 3
        self.fasta = pysam.FastaFile(self.Fasta)
        try:
            self.Bam_reader = pysam.AlignmentFile(self.Bam, "rb")
        except StopIteration as Err:
            logging.error(f'变异位置不存在 reads [{self.var}]')
            sys.exit(1)
        self.var_Chr, self.var_Start, self.var_End, \
            self.var_Ref, self.var_Alt = self.var.split(' ')
        self.var_Start: int = int(self.var_Start)
        self.var_End: int = int(self.var_End)
        self.__set_Type_Len()
        self.DP: int = 0
        self.AD: int = 0
        self.Is_reads: int = 0
        self.XA_reads_num: int = 0
        self.var_MQ: list = []
        self.other_MQ: list = []
        self.var_reads_name: list = []
        self.complex: bool = True
        self.reads_direction: set = set()

    def __set_Type_Len(self) -> None:
        """ 分析设置变异类型和长度 """
        if self.var_Ref == '-':
            self.var_Type = 'INS'
            self.var_Len = len(self.var_Alt)
        elif self.var_Alt == '-':
            self.var_Type = 'DEL'
            self.var_Len = len(self.var_Ref)
        elif len(self.var_Alt) == len(self.var_Ref) == 1:
            self.var_Type: str = 'SNV'
            self.var_Len: int = 1
        else:
            self.var_Type = 'DELINS'
            self.var_Len = len(self.var_Alt) if len(self.var_Alt) > len(self.var_Ref) else len(self.var_Ref)
        if self.var_Len > 50:
            logging.error(f'变异大于 50 bp,不适用于本程序 [{self.var}]')
            sys.exit(1)

    @property
    def Uniq(self) -> bool:
        """ 判断变异是否处于 reads 的唯一比对位置 """
        if self.XA_reads_num/self.DP > 0.2:
            return False
        return True

    @property
    def Double_direction(self) -> bool:
        """ 判断变异是否处于 reads 的唯一比对位置 """
        if len(self.reads_direction) == 1:
            return False
        return True

    @property
    def high_MQ(self) -> bool:
        """ 判断变异是否处于 reads 的唯一比对位置 """
        if len(self.other_MQ) == 0 and len(self.var_MQ) != 0:
            if sum(self.var_MQ)/len(self.var_MQ) > 10:
                return True
            else:
                return False
        if sum(self.other_MQ) == 0:
            if sum(self.var_MQ)/len(self.var_MQ) > 10:
                return True
            else:
                return False
        if (sum(self.var_MQ)/len(self.var_MQ)) / (sum(self.other_MQ)/len(self.other_MQ)) < 0.5:
            return False
        return True

    @property
    def AF(self) -> float:
        if self.DP == 0:
            return 1
        return self.AD / self.DP

    @staticmethod
    def min_dup(base: str):
        """ 找到 del、ins 的最小重复区域 """
        for i in range(len(base)):
            if base.count(base[0:i+1]) == len(base) / (i + 1):
                return base[0:i+1]
        else:
            return base

    def var_inside(self, Start, End) -> bool:
        """ 变异是否在某个区间内 """
        # 变异在比对位置内
        if self.var_Start >= Start and self.var_End <= End:
            return True
        return False

    def var_near(self, Start, End) -> bool:
        """ 变异是否在某个区间内 """
        # 变异在比对位置内
        if self.var_Start == End or self.var_End == Start:
            return True
        return False

    def add_SNV(self, record):
        """ SNV 变异记录处理 """
        # 点突变在 reads 上的起始索引位置
        for (i, pos) in record.get_aligned_pairs():
            if pos == self.var_Start - 1:
                snv_index: int = i
                break
        else:
            logging.error(f'SNV 位置不存在 [{self.var_Start}] in reads [{record.query_name}]')
            sys.exit(1)
        # 变异在这个记录的 reads 上
        if record.query_sequence[snv_index] == self.var_Alt:
            self.AD += 1
            # 简单区域判断
            if self.complex:
                # 变异在参考基因组上的位置之后可能的重复区域
                ref_seq: str = self.fasta.fetch(region=f'{self.var_Chr}:{self.var_Start+1}-{self.var_End + (self.var_Len * self.dup_num)}')
                if ref_seq.count(self.var_Alt) == self.dup_num:
                    self.complex = False
            # var MQ
            self.var_MQ.append(record.mapping_quality)
            # 测穿的 reads 对数量
            if record.query_name in self.var_reads_name:
                self.Is_reads += 1
            else:
                self.var_reads_name.append(record.query_name)
        else:
            self.other_MQ.append(record.mapping_quality)

    def add_INS(self, record):
        """ INS 变异记录处理 """
        # 简单区域判断
        if self.complex:
            # 变异在参考基因组上的位置之后可能的重复区域
            self.dup_base = self.min_dup(self.var_Alt)
            ref_seq: str = self.fasta.fetch(
                region=f'{self.var_Chr}:{self.var_Start + 1}-{self.var_End + (len(self.dup_base) * self.dup_num)}'
            )
            if ref_seq.count(self.dup_base) == self.dup_num:
                self.complex = False
        # INS 在 reads 上的起始索引位置,实际的 INS 实在起始碱基之后才开始插入序列的
        for (i, pos) in record.get_aligned_pairs():
            if pos == self.var_Start - 1:
                ins_index: int = i
                break
        else:
            logging.error(f'INS 位置不存在 [{self.var_Start}] in reads [{record.query_name}]')
            sys.exit(1)
        # 变异在这个记录的 reads 上
        var_in_reads: bool = False
        if self.complex and record.query_sequence[ins_index+1:ins_index+1+self.var_Len] == self.var_Alt:
            var_in_reads = True
        elif not self.complex:
            ref_seq: str = self.fasta.fetch(
                region=f'{self.var_Chr}:{self.var_Start + 1}-{self.var_End + 150}'
            )
            for i in range(self.dup_num, 150):
                if not ref_seq.startswith(self.dup_base * i):
                    var_seq = record.query_sequence[
                        ins_index + 1:ins_index + 1 + self.var_Len + len(self.dup_base) * (i - 1)
                    ]
                    if var_seq == self.var_Alt + self.dup_base * (i - 1):
                        var_in_reads = True
                    break
        if var_in_reads:
            self.AD += 1
            # var MQ
            self.var_MQ.append(record.mapping_quality)
            # 测穿的 reads 对数量
            if record.query_name in self.var_reads_name:
                self.Is_reads += 1
            else:
                self.var_reads_name.append(record.query_name)
        else:
            self.other_MQ.append(record.mapping_quality)

    def add_DEL(self, record):
        """ DEL 变异记录处理 """
        # 简单区域判断
        if self.complex:
            # 变异在参考基因组上的位置之后可能的重复区域
            dup_base = self.min_dup(self.var_Ref)
            ref_seq: str = self.fasta.fetch(
                region=f'{self.var_Chr}:{self.var_Start}-{self.var_Start - 1 + (len(dup_base) * self.dup_num)}'
            )
            if ref_seq.count(dup_base) == self.dup_num:
                self.complex = False
        pos_dict: dict = {v: k for (k, v) in record.get_aligned_pairs()}
        # DEL 在 reads 上的话,前面能匹配上且有索引,后面匹配上且有索引,del 区域没有索引
        if pos_dict[self.var_Start - 2] and pos_dict[self.var_End] and \
            all(not pos_dict[i] for i in range(self.var_Start - 1, self.var_End)):
            self.AD += 1
            # var MQ
            self.var_MQ.append(record.mapping_quality)
            # 测穿的 reads 对数量
            if record.query_name in self.var_reads_name:
                self.Is_reads += 1
            else:
                self.var_reads_name.append(record.query_name)
        else:
            self.other_MQ.append(record.mapping_quality)

    def add_DELINS(self, record):
        """ DELINS 变异记录处理 """
        # DELINS 在 reads 上的话,前面能匹配上且有索引,后面匹配上且有索引,del 区域没有索引
        pre_var_seq: str = ''
        inside: bool = False
        for (i, pos) in record.get_aligned_pairs():
            if not inside and (not pos or self.var_Start - 2 < pos):
                inside = True
                if i:
                    # print(i)
                    pre_var_seq = f"{pre_var_seq}{record.query_sequence[i]}"
                    continue
            if inside and (not pos or pos < self.var_End):
                if i:
                    # print(i)
                    pre_var_seq = f"{pre_var_seq}{record.query_sequence[i]}"
                    continue
            if pos and pos >= self.var_End:
                break
        # logging.debug(pre_var_seq)
        ref_seq: str = self.fasta.fetch(region=f'{self.var_Chr}:{self.var_Start}-{self.var_End}')
        var_seq: str = ref_seq.replace(self.var_Ref, self.var_Alt, 1)
        if var_seq == pre_var_seq:
            self.AD += 1
            # var MQ
            self.var_MQ.append(record.mapping_quality)
            # 测穿的 reads 对数量
            if record.query_name in self.var_reads_name:
                self.Is_reads += 1
            else:
                self.var_reads_name.append(record.query_name)
        else:
            self.other_MQ.append(record.mapping_quality)

    def run(self):
        """ 添加 bam 记录 """
        for record in self.Bam_reader.fetch(region=f"{self.var_Chr}:{self.var_Start}-{self.var_End}"):
            # reads 方向
            self.reads_direction.add(record.is_reverse)
            # 未比对,非配对的 reads 剔除
            if record.is_unmapped or not record.is_paired:
                continue
            # 变异在 read 的比对上的区域( reads 不包含变异位置的 reads 剔除)
            if self.var_inside(record.pos, record.aend):
                self.DP += 1
                # 按变异类型分开处理
                if self.var_Type == 'SNV':
                    self.add_SNV(record)
                elif self.var_Type == 'INS':
                    self.add_INS(record)
                elif self.var_Type == 'DEL':
                    self.add_DEL(record)
                else:
                    self.add_DELINS(record)
                # 其他比对位置
                if record.has_tag('XA'):
                    self.XA_reads_num += 1
            # elif self.var_near(record.pos, record.aend):
            #     pass
            else:
                continue
        self.score: float = call_score(
            self.AD, self.AF, self.Is_reads, self.Uniq, self.complex, self.high_MQ, self.Double_direction
        )
        self.level: str = cal_level(self.score)




代码检验

从 bam 文件内挑选一些示范性的变异,利用上期评分机制代码,检验证据项收集代码的实用性,下方是测试代码,与人工在 igv 上审查证据项基本没有差异,最终的评级也基本一致。

点击查看代码
from call_confidence import call_score, cal_level

def check_example():
    """ 测试数据检查结果 """
    group = [{
        "Bam": "NBS22FJ0003275.bam.dedup.bam",
        "Info": [
            {
                "Var": "1 386546 386547 TG C",
                "Res": f"1 {1 / 2} 0 True True False True E"
            },
            {
                "Var": "1 386557 386557 G C",
                "Res": f"1 {1 / 2} 0 True True False True E"
            },
            {
                "Var": "1 16374185 16374185 G A",   # NBS22FJ0003560 样本,该变异处于 reads 边缘位置
                "Res": f"12 {12 / 12} 0 True True True False B"
            },
            {
                "Var": "1 16376231 16376232 AA TAT",
                "Res": f"245 {245 / 295} 62 False True True True B"
            },
            {
                "Var": "1 16370873 16370873 - ACACACAT",
                "Res": f"11 {11 / 51} 0 True True True True B"
            },
            {
                "Var": "1 16370873 16370873 - ACACACACAT",  #! 变异在 reads 边缘,原结果也未计算入总深度
                "Res": f"32 {32 / 51} 3 True True True True A"
            },
            {
                "Var": "1 16373200 16373202 CTT -",
                "Res": f"194 {201 / 201} 54 True True True True A"
            },
            {
                "Var": "1 21880818 21880818 - ATAT",
                "Res": f"2 {2 / 9} 0 True False True False E"
            },
            {
                "Var": "1 24019391 24019392 TT -",
                "Res": f"2 {2 / 7} 0 True False True False E"
            },
            {
                "Var": "1 24141011 24141011 C T",
                "Res": f"4 {4 / 5} 0 True True True False D"
            },
            {
                "Var": "2 211466714 211466715 AT -",
                "Res": f"2 {2 / 4} 0 True False True False E"
            },
            {
                "Var": "2 228118596 228118596 A G",
                "Res": f"4 {4 / 5} 0 True True True True C"
            },
            {
                "Var": "1 2338126 2338126 C A",
                "Res": f"307 {307 / 310} 78 True True True True A"
            },
        ],
    }]
    all_check: int = 0
    fail_check: int = 0
    for g in group:
        for info in g['Info']:
            all_check += 1
            myvar = Var_Info(
                var=info['Var'],
                Bam=g['Bam'],
                Fasta='/analysis_s140/reference/GATK_bundle_b37/human_g1k_v37_decoy.fasta',
                dup_num=4
            )
            myvar.run()
            # print('Type', myvar.var_Type)
            # print('AD', myvar.AD)
            # print('DP', myvar.DP)
            # print('Is_reads', myvar.Is_reads)
            # print('Uniq', myvar.Uniq)
            # print('XA_reads', myvar.XA_reads_num)
            # print('complex', myvar.complex)
            # print('var_MQ', myvar.var_MQ)
            # print('other_MQ', myvar.other_MQ)
            # print('high_MQ', myvar.high_MQ)
            if myvar.DP == 0:
                sys.exit(f'深度为 0,请确认变异位置是否正确 {info["Var"]}')
            score = call_score(
                myvar.AD, myvar.AF, myvar.Is_reads, myvar.Uniq, myvar.complex, myvar.high_MQ, myvar.Double_direction)
            level = cal_level(score)
            res = f"{myvar.AD} {myvar.AF} {myvar.Is_reads} {myvar.Uniq} {myvar.complex} {myvar.high_MQ} {myvar.Double_direction} {level}"
            if info['Res'] != res:
                fail_check += 1
                print(info["Var"])
                print(f"[Error] 结果于预期不一致!\n\t预期: {info['Res']}\n\t结果: {res}")
                print('-'*50)
                continue
    fail_info: str = ''
    if fail_check:
        fail_info = f"{fail_check} 个不符合要求。"
    logging.info(f'一共验证 {all_check} 个变异。{fail_info}')



总结

  • 该代码主要是为了节省人工核查变异的耗时。
  • 支持的变异包括 SNV, 小于 50bp 的 DEL、INS、DELINS 的复杂变异。
  • 由于代码还未经过全面的变异检验,可能会存在未考虑到的缺陷。