背景
接触过一段时间遗传变异检测,现有公认的 GATK 检测小变异(snv + indel)的 “最佳实践” ,但是对于结构变异(SV),尽管有相当多软件能进行 SV 检测,但在其目前业内没有共识的检测软件和流程。现将自己的调研结果和成果记录下来。
SV 二代的检测原理主要有 4 个:
- RP : 基于双端测序数据,进行 SV 检测, 它不能精确到单个碱基断点。
- SR : 检测到的 SVs 断点能精确到单个碱基。
- RD : 基于 read 覆盖深度, 计算拷贝数变异。
- AS : 基于 de novo assembly。
此外还可能利用 SV 范围内 snv 变异辅助确定,比如二倍体 DUP 上的杂合变异 reads 的 ref/alt 倾向于 1/2 或 2/1, 而二倍体 DEL 范围内应该不存在杂合变异。各检测原理都有各自的优势,其中 AS 是相对最为准确的原理。文献综述推荐使用多种检测原理同时进行检测,最后综合各原理的检测结果进行筛选、过滤,以此提高检测灵敏度和准确度。
文献
在 GRIDSS 文献选用了大家常用,比较可靠的 SV 检测软件进行比较全面的测试,从结果上看,多款软件都有各自的优势,如 gridss 和 lumpy 检测的断点位置最准确, manta、Socrates 检测的 SV 更全面,测试结果如下图:
- 根据多个文献描述,预期 SV 检测流程图:
经验
SV 检测软件: lumpy
,manta
SV vcf 合并:缺乏通用工具
自建 python 工具:
点击查看代码
import re
import os
import sys
import attr
import gzip
import logging
from typing import TextIO
from collections import defaultdict, OrderedDict
logging.basicConfig(
level = logging.INFO,
format = "[%(asctime)s - %(levelname)-4s - %(module)s : line %(lineno)s] %(message)s",
datefmt = "%Y-%m-%d %H:%M:%S"
)
@attr.s
class VCF_header:
""" vcf 表头储存类 """
SAMPLES = attr.ib(type=list, default=attr.Factory(list))
COL_HEARER = attr.ib(type=str, default='#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT')
def __attrs_post_init__(self):
self.REC_order = {
'fileformat': '',
'source': '',
'reference': '',
'contig': '',
'INFO': '',
'FORMAT': '',
'FILTER': '',
'ALT': '',
'OTHER': '',
}
def add(self, info: str):
for line in info.strip().split('\n'):
line = line.strip()
# 空行跳过
if not line:
continue
if line.startswith(f'##'):
key = re.findall(f'^##(\w+)=', line)[0]
if key in self.REC_order:
# vcf 文件类型只选用其中一个
if key == 'fileformat' and self.REC_order[key]:
continue
# 重复记录去除
if line in self.REC_order[key]:
continue
self.REC_order[key] = f"{self.REC_order[key]}{line}\n"
else:
self.REC_order['OTHER'] = f"{self.REC_order['OTHER']}{line}\n"
elif line.startswith(f'#CHROM'):
if not self.SAMPLES:
self.SAMPLES = line.split('\t')[9:]
# 增加表头时,核查样本是否一致
elif self.SAMPLES != line.split('\t')[9:]:
sys.exit(f'【Error】 样本编号不一致!')
else:
continue
# 增加新的 header 信息
add_info = [
['INFO', '##INFO=<ID=SVTOOL,Number=1,Type=String,Description="Tool used to generate the SV">'],
['INFO', '##INFO=<ID=NUM_SVTOOLS,Number=1,Type=Integer,Description="Number of SV callers supporting the event">'],
]
for k, v in add_info:
if k not in self.REC_order:
logging.warning(f'表头字段不存在 {k}, 忽略新增的表头: \n{v}')
continue
if v not in self.REC_order[k]:
logging.debug(f'新增 header 记录 {k} : {v}')
self.REC_order[k] = f"{self.REC_order[k]}{v}\n"
def merge(self, in_header):
""" 合并另一个 VCF_header 实例类 """
# 样本不一致时不合并
if self.SAMPLES != in_header.SAMPLES:
logging.error(f'样本不一致')
for v in in_header.REC_order.values():
self.add(v)
def __str__(self) -> str:
header_str = "".join([info for info in self.REC_order.values() if info])
all_sample = "\t".join(self.SAMPLES)
header_str = f"{header_str.strip()}\n{self.COL_HEARER}\t{all_sample}\n"
return header_str
@attr.s
class Recod:
""" vcf 单条记录储存类 """
CHROM = attr.ib(type=str, default=attr.Factory(str))
POS = attr.ib(type=str, default=attr.Factory(str))
ID = attr.ib(type=str, default=attr.Factory(str))
REF = attr.ib(type=str, default=attr.Factory(str))
ALT = attr.ib(type=str, default=attr.Factory(str))
QUAL = attr.ib(type=str, default=attr.Factory(str))
FILTER = attr.ib(type=str, default=attr.Factory(str))
INFO = attr.ib(type=defaultdict, default=attr.Factory(defaultdict))
FORMAT = attr.ib(type=list, default=attr.Factory(list))
SAMPLE_value = attr.ib(type=OrderedDict, default=attr.Factory(OrderedDict))
def __str__(self) -> str:
res = f"{self.CHROM}\t{self.POS}\t{self.ID}\t{self.REF}\t{self.ALT}\t{self.QUAL}\t{self.FILTER}\t"
for k, v in self.INFO.items():
res = f"{res}{k}={v};"
else:
res = res.strip(';')
res = f"{res}\t{':'.join(self.FORMAT)}\t"
for vs in self.SAMPLE_value.values():
for v in vs.values():
res = f"{res}{v}:"
else:
res = res.strip(':')
return res
@attr.s
class VCF_recods:
""" vcf 记录储存类,自动核查是否按染色体排序 """
SAMPLES = attr.ib(type=list, default=attr.Factory(list))
all_records = attr.ib(type=list, default=attr.Factory(list))
allow_chrom = attr.ib(type=list, default= [
str(i) for i in range(1,23)
] + [
f'CHR{i}' for i in range(1,23)
] + ['X', 'Y', 'CHRX', 'CHRY']
)
# 用于核对 add 添加的 record 是否按指定的染色体顺序
__last_chrom = attr.ib(type=str, default=attr.Factory(str))
__last_pos = attr.ib(type=str, default=attr.Factory(str))
def Iter(self) -> Recod:
""" 记录访问的生成器 """
for rec in self.all_records:
yield rec
def __str__(self) -> str:
return "\n".join([r.__str__() for r in self.all_records])
def add(self, info: str, svtool: str='', min_sv: int=50):
""" 添加新的记录 """
cells = info.strip().split('\t')
# print('cells', cells)
# 空行跳过,列数小于 10 (至少一个样本) 报错退出。
if not cells:
return
if len(cells) < 10:
logging.error(f'vcf 记录小于 10 列: \n{info}')
sys.exit('vcf 记录小于 10 列')
# 染色体合法性
cells[0] = cells[0].upper()
if cells[0] not in self.allow_chrom:
logging.debug(f'染色体不合法: \n{info}')
return
if 'SVTYPE=BND' in cells[7]:
alt_chr = re.findall(r'[\[\]]([\w\.]+):\d+', cells[4])[0]
if alt_chr not in self.allow_chrom:
logging.debug(f'alt 染色体不合法: \n{info}')
return
# 排序检查
if not self.__last_chrom:
self.__last_chrom = cells[0]
self.__last_pos = cells[1]
elif self.allow_chrom.index(cells[0]) < self.allow_chrom.index(self.__last_chrom) or \
(cells[0] == self.__last_chrom and int(self.__last_pos) > int(cells[1])):
# print('self.__last_chrom', self.__last_chrom)
# print('self.__last_pos', self.__last_pos)
logging.error(f'vcf 没有排序: {info}')
sys.exit()
record = Recod()
record.CHROM = cells[0]
record.POS = cells[1]
record.ID = cells[2]
record.REF = cells[3]
record.ALT = cells[4]
record.QUAL = cells[5]
record.FILTER = cells[6]
for val in cells[7].split(';'):
if '=' not in val:
record.INFO.update({val:val})
else:
k, v = val.split('=')
record.INFO.update({k:v})
# INFO 内添加 SVTOOL,NUM_SVTOOLS 字段
if 'NUM_SVTOOLS' not in record.INFO:
record.INFO['NUM_SVTOOLS'] = 1
if 'SVTOOL' not in record.INFO:
if svtool:
record.INFO['SVTOOL'] = svtool
else:
record.INFO['SVTOOL'] = 'unknown'
record.FORMAT = cells[8].split(':')
# 必须要有样本
if not self.SAMPLES:
logging.error(f'没有样本名,不能添加数据!: \n{info}')
return
# 核对样本数量是否一致
if len(self.SAMPLES) != len(cells) - 9:
logging.error(f'样本数量不一致!: \n{self.SAMPLES}\n{info}')
return
# SV 长度要达标
if 'SVLEN' in record.INFO:
if abs(int(record.INFO['SVLEN'])) < min_sv:
logging.debug(f'变异长度不达标, min sv len = {min_sv}bp')
return
# 添加样本的数据
for i, sample in enumerate(self.SAMPLES):
values = cells[9+i].split(':')
for j, k in enumerate(record.FORMAT):
if sample not in record.SAMPLE_value:
record.SAMPLE_value[sample] = OrderedDict()
try:
a = values[j]
except IndexError:
print(cells)
print(record.FORMAT)
print(values, j)
sys.exit()
if ',' in values[j]:
values[j] = values[j].split(',')[-1]
record.SAMPLE_value[sample][k] = values[j]
#* PR 字段与 PE 同义,增加 PE
if k == 'PR':
record.SAMPLE_value[sample]['PE'] = values[j]
# 'PR' 前面加上 'PE'
if 'PR' in record.FORMAT and 'PE' not in record.FORMAT:
record.FORMAT.insert(record.FORMAT.index('PR'), 'PE')
# 增加到记录的列表内
if record in self.all_records:
logging.warning(f'记录重复: {info}')
return
self.__last_chrom = cells[0]
self.__last_pos = cells[1]
self.all_records.append(record)
@attr.s
class VCF:
""""
将 vcf 数据存入 VCF 类对象, 特点:
a. 使用 ‘标签:vcf文件名’ 来标记 vcf 来源,如 GATK:germline.vcf.gz
b. 自动对输入 vcf 记录排序
vcf : 一个 vcf 文件,允许传入压缩格式,允许给 vcf 添加特殊来源标记。
如 GATK 来源的 vcf 文件 germline.vcf.gz, 可写成 GATK:germline.vcf.gz
"""
vcf = attr.ib(type=str, default=attr.Factory(str))
min_sv = attr.ib(type=int, default=attr.Factory(int))
def __attrs_post_init__(self):
""" 解析 vcf 文件,保存为 header, records 对象 """
self.svtool = None
if self.vcf.count(':') == 1:
self.svtool, self.vcf = self.vcf.split(':')
if not os.path.isfile(self.vcf):
logging.error(f'【error】 文件不存在 {self.vcf}')
sys.exit()
if not self.min_sv:
self.min_sv: int = 50
if self.vcf.endswith('.gz'):
Reader = gzip.open(self.vcf, 'rt')
else:
Reader = open(self.vcf)
# 确保 vcf 记录按顺序排列
Reader = self._sort_check(Reader)
self.header = VCF_header()
self.records = VCF_recods()
for line in Reader:
if line.startswith('#'):
self.header.add(line)
self.records.SAMPLES = self.header.SAMPLES
else:
self.records.add(line, self.svtool, self.min_sv)
Reader.close()
logging.info(f"<{self.svtool}> 变异数量: {len(self.records.all_records)}")
def __str__(self) -> str:
return self.header.__str__() + "\n".join([r.__str__() for r in self.records.all_records])
def merge(self, in_vcf, overlap_p=0.8):
""" 合并另一个 VCF_recods 实例类, 两记录相同类型变异重叠区域超过最大区域的 80% 就合并 """
# 没有信息或样本不一致,直接退出
if not in_vcf.header or not in_vcf.records or \
in_vcf.header.SAMPLES != self.header.SAMPLES:
logging.error(f'数据为空,或者样本不一致!')
return
# 合并 header
self.header.merge(in_vcf.header)
id = 1
new_all_recods = VCF_recods()
my_generator = self.Iter()
in_generator = in_vcf.Iter()
my_rec: Recod = next(my_generator, None)
in_rec: Recod = next(in_generator, None)
my_next = None
in_next = None
# couns = 0
while my_rec or in_rec:
# couns += 1
# if my_rec.POS == in_rec.POS== '152807507':
# if couns > 4000 and couns % 1000:
# print('my_rec', my_rec)
# print('in_rec', in_rec, end='\n\n')
# if couns > 5000:
# sys.exit()
# 输入项没有了时,直接保存原记录
if not in_rec:
my_rec.ID = id
id += 1
new_all_recods.all_records.append(my_rec)
my_rec = next(my_generator, None)
continue
# 仅有输入项,直接保存输入项
elif not my_rec:
in_rec.ID = id
id += 1
new_all_recods.all_records.append(in_rec)
in_rec = next(in_generator, None)
continue
# 处理一个 vcf 文件内存在多个位置相同的 SV
my_rec_list = []
in_rec_list = []
same_pos = False
while my_rec.CHROM == in_rec.CHROM and my_rec.POS == in_rec.POS:
if not my_next:
my_next = next(my_generator, None)
if my_next and my_next.CHROM == my_rec.CHROM and my_next.POS == my_rec.POS:
my_rec_list.append(my_rec)
my_rec = my_next
my_next = None
continue
in_next = next(in_generator, None)
if in_next and in_next.CHROM == in_rec.CHROM and in_next.POS == in_rec.POS:
in_rec_list.append(in_rec)
in_rec = in_next
continue
same_pos = True
break
my_rec_list.append(my_rec)
in_rec_list.append(in_rec)
done_rec = []
call_my = False # 判断是否需要载入下一个 my_rec
call_in = False # 判断是否需要载入下一个 in_rec
for my_rec in my_rec_list:
for in_rec in in_rec_list:
res = self._compare_records(my_rec, in_rec, id, overlap_p)
if res:
id += 1
new_all_recods.all_records.append(res)
done_rec.extend([my_rec, in_rec])
if not my_next:
call_my = True
if not in_next:
call_in = True
break
# 比较记录顺序,靠前的先写入 all_records 内
my_chr_index = self.records.allow_chrom.index(my_rec.CHROM)
in_chr_index = self.records.allow_chrom.index(in_rec.CHROM)
if my_chr_index < in_chr_index:
my_rec.ID = id
id += 1
new_all_recods.all_records.append(my_rec)
call_my = True
done_rec.append(my_rec)
continue
elif in_chr_index < my_chr_index:
in_rec.ID = id
id += 1
new_all_recods.all_records.append(in_rec)
call_in = True
done_rec.append(in_rec)
continue
else:
if int(my_rec.POS) < int(in_rec.POS):
my_rec.ID = id
id += 1
new_all_recods.all_records.append(my_rec)
call_my = True
done_rec.append(my_rec)
continue
elif int(in_rec.POS) < int(my_rec.POS):
in_rec.ID = id
id += 1
new_all_recods.all_records.append(in_rec)
call_in = True
done_rec.append(in_rec)
continue
# 位置相等后续一次性处理
for rec in my_rec_list + in_rec_list:
# 多个位置相等的记录比较完后,添加到结果表内,其他情况的记录留着后续再比较
if rec not in done_rec and same_pos:
rec.ID = id
id += 1
new_all_recods.all_records.append(rec)
same_pos = False
call_my = True
call_in = True
if my_next:
my_rec = my_next
my_next = None
else:
if call_my:
my_rec = next(my_generator, None)
call_my = False
if in_next:
in_rec = in_next
in_next = None
else:
if call_in:
in_rec = next(in_generator, None)
call_in = False
# if couns == 4000 and my_rec.POS == in_rec.POS== '24033178':
# print('next')
# print('my_next', my_next)
# print('in_next', in_next)
# print('my_rec', my_rec)
# print('in_rec', in_rec, end='\n\n')
logging.info(f'合并之后变异数量: {id-1}')
self.records = new_all_recods
def _compare_records(self, my_rec, in_rec, ID, overlap_p=0.8):
""" 比较两个位置是否可以合并,可以合并的话,返回一个新的 VCF_record 类,否则返回 None """
if my_rec.CHROM == in_rec.CHROM and \
my_rec.INFO['SVTYPE'] == in_rec.INFO['SVTYPE'] and \
abs(int(my_rec.POS) - int(in_rec.POS)) <= 5:
#* 断点
if my_rec.INFO['SVTYPE'] == 'BND':
my_alt_chr, my_alt_pos = re.findall(r'[\[\]](\w+)\:(\d+)', my_rec.ALT)[0]
in_alt_chr, in_alt_pos = re.findall(r'[\[\]](\w+)\:(\d+)', in_rec.ALT)[0]
# 另一端点染色体一致,断点位置位置偏差小于 5
if my_alt_chr == in_alt_chr and \
abs(int(my_alt_pos) - int(in_alt_pos)) <= 5 and \
abs(int(my_rec.POS) - int(in_rec.POS)) <= 5:
return self._merge_record(my_rec, in_rec, ID)
#* 常规变异有 SVLEN 或 END
elif my_rec.INFO['SVTYPE'] != 'INS' and 'END' in my_rec.INFO and 'END' in in_rec.INFO:
my_end = int(my_rec.INFO['END'])
in_end = int(in_rec.INFO['END'])
my_pos = int(my_rec.POS)
in_pos = int(in_rec.POS)
my_len = my_end - int(my_rec.POS)
in_len = in_end - int(in_rec.POS)
over_p = self.cal_overlap(my_pos, my_end, in_pos, in_end)
if over_p*100 >= overlap_p:
return self._merge_record(my_rec, in_rec, ID)
elif 'SVLEN' in my_rec.INFO and 'SVLEN' in in_rec.INFO:
my_len = abs(int(my_rec.INFO['SVLEN']))
in_len = abs(int(in_rec.INFO['SVLEN']))
over_p = in_len/my_len if my_len > in_len else my_len/in_len
if over_p*100 >= overlap_p:
return self._merge_record(my_rec, in_rec, ID)
@staticmethod
def cal_overlap(a: int, b: int, x: int, y: int) -> float:
"""
计算 2 个区间重叠百分比,返回 0-1
(a, b) 为一个区间, (x, y) 为一个区间
"""
if a > b:
a, b = b, a
if x > y:
x, y = y, x
ord = [a, b, x, y]
ord.sort()
# ab in xy
if a == ord[1] and b == ord[2]:
return (b-a)/(y-x)
# xy in ab
if x == ord[1] and y == ord[2]:
return (y-x)/(b-a)
# ab overlap xy
if a == ord[1] and y == ord[2]:
return (y-a)/(b-x)
if x == ord[1] and b == ord[2]:
return (b-x)/(y-a)
return 0.0
def _merge_record(self, my_rec, in_rec, ID: str):
""" 整理需要合并的 """
# 合并记录
new_record = Recod()
new_record.CHROM = my_rec.CHROM
new_record.ID = ID
# 偏差越小位置越准确
if my_rec.POS == in_rec.POS:
new_record.POS = my_rec.POS
elif 'CIPOS' in my_rec.INFO:
if 'CIPOS' in in_rec.INFO:
my_CIPOS = sum(abs(int(i)) for i in my_rec.INFO['CIPOS'].split(','))
in_CIPOS = sum(abs(int(i)) for i in in_rec.INFO['CIPOS'].split(','))
if my_CIPOS < in_CIPOS:
new_record.POS = my_rec.POS
else:
new_record.POS = in_rec.POS
else:
new_record.POS = my_rec.POS
elif 'CIPOS' in in_rec.INFO:
new_record.POS = in_rec.POS
else:
logging.warning(f'CIPOS 不存在: \n{my_rec}\n{in_rec}')
new_record.POS = my_rec.POS
# 偏差越小位置越准确
if 'CIEND' in my_rec.INFO:
if 'CIEND' in in_rec.INFO:
my_CIEND = sum(abs(int(i)) for i in my_rec.INFO['CIEND'].split(','))
in_CIEND = sum(abs(int(i)) for i in in_rec.INFO['CIEND'].split(','))
if my_CIEND < in_CIEND:
new_record.ALT = my_rec.ALT
new_record.REF = my_rec.REF
else:
new_record.ALT = in_rec.ALT
new_record.REF = in_rec.REF
else:
new_record.ALT = my_rec.ALT
new_record.REF = my_rec.REF
else:
new_record.ALT = in_rec.ALT
new_record.REF = in_rec.REF
new_record.QUAL = self._merge_QUAL(my_rec.QUAL, in_rec.QUAL)
new_record.FILTER = self._merge_FILTER(my_rec.FILTER, in_rec.FILTER)
new_record.INFO = self._merge_INFO(my_rec.INFO, in_rec.INFO)
new_record.FORMAT = self._merge_FORMAT(my_rec.FORMAT, in_rec.FORMAT)
new_record.SAMPLE_value = self._merge_SAMPLE_value(new_record.FORMAT, my_rec, in_rec)
#! 修正明显的错误
if new_record.INFO['SVTYPE'] != 'INS' and \
'SVLEN' in new_record.INFO and \
'END' in new_record.INFO:
if abs(int(new_record.INFO['SVLEN'])) != int(new_record.INFO['END']) - int(new_record.POS):
logging.warning(f'发现 SVLEN 异常,进行修复: [{new_record.CHROM}:{new_record.POS}]')
new_record.INFO['SVLEN'] = str(int(new_record.INFO['END']) - int(new_record.POS))
# DEL 的长度为负值
if new_record.INFO['SVTYPE'] == 'DEL':
new_record.INFO['SVLEN'] = f"-{new_record.INFO['SVLEN']}"
return new_record
@staticmethod
def _merge_QUAL(my_QUAL: str, in_QUAL: str) -> str:
""" 合并 QUAL 字段 """
if my_QUAL.isdigit():
if in_QUAL.isdigit():
if int(my_QUAL) > int(in_QUAL):
return my_QUAL
return in_QUAL
return my_QUAL
return in_QUAL
@staticmethod
def _merge_FILTER(my_FILTER, in_FILTER):
""" 合并 FILTER 字段 """
if my_FILTER != '.':
if in_FILTER != '.' and my_FILTER != in_FILTER:
return f"{my_FILTER},{in_FILTER}"
return my_FILTER
return in_FILTER
@staticmethod
def _merge_INFO(my_INFO: defaultdict, in_INFO: defaultdict) -> defaultdict:
""" 合并 INFO 字段 """
new_INFO = defaultdict()
# 合并必须 SV 类型一致
new_INFO['SVTYPE'] = my_INFO['SVTYPE']
# 不精确变异标识符
if 'IMPRECISE' in my_INFO:
if 'IMPRECISE' in in_INFO:
new_INFO['IMPRECISE'] = 'IMPRECISE'
# CIPOS
if 'CIPOS' in my_INFO:
if 'CIPOS' in in_INFO:
my_CIPOS = sum(abs(int(i)) for i in my_INFO['CIPOS'].split(','))
in_CIPOS = sum(abs(int(i)) for i in in_INFO['CIPOS'].split(','))
if my_CIPOS < in_CIPOS:
new_INFO['CIPOS'] = my_INFO['CIPOS']
else:
new_INFO['CIPOS'] = in_INFO['CIPOS']
else:
new_INFO['CIPOS'] = my_INFO['CIPOS']
elif 'CIPOS' in in_INFO:
new_INFO['CIPOS'] = in_INFO['CIPOS']
# END, CIEND
first = True
if 'CIEND' in my_INFO:
if 'CIEND' in in_INFO:
my_CIEND = sum(abs(int(i)) for i in my_INFO['CIEND'].split(','))
in_CIEND = sum(abs(int(i)) for i in in_INFO['CIEND'].split(','))
if my_CIEND < in_CIEND:
first = True
else:
first = False
else:
first = True
elif 'CIEND' in in_INFO:
first = False
if first:
if 'CIEND' in my_INFO:
new_INFO['CIEND'] = my_INFO['CIEND']
elif 'CIEND' in in_INFO:
new_INFO['CIEND'] = in_INFO['CIEND']
if 'END' in my_INFO:
new_INFO['END'] = my_INFO['END']
elif 'END' in in_INFO:
new_INFO['END'] = in_INFO['END']
# else:
# pass
# logging.warning(f'没有 END')
if 'SVLEN' in my_INFO:
new_INFO['SVLEN'] = my_INFO['SVLEN']
elif 'SVLEN' in in_INFO:
new_INFO['SVLEN'] = in_INFO['SVLEN']
# else:
# pass
# logging.warning(f'没有 SVLEN')
else:
if 'CIEND' in in_INFO:
new_INFO['CIEND'] = in_INFO['CIEND']
elif 'CIEND' in my_INFO:
new_INFO['CIEND'] = my_INFO['CIEND']
if 'END' in in_INFO:
new_INFO['END'] = in_INFO['END']
elif 'END' in my_INFO:
new_INFO['END'] = my_INFO['END']
# else:
# pass
# logging.warning(f'没有 END')
if 'SVLEN' in in_INFO:
new_INFO['SVLEN'] = in_INFO['SVLEN']
elif 'SVLEN' in my_INFO:
new_INFO['SVLEN'] = my_INFO['SVLEN']
# else:
# pass
# logging.warning(f'没有 SVLEN')
# SVTOOL NUM_SVTOOLS
if 'SVTOOL' in my_INFO:
if 'SVTOOL' in in_INFO:
new_INFO['SVTOOL'] = f"{my_INFO['SVTOOL']},{in_INFO['SVTOOL']}"
# new_INFO['SVTOOL'] = ",".join(list(set(my_INFO['SVTOOL'].split(',') + in_INFO['SVTOOL'].split(','))))
new_INFO['NUM_SVTOOLS'] = str(new_INFO['SVTOOL'].count(',') + 1)
else:
logging.error(f'INFO 内 SVTOOL 字段丢失: {in_INFO}')
else:
logging.error(f'INFO 内 SVTOOL 字段丢失: {my_INFO}')
if 'NUM_SVTOOLS' in my_INFO:
if 'NUM_SVTOOLS' in in_INFO:
new_INFO['NUM_SVTOOLS'] = str(int(my_INFO['NUM_SVTOOLS']) + int(in_INFO['NUM_SVTOOLS']))
else:
logging.error(f'INFO 内 NUM_SVTOOLS 字段丢失: {in_INFO}')
else:
logging.error(f'INFO 内 NUM_SVTOOLS 字段丢失: {in_INFO}')
# other
for key in ['BND_DEPTH', 'MATE_BND_DEPTH']:
if key in my_INFO:
if key in in_INFO:
my_value = int(my_INFO[key])
in_value = int(in_INFO[key])
if my_value < in_value:
new_INFO[key] = my_INFO[key]
else:
new_INFO[key] = in_INFO[key]
else:
new_INFO[key] = my_INFO[key]
elif key in in_INFO:
new_INFO[key] = in_INFO[key]
return new_INFO
@staticmethod
def _merge_FORMAT(my_FORMAT: list, in_FORMAT: list) -> list:
""" 合并 样本信息 字段 """
allow_FORMAT = ['GT', 'PE', 'SR']
new_FORMAT = []
for key in my_FORMAT + in_FORMAT:
if key in allow_FORMAT and key not in new_FORMAT:
new_FORMAT.append(key)
return new_FORMAT
@staticmethod
def _merge_SAMPLE_value(FORMAT, my_rec, in_rec) -> OrderedDict:
""" 合并 样本信息 字段 """
new_SAMPLE_value = OrderedDict()
# PR 当作 PE 的别名
my_format = my_rec.FORMAT
in_format = in_rec.FORMAT
for s, _ in my_rec.SAMPLE_value.items():
new_SAMPLE_value[s] = OrderedDict()
for key in FORMAT:
if key == 'GT':
my_gt = my_rec.SAMPLE_value[s][key]
in_gt = in_rec.SAMPLE_value[s][key]
if my_gt != './.':
new_SAMPLE_value[s][key] = my_gt
elif in_gt != './.':
new_SAMPLE_value[s][key] = in_gt
else:
new_SAMPLE_value[s][key] = './.'
continue
if key in my_format:
my_v = my_rec.SAMPLE_value[s][key]
if key in in_format:
in_v = in_rec.SAMPLE_value[s][key]
if int(my_v) > int(in_v):
new_SAMPLE_value[s][key] = my_v
else:
new_SAMPLE_value[s][key] = in_v
else:
new_SAMPLE_value[s][key] = my_v
elif key in in_format:
in_v = in_rec.SAMPLE_value[s][key]
new_SAMPLE_value[s][key] = in_v
else:
logging.error(f'哪里有问题,两个 record 内都没有的话 {key} 哪来的丫!')
return new_SAMPLE_value
def Iter(self) -> Recod:
""" 记录访问的生成器 """
for rec in self.records.all_records:
yield rec
def write(self, out_file: str='out.vcf'):
if out_file.endswith('vcf.gz'):
o_vcf = gzip.open(out_file, 'wt')
else:
o_vcf = open(out_file, 'w')
o_vcf.write(self.header.__str__() + "\n".join([r.__str__() for r in self.records.all_records]))
@staticmethod
def _sort_check(in_reader: TextIO):
""" 对 vcf 记录按染色体编号进行排序 """
all_records = defaultdict(list)
for line in in_reader:
if line.startswith('#'):
yield line
else:
chrom, pos, *_ = line.split('\t')
chrom = chrom.upper().replace('CHR', '')
# 为了保证顺序符合预期,小数位的染色体前面填充 0
if chrom.isdigit():
key = "{:0>2d}_{:0>9d}".format(int(chrom), int(pos))
else:
key = "{:s}_{:0>9d}".format(chrom, int(pos))
all_records[key].append(line)
all_key = list(all_records.keys())
all_key.sort()
for key in all_key:
for info in all_records[key]:
yield info
if __name__ == '__main__':
in_vcf = VCF('lumpy:lumpy.vcf')
ou_vcf = VCF('manta:manta.vcf.gz')
in_vcf.merge(ou_vcf)
in_vcf.write()
SV 注释: AnnotSV
SV 可视化: samplot
注意事项
- SV 的假阳性结果特别多,需要根据一定的条件进行过滤
- 断点明确的 SV 可信度更高