把clinvar转换为annovar的格式
基于python自提
转自:https://blog.csdn.net/Cassiel60/article/details/89399533
下载最新的数据库:ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/
annovar是注释用的比较多的软件,clinvar的数据库经常更新,要跟的上更新,就必须自己进行格式转换,也可以把自己的数据库放在annovar注释,比如hgmd,网上有很多优秀的python代码可以实现,可以自己写,也可以参照别人的
官方的版本使用perl写的:参考https://pzweuj.github.io/2018/07/27/annovar-clinvar.html上进行操作,也可以。
put上一个已经写好的转换代码,具体是哪位大神的已经忘了,好像有一点点小问题,遇到什么问题自己再调试。
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import sys
import gzip
import pandas as pd
__doc__ = '''
USAGE:
python script <in:clinvar.vcf.gz> <out:clinvar.tab.xls> <out:annovar.annot.db>
clinvar.vcf.gz:
archive_2.0
ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz
clinvar.tab.xls:
full vcf infor
annovar.annot.db:
#Chr Start End Ref Alt CLINSIG CLNDBN CLNACC CLNDSDB CLNDSDBID
1 949523 949523 C T Pathogenic Immunodeficiency_38_with_basal_ganglia_calcification RCV000162196.3 MedGen:OMIM CN221808:616126
1 949608 949608 G A Benign not_specified RCV000455759.1 MedGen CN169374
1 949696 949696 - G Pathogenic Immunodeficiency_38_with_basal_ganglia_calcification RCV000148989.5 MedGen:OMIM CN221808:616126
'''
try:
_, clinvar, fulltab, annovar = sys.argv
except:
print(__doc__)
sys.exit(1)
def parse_variant(line):
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO = line.split('\t')
POS = int(POS)
dat = {}
dat['CHROM'] = CHROM
dat['ID'] = ID
if len(REF)==len(ALT)==1 and REF!=ALT: # SNV
dat['START'], dat['END'] = POS, POS
dat['REF'], dat['ALT'] = REF, ALT
elif len(REF)==1 and len(ALT)!=1 and ALT.startswith(REF): # Insert
dat['START'], dat['END'] = POS, POS
dat['REF'], dat['ALT'] = '-', ALT[1:]
elif len(REF)!=1 and len(ALT)==1 and REF.startswith(ALT): # Delete
dat['START'], dat['END'] = POS+1, POS+len(REF)-1
dat['REF'], dat['ALT'] = REF[1:], '-'
else:
#print(line)
dat['START'], dat['END'] = POS, POS+len(REF)-1
dat['REF'], dat['ALT'] = REF, ALT
INFO = [i.split('=') for i in INFO.split(';')]
INFO = {i[0]:i[1] for i in INFO}
dat.update(INFO)
dat['CLNSIG'] = dat.get('CLNSIG', 'not provided').replace('/', '|')
dat['CLNACC'] = ''
try:
# example CLNDISDB=MedGen:C3808739,OMIM:615120|MedGen:CN169374
tmp = [i.split(',') for i in dat['CLNDISDB'].split('|')]
db, ids = [], []
for i in tmp:
tmp2 = []
for j in i:
tmp2.append(['.', '.'] if j=='.' else j.split(':'))
db.append(','.join([j[0] for j in tmp2]))
ids.append(','.join([j[1] for j in tmp2]))
dat['CLNDSDB'] = '|'.join(db)
dat['CLNDSDBID'] = '|'.join(ids)
except KeyError:
pass
return pd.Series(dat)
fopen = gzip.open if clinvar.endswith('gz') else open
with fopen(clinvar) as f:
vcf = []
for i in f:
if i.startswith('#') or not i.strip():
continue
vcf.append(parse_variant(i.strip()))
dat = pd.concat(vcf, axis=1, join='outer')
dat = dat.T
#dat = dat.fillna('--')
title = ['CHROM', 'START', 'END', 'REF', 'ALT', 'ID']
header = sorted(list(set(dat.columns)-set(title)))
dat = dat[title+header]
dat.to_csv(fulltab, sep='\t', index=False)
# header = ['CHROM', 'START', 'END', 'REF', 'ALT', 'CLNSIG', 'CLNDN', 'CLNACC', 'CLNDSDB', 'CLNDSDBID']
header = ['CHROM', 'START', 'END', 'REF', 'ALT', 'CLNSIG', 'CLNDN', 'CLNDISDB']
dat = dat[header]
header = ['#Chr', 'Start', 'End', 'Ref', 'Alt', 'CLINSIG', 'CLNDBN', 'CLNDISDB']
dat.columns = header
dat.to_csv(annovar, sep='\t', index=False)
然后就可以放进annovar的数据库中,注释的时候填入对应的版本(比如:clinvar_20180729)就可以了。
如果想更快的话,可以给数据库建立一个索引文件
有一个建立annovar索引的perl程序,Annovar_index.pl也是已经写好的。
#!/usr/bin perl;
use warnings;
use strict;
die "$0 <Annovar Database File> <BIN Size>" unless @ARGV == 2;
my $input_file = $ARGV[0];
my $bin_size = $ARGV[1];
if (!-e $input_file) {
die "$input_file not found\n";
}
my $file_size = -s $input_file;
my %index;
open(my $in, "<", $input_file) or die "Couldn't open $input_file for indexing\n";
my $previous_file_position = tell $in;
while (my $ln = <$in>) {
#my ($chr,$start,$stop) = split "\t", $ln;
my ($chr,$start,$stop) = split "\t", $ln;
$chr =~ s/^chr//i;
my $bin_start = int($start/$bin_size) * $bin_size;
my $current_file_position = tell $in;
if (!exists $index{$chr}->{$bin_start}) {
$index{$chr}->{$bin_start} = [$previous_file_position, $current_file_position];
}
else{
$index{$chr}->{$bin_start}->[1] = $current_file_position;
}
$previous_file_position = $current_file_position;
}
close $in;
my $index_file = $input_file . ".idx";
open(OUT, ">$index_file");
print OUT "#BIN\t$bin_size\t$file_size\n";
foreach my $chr ((1,10..19,2,20,21,22,3..9,"MT","X","Y")){ #Ordered array to match other Annovar idx files
foreach my $index_region (sort keys %{$index{$chr}}){
my $start = $index{$chr}->{$index_region}->[0];
my $stop = $index{$chr}->{$index_region}->[1];
print OUT "$chr\t$index_region\t$start\t$stop\n";
}
}
用法:
perl Annovar_index.pl hg19_clinvar_20180729.txt 1000
通过vt工具快速转换
转自:https://pzweuj.github.io/2018/07/27/annovar-clinvar.html
安装vt
git clone https://github.com/atks/vt.git
cd vt
make
./vt -h
下载clinvar数据库
wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar_20180701.vcf.gz
wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar_20180701.vcf.gz.tbi
转换
vt decompose clinvar_20180701.vcf.gz -o temp.split.vcf
perl prepare_annovar_user.pl -dbtype clinvar_preprocess2 temp.split.vcf -out temp.split2.vcf
vt normalize temp.split2.vcf \
-r ~/database/b37/Homo_sapiens.GRCh37.dna.toplevel.fa \
-o temp.norm.vcf \
-w 2000000
perl prepare_annovar_user.pl -dbtype clinvar2 temp.norm.vcf -out hg19_clinvar_20180701.txt
构建索引
https://github.com/pzweuj/practice/blob/master/perl/makeAnnovarIndex/makeAnnovarIndex.pl