【WDL】7. 实践:GATK calling变异(人类)
功能
使用BWA + GATK进行变异检测的最佳实践流程,且优化为按染色体切分,并行进行变异检测和BQSR步骤,加快分析进度。
流程图
input.json
{
"wgs.apply_bqsr.cpu": 32,
"wgs.apply_bqsr.disks": "local-disk 250 cloud_ssd",
"wgs.apply_bqsr.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
"wgs.apply_bqsr.java_opts": "'-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Djava.io.tmpdir=/mnt'",
"wgs.apply_bqsr.memory": "64G",
"wgs.base_recalibrator.cpu": 16,
"wgs.base_recalibrator.disks": "local-disk 250 cloud_ssd",
"wgs.base_recalibrator.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
"wgs.base_recalibrator.java_opts": "'-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Djava.io.tmpdir=/mnt'",
"wgs.base_recalibrator.memory": "32G",
"wgs.bwa_align.bwa_opts": "-K 10000000 -M -Y",
"wgs.bwa_align.bwa_release": "/usr/local/bin/bwakit-0.7.15/",
"wgs.bwa_align.cpu": 32,
"wgs.bwa_align.disks": "local-disk 250 cloud_ssd",
"wgs.bwa_align.memory": "64G",
"wgs.bwa_align.samtools_release": "/usr/local/bin/samtools-1.9/",
"wgs.dedup.cpu": 16,
"wgs.dedup.create_index": true,
"wgs.dedup.disks": "local-disk 250 cloud_ssd",
"wgs.dedup.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
"wgs.dedup.java_opts": "'-Xms4000m -Djava.io.tmpdir=/mnt -Dsamjdk.compression_level=2'",
"wgs.dedup.memory": "32G",
"wgs.dedup.remove_duplicates": true,
"wgs.docker_img": "call_variants:1.0",
"wgs.fastq1": "/path/to/reads_1.fastq",
"wgs.fastq2": "/path/to/reads_2.fastq",
"wgs.gather_bam_files.cpu": 16,
"wgs.gather_bam_files.create_index": true,
"wgs.gather_bam_files.disks": "local-disk 250 cloud_ssd",
"wgs.gather_bam_files.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
"wgs.gather_bam_files.java_opts": "'-Xms3000m -Djava.io.tmpdir=/mnt -Dsamjdk.compression_level=2'",
"wgs.gather_bam_files.memory": "32G",
"wgs.gather_bqsr_report.cpu": 16,
"wgs.gather_bqsr_report.disks": "local-disk 250 cloud_ssd",
"wgs.gather_bqsr_report.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
"wgs.gather_bqsr_report.java_opts": "'-Xms3000m -Djava.io.tmpdir=/mnt'",
"wgs.gather_bqsr_report.memory": "32G",
"wgs.genome_dict": "/path/to/Homo_sapiens_assembly38.dict",
"wgs.genome_indexes": [
"/path/to/Homo_sapiens_assembly38.fasta",
"/path/to/Homo_sapiens_assembly38.fasta.64.alt",
"/path/to//Homo_sapiens_assembly38.fasta.64.amb",
"/path/to/Homo_sapiens_assembly38.fasta.64.ann",
"/path/to/Homo_sapiens_assembly38.fasta.64.bwt",
"/path/to/Homo_sapiens_assembly38.fasta.64.pac",
"/path/to/Homo_sapiens_assembly38.fasta.64.sa",
"/path/to/Homo_sapiens_assembly38.dict",
"/path/to/Homo_sapiens_assembly38.fasta.fai"
],
"wgs.genotype_gvcfs.cpu": 16,
"wgs.genotype_gvcfs.disks": "local-disk 250 cloud_ssd",
"wgs.genotype_gvcfs.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
"wgs.genotype_gvcfs.java_opts": "'-Xms4000m -Djava.io.tmpdir=/mnt'",
"wgs.genotype_gvcfs.memory": "32G",
"wgs.haplotype_caller.cpu": 32,
"wgs.haplotype_caller.disks": "local-disk 250 cloud_ssd",
"wgs.haplotype_caller.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
"wgs.haplotype_caller.java_opts": "'-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Djava.io.tmpdir=/mnt'",
"wgs.haplotype_caller.memory": "64G",
"wgs.known_1000GIndels": [
"/path/to/Homo_sapiens_assembly38.known_indels.vcf.gz",
"/path/to/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi"
],
"wgs.known_Mills": [
"/path/to/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
"/path/to/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
],
"wgs.known_dbsnp": [
"/path/to/Homo_sapiens_assembly38.dbsnp138.vcf",
"/path/to/Homo_sapiens_assembly38.dbsnp138.vcf.idx"
],
"wgs.merge_vcfs.cpu": 16,
"wgs.merge_vcfs.disks": "local-disk 250 cloud_ssd",
"wgs.merge_vcfs.gatk_Launcher": "/usr/local/bin/gatk-4.1.4.1/gatk",
"wgs.merge_vcfs.java_opts": "'-Xms2000m -Djava.io.tmpdir=/mnt'",
"wgs.merge_vcfs.memory": "32G",
"wgs.reads_group": "'@RG\\tID:GROUP_NAME\\tSM:SAMPLE_NAME\\tPL:PLATFORM'",
"wgs.sample_id": "test",
"wgs.wgs_calling_regions": "/path/to/wgs_calling_regions.hg38.interval_list"
}
interval_list用于并行,需要与基因组版本配套。
WDL
version 1.0
workflow wgs {
input {
String sample_id
String reads_group
File fastq1
File fastq2
Array[File] genome_indexes
File genome_dict
File wgs_calling_regions
Array[File] known_dbsnp
Array[File] known_Mills
Array[File] known_1000GIndels
String docker_img = "call_variants:1.0"
}
call bwa_align {
input:
sample_id=sample_id,
reads_group=reads_group,
fastq1=fastq1,
fastq2=fastq2,
genome_indexes=genome_indexes,
docker_img=docker_img
}
call dedup {
input:
sample_id=sample_id,
sorted_bam = bwa_align.sorted_bam_output,
docker_img = docker_img
}
call gen_bqsr_scatter {
input:
genome_dict=genome_dict
}
scatter (sequence_group_intervals in gen_bqsr_scatter.sequence_grouping) {
call base_recalibrator {
input:
grouping=sequence_group_intervals,
sample_id=sample_id,
dedup_bam=dedup.dedup_bam_output,
genome_indexes=genome_indexes,
known_dbsnp=known_dbsnp,
known_1000GIndels=known_1000GIndels,
known_Mills=known_Mills,
docker_img=docker_img
}
}
call gather_bqsr_report {
input:
recalibration_reports=base_recalibrator.recalibration_report_output,
sample_id=sample_id,
docker_img=docker_img
}
scatter (subgroup in gen_bqsr_scatter.sequence_grouping_with_unmapped) {
call apply_bqsr {
input:
sample_id=sample_id,
calling_region=subgroup,
dedup_bam=dedup.dedup_bam_output,
genome_indexes=genome_indexes,
bqsr_grp=gather_bqsr_report.bqsr_grp_output,
docker_img=docker_img
}
}
call gather_bam_files {
input:
sample_id=sample_id,
separate_recalibrated_bams = apply_bqsr.separate_recalibrated_bam,
docker_img=docker_img
}
call gen_hc_scatter {
input:
wgs_calling_regions=wgs_calling_regions
}
scatter (intervals in gen_hc_scatter.scatter_interval_list) {
call haplotype_caller {
input:
sample_id=sample_id,
input_bam=gather_bam_files.recalibrated_bam_output,
calling_region=intervals,
genome_indexes=genome_indexes,
known_dbsnp=known_dbsnp,
docker_img=docker_img
}
}
call merge_vcfs {
input:
sample_id=sample_id,
docker_img=docker_img,
vcfs=haplotype_caller.hc_block_gvcf
}
call genotype_gvcfs {
input:
sample_id=sample_id,
genome_indexes=genome_indexes,
gvcf=merge_vcfs.gvcf_output,
known_dbsnp=known_dbsnp,
docker_img=docker_img
}
output {
Pair[File,File] output_bam = gather_bam_files.recalibrated_bam_output
Pair[File,File] output_gvcf = merge_vcfs.gvcf_output
File output_vcf = genotype_gvcfs.vcf_output
}
}
task bwa_align {
input {
# reads pair
File fastq1
File fastq2
# pre-built genome index files
Array[File] genome_indexes
# experiments info
String reads_group
String sample_id
# bwa parameters
String bwa_release = "/usr/local/bin/bwakit-0.7.15/"
String samtools_release = "/usr/local/bin/samtools-1.9/"
String bwa_opts = "-K 10000000 -M -Y"
# Resource
Int cpu = 32
String memory = "64G"
String disks = "local-disk 250 cloud_ssd"
# docker image
String docker_img
}
String sorted_bam = sample_id + ".sorted.bam"
String sorted_bam_index = sorted_bam + ".bai"
command <<<
cpu_cores=$(nproc)
~{bwa_release}/bwa mem \
~{bwa_opts} \
-t $cpu_cores \
-R ~{reads_group} \
~{genome_indexes[0]} \
~{fastq1} ~{fastq2} \
| ~{samtools_release}/samtools sort -@ $cpu_cores -o ~{sorted_bam}
~{samtools_release}/samtools index ~{sorted_bam}
>>>
runtime {
cpu: cpu
memory: memory
disks: disks
docker: docker_img
}
output {
Pair[File, File] sorted_bam_output = (sorted_bam, sorted_bam_index)
}
}
task dedup {
input {
String sample_id
Pair[File, File] sorted_bam
String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
String java_opts = "'-Xms4000m -Djava.io.tmpdir=/mnt -Dsamjdk.compression_level=2'"
Boolean remove_duplicates = true
Boolean create_index = true
# Resource
Int cpu = 16
String memory = "32G"
String disks = "local-disk 250 cloud_ssd"
# docker
String docker_img
}
String dedup_bam = sample_id + ".deduplicated.bam"
String dedup_bam_index = sample_id + ".deduplicated.bai"
String dedup_metrics = sample_id + ".deduplicated.metrics"
command <<<
~{gatk_Launcher} --java-options ~{java_opts} \
MarkDuplicates \
-I ~{sorted_bam.left} \
-O ~{dedup_bam} \
-M ~{dedup_metrics} \
~{true="--REMOVE_DUPLICATES true" false="" remove_duplicates} \
~{true="--CREATE_INDEX true" false="" create_index}
>>>
runtime {
cpu: cpu
memory: memory
disks: disks
docker: docker_img
}
output {
Pair[File,File] dedup_bam_output = (dedup_bam, dedup_bam_index)
File dedup_metrics_output = dedup_metrics
}
}
# Split the whole genome for BQSR scatter-gather
task gen_bqsr_scatter {
input {
File genome_dict
}
## 不建议在command中写非shell脚本
command <<<
python - ~{genome_dict} << EOF
import sys
with open(sys.argv[1], "r") as ref_dict_file:
sequence_tuple_list = []
longest_sequence = 0
for line in ref_dict_file:
if line.startswith("@SQ"):
line_split = line.split("\t")
# (Sequence_Name, Sequence_Length)
sequence_tuple_list.append((line_split[1].split("SN:")[1], int(line_split[2].split("LN:")[1])))
longest_sequence = sorted(sequence_tuple_list, key=lambda x: x[1], reverse=True)[0][1]
# We are adding this to the intervals because hg38 has contigs named with embedded colons and a bug in GATK strips off
# the last element after a :, so we add this as a sacrificial element.
hg38_protection_tag = ":1+"
# initialize the tsv string with the first sequence
tsv_string = "-L " + sequence_tuple_list[0][0] + hg38_protection_tag
temp_size = sequence_tuple_list[0][1]
for sequence_tuple in sequence_tuple_list[1:]:
if temp_size + sequence_tuple[1] <= longest_sequence:
temp_size += sequence_tuple[1]
tsv_string += " " + "-L " + sequence_tuple[0] + hg38_protection_tag
else:
tsv_string += "\n" + "-L " + sequence_tuple[0] + hg38_protection_tag
temp_size = sequence_tuple[1]
# add the unmapped sequences as a separate line to ensure that they are recalibrated as well
with open("sequence_grouping.txt", "w") as tsv_file:
tsv_file.write(tsv_string)
tsv_file.write("\n")
tsv_file.close()
tsv_string += '\n' + "-L unmapped\n"
with open("sequence_grouping_with_unmapped.txt","w") as tsv_file_with_unmapped:
tsv_file_with_unmapped.write(tsv_string)
tsv_file_with_unmapped.close()
EOF
>>>
output {
Array[String] sequence_grouping = read_lines("sequence_grouping.txt")
Array[String] sequence_grouping_with_unmapped = read_lines("sequence_grouping_with_unmapped.txt")
}
}
task base_recalibrator {
input {
String grouping
Pair[File,File] dedup_bam
Array[File] genome_indexes
Array[File] known_1000GIndels
Array[File] known_Mills
Array[File] known_dbsnp
String sample_id
String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
String java_opts = "'-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Djava.io.tmpdir=/mnt'"
# Resource
Int cpu = 16
String memory = "32G"
String disks = "local-disk 250 cloud_ssd"
# docker
String docker_img
}
String recalibration_report = sample_id + "_bqsr.table"
command <<<
~{gatk_Launcher} --java-options ~{java_opts} \
BaseRecalibrator \
-I ~{dedup_bam.left} \
-R ~{genome_indexes[0]} \
~{"--known-sites " + known_1000GIndels[0]} \
~{"--known-sites " + known_Mills[0]} \
~{"--known-sites " + known_dbsnp[0]} \
~{grouping} \
-O ~{recalibration_report}
>>>
runtime {
cpu:cpu
memory:memory
disks:disks
docker: docker_img
}
output {
File recalibration_report_output = recalibration_report
}
}
task gather_bqsr_report {
input {
Array[File] recalibration_reports
String sample_id
String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
String java_opts = "'-Xms3000m -Djava.io.tmpdir=/mnt'"
# Resource
Int cpu = 16
String memory = "32G"
String disks = "local-disk 250 cloud_ssd"
# docker
String docker_img
}
String bqsr_grp = sample_id + "-bqsr.grp"
command <<<
~{gatk_Launcher} --java-options ~{java_opts} \
GatherBQSRReports \
-I ~{sep=" -I " recalibration_reports} \
-O ~{bqsr_grp}
>>>
runtime {
cpu:cpu
memory:memory
disks:disks
docker: docker_img
}
output {
File bqsr_grp_output = bqsr_grp
}
}
task apply_bqsr {
input {
Pair[File,File] dedup_bam
Array[File] genome_indexes
String calling_region
File bqsr_grp
String sample_id
String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
String java_opts = "'-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Djava.io.tmpdir=/mnt'"
# Resource
Int cpu = 32
String memory = "64G"
String disks = "local-disk 250 cloud_ssd"
# docker
String docker_img
}
String tmp_recalibrated_bam = sample_id + "_tmp_recaled.bam"
command <<<
~{gatk_Launcher} --java-options ~{java_opts} \
ApplyBQSR \
-R ~{genome_indexes[0]} \
-I ~{dedup_bam.left} \
--bqsr-recal-file ~{bqsr_grp} \
~{calling_region} \
-O ~{tmp_recalibrated_bam}
>>>
runtime {
cpu:cpu
memory:memory
disks:disks
docker: docker_img
}
output {
File separate_recalibrated_bam = tmp_recalibrated_bam
}
}
task gather_bam_files {
input {
Array[File] separate_recalibrated_bams
String sample_id
String gatk_Launcher="/usr/local/bin/gatk-4.1.4.1/gatk"
String java_opts = "'-Xms3000m -Djava.io.tmpdir=/mnt -Dsamjdk.compression_level=2'"
Boolean create_index = true
# Resource
Int cpu = 16
String memory = "32G"
String disks = "local-disk 250 cloud_ssd"
# docker
String docker_img
}
String recalibrated_bam = sample_id + ".recalibrated.bam"
String recalibrated_bam_index = sample_id + ".recalibrated.bai"
command <<<
~{gatk_Launcher} --java-options ~{java_opts} \
GatherBamFiles \
-I ~{sep=" -I " separate_recalibrated_bams} \
~{true="--CREATE_INDEX true " false = "" create_index} \
-O ~{recalibrated_bam}
>>>
runtime {
cpu:cpu
memory:memory
disks:disks
docker: docker_img
}
output {
Pair[File,File] recalibrated_bam_output = (recalibrated_bam, recalibrated_bam_index)
}
}
# Break the calling interval_list into sub-intervals
# Perform variant calling on the sub-intervals, and then gather the results
task gen_hc_scatter {
input {
File wgs_calling_regions
}
command <<<
awk 'BEGIN {
prev_total = 0
frag = 1
container = ""
}
{ if ( $1 !~ /^@/ )
{
len = ($3 - $2 + 1)
if ( prev_total + len < 324860607 ) {
prev_total += len
container = container sprintf("-L %s:%d-%d ", $1, $2, $3)
}
else {
a1 = prev_total + len - 324860607
a2 = 324860607 - prev_total
if ( a1 > a2 ) { print container; container = sprintf("-L %s:%d-%d ", $1, $2, $3); prev_total = len}
else { container = container sprintf("-L %s:%d-%d ", $1, $2, $3); print container; container = ""; prev_total = 0}
frag += 1
}
}
}
END {
if ( container ) { print container }
}' ~{wgs_calling_regions} > "ScatterIntervalList.txt"
>>>
output {
Array[String] scatter_interval_list = read_lines("ScatterIntervalList.txt")
}
}
# Call variants in parallel over WGS calling intervals
task haplotype_caller {
input {
String sample_id
Pair[File,File] input_bam
String calling_region
Array[File] genome_indexes
Array[File] known_dbsnp
String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
String java_opts = "'-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Djava.io.tmpdir=/mnt'"
# Resource
Int cpu = 32
String memory = "64G"
String disks = "local-disk 250 cloud_ssd"
# docker
String docker_img
}
String hc_gvcf = sample_id + "_hc.g.vcf.gz"
command <<<
~{gatk_Launcher} --java-options ~{java_opts} \
HaplotypeCaller \
-R ~{genome_indexes[0]} \
-I ~{input_bam.left} \
~{calling_region} \
--dbsnp ~{known_dbsnp[0]} \
--ERC GVCF \
-O ~{hc_gvcf}
>>>
runtime {
cpu:cpu
memory:memory
disks:disks
docker: docker_img
}
output {
File hc_block_gvcf = hc_gvcf
}
}
task merge_vcfs {
input {
String sample_id
Array[File] vcfs
String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
String java_opts = "'-Xms2000m -Djava.io.tmpdir=/mnt'"
# Resource
Int cpu = 16
String memory = "32G"
String disks = "local-disk 250 cloud_ssd"
# docker
String docker_img
}
#test_hc.g.vcf.gz.tbi
String gvcf = sample_id + ".g.vcf.gz"
String gvcf_idx = sample_id + ".g.vcf.gz.tbi"
command <<<
~{gatk_Launcher} --java-options ~{java_opts} \
MergeVcfs \
-I ~{sep=" -I " vcfs} \
-O ~{gvcf}
>>>
runtime {
cpu:cpu
memory:memory
disks:disks
docker: docker_img
}
output {
Pair[File, File] gvcf_output = (gvcf, gvcf_idx)
}
}
task genotype_gvcfs {
input {
String sample_id
Pair[File, File] gvcf
Array[File] genome_indexes
Array[File] known_dbsnp
String gatk_Launcher = "/usr/local/bin/gatk-4.1.4.1/gatk"
String java_opts = "'-Xms4000m -Djava.io.tmpdir=/mnt'"
# Resource
Int cpu = 16
String memory = "32G"
String disks = "local-disk 250 cloud_ssd"
# docker
String docker_img }
String vcf = sample_id + ".vcf.gz"
command <<<
~{gatk_Launcher} --java-options ~{java_opts} \
GenotypeGVCFs \
-R ~{genome_indexes[0]} \
--dbsnp ~{known_dbsnp[0]} \
-V ~{gvcf.left} \
-O ~{vcf}
>>>
runtime {
cpu:cpu
memory:memory
disks:disks
docker: docker_img
}
output {
File vcf_output = vcf
}
}
Referenced from aliyun
本文来自博客园,作者:生物信息与育种,转载请注明原文链接:https://www.cnblogs.com/miyuanbiotech/p/16270691.html。若要及时了解动态信息,请关注同名微信公众号:生物信息与育种。