Variant Phasing¶
This page walks through the steps to use megalodon in conjunction with whatshap to produce the highest quality phased variant calls.
This pipeline produces the variants.haploid_merged.vcf
file containing high quality phased variant calls.
The intermediate variant_mappings.haplotagged.bam
file can be of particular interest to investigate variant calls at the per-read level.
This file contains the reference sequence for each read annotated only with the proposed variant calls, including quality scores for SNVs.
Thus random read errors are masked allowing for more accurate analysis on proposed variants.
See an example of this per-read variant genome browser visualization below.
Workflow¶
reads_dir="fast5s"
ref="reference.fasta"
variants_vcf="variants.vcf.gz"
out_dir="megalodon_results"
nproc=16
gpu_devices="0 1"
# run megalodon to produce variant_mappings
megalodon \
$reads_dir --outputs mappings variants variant_mappings \
--reference $ref --variant-filename $variants_vcf \
--output-directory $out_dir \
--processes $nproc --devices $gpu_devices \
--verbose-read-progress 3
# filter whatshap incompatible variants and create indices
megalodon_extras \
phase_variants whatshap_filter \
$out_dir/variants.sorted.vcf \
$out_dir/variants.sorted.whatshap_filt.vcf \
--filtered-records $out_dir/whatshap_filt.txt
bgzip $out_dir/variants.sorted.whatshap_filt.vcf
tabix $out_dir/variants.sorted.whatshap_filt.vcf.gz
samtools index $out_dir/variant_mappings.sorted.bam
# run whatshap with produced mappings and variants
whatshap \
phase --distrust-genotypes \
-o $out_dir/variants.phased.vcf \
$out_dir/variants.sorted.whatshap_filt.vcf.gz \
$out_dir/variant_mappings.sorted.bam
# assign haplotypes to reads
bgzip $out_dir/variants.phased.vcf
tabix $out_dir/variants.phased.vcf.gz
whatshap \
haplotag $out_dir/variants.phased.vcf.gz \
$out_dir/variant_mappings.sorted.bam \
-o $out_dir/variant_mappings.haplotagged.bam
# extract haplotype reads and call haploid variants
megalodon_extras \
phase_variants extract_haplotype_reads \
$out_dir/variant_mappings.haplotagged.bam \
$out_dir/variant_mappings
megalodon_extras \
aggregate run \
--megalodon-directory $out_dir --output-suffix haplotype_1 \
--read-ids-filename $out_dir/variant_mappings.haplotype_1_read_ids.txt \
--outputs variants --haploid --processes $nproc
megalodon_extras \
aggregate run \
--megalodon-directory $out_dir --output-suffix haplotype_2 \
--read-ids-filename $out_dir/variant_mappings.haplotype_2_read_ids.txt \
--outputs variants --haploid --processes $nproc
# merge haploid variants to produce diploid variants
megalodon_extras \
phase_variants merge_haploid_variants \
$out_dir/variants.sorted.vcf.gz \
$out_dir/variants.haplotype_1.sorted.vcf.gz \
$out_dir/variants.haplotype_2.sorted.vcf.gz \
--out-vcf $out_dir/variants.haploid_merged.vcf