Variant Phasing

This page walks through the steps to use megalodon in conjunction with whatshap to produce the highest quality phased variant calls.

This pipeline produces the variants.haploid_merged.vcf file containing high quality phased variant calls. The intermediate variant_mappings.haplotagged.bam file can be of particular interest to investigate variant calls at the per-read level. This file contains the reference sequence for each read annotated only with the proposed variant calls, including quality scores for SNVs. Thus random read errors are masked allowing for more accurate analysis on proposed variants. See an example of this per-read variant genome browser visualization below.


_images/whatshap_haplotagged_variant_viz.png

Genome browser visualization. Megalodon variant_mappings haplotagged with whatshap (upper panel) and raw read mappings (lower panel).


Workflow

reads_dir="fast5s"
ref="reference.fasta"
variants_vcf="variants.vcf.gz"
out_dir="megalodon_results"
nproc=16
gpu_devices="0 1"

# run megalodon to produce variant_mappings
megalodon \
    $reads_dir --outputs mappings variants variant_mappings \
    --reference $ref --variant-filename $variants_vcf \
    --output-directory $out_dir \
    --processes $nproc --devices $gpu_devices \
    --verbose-read-progress 3

# filter whatshap incompatible variants and create indices
megalodon_extras \
    phase_variants whatshap_filter \
    $out_dir/variants.sorted.vcf \
    $out_dir/variants.sorted.whatshap_filt.vcf \
    --filtered-records $out_dir/whatshap_filt.txt
bgzip $out_dir/variants.sorted.whatshap_filt.vcf
tabix $out_dir/variants.sorted.whatshap_filt.vcf.gz
samtools index $out_dir/variant_mappings.sorted.bam

# run whatshap with produced mappings and variants
whatshap \
    phase --distrust-genotypes \
    -o $out_dir/variants.phased.vcf \
    $out_dir/variants.sorted.whatshap_filt.vcf.gz \
    $out_dir/variant_mappings.sorted.bam

# assign haplotypes to reads
bgzip $out_dir/variants.phased.vcf
tabix $out_dir/variants.phased.vcf.gz
whatshap \
    haplotag $out_dir/variants.phased.vcf.gz \
    $out_dir/variant_mappings.sorted.bam \
    -o $out_dir/variant_mappings.haplotagged.bam

# extract haplotype reads and call haploid variants
megalodon_extras \
    phase_variants extract_haplotype_reads \
    $out_dir/variant_mappings.haplotagged.bam \
    $out_dir/variant_mappings
megalodon_extras \
    aggregate run \
    --megalodon-directory $out_dir --output-suffix haplotype_1  \
    --read-ids-filename $out_dir/variant_mappings.haplotype_1_read_ids.txt \
    --outputs variants --haploid --processes $nproc
megalodon_extras \
    aggregate run \
    --megalodon-directory $out_dir --output-suffix haplotype_2  \
    --read-ids-filename $out_dir/variant_mappings.haplotype_2_read_ids.txt \
    --outputs variants --haploid --processes $nproc

# merge haploid variants to produce diploid variants
megalodon_extras \
    phase_variants merge_haploid_variants \
    $out_dir/variants.sorted.vcf.gz \
    $out_dir/variants.haplotype_1.sorted.vcf.gz \
    $out_dir/variants.haplotype_2.sorted.vcf.gz \
    --out-vcf $out_dir/variants.haploid_merged.vcf