Source code for pomoxis.coverage_from_bam

import argparse
import logging
import numpy as np
import os
import pandas as pd
import pysam
from pomoxis.util import parse_regions, Region


[docs]def coverage_of_region(region, bam_fp, stride): """Get coverage data for a region""" bins = np.arange(region.start, region.end, stride) msg = 'Processing region {}:{}-{}' logging.info(msg.format(region.ref_name, bins[0], bins[-1])) coverage_by_is_rev = {True: np.zeros(len(bins)), False: np.zeros(len(bins))} for r_obj in pysam.AlignmentFile(bam_fp).fetch(contig=region.ref_name, start=region.start, end=region.end): start_i = max((r_obj.reference_start - bins[0]) // stride, 0) end_i = min((r_obj.reference_end - bins[0]) // stride, len(bins)) coverage_by_is_rev[r_obj.is_reverse][start_i: end_i] += 1 total_depth = coverage_by_is_rev[True] + coverage_by_is_rev[False] df = pd.DataFrame({'pos': bins, 'depth': total_depth, 'depth_fwd': coverage_by_is_rev[False], 'depth_rev': coverage_by_is_rev[True], } ) return df
[docs]def coverage_summary_of_region(*args): df = coverage_of_region(*args) return df.describe()
[docs]def main(): logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO) parser = argparse.ArgumentParser( prog='coverage_from_bam', description='Calculate read coverage depth from a bam.', epilog='By default a file is written per reference sequence, this can be changed with the `--one_file` ' 'option. If overlapping regions are specified, `--one_file` should not be used.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('bam', help='.fasta/fastq file.') parser.add_argument('-r', '--regions', nargs='+', help='Only process given regions.') grp = parser.add_mutually_exclusive_group() grp.add_argument('-p', '--prefix', help='Prefix for output, defaults to basename of bam.') grp.add_argument('-o', '--one_file', help='Single output file with "region" column.') parser.add_argument('-s', '--stride', type=int, default=1000, help='Stride in genomic coordinate.') parser.add_argument('--summary_only', action='store_true', help='Output only the depth_summary.txt file') args = parser.parse_args() bam = pysam.AlignmentFile(args.bam) ref_lengths = dict(zip(bam.references, bam.lengths)) if args.regions is not None: regions = parse_regions(args.regions, ref_lengths=ref_lengths) else: regions = [Region(ref_name=r, start=0, end=ref_lengths[r]) for r in bam.references] summary = {} first = True if args.one_file is not None: args.one_file = open(args.one_file, 'w') for region in regions: prefix = args.prefix if prefix is None: prefix = os.path.splitext(os.path.basename(args.bam))[0] region_str = '{}_{}_{}'.format(region.ref_name, region.start, region.end) df = coverage_of_region(region, args.bam, args.stride) summary[region_str] = df['depth'].describe() if not args.summary_only: if args.one_file is not None: df['rname'] = region.ref_name df.to_csv(args.one_file, sep='\t', index=False, header=first) else: depth_fp = '{}_{}.depth.txt'.format(prefix, region_str) df.to_csv(depth_fp, sep='\t', index=False) first = False if args.one_file is not None: args.one_file.close() summary_fp = '{}_depth_summary.txt'.format(prefix) summary_df = pd.DataFrame(summary).T.reset_index().rename(columns={'index': 'region'}) summary_df.to_csv(summary_fp, index=False, sep='\t')
if __name__ == '__main__': main()