Source code for pomoxis.stats_from_bam

"""
Tabulate some simple alignment stats from sam.
"""

import argparse
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import functools
import itertools
from math import ceil
import sys

import pysam

from pomoxis.util import intervaltrees_from_bed

parser = argparse.ArgumentParser(
        prog='stats_from_bam',
        description="""Parse a bamfile (from a stream) and output summary stats for each read.""",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('bam', type=str, help='Path to bam file.')
parser.add_argument('--bed', default=None, help='.bed file of reference regions to include.')
parser.add_argument('-m', '--min_length', type=int, default=None)
parser.add_argument('-a', '--all_alignments', action='store_true',
                    help='Include secondary and supplementary alignments.')
parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
                    help='Output alignment stats to file instead of stdout.')
parser.add_argument('-s', '--summary', type=argparse.FileType('w'), default=sys.stderr,
                    help='Output summary to file instead of stderr.')
parser.add_argument('-t', '--threads', type=int, default=1,
                    help='Number of threads for parallel processing.')


[docs]def stats_from_aligned_read(read, references, lengths): """Create summary information for an aligned read :param read: :class:`pysam.AlignedSegment` object """ try: NM = read.get_tag('NM') except: raise IOError("Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.") name = read.qname start_offset = 0 if read.is_secondary or read.is_supplementary: first_cig = read.cigartuples[0] if first_cig[0] == 5: # should always be true for minimap2 start_offset = first_cig[1] counts, _ = read.get_cigar_stats() match = counts[0] # alignment match (can be a sequence match or mismatch) ins = counts[1] delt = counts[2] # NM is edit distance: NM = INS + DEL + SUB sub = NM - ins - delt length = match + ins + delt iden = 100 * float(match - sub) / match acc = 100 - 100 * float(NM) / length read_length = read.infer_read_length() coverage = 100 * float(read.query_alignment_length) / read_length direction = '-' if read.is_reverse else '+' results = { "name": name, "coverage": coverage, "direction": direction, "length": length, "read_length": read_length, "match": match, "ins": ins, "del": delt, "sub": sub, "iden": iden, "acc": acc, "qstart": read.query_alignment_start + start_offset, "qend": read.query_alignment_end + start_offset, "rstart": read.reference_start, "rend": read.reference_end, "ref": references[read.reference_id], "aligned_ref_len": read.reference_length, "ref_coverage": 100*float(read.reference_length) / lengths[read.reference_id], } return results
[docs]def masked_stats_from_aligned_read(read, references, lengths, tree): """Create summary information for an aligned read over regions in bed file. :param read: :class:`pysam.AlignedSegment` object """ try: MD = read.get_tag('MD') except: raise IOError("Read is missing required 'MD' tag. Try running 'samtools callmd - ref.fa'.") correct, delt, ins, sub, aligned_ref_len, masked = 0, 0, 0, 0, 0, 0 pairs = read.get_aligned_pairs(with_seq=True) qseq = read.query_sequence pos_is_none = lambda x: (x[1] is None or x[0] is None) pos = None insertions = [] # TODO: refactor to use get_trimmed_pairs (as in catalogue_errors)? for qp, rp, rb in itertools.dropwhile(pos_is_none, pairs): if rp == read.reference_end or (qp == read.query_alignment_end): break pos = rp if rp is not None else pos if not tree.overlaps(pos) or (rp is None and not tree.overlaps(pos + 1)): # if rp is None, we are in an insertion, check if pos + 1 overlaps # (ref position of ins is arbitrary) # print('Skipping ref {}:{}'.format(read.reference_name, pos)) masked += 1 continue else: if rp is not None: # we don't have an insertion aligned_ref_len += 1 if qp is None: # deletion delt += 1 elif qseq[qp] == rb: # correct correct += 1 elif qseq[qp] != rb: # sub sub += 1 else: # we have an insertion ins += 1 name = read.qname match = correct + sub length = match + ins + delt if match == 0: # no matches within bed regions - all bed ref positions were deleted. # skip this alignment. return None iden = 100 * float(match - sub) / match acc = 100 - 100 * float(sub + ins + delt) / length read_length = read.infer_read_length() masked_query_alignment_length = correct + sub + ins coverage = 100*float(masked_query_alignment_length) / read_length direction = '-' if read.is_reverse else '+' results = { "name": name, "coverage": coverage, "direction": direction, "length": length, "read_length": read_length, "match": match, "ins": ins, "del": delt, "sub": sub, "iden": iden, "acc": acc, "qstart": read.query_alignment_start, "qend": read.query_alignment_end, "rstart": read.reference_start, "rend": read.reference_end, "ref": references[read.reference_id], "aligned_ref_len": aligned_ref_len, "ref_coverage": 100 * float(aligned_ref_len) / lengths[read.reference_id], "masked": masked, } return results
def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, bed_file=None): start, stop = start_stop counts = Counter() results = [] with pysam.AlignmentFile(bam_fp, 'rb') as bam: if bed_file is not None: trees = intervaltrees_from_bed(bed_file) for i, read in enumerate(bam.fetch(until_eof=True)): if i < start: continue elif i == stop: break if read.is_unmapped: counts['unmapped'] += 1 continue if not all_alignments and (read.is_secondary or read.is_supplementary): continue if bed_file is not None: if not trees[read.reference_name].overlaps(read.reference_start, read.reference_end): sys.stderr.write('read {} does not overlap with any regions in bedfile\n'.format(read.query_name)) counts['masked'] += 1 continue else: result = masked_stats_from_aligned_read(read, bam.references, bam.lengths, trees[read.reference_name]) if result is None: # no matches within bed regions counts['all_matches_masked'] += 1 continue else: result = stats_from_aligned_read(read, bam.references, bam.lengths) if min_length is not None and result['length'] < min_length: counts['short'] += 1 continue results.append(result) return counts, results
[docs]def main(arguments=None): args = parser.parse_args(arguments) if args.threads > 1 and args.bed is not None: pool = ProcessPoolExecutor(args.threads) mapper = pool.map else: pool = None mapper = map args.threads = 1 headers = ['name', 'ref', 'coverage', 'ref_coverage', 'qstart', 'qend', 'rstart', 'rend', 'aligned_ref_len', 'direction', 'length', 'read_length', 'match', 'ins', 'del', 'sub', 'iden', 'acc'] masked_headers = ['masked'] if args.bed is not None: headers.extend(masked_headers) args.output.write('\t'.join(headers)) args.output.write('\n') # create a slice of reads to process in each thread to avoid looping through # bam n read times and reduce mp overhead ranges = [(0, float('inf'))] if args.threads > 1: with pysam.AlignmentFile(args.bam) as bam: n_reads = bam.count(until_eof=True) n_reads_per_proc = ceil(n_reads / args.threads) ranges = [(start, min(start + n_reads_per_proc, n_reads)) for start in range(0, n_reads, n_reads_per_proc)] func = functools.partial(_process_reads, args.bam, all_alignments=args.all_alignments, min_length=args.min_length, bed_file=args.bed) counts = Counter() for batch_counts, results in mapper(func, ranges): counts.update(batch_counts) counts['total'] += len(results) for result in results: out_row = (str(result[x]) for x in headers) args.output.write('\t'.join(out_row)) args.output.write('\n') if pool is not None: pool.shutdown(wait=True) if counts['total'] == 0: args.summary.write('No alignments processed. Check your bam and filtering options.\n') args.summary.write('Mapped/Unmapped/Short/Masked/Skipped(all matches masked): {total}/{unmapped}/{short}/{masked}/{all_matches_masked}\n'.format_map(counts))
if __name__ == '__main__': main()