"""
Tabulate some simple alignment stats from sam.
"""
import argparse
import sys
import numpy as np
import pandas as pd
import warnings
from collections import OrderedDict
parser = argparse.ArgumentParser(
description="""Summarise output of `stats_from_bam`.""",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin)
parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stderr,
help='Output summary to file instead of stderr.')
parser.add_argument('-p', '--percentiles', type=int, nargs='+', default=(10,50,90),
help='Percentiles for summary.')
parser.add_argument('-pr', '--per_reference', action='store_true',
help='Also output a summary for each reference.')
[docs]def qscore(d):
with warnings.catch_warnings():
# some values might be zero
warnings.simplefilter("ignore")
q = -10 * np.log10(d)
return q
[docs]def summarise_stats(d, percentiles=(10, 50, 90)):
s = []
def summarise_stat(name, f):
# per alignment errors from which to calculate metrics, bounds etc.
r = OrderedDict()
r['name'] = name
r['mean'] = f[0](d)
vals = f[1](d)
for p, pval in zip(percentiles, np.percentile(vals, percentiles)):
r['q{}'.format(p)] = pval
return r
# create functions to get overall length-weighted averages
# and per-alignment errors from which to calculate percentiles
f = OrderedDict()
f['err_ont'] = (lambda d: np.sum(d['sub'] + d['ins'] + d['del']) / np.sum(d['length']), # weighted
lambda d: (d['sub'] + d['ins'] + d['del'])/ d['length']) # per alignment
f['err_bal'] = (lambda d: np.sum(d['sub'] + d['ins'] + d['del']) / np.sum(d['rend'] - d['rstart']),
lambda d: (d['sub'] + d['ins'] + d['del'])/ (d['rend'] - d['rstart']))
f['iden'] = (lambda d: np.sum(d['sub']) / (np.sum(d['rend'] - d['rstart'] - d['del'])),
lambda d: d['sub'] / (d['rend'] - d['rstart'] - d['del']))
f['del'] = (lambda d: np.sum(d['del']) / (np.sum(d['rend'] - d['rstart'])),
lambda d: d['del'] / (d['rend'] - d['rstart']))
f['ins'] = (lambda d: np.sum(d['ins']) / (np.sum(d['rend'] - d['rstart'])),
lambda d: d['ins'] / (d['rend'] - d['rstart']))
for name, f in f.items():
s.append(summarise_stat(name, f))
return s
[docs]def main(arguments=None):
args = parser.parse_args(arguments)
stats = pd.read_csv(args.input, sep='\t')
if len(stats) == 0:
raise ValueError('No alignments stats found.')
def output_summary(stats, prefix=''):
to_str_opts = {'buf': args.output, 'col_space': 8, 'index':False, 'justify': 'center'}
errors = pd.DataFrame(summarise_stats(stats, percentiles=args.percentiles))
pcnt_formatter = {}
for col in errors.columns:
if np.issubdtype(errors[col].dtype, np.number):
pcnt_formatter[col] = '{:5.3%}'.format
else:
pcnt_formatter[col] = '{}'.format
args.output.write('\n\n# {} Percentage Errors\n'.format(prefix))
errors.to_string(formatters=pcnt_formatter, **to_str_opts)
args.output.write('\n\n# {} Q Scores\n'.format(prefix))
for col in errors.columns:
if np.issubdtype(errors[col].dtype, np.number):
errors[col] = qscore(errors[col])
errors.to_string(float_format='%5.2f', **to_str_opts)
output_summary(stats)
args.output.write('\n\n# Ref Coverage\n')
for ref, grp in stats.groupby('ref'):
args.output.write('{} {:5.3f}\n'.format(ref, grp['ref_coverage'].sum()))
if args.per_reference:
for ref, grp in stats.groupby('ref'):
output_summary(grp, prefix=ref)
if __name__ == '__main__':
main()