#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# $Id: avg.py 1226 2015-08-15 19:14:57Z coelho $
#

import argparse
ap = argparse.ArgumentParser(description='show stats about data: count average stddev [min q1 median q3 max]...')
ap.add_argument('--median', default=True, action='store_true',
				help='compute median and quartile values')
ap.add_argument('--no-median', dest='median', default=True,
				action='store_false',
				help='do not compute median and quartile values')
ap.add_argument('--more', default=False, action='store_true',
				help='show some more stats')
ap.add_argument('--limit', type=float, default=None,
				help='set limit for counting below limit values')
ap.add_argument('--length', type=int, default=None,
				help='set expected length, assume 0 if beyond')
ap.add_argument('file', nargs='*', help='list of files to process')
opt = ap.parse_args()

# option consistency
if opt.limit:
	opt.more = True
if opt.more:
	opt.median = True

# reset arguments for fileinput
import sys
sys.argv[1:] = opt.file

import fileinput

n, skipped, vals = 0, 0, []
k, vmin, vmax = None, None, None
sum1, sum2 = 0.0, 0.0

for line in fileinput.input():
	try:
		v = float(line)
		if opt.median: # keep track only if needed
			vals.append(v)
		if k is None: # first time
			k, vmin, vmax = v, v, v
		else: # next time
			vmin = min(vmin, v)
			vmax = max(vmax, v)
		n += 1
		vmk = v - k
		sum1 += vmk
		sum2 += vmk * vmk
	except ValueError: # float conversion failed
		skipped += 1

if opt.length:
	assert "some data seen", n > 0
	missing = int(opt.length) - len(vals)
	assert "positive number of missing data", missing >= 0
	if missing > 0:
		print("warning: %d missing data, extending with zeros" % missing)
		if opt.median:
			vals += [ 0.0 ] * missing
		vmin = min(vmin, 0.0)
		sum1 += - k * missing
		sum2 += k * k * missing
		n += missing
		assert len(vals) == int(opt.length)

if opt.median:
	assert "consistent length", len(vals) == n

# five numbers...
# numpy.percentile requires numpy at least 1.9 to use 'midpoint'
# statistics.median requires python 3.4 (?)
def median(vals, start, length):
	m, odd = divmod(length, 2)
	#return 0.5 * (vals[start + m + odd - 1] + vals[start + m])
	return  vals[start + m] if odd else \
		0.5 * (vals[start + m-1] + vals[start + m])

# return ratio of below limit values
def below(vals, limit):
	return float(len([v for v in vals if v < limit ])) / len(vals)

if skipped:
	print("warning: %d lines skipped" % skipped)

if n > 0:
	# show result (hmmm, precision is truncated...)
	from math import sqrt
	avg, stddev = k + sum1 / n, sqrt((sum2 - (sum1 * sum1) / n) / n)
	if opt.median:
		vals.sort()
		med = median(vals, 0, len(vals))
		# not sure about odd/even issues here...
		q1 = median(vals, 0, len(vals) // 2)
		q3 = median(vals, (len(vals)+1) // 2, len(vals) // 2)
		print("avg over %d: %f ± %f [%f, %f, %f, %f, %f]" %
			  (n, avg, stddev, vmin, q1, med, q3, vmax))
		if opt.more:
			limit = opt.limit if opt.limit else 0.1 * med
			print("percent of values below %.1f: %.1f%%" %
				  (limit, 100.0 * below(vals, limit)))
	else:
		print("avg over %d: %f ± %f [%f, %f]" %
			  (n, avg, stddev, vmin, vmax))
else:
	print("no data seen.")
