#!/usr/bin/python

import datetime
import math
import numpy
import psycopg2
import random
import re
import sys

HEAD_PORT = 5430
RSE_PORT = 5431
SSD_20_PORT = 5432
SSD_25_PORT = 5425
SSD_30_PORT = 5433

TEST_DATA_FILE = '/tmp/mcv-test-data'
NUMBER_OF_ANAYLYSE_TESTS = 10

conn0 = psycopg2.connect('port=%d dbname=john user=john' % HEAD_PORT)
conn0.set_session(autocommit = True)
conn1 = psycopg2.connect('port=%d dbname=john user=john' % RSE_PORT)
conn1.set_session(autocommit = True)
conn20 = psycopg2.connect('port=%d dbname=john user=john' % SSD_20_PORT)
conn20.set_session(autocommit = True)
conn25 = psycopg2.connect('port=%d dbname=john user=john' % SSD_25_PORT)
conn25.set_session(autocommit = True)
conn30 = psycopg2.connect('port=%d dbname=john user=john' % SSD_30_PORT)
conn30.set_session(autocommit = True)

branches = {}
branches['master'] = conn0.cursor()
branches['RSE'] = conn1.cursor()
branches['V2_20'] = conn20.cursor()
branches['V2_25'] = conn25.cursor()
branches['V2_30'] = conn30.cursor()

# Store test results in the master DB.
result_store = branches['master']

# Create one results table each time the script is run.
timestamp = datetime.datetime.now().strftime('%m%d%H%M')
results_table = 'mcv_results_' + timestamp
result_store.execute('DROP TABLE IF EXISTS %s' % (results_table,))
result_store.execute('CREATE TABLE %s ' \
                        '(branch text,' \
                        'title text,' \
                        'count int,' \
                        'dcount int,' \
                        'num_mcvs int,' \
                        'ratio_least_common_mcv float4,' \
                        'ratio_most_common_non_mcv float4)' % (results_table,))

def test_stats(cur, x):
    cur.execute('EXPLAIN ANALYSE'\
                ' SELECT * FROM mcv_test WHERE x = %s', (x,))
    plan = cur.fetchone()[0]
    #~ print '***' + plan
    match = re.search('.*rows=([0-9]*).*rows=([0-9]*)', plan)
    estimate = float(match.group(1))
    actual = float(match.group(2))
    return actual / estimate

def test_analyse(title, branch, cur):
    for i in range(NUMBER_OF_ANAYLYSE_TESTS):
        cur.execute('ANALYSE mcv_test')
        cur.execute('SELECT coalesce(cardinality(most_common_vals), 0)'\
                    '  FROM pg_stats WHERE tablename = \'mcv_test\'')
        num_mcvs = (cur.fetchone()[0])

        cur.execute('SELECT count(x), sum(count)'\
                    '  FROM mcv_test_counts')
        (dcount, count) = cur.fetchone()

        # Returns MCVs as rows
        mcv_query = 'SELECT unnest('\
                    '         string_to_array('\
                    '           array_to_string('\
                    '             most_common_vals, \',\'), \',\'))::int'\
                    '  FROM pg_stats WHERE tablename = \'mcv_test\''

        # Report ratio between estimated and actual cardinality of most common non-MCV
        cur.execute('SELECT x' \
                    '  FROM mcv_test_counts' \
                    '  WHERE x NOT IN ( %s )' \
                    '  ORDER BY count DESC LIMIT 1' % (mcv_query,))

        most_common_non_mcv = cur.fetchone()
        if most_common_non_mcv is None:
            ratio_most_common_non_mcv = 0
        else:
            ratio_most_common_non_mcv = test_stats(cur, most_common_non_mcv)

        # Report ratio between estimated and actual cardinality of least common MCV
        cur.execute('SELECT x' \
                    '  FROM mcv_test_counts' \
                    '  WHERE x IN ( %s )' \
                    '  ORDER BY count ASC LIMIT 1' % (mcv_query,))

        least_common_mcv = cur.fetchone()
        if least_common_mcv is None:
            ratio_least_common_mcv = 0
        else:
            ratio_least_common_mcv = test_stats(cur, least_common_mcv)

        # Save results
        result_store.execute(
        'INSERT INTO %s' \
        '       (branch, title,  count, dcount, num_mcvs, ratio_most_common_non_mcv, ratio_least_common_mcv)' \
        'VALUES (\'%s\', \'%s\', %d,    %d,     %d,       %.4f,                      %.4f)' %
            (results_table,
                 branch, title,  count, dcount, num_mcvs, ratio_most_common_non_mcv, ratio_least_common_mcv))

        print
        print branch
        print title
        print 'Table size:', count
        print 'Distinct values:', dcount
        print 'Number of MCVs:', num_mcvs
        print 'Ratio MC non-MCV', ratio_most_common_non_mcv
        print 'Ratio LC MCV', ratio_least_common_mcv

def test(dist_fn, *args):
    with open(TEST_DATA_FILE, 'w') as fd:
        title = dist_fn(fd, *args)

    for branch, cur in branches.iteritems():
        # Recreate table
        cur.execute('DROP TABLE IF EXISTS mcv_test')
        cur.execute('CREATE TABLE mcv_test(x int)')

        # Load data
        cur.execute('COPY mcv_test FROM \'%s\'' % TEST_DATA_FILE)
        #~ cur.execute('CREATE INDEX mcv_test_x_idx ON mcv_test(x)')

        # Populate counts table
        cur.execute('DROP TABLE IF EXISTS mcv_test_counts')
        cur.execute('CREATE TABLE mcv_test_counts(x int, count int)')
        cur.execute('INSERT INTO mcv_test_counts (x, count)' \
                    'SELECT x, count(*)' \
                    '  FROM mcv_test' \
                    '  GROUP BY 1' \
                    '  ORDER BY 2 DESC')

        test_analyse(title, branch, cur)

def uniform_correlated(fd, N, Nd):
    x = 0
    while x < N:
        val = int(x*float(Nd)/N)
        print >>fd, val
        x += 1
    return 'Correlated uniform distribution (N=%d, Nd=%d)' % (N, Nd)

def uniform_random(fd, N, Nd):
    vals = range(0, N)
    random.shuffle(vals)
    x = 0
    while x < N:
        val = int(vals[x]*float(Nd)/N)
        print >>fd, val
        x += 1
    return 'Random uniform distribution (N=%d, Nd=%d)' % (N, Nd)

def normal(fd, N, sd):
    for val in numpy.random.normal(0, sd, N):
        print >>fd, int(round(val))
    return 'Normal distribution (N=%d, sd=%d)' % (N, sd)
    
def binomial(fd, N, n, p):
    for val in numpy.random.binomial(n, p, N):
        print >>fd, int(round(val))
    return 'Binomial distribution (N=%d, n=%d, p=%f)' % (N, n, p)

def exponential(fd, N, sd):
    for val in numpy.random.exponential(sd, N):
        print >>fd, int(round(val))
    return 'Exponential distribution (N=%d, sd=%f (beta))' % (N, sd)

def laplace(fd, N, b):
    for val in numpy.random.laplace(0, b, N):
        print >>fd, int(round(val))
    return 'Laplace distribution (N=%d, b=%f (lambda))' % (N, b)

def pareto(fd, N, a):
    for val in numpy.random.pareto(a, N):
        print >>fd, int(round(val))
    return 'Pareto distribution (N=%d, a=%f)' % (N, a)

def multimodal(fd, N, peaks, sep, sd):
    for x in range(0, N):
        mean = random.randrange(0, peaks)*sep
        print >>fd, int(round(numpy.random.normal(mean, sd)))
    return 'Multimodal normal mix (N=%d, peaks=%d, sep=%d, sd=%d)' % (N, peaks, sep, sd)

def uniform_correlated_tests():
    test(uniform_correlated, 30000, 100)
    test(uniform_correlated, 30000, 150)

    test(uniform_correlated, 40000, 100)
    test(uniform_correlated, 40000, 150)
    test(uniform_correlated, 40000, 200)
    test(uniform_correlated, 40000, 500)
    test(uniform_correlated, 40000, 1000)
    test(uniform_correlated, 40000, 2000)
    test(uniform_correlated, 40000, 3000)
    test(uniform_correlated, 40000, 4000)
    test(uniform_correlated, 40000, 5000)
    test(uniform_correlated, 40000, 10000)
    test(uniform_correlated, 40000, 20000)

    test(uniform_correlated, 50000, 100)
    test(uniform_correlated, 50000, 150)
    test(uniform_correlated, 50000, 200)
    test(uniform_correlated, 50000, 500)
    test(uniform_correlated, 50000, 1000)
    test(uniform_correlated, 50000, 2000)
    test(uniform_correlated, 50000, 3000)
    test(uniform_correlated, 50000, 6000)
    test(uniform_correlated, 50000, 12000)
    test(uniform_correlated, 50000, 25000)

    test(uniform_correlated, 60000, 100)
    test(uniform_correlated, 60000, 150)
    test(uniform_correlated, 60000, 200)
    test(uniform_correlated, 60000, 500)
    test(uniform_correlated, 60000, 1000)
    test(uniform_correlated, 60000, 2000)
    test(uniform_correlated, 60000, 3000)
    test(uniform_correlated, 60000, 5000)
    test(uniform_correlated, 60000, 7000)
    test(uniform_correlated, 60000, 15000)
    test(uniform_correlated, 60000, 30000)

    test(uniform_correlated, 100000, 100)
    test(uniform_correlated, 100000, 150)
    test(uniform_correlated, 100000, 200)
    test(uniform_correlated, 100000, 300)
    test(uniform_correlated, 100000, 400)
    test(uniform_correlated, 100000, 500)
    test(uniform_correlated, 100000, 1000)
    test(uniform_correlated, 100000, 2000)
    test(uniform_correlated, 100000, 3000)
    test(uniform_correlated, 100000, 4000)
    test(uniform_correlated, 100000, 5000)
    test(uniform_correlated, 100000, 10000)
    test(uniform_correlated, 100000, 20000)
    test(uniform_correlated, 100000, 50000)

    test(uniform_correlated, 1000000, 100)
    test(uniform_correlated, 1000000, 200)
    test(uniform_correlated, 1000000, 500)
    test(uniform_correlated, 1000000, 750)
    test(uniform_correlated, 1000000, 1000)
    test(uniform_correlated, 1000000, 1500)
    test(uniform_correlated, 1000000, 2000)
    test(uniform_correlated, 1000000, 3000)
    test(uniform_correlated, 1000000, 4000)
    test(uniform_correlated, 1000000, 5000)
    test(uniform_correlated, 1000000, 10000)
    test(uniform_correlated, 1000000, 100000)
    test(uniform_correlated, 1000000, 500000)

def uniform_random_tests():
    test(uniform_random, 30000, 100)
    test(uniform_random, 30000, 150)

    test(uniform_random, 40000, 100)
    test(uniform_random, 40000, 150)
    test(uniform_random, 40000, 200)
    test(uniform_random, 40000, 500)
    test(uniform_random, 40000, 1000)
    test(uniform_random, 40000, 2000)
    test(uniform_random, 40000, 3000)
    test(uniform_random, 40000, 4000)
    test(uniform_random, 40000, 5000)
    test(uniform_random, 40000, 10000)
    test(uniform_random, 40000, 20000)

    test(uniform_random, 50000, 100)
    test(uniform_random, 50000, 150)
    test(uniform_random, 50000, 200)
    test(uniform_random, 50000, 500)
    test(uniform_random, 50000, 1000)
    test(uniform_random, 50000, 2000)
    test(uniform_random, 50000, 3000)
    test(uniform_random, 50000, 6000)
    test(uniform_random, 50000, 12000)
    test(uniform_random, 50000, 25000)

    test(uniform_random, 60000, 100)
    test(uniform_random, 60000, 150)
    test(uniform_random, 60000, 200)
    test(uniform_random, 60000, 500)
    test(uniform_random, 60000, 1000)
    test(uniform_random, 60000, 2000)
    test(uniform_random, 60000, 3000)
    test(uniform_random, 60000, 5000)
    test(uniform_random, 60000, 7000)
    test(uniform_random, 60000, 15000)
    test(uniform_random, 60000, 30000)

    test(uniform_random, 100000, 100)
    test(uniform_random, 100000, 150)
    test(uniform_random, 100000, 200)
    test(uniform_random, 100000, 300)
    test(uniform_random, 100000, 400)
    test(uniform_random, 100000, 500)
    test(uniform_random, 100000, 1000)
    test(uniform_random, 100000, 2000)
    test(uniform_random, 100000, 3000)
    test(uniform_random, 100000, 4000)
    test(uniform_random, 100000, 5000)
    test(uniform_random, 100000, 10000)
    test(uniform_random, 100000, 20000)
    test(uniform_random, 100000, 50000)

    test(uniform_random, 1000000, 100)
    test(uniform_random, 1000000, 200)
    test(uniform_random, 1000000, 500)
    test(uniform_random, 1000000, 750)
    test(uniform_random, 1000000, 1000)
    test(uniform_random, 1000000, 1500)
    test(uniform_random, 1000000, 2000)
    test(uniform_random, 1000000, 3000)
    test(uniform_random, 1000000, 4000)
    test(uniform_random, 1000000, 5000)
    test(uniform_random, 1000000, 10000)
    test(uniform_random, 1000000, 100000)
    test(uniform_random, 1000000, 500000)

def normal_tests():
    test(normal, 500000, 1)
    test(normal, 500000, 2)
    test(normal, 500000, 3)
    test(normal, 500000, 4)
    test(normal, 500000, 5)
    test(normal, 500000, 6)
    test(normal, 500000, 6)
    test(normal, 500000, 7)
    test(normal, 500000, 8)
    test(normal, 500000, 9)
    test(normal, 500000, 10)
    test(normal, 500000, 15)
    test(normal, 500000, 20)
    test(normal, 500000, 30)
    test(normal, 500000, 40)
    test(normal, 500000, 50)
    test(normal, 500000, 75)
    test(normal, 500000, 100)
    test(normal, 500000, 150)
    test(normal, 500000, 200)
    test(normal, 500000, 300)
    test(normal, 500000, 400)
    test(normal, 500000, 500)
    test(normal, 500000, 600)
    test(normal, 500000, 800)
    test(normal, 500000, 1000)

def binomial_tests():
    test(binomial, 500000, 100, 0.05)
    test(binomial, 500000, 100, 0.1)
    test(binomial, 500000, 100, 0.2)
    test(binomial, 500000, 100, 0.3)
    test(binomial, 500000, 100, 0.4)
    test(binomial, 500000, 100, 0.5)

    test(binomial, 500000, 1000, 0.1)
    test(binomial, 500000, 1000, 0.2)
    test(binomial, 500000, 1000, 0.3)
    test(binomial, 500000, 1000, 0.4)
    test(binomial, 500000, 1000, 0.5)

    test(binomial, 500000, 2000, 0.5)
    test(binomial, 500000, 3000, 0.5)
    test(binomial, 500000, 4000, 0.5)
    test(binomial, 500000, 5000, 0.5)

def exponential_tests():
    test(exponential, 500000, 0.25)
    test(exponential, 500000, 0.5)
    test(exponential, 500000, 1)
    test(exponential, 500000, 2)
    test(exponential, 500000, 3)
    test(exponential, 500000, 4)
    test(exponential, 500000, 5)
    test(exponential, 500000, 10)
    test(exponential, 500000, 20)
    test(exponential, 500000, 30)

def laplace_tests():
    test(laplace, 500000, 0.1)
    test(laplace, 500000, 0.25)
    test(laplace, 500000, 0.5)
    test(laplace, 500000, 1)
    test(laplace, 500000, 2)
    test(laplace, 500000, 3)
    test(laplace, 500000, 4)
    test(laplace, 500000, 5)
    test(laplace, 500000, 10)
    test(laplace, 500000, 25)
    test(laplace, 500000, 50)
    test(laplace, 500000, 100)

def pareto_tests():
    test(pareto, 500000, 1)
    test(pareto, 500000, 2)
    test(pareto, 500000, 3)
    test(pareto, 500000, 4)
    test(pareto, 500000, 5)

def multimodal_tests():
    test(multimodal, 500000, 2, 100, 10)
    test(multimodal, 500000, 3, 50, 10)
    test(multimodal, 500000, 5, 30, 10)
    test(multimodal, 1000000, 10, 20, 10)
    test(multimodal, 2000000, 20, 20, 15)

uniform_correlated_tests()
uniform_random_tests()
normal_tests()
binomial_tests()
exponential_tests()
laplace_tests()
pareto_tests()
multimodal_tests()
