#!/usr/bin/env python
from __future__ import print_function

import collections
from optparse import OptionParser
import os
import sys

__author__ = 'Marco Nenciarini <marco.nenciarini@2ndquadrant.it>'

usage = """usage: %prog [options] backup_1 backup_2
"""
parser = OptionParser(usage=usage)
(options, args) = parser.parse_args()

# need 2 directories
if len(args) != 2:
    parser.print_help()
    sys.exit(1)

FileItem = collections.namedtuple('FileItem', 'size time path')


def get_files(target_dir):
    """Return a set of FileItem"""
    files = set()
    for dir_path, _, file_names in os.walk(target_dir, followlinks=True):
        for filename in file_names:
            path = os.path.join(dir_path, filename)
            rel_path = path[len(target_dir) + 1:]
            stats = os.stat(path)
            files.add(FileItem(stats.st_size, stats.st_mtime, rel_path))
    return files


def size_fmt(num, suffix='B'):
    """Format a size"""
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

def percent_fmt(a, b):
    """Format a percent"""
    return "%.1f%%" % (100.0*a/b)


def get_size(file_set):
    return reduce(lambda x, y: x + y.size, file_set, 0)


def report(a, b):
    # find files that are identical (same size and same time)
    common = a & b

    # remove files count only main forks LSN
    common_lsn = set()
    for item in common:
        # remove non main fork
        if any(suffix in item.path
               for suffix in ('_fsm', '_vm', '_init')):
            continue
        # remove things outside data directory
        if not any(item.path.startswith(prefix)
                   for prefix in ('base', 'global', 'pg_tblspc')):
            continue
        common_lsn.add(item)

    a_size = get_size(a)
    b_size = get_size(b)
    fs_based_size = get_size(common)
    lsn_based_size = get_size(common_lsn)

    print("First backup size: %d (%s)" % (a_size, size_fmt(a_size)))
    print("Second backup size: %d (%s)" % (b_size, size_fmt(b_size)))
    print("Matching files count: %d" % (len(common)))
    print("Matching LSN count: %d" % (len(common_lsn)))
    print("Matching files size: %d (%s, %s)" % (
        fs_based_size,
        size_fmt(fs_based_size),
        percent_fmt(fs_based_size, b_size)))
    print("Matching LSN size: %d (%s, %s)" % (
        lsn_based_size,
        size_fmt(lsn_based_size),
        percent_fmt(lsn_based_size, b_size)))


a = get_files(args[0])
b = get_files(args[1])

report(a,b)

