import io
import re
import sys
import random

## Configuration ##
CHUNK_LEN = 5000

# *fix, ie prefix, suffix.
starfix_lengths = [
    10,
    50,
    90,
    130,
    170,
    210,
    250
]

def copyEscape(ustring):
    """
    From the COPY docs:
        Backslash characters (\) can be used in the COPY data to
        quote data characters that might otherwise be taken as
        row or column delimiters.
        In particular, the following characters must be preceded
        by a backslash if they appear as part of a column value:
            backslash itself, newline, carriage return, and the
            current delimiter character.

    :LINK: https://www.postgresql.org/docs/9.1/static/sql-copy.html
    """
    return re.sub(r'([\\\r\n\t])', r'\\\1', ustring, flags=re.UNICODE)

if __name__ == '__main__':
    input_filename = sys.argv[1]
    input_stream = io.open(input_filename, mode='r', encoding='Latin-1')

    prefix_mode = False
    suffix_mode = False
    if sys.argv[2] == '--prefix':
        prefix_mode = True
    if sys.argv[2] == '--suffix':
        suffix_mode = True

    chunks = []

    chunk = None
    while chunk is None or len(chunk) == CHUNK_LEN:
        chunk = input_stream \
            .read(CHUNK_LEN)

        if chunk == '':
            break

        chunks.append(chunk)

    if prefix_mode or suffix_mode:
        rand_chunk = random.choice(chunks)

        rand_start = 0
        rand_end = len(rand_chunk) - 1

        if prefix_mode:
            rand_end -= starfix_lengths[-1]
        if suffix_mode:
            rand_start += starfix_lengths[-1]

        seed_i = random.randint(rand_start, rand_end)
        for i in starfix_lengths:
            if prefix_mode:
                print repr(rand_chunk[seed_i : seed_i + i + 1])
            if suffix_mode:
                print repr(rand_chunk[seed_i - i : seed_i + 1])
    else:
        print u'\n'.join([copyEscape(c) for c in chunks]).encode('utf-8')
