#!/usr/bin/env python2

import csv
import sys
from sys import stdout, stderr
import os
import argparse

class RetainValueIterator(object):
    def __init__(self, f):
        self.f=f
        self.val=None

    def __iter__(self):
        return self

    def next(self):
        self.val=self.f.next()
        return self.val

def compare(files, id_column, ignore_columns=None, verbose=False):
    """
    Compare multiple csv files using a dict

    files - list of file names
    id_column - name of the column containing the unique ID
    ignore_columns - columns to ignore
    verbose - verbose indicator

    """
    if not ignore_columns:
        ignore_columns=[]

    data=[]
    ignore_indexes=[]
    _last_header=None
    for f in files:
        _f=open(f, 'rb')
        _rv=RetainValueIterator(_f)
        _csv=csv.reader(_rv)
        _header=_csv.next()

        if _last_header and _header != _last_header:
            raise Exception('The header rows do not match')
        _last_header=_header

        # d[0]: filename
        # d[1]: csv_reader
        # d[2]: retain_value_iterator
        # d[3]: row_number
        # d[4]: completion_status
        # d[5]: dict
        data.append([f, _csv, _rv, 1, False, {},])

    # ensure the id_column exists and get the column index
    if id_column not in _last_header:
        raise Exception('The id column specified does not exist')
    id_column_index=_last_header.index(id_column)

    # get the indexes for the ignore_columns
    try:
        ignore_indexes=[_last_header.index(a) for a in ignore_columns]
    except ValueError:
        raise Exception('One or more of the specified ignore columns does ' \
            'not exist')
        
    # identify hashable indexes
    hashable_indexes = \
        [a for a in range(len(_last_header)) if a not in ignore_indexes]

    if verbose:
        stderr.write('Loading data from %d files\n' % (len(files),))

    # write the mismatch header
    stdout.write(
        'file,line,%s\n' % (','.join(_last_header)),)

    # default to first data
    current_index=0

    # While any data has yet to complete
    while any(not d[4] for d in data):
        # keep in range
        if current_index > (len(data)-1):
            current_index=0
        # find the next incomplete data
        while data[current_index][4]==True:
            current_index+=1
        # set the current data
        d=data[current_index]

        try:
            # next row
            row=d[1].next()
            # get the raw csv from the retain value iterator
            csv_text=d[2].val
            # increment row count
            d[3]+=1
            if len(row):
                ident=row[id_column_index]
                hashed=hash(''.join(row[x] for x in hashable_indexes))

                # determine if id exists in the other data dicts
                found_in_all=True
                for od in [x for x in data if x != d]:
                    if ident not in od[5]:
                        found_in_all=False
                        break

                if found_in_all: # id exists in all other data dicts
                    # determine whether they all match
                    all_match=True
                    for od in [x for x in data if x != d]:
                        if od[5][ident][0]!=hashed:
                            all_match=False
                            break

                    # if the data did not match, print results
                    if not all_match:
                        # iterate thru each data
                        for od in [x for x in data if x != d]:
                            stdout.write('%s,%s,%s' % \
                                (od[0], od[5][ident][1], od[5][ident][2]))
                        stdout.write('%s,%s,%s' % \
                            (d[0], d[3], csv_text))

                    # iterate thru each data and delete the items
                    for od in [x for x in data if x != d]:
                        del od[5][ident]

                else: # id does not yet exists in all other dicts
                    d[5][ident]=(hashed, d[3], csv_text)

                # file switching every 100,000 rows
                if d[3] % 100000 == 0:
                    current_index+=1

        except StopIteration:
            # set completion status and increment data index   
            d[4]=True
            current_index+=1

    if verbose:
        stderr.write('Scanning for unique IDs\n')

    # Find Rows that are not in ALL files.
    stderr.write('%s,%s' % (id_column, ','.join([d[0] for d in data])))
    if len(files) > 2:
        stderr.write(',consistency')
    stderr.write('\n')

    # create a set of all remaining IDs
    id_set=set().union(*[d[5].keys() for d in data])

    # iterate the set
    for id in id_set:
        if len(files) > 2:
            _last_hash=None
            matched=True
        stderr.write('%s' % (id,))
        for d in data:
            if id in d[5]:
                stderr.write(',True')
                if len(files) > 2:
                    # check last hash
                    if _last_hash and d[5][id][0]!= _last_hash:
                        matched=False
                    _last_hash=d[5][id][0]
            else:
                stderr.write(',False')
        if len(files) > 2:
            if matched:
                stderr.write(',consistent')
            else:
                stderr.write(',mismatched')
        stderr.write('\n')

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser=argparse.ArgumentParser(description='compare two csv files')
    parser.add_argument('files', help='file to compare',
        metavar='file', nargs='+')
    parser.add_argument('identity_col', metavar='identity_column',
        help='identity column name')

    parser.add_argument('--ignore', help='columns to ignore',
        nargs='+', default=[])
    parser.add_argument('--verbose', '-v', help='verbose indicator',
        action='store_true', default=False)

    args=parser.parse_args(argv[1:])

    files=args.files
    identity_col=[] or args.identity_col
    ignore=args.ignore
    verbose=args.verbose

    compare(files, identity_col, ignore_columns=ignore, verbose=verbose)

if __name__ == "__main__":
        sys.exit(main())