Skip to content

Instantly share code, notes, and snippets.

@sobernaut
Last active July 24, 2020 16:05
Show Gist options
  • Save sobernaut/cb6b87ac7d570d3e9ce01fbd51fb6a4f to your computer and use it in GitHub Desktop.
Save sobernaut/cb6b87ac7d570d3e9ce01fbd51fb6a4f to your computer and use it in GitHub Desktop.

Revisions

  1. sobernaut revised this gist Jul 24, 2020. 1 changed file with 46 additions and 20 deletions.
    66 changes: 46 additions & 20 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -1,24 +1,21 @@
    import sys, csv, os
    import json


    original = './data'
    new = './data/new'
    old = './updated'


    year = input('Enter year')

    def stripp(x):
    return x.replace(' ', '')

    def getYearFile(path):
    def getYearFile(path, year):
    for file in os.listdir(path):

    if file.endswith(".csv") and file.startswith(year):
    return path + '/' + file




    def getProcessedArr(file, rowNo, cb=None):
    arr = []
    with open(file, 'r') as f:
    @@ -46,23 +43,52 @@ def compare(x, y):
    def formatCb(val):
    return val.split('_')[1]

    def dump(data, filename):
    f = './data/rnd/' + filename + '.json'
    with open(f, 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=2)
    print('Dumped data on file {}'.format(f))

    def diff(a, b):
    return list(set(a) - set(b))

    def analyze(year):
    print('\n------------------------Year {}------------------------'.format(year))
    old_csv = getYearFile(old, year)
    old_str_no = getProcessedArr(old_csv, 3, formatCb)
    new_csv = getYearFile(new, year)
    new_str_no = getProcessedArr(new_csv, 1)



    print('Total ids in new', len(new_str_no))
    print('Total ids in old', len(old_str_no))
    print('Same or not?', compare(old_str_no, new_str_no))

    newminusold = diff(new_str_no, old_str_no)
    oldminusnew = diff(old_str_no, new_str_no)

    print('Ids in new that are not in old', len(newminusold))


    dump(newminusold, year + 'newminusold')
    dump(oldminusnew, year + 'oldminusnew')


    original_csv = getYearFile(original, year)
    original_str_no = getProcessedArr(original_csv, 1)

    old_csv = getYearFile(old)
    old_str_no = getProcessedArr(old_csv, 3, formatCb)
    new_csv = getYearFile(new)
    new_str_no = getProcessedArr(new_csv, 1)
    print('original', len(original_str_no))
    print('Same or not?', compare(old_str_no, original_str_no), compare(new_str_no, original_str_no))
    print('Ids in new that are not in original', len(list(set(new_str_no) - set(original_str_no))))
    print('Ids in old that are not in original', len(list(set(old_str_no) - set(original_str_no))))
    print('/END/')

    print('Total ids in new', len(new_str_no))
    print('Total ids in old', len(old_str_no))
    print('Same or not?', compare(old_str_no, new_str_no))
    print('Ids in new that are not in old', len(list(set(new_str_no) - set(old_str_no))))


    original_csv = getYearFile(original)
    original_str_no = getProcessedArr(original_csv, 1)
    user_input = input('Enter year')

    print('original', len(original_str_no))
    print('Same or not?', compare(old_str_no, original_str_no), compare(new_str_no, original_str_no))
    print('Ids in new that are not in original', len(list(set(new_str_no) - set(original_str_no))))
    print('Ids in old that are not in original', len(list(set(old_str_no) - set(original_str_no))))
    for splitted in user_input.split(','):
    analyze(splitted.replace(' ', ''))


  2. sobernaut revised this gist Jul 19, 2020. 1 changed file with 9 additions and 19 deletions.
    28 changes: 9 additions & 19 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -2,8 +2,8 @@


    original = './data'
    old = './data/new'
    new = './updated'
    new = './data/new'
    old = './updated'


    year = input('Enter year')
    @@ -48,31 +48,21 @@ def formatCb(val):


    old_csv = getYearFile(old)
    old_str_no = getProcessedArr(old_csv, 3, formatCb)
    new_csv = getYearFile(new)
    new_str_no = getProcessedArr(new_csv, 1)


    old_str_no = getProcessedArr(old_csv, 1)
    new_str_no = getProcessedArr(new_csv, 3, formatCb)

    print('new', len(new_str_no), 'old', len(old_str_no))
    print('Total ids in new', len(new_str_no))
    print('Total ids in old', len(old_str_no))
    print('Same or not?', compare(old_str_no, new_str_no))

    u = list(set(old_str_no) - set(new_str_no))
    p = list(set(new_str_no) - set(old_str_no))

    print('Change in U', len(u))
    print('Change in P', len(p))
    print('Ids in new that are not in old', len(list(set(new_str_no) - set(old_str_no))))


    original_csv = getYearFile(original)
    original_str_no = getProcessedArr(original_csv, 1)

    print('original', len(original_str_no))
    print('Same or not?', compare(old_str_no, original_str_no), compare(new_str_no, original_str_no))

    u = list(set(old_str_no) - set(original_str_no))
    p = list(set(new_str_no) - set(original_str_no))

    print('Change in U', len(u))
    print('Change in P', len(p))
    print('Ids in new that are not in original', len(list(set(new_str_no) - set(original_str_no))))
    print('Ids in old that are not in original', len(list(set(old_str_no) - set(original_str_no))))

  3. sobernaut revised this gist Jul 19, 2020. 1 changed file with 10 additions and 12 deletions.
    22 changes: 10 additions & 12 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -63,18 +63,16 @@ def formatCb(val):
    print('Change in U', len(u))
    print('Change in P', len(p))

    arguments = sys.argv
    if len(arguments) > 2:
    original_csv = getYearFile(original)
    original_str_no = getProcessedArr(original_csv, 1)

    print('original', len(original_str_no))
    print('Same or not?', compare(old_str_no, original_str_no), compare(new_str_no, original_str_no))

    u = list(set(old_str_no) - set(original_str_no))
    p = list(set(new_str_no) - set(original_str_no))

    print('Change in U', len(u))
    print('Change in P', len(p))
    original_csv = getYearFile(original)
    original_str_no = getProcessedArr(original_csv, 1)

    print('original', len(original_str_no))
    print('Same or not?', compare(old_str_no, original_str_no), compare(new_str_no, original_str_no))

    u = list(set(old_str_no) - set(original_str_no))
    p = list(set(new_str_no) - set(original_str_no))

    print('Change in U', len(u))
    print('Change in P', len(p))

  4. sobernaut revised this gist Jul 15, 2020. 1 changed file with 16 additions and 3 deletions.
    19 changes: 16 additions & 3 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -19,7 +19,7 @@ def getYearFile(path):



    def getProcessedArr(file, rowNo):
    def getProcessedArr(file, rowNo, cb=None):
    arr = []
    with open(file, 'r') as f:
    reader = csv.reader(f)
    @@ -29,7 +29,8 @@ def getProcessedArr(file, rowNo):
    if line_count == 0:
    line_count += 1
    else:
    structure_no = stripp(row[rowNo])
    stripped = stripp(row[rowNo])
    structure_no = cb(stripped) if cb else stripped
    arr.append(structure_no)
    line_count += 1

    @@ -42,15 +43,25 @@ def compare(x, y):

    return [comp_set, comp_set_none]

    def formatCb(val):
    return val.split('_')[1]


    old_csv = getYearFile(old)
    new_csv = getYearFile(new)


    old_str_no = getProcessedArr(old_csv, 1)
    new_str_no = getProcessedArr(new_csv, 11)
    new_str_no = getProcessedArr(new_csv, 3, formatCb)

    print('new', len(new_str_no), 'old', len(old_str_no))
    print('Same or not?', compare(old_str_no, new_str_no))

    u = list(set(old_str_no) - set(new_str_no))
    p = list(set(new_str_no) - set(old_str_no))

    print('Change in U', len(u))
    print('Change in P', len(p))

    arguments = sys.argv
    if len(arguments) > 2:
    @@ -65,3 +76,5 @@ def compare(x, y):

    print('Change in U', len(u))
    print('Change in P', len(p))


  5. sobernaut created this gist Jul 15, 2020.
    67 changes: 67 additions & 0 deletions compare.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,67 @@
    import sys, csv, os


    original = './data'
    old = './data/new'
    new = './updated'


    year = input('Enter year')

    def stripp(x):
    return x.replace(' ', '')

    def getYearFile(path):
    for file in os.listdir(path):

    if file.endswith(".csv") and file.startswith(year):
    return path + '/' + file



    def getProcessedArr(file, rowNo):
    arr = []
    with open(file, 'r') as f:
    reader = csv.reader(f)
    line_count = 0

    for row in reader:
    if line_count == 0:
    line_count += 1
    else:
    structure_no = stripp(row[rowNo])
    arr.append(structure_no)
    line_count += 1

    print("Processed file {} with {} lines".format(file, line_count))
    return arr

    def compare(x, y):
    comp_set = set(x) == set(y)
    comp_set_none = x == y

    return [comp_set, comp_set_none]

    old_csv = getYearFile(old)
    new_csv = getYearFile(new)

    old_str_no = getProcessedArr(old_csv, 1)
    new_str_no = getProcessedArr(new_csv, 11)

    print('new', len(new_str_no), 'old', len(old_str_no))
    print('Same or not?', compare(old_str_no, new_str_no))


    arguments = sys.argv
    if len(arguments) > 2:
    original_csv = getYearFile(original)
    original_str_no = getProcessedArr(original_csv, 1)

    print('original', len(original_str_no))
    print('Same or not?', compare(old_str_no, original_str_no), compare(new_str_no, original_str_no))

    u = list(set(old_str_no) - set(original_str_no))
    p = list(set(new_str_no) - set(original_str_no))

    print('Change in U', len(u))
    print('Change in P', len(p))