Skip to content

Instantly share code, notes, and snippets.

@sobernaut
Last active July 24, 2020 16:05
Show Gist options
  • Save sobernaut/cb6b87ac7d570d3e9ce01fbd51fb6a4f to your computer and use it in GitHub Desktop.
Save sobernaut/cb6b87ac7d570d3e9ce01fbd51fb6a4f to your computer and use it in GitHub Desktop.
import sys, csv, os
import json
original = './data'
new = './data/new'
old = './updated'
def stripp(x):
return x.replace(' ', '')
def getYearFile(path, year):
for file in os.listdir(path):
if file.endswith(".csv") and file.startswith(year):
return path + '/' + file
def getProcessedArr(file, rowNo, cb=None):
arr = []
with open(file, 'r') as f:
reader = csv.reader(f)
line_count = 0
for row in reader:
if line_count == 0:
line_count += 1
else:
stripped = stripp(row[rowNo])
structure_no = cb(stripped) if cb else stripped
arr.append(structure_no)
line_count += 1
print("Processed file {} with {} lines".format(file, line_count))
return arr
def compare(x, y):
comp_set = set(x) == set(y)
comp_set_none = x == y
return [comp_set, comp_set_none]
def formatCb(val):
return val.split('_')[1]
def dump(data, filename):
f = './data/rnd/' + filename + '.json'
with open(f, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=2)
print('Dumped data on file {}'.format(f))
def diff(a, b):
return list(set(a) - set(b))
def analyze(year):
print('\n------------------------Year {}------------------------'.format(year))
old_csv = getYearFile(old, year)
old_str_no = getProcessedArr(old_csv, 3, formatCb)
new_csv = getYearFile(new, year)
new_str_no = getProcessedArr(new_csv, 1)
print('Total ids in new', len(new_str_no))
print('Total ids in old', len(old_str_no))
print('Same or not?', compare(old_str_no, new_str_no))
newminusold = diff(new_str_no, old_str_no)
oldminusnew = diff(old_str_no, new_str_no)
print('Ids in new that are not in old', len(newminusold))
dump(newminusold, year + 'newminusold')
dump(oldminusnew, year + 'oldminusnew')
original_csv = getYearFile(original, year)
original_str_no = getProcessedArr(original_csv, 1)
print('original', len(original_str_no))
print('Same or not?', compare(old_str_no, original_str_no), compare(new_str_no, original_str_no))
print('Ids in new that are not in original', len(list(set(new_str_no) - set(original_str_no))))
print('Ids in old that are not in original', len(list(set(old_str_no) - set(original_str_no))))
print('/END/')
user_input = input('Enter year')
for splitted in user_input.split(','):
analyze(splitted.replace(' ', ''))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment