Last active
July 24, 2020 16:05
-
-
Save sobernaut/cb6b87ac7d570d3e9ce01fbd51fb6a4f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys, csv, os | |
| import json | |
| original = './data' | |
| new = './data/new' | |
| old = './updated' | |
| def stripp(x): | |
| return x.replace(' ', '') | |
| def getYearFile(path, year): | |
| for file in os.listdir(path): | |
| if file.endswith(".csv") and file.startswith(year): | |
| return path + '/' + file | |
| def getProcessedArr(file, rowNo, cb=None): | |
| arr = [] | |
| with open(file, 'r') as f: | |
| reader = csv.reader(f) | |
| line_count = 0 | |
| for row in reader: | |
| if line_count == 0: | |
| line_count += 1 | |
| else: | |
| stripped = stripp(row[rowNo]) | |
| structure_no = cb(stripped) if cb else stripped | |
| arr.append(structure_no) | |
| line_count += 1 | |
| print("Processed file {} with {} lines".format(file, line_count)) | |
| return arr | |
| def compare(x, y): | |
| comp_set = set(x) == set(y) | |
| comp_set_none = x == y | |
| return [comp_set, comp_set_none] | |
| def formatCb(val): | |
| return val.split('_')[1] | |
| def dump(data, filename): | |
| f = './data/rnd/' + filename + '.json' | |
| with open(f, 'w', encoding='utf-8') as file: | |
| json.dump(data, file, ensure_ascii=False, indent=2) | |
| print('Dumped data on file {}'.format(f)) | |
| def diff(a, b): | |
| return list(set(a) - set(b)) | |
| def analyze(year): | |
| print('\n------------------------Year {}------------------------'.format(year)) | |
| old_csv = getYearFile(old, year) | |
| old_str_no = getProcessedArr(old_csv, 3, formatCb) | |
| new_csv = getYearFile(new, year) | |
| new_str_no = getProcessedArr(new_csv, 1) | |
| print('Total ids in new', len(new_str_no)) | |
| print('Total ids in old', len(old_str_no)) | |
| print('Same or not?', compare(old_str_no, new_str_no)) | |
| newminusold = diff(new_str_no, old_str_no) | |
| oldminusnew = diff(old_str_no, new_str_no) | |
| print('Ids in new that are not in old', len(newminusold)) | |
| dump(newminusold, year + 'newminusold') | |
| dump(oldminusnew, year + 'oldminusnew') | |
| original_csv = getYearFile(original, year) | |
| original_str_no = getProcessedArr(original_csv, 1) | |
| print('original', len(original_str_no)) | |
| print('Same or not?', compare(old_str_no, original_str_no), compare(new_str_no, original_str_no)) | |
| print('Ids in new that are not in original', len(list(set(new_str_no) - set(original_str_no)))) | |
| print('Ids in old that are not in original', len(list(set(old_str_no) - set(original_str_no)))) | |
| print('/END/') | |
| user_input = input('Enter year') | |
| for splitted in user_input.split(','): | |
| analyze(splitted.replace(' ', '')) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment