import json import math import sys #import pprint ## You should probably close the file before returning d here. Use a ## with statement def parse_file(filename): f = open(filename, "rt") d = json.load(f) return d ## You don't necessarily need to do this upfront. You can just add ## events as you go through the original data structure def set_events(data): distinct_events = set() for i in data: events = i["events"] for j in events: distinct_events.add(j) return list(distinct_events) def calc_metrics(data, distinct_events): #distinct_events = set_events(data) ## You shouldn't do this. The code should run based on the ## data. Adding something like this means that you have to change ## the code when the data changes. This is an anti-pattern. ## ## Also, you shouldn't name variables dict, list etc. since these ## are builtins. dict = { 'weekend':{}, 'reading':{}, 'cycling':{}, 'lettuce':{}, 'dentist':{}, 'running':{}, 'television':{}, 'exercise':{}, 'brussel sprouts':{}, 'candy':{}, 'beer':{}, 'spaghetti':{}, 'brushed teeth':{}, 'work':{}, 'peanuts':{}, 'lasagna':{}, 'carrot':{}, 'bread':{}, 'touched tree':{}, 'computer':{}, 'pizza':{}, 'nachos':{}, 'cauliflower':{}, 'ice cream':{}, 'potatoes':{}, 'pudding':{} } ## While this loop works, I think a more idiomatic (though perhaps ## not as efficient solution) is to just count and use numbers ## rather than the string which you've using. ## ## I also recommend building the functions with proper names ## "calc_metrics" could mean anything. for i in data: events = i["events"] squirrel = int(i["squirrel"]) for j in distinct_events: ## You've commented this out above. I'm assuming that was an error if j in events: x = "{}{}".format(1,squirrel) else: x = "{}{}".format(0, squirrel) try: dict[j][x] = dict[x]+1 ## Shouldn't the right side be dict[j][x] + 1 ? except KeyError: #print(j) #print(x) dict[j][x] = 1 return dict def calc_phi(dict): ## Same comment about repeating the data as above. dict1 = { 'weekend': {}, 'reading': {}, 'cycling': {}, 'lettuce': {}, 'dentist': {}, 'running': {}, 'television': {}, 'exercise': {}, 'brussel sprouts': {}, 'candy': {}, 'beer': {}, 'spaghetti': {}, 'brushed teeth': {}, 'work': {}, 'peanuts': {}, 'lasagna': {}, 'carrot': {}, 'bread': {}, 'touched tree': {}, 'computer': {}, 'pizza': {}, 'nachos': {}, 'cauliflower': {}, 'ice cream': {}, 'potatoes': {}, 'pudding':{} } for i in dict1: if not ("11" in dict[i].keys()): ## You don't need the .keys(). You can use the `in` operator directly on dictionaries dict[i]["11"] = 0 if not ("10" in dict[i].keys()): dict[i]["10"] = 0 if not ("01" in dict[i].keys()): dict[i]["01"] = 0 if not ("00" in dict[i].keys()): dict[i]["00"] = 0 ## This whole calculation here becomes rather dense. I recommend you clean it up a little with temporary variables. x = (dict[i]["11"]*dict[i]["00"] - dict[i]["10"]*dict[i]["01"]) y = math.sqrt((dict[i]["11"]+dict[i]["10"])*(dict[i]["00"]+dict[i]["01"])*(dict[i]["11"]+dict[i]["01"])*(dict[i]["00"]+dict[i]["10"])) dict1[i] = x/y return dict1 def main(filename): d = parse_file(filename) distinct_events = set_events(d) dict = calc_metrics(d, distinct_events) dict1 = calc_phi(dict) ## Your calc_phi mutates the dict parameter which is a bad idea. print(json.dumps(dict1, indent=10)) #pprint.pprint(dict1, indent=10) if __name__ == "__main__": # Import guard main(sys.argv[1])