from tqdm import tqdm import string import json import collections import numpy as np import os DATA_PATH = "data" ################################################################################################ # IMPORT NUMPY FILES ################################################################################################ np_train_data = np.load(os.path.join(DATA_PATH,'train_data.npy')) np_val_data = np.load(os.path.join(DATA_PATH,'val_data.npy')) np_test_data = np.load(os.path.join(DATA_PATH,'test_data.npy')) train_data = collections.OrderedDict() for i in range(len(np_train_data.item())): cap = np_train_data.item()['caps'] img = np_train_data.item()['ims'] train_data['caps'] = cap train_data['ims'] = img val_data = collections.OrderedDict() for i in range(len(np_val_data.item())): cap = np_val_data.item()['caps'] img = np_val_data.item()['ims'] val_data['caps'] = cap val_data['ims'] = img test_data = collections.OrderedDict() for i in range(len(np_test_data.item())): cap = np_test_data.item()['caps'] img = np_test_data.item()['ims'] test_data['caps'] = cap test_data['ims'] = img ################################################################################################ # IMPORT FILES JSON FILES ################################################################################################ with open(os.path.join(DATA_PATH,'instances_val2014.json')) as json_file: coco_instances_val = json.load(json_file) with open(os.path.join(DATA_PATH,'captions_val2014.json')) as json_file: coco_caption_val = json.load(json_file) ################################################################################################ # HELPER FUNCTIONS ################################################################################################ # Group captions for the same image together def group_captions(data): return np.array(data['caps']).reshape(-1, 5) # Remove punctuation and make it lowercase def process_annotations(annotations): result = [] for i in annotations: translation = ''.join(c for c in i["caption"] if c not in string.punctuation) i["processed"] = translation.lower() result.append(i) return result # Get matches using the captions def get_matches(captions, lookup): result = [] for i in tqdm(range(len(captions))): found = False for caption in captions[i]: match = get_match(query=caption, lookup=lookup) # If it happens that we couldn't find the first caption in the lookup, # which happens rarely, we continue to the next one. if match: found = True result.append(match) break if not found: result.append(None) return result # Get item that matches query def get_match(query, lookup): q = str(query, "utf8") for i in lookup: if q in i["processed"]: return i return None ################################################################################################ # MAIN FUNCTIONS ################################################################################################ # Use the captions to find the ids def get_ids(coco_caption, data): caps = group_captions(data) lookup = coco_caption["annotations"] lookup = process_annotations(lookup) matches = get_matches(caps, lookup) ids = [i["id"] for i in matches] image_ids = [i["image_id"] for i in matches] return ids, image_ids, caps # Adding the ids and image ids as columns to data, and group the captions def add_ids(coco_caption, data): ids, image_ids, caps = get_ids(coco_caption, data) data["caps"] = caps data["ids"] = ids data["image_ids"] = image_ids return data ################################################################################################ # ADD COLUMNS IDS AND IMAGE IDS, AND GROUP CAPTIONS ################################################################################################ val_data = add_ids(coco_caption_val, val_data) train_data = add_ids(coco_caption_val, train_data) test_data = add_ids(coco_caption_val, test_data) np.save("val_data_with_ids", val_data) np.save("train_data_with_ids", train_data) np.save("test_data_with_ids", test_data)