# -*- coding: utf-8 -*- """ Use torchMoji to score texts for emoji distribution. The resulting emoji ids (0-63) correspond to the mapping in emoji_overview.png file at the root of the torchMoji repo. Writes the result to a csv file. """ from __future__ import print_function, division, unicode_literals import example_helper import json import csv import numpy as np import os from tqdm import tqdm from utils.torchmoji.sentence_tokenizer import SentenceTokenizer from utils.torchmoji.model_def import torchmoji_feature_encoding from utils.torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH def files_to_list(filename): """ Takes a text file of filenames and makes a list of filenames """ with open(filename, encoding='utf-8') as f: files = f.readlines() files = [f.rstrip() for f in files] return files def top_elements(array, k): ind = np.argpartition(array, -k)[-k:] return ind[np.argsort(array[ind])][::-1] INPUT_PATHS = [ '/media/cookie/Samsung 860 QVO/ClipperDatasetV2/filelists/train_taca2.txt', '/media/cookie/Samsung 860 QVO/ClipperDatasetV2/filelists/validation_taca2.txt', ] BATCH_SIZE = 50 # get dataset from text files dataset = [j.split("|") for i in [files_to_list(x) for x in INPUT_PATHS] for j in i] paths = [x[0] for x in dataset] texts = [x[1] for x in dataset] # remove filtered_chars from text filtered_chars=["☺","␤"] for i, text in enumerate(texts): for filtered_char in filtered_chars: texts[i] = texts[i].replace(filtered_char,"") data = list(zip(paths,texts)) maxlen = 120 print('Tokenizing using dictionary from {}'.format(VOCAB_PATH)) with open(VOCAB_PATH, 'r') as f: vocabulary = json.load(f) st = SentenceTokenizer(vocabulary, maxlen) print('Loading model from {}.'.format(PRETRAINED_PATH)) model = torchmoji_feature_encoding(PRETRAINED_PATH) print(model) print('Running predictions.') for i in tqdm(range(0, len(data), BATCH_SIZE), total=len(list(range(0, len(data), BATCH_SIZE))), smoothing=0.01): paths = [x[0] for x in data[i:i+BATCH_SIZE]] texts = [x[1] for x in data[i:i+BATCH_SIZE]] #print(texts) tokenized, _, _ = st.tokenize_sentences(texts) embedding = model(tokenized) # returns np array [B, Embed] for j in range(len(embedding)): filepath_without_ext = ".".join(paths[j].split(".")[:-1]) path_path_len = min(len(filepath_without_ext), 999) file_path_safe = filepath_without_ext[0:path_path_len] #if os.path.exists(file_path_safe+"_embed.npy"): os.remove(file_path_safe+"_embed.npy") np.save(file_path_safe + "_.npy", embedding[j]) #tqdm.write(str(embedding[j]))