sentences = [re.sub(r'.,:?{}', ' ', sentence) for sentence in sentences] corpus = " ".join(sentences) words = set(doc.split()) word_index = {word: index for index, word in enumerate(words)} with open( 'word_index.json' , 'w' ) as file: json.dump( word_index , file )