vocab = set() for i, line in enumerate(open('wiki.train.tokens')): words = [x for x in line.split(' ') if x] [vocab.add(word) for word in words] if i < 10: print(words) print('Vocab size:', len(vocab)) # Returns 33,278 for WikiText-2 # Returns 267,735 for WikiText-103