#!/usr/bin/env python """Counts the number of times a word occurs in a very large text file""" from __future__ import print_function import os import sys import argparse import textacy import multiprocessing from tqdm import tqdm from collections import Counter from itertools import zip_longest def grouper(n, iterable, padvalue=None): return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue) def process_chunk(chunk): if chunk: # Roughly the overhead of a Python String size = sys.getsizeof(chunk) - 40 wordcount = Counter(chunk.split()) return (wordcount, size) else: return (None, 0) def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'infile', help="Input file", type=argparse.FileType('r')) parser.add_argument( 'outfile', help="Output file", type=argparse.FileType('w')) args = parser.parse_args(arguments) p = multiprocessing.Pool(8) filesize = os.path.getsize(args.infile.name) total_word_counts = None with tqdm(total=filesize) as pbar: for chunk in grouper(1000, args.infile): results = p.map(process_chunk, chunk) word_counts, sizes = zip(*results) pbar.update(sum(sizes)) chunk_wc = sum(word_counts, Counter()) if total_word_counts != None: total_word_counts += chunk_wc else: total_word_counts = chunk_wc for item in total_word_counts.items(): args.outfile.write("{} {}\n".format(*item)) if __name__ == '__main__': sys.exit(main(sys.argv[1:]))