Last active
July 28, 2018 22:34
-
-
Save Helw150/51c5f78f920e659866a295e283a8883f to your computer and use it in GitHub Desktop.
Revisions
-
Helw150 revised this gist
Jul 28, 2018 . 1 changed file with 1 addition and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,5 @@ #!/usr/bin/env python """Counts the number of times a word occurs in a very large text file""" from __future__ import print_function import os -
Helw150 created this gist
Jul 28, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,62 @@ #!/usr/bin/env python """Clean english text for NLP use """ from __future__ import print_function import os import sys import argparse import textacy import multiprocessing from tqdm import tqdm from collections import Counter from itertools import zip_longest def grouper(n, iterable, padvalue=None): return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue) def process_chunk(chunk): if chunk: # Roughly the overhead of a Python String size = sys.getsizeof(chunk) - 40 wordcount = Counter(chunk.split()) return (wordcount, size) else: return (None, 0) def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'infile', help="Input file", type=argparse.FileType('r')) parser.add_argument( 'outfile', help="Output file", type=argparse.FileType('w')) args = parser.parse_args(arguments) p = multiprocessing.Pool(8) filesize = os.path.getsize(args.infile.name) total_word_counts = None with tqdm(total=filesize) as pbar: for chunk in grouper(1000, args.infile): results = p.map(process_chunk, chunk) word_counts, sizes = zip(*results) pbar.update(sum(sizes)) chunk_wc = sum(word_counts, Counter()) if total_word_counts != None: total_word_counts += chunk_wc else: total_word_counts = chunk_wc for item in total_word_counts.items(): args.outfile.write("{} {}\n".format(*item)) if __name__ == '__main__': sys.exit(main(sys.argv[1:]))