Helw150 · July 28, 2018 22:34 · Jul 28, 2018 · Jul 28, 2018
diff --git a/large-file-processing.py b/large-file-processing.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
-"""Clean english text for NLP use
-"""
+"""Counts the number of times a word occurs in a very large text file"""
 
 from __future__ import print_function
 import os

diff --git a/large-file-processing.py b/large-file-processing.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+"""Clean english text for NLP use
+"""
+
+from __future__ import print_function
+import os
+import sys
+import argparse
+import textacy
+import multiprocessing
+from tqdm import tqdm
+from collections import Counter
+from itertools import zip_longest
+
+
+def grouper(n, iterable, padvalue=None):
+    return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue)
+
+
+def process_chunk(chunk):
+    if chunk:
+        # Roughly the overhead of a Python String
+        size = sys.getsizeof(chunk) - 40
+        wordcount = Counter(chunk.split())
+        return (wordcount, size)
+    else:
+        return (None, 0)
+
+
+def main(arguments):
+
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument(
+        'infile', help="Input file", type=argparse.FileType('r'))
+    parser.add_argument(
+        'outfile', help="Output file", type=argparse.FileType('w'))
+
+    args = parser.parse_args(arguments)
+    p = multiprocessing.Pool(8)
+
+    filesize = os.path.getsize(args.infile.name)
+
+    total_word_counts = None
+    with tqdm(total=filesize) as pbar:
+        for chunk in grouper(1000, args.infile):
+            results = p.map(process_chunk, chunk)
+            word_counts, sizes = zip(*results)
+            pbar.update(sum(sizes))
+            chunk_wc = sum(word_counts, Counter())
+            if total_word_counts != None:
+                total_word_counts += chunk_wc
+            else:
+                total_word_counts = chunk_wc
+
+    for item in total_word_counts.items():
+        args.outfile.write("{} {}\n".format(*item))
+
+
+if __name__ == '__main__':
+sys.exit(main(sys.argv[1:]))
No results found