Skip to content

Instantly share code, notes, and snippets.

@Helw150
Last active July 28, 2018 22:34
Show Gist options
  • Select an option

  • Save Helw150/51c5f78f920e659866a295e283a8883f to your computer and use it in GitHub Desktop.

Select an option

Save Helw150/51c5f78f920e659866a295e283a8883f to your computer and use it in GitHub Desktop.

Revisions

  1. Helw150 revised this gist Jul 28, 2018. 1 changed file with 1 addition and 2 deletions.
    3 changes: 1 addition & 2 deletions large-file-processing.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,5 @@
    #!/usr/bin/env python
    """Clean english text for NLP use
    """
    """Counts the number of times a word occurs in a very large text file"""

    from __future__ import print_function
    import os
  2. Helw150 created this gist Jul 28, 2018.
    62 changes: 62 additions & 0 deletions large-file-processing.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,62 @@
    #!/usr/bin/env python
    """Clean english text for NLP use
    """

    from __future__ import print_function
    import os
    import sys
    import argparse
    import textacy
    import multiprocessing
    from tqdm import tqdm
    from collections import Counter
    from itertools import zip_longest


    def grouper(n, iterable, padvalue=None):
    return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue)


    def process_chunk(chunk):
    if chunk:
    # Roughly the overhead of a Python String
    size = sys.getsizeof(chunk) - 40
    wordcount = Counter(chunk.split())
    return (wordcount, size)
    else:
    return (None, 0)


    def main(arguments):

    parser = argparse.ArgumentParser(
    description=__doc__,
    formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
    'infile', help="Input file", type=argparse.FileType('r'))
    parser.add_argument(
    'outfile', help="Output file", type=argparse.FileType('w'))

    args = parser.parse_args(arguments)
    p = multiprocessing.Pool(8)

    filesize = os.path.getsize(args.infile.name)

    total_word_counts = None
    with tqdm(total=filesize) as pbar:
    for chunk in grouper(1000, args.infile):
    results = p.map(process_chunk, chunk)
    word_counts, sizes = zip(*results)
    pbar.update(sum(sizes))
    chunk_wc = sum(word_counts, Counter())
    if total_word_counts != None:
    total_word_counts += chunk_wc
    else:
    total_word_counts = chunk_wc

    for item in total_word_counts.items():
    args.outfile.write("{} {}\n".format(*item))


    if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))