#!/usr/bin/env python
# coding: utf-8

import collections
import glob
import os
import re
import string


if __name__ == '__main__':
    
    # pattern = re.compile('[\W_]+')
    # for path in glob.glob('/media/mtc/Data/tmp/gberg-*'):
    #     counter = collections.Counter()
    #     outfile = '%s.freq' % path
    #     print(path, outfile)
    #     if os.path.exists(outfile):
    #         continue
    #     with open(path) as handle:
    #         for line in handle:
    #             line = line.lower()
    #             for token in line.split():
    #                 token = pattern.sub('', token)
    #                 if token:
    #                     counter[token] += 1
    #     with open('%s.freq' % path, 'w') as output:
    #         for key, value in counter.iteritems():
    #             output.write('%s\t%s\n' % (key, value))

    counter = collections.Counter()
    for i, path in enumerate(glob.glob('/media/mtc/Data/tmp/gberg-*freq')):
        with open(path) as handle:
            print(path, len(counter), i)
            for line in handle:
                token, frequency = line.strip().split()
                frequency = int(frequency)
                if len(token) > 30:
                    continue
                else:
                    counter[token] += frequency

    with open('./gutenberg.freq', 'w') as output:
        for key, value in counter.iteritems():
            output.write('%s\t%s\n' % (key, value))