Skip to content

Instantly share code, notes, and snippets.

@JonnatasCabral
Created June 20, 2018 20:13
Show Gist options
  • Select an option

  • Save JonnatasCabral/2b5fc35eecc3743dca799576d27f540f to your computer and use it in GitHub Desktop.

Select an option

Save JonnatasCabral/2b5fc35eecc3743dca799576d27f540f to your computer and use it in GitHub Desktop.

Revisions

  1. JonnatasCabral created this gist Jun 20, 2018.
    184 changes: 184 additions & 0 deletions rede_dia_brasil.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,184 @@

    # pip install python-Levenshtein gensim
    from collections import defaultdict
    from gensim import corpora
    from gensim import models
    from gensim import similarities
    from Levenshtein import distance
    from Levenshtein import ratio
    import pickle
    import re
    import unicodedata
    import csv


    class attrdict(dict):

    def __setattr__(self, key, value):
    self[key] = value

    def __dir__(self):
    return self.keys()

    def __getattr__(self, key):
    try:
    return self[key]
    except KeyError:
    raise AttributeError(key)

    def __setstate__(self, state):
    pass


    class IndexMixin:
    _join_str = ' '
    _distance = True

    def __init__(self):
    self.sections = defaultdict(attrdict)

    def __repr__(self):
    return f'{self.__class__.__name__}(sections=[{len(self.sections)}])'

    def normalize(self, value):
    try:
    return unicodedata.normalize(
    'NFKD', str(value).lower()
    ).encode('ascii', 'ignore').decode('ascii')
    except Exception:
    return value

    def tokenize(self, documents):
    return [
    [
    word for word in re.findall(
    r'[A-Z]+|[a-z]+', self.normalize(doc)
    )
    ] for doc in documents
    ]

    def fit(self, sec, documents, **kwargs):
    sec = self.sections[sec]
    sec.documents = documents
    sec.tokens = self.tokenize(documents)
    sec.stems = [
    self._join_str.join(token) for token in sec.tokens
    ]
    sec.dictionary = corpora.Dictionary(sec.tokens)
    sec.corpus = [sec.dictionary.doc2bow(token) for token in sec.tokens]
    sec.tfidf = models.TfidfModel(sec.corpus)
    sec.lsi = models.LsiModel(
    sec.tfidf[sec.corpus],
    id2word=sec.dictionary,
    num_topics=kwargs.pop('num_topics', 2000),
    power_iters=kwargs.pop('power_iters', 2)
    )
    sec.index = similarities.MatrixSimilarity(sec.lsi[sec.corpus])
    return self

    def similar(self, sec, document, **kwargs):
    sec = self.sections[sec]
    token = self.tokenize([document])[0]
    stem = self._join_str.join(token)
    if not stem:
    return []
    bow = sec.dictionary.doc2bow(token)
    lsi = sec.lsi[bow]
    index = sec.index[lsi]
    w = [
    kwargs.pop('sim_weight', 1),
    kwargs.pop('dist_weight', 1),
    kwargs.pop('ratio_weight', 1)
    ]
    min_score = kwargs.pop('min_score', 0.95)

    results = (
    (
    i, sec.stems[i], sim,
    distance(stem, sec.stems[i]) if w[1] > 0 else 0,
    ratio(stem, sec.stems[i]) if w[2] > 0 else 0
    ) for i, sim in enumerate(index)
    )
    score = (
    (sec.documents[i], j, s, d, r, (
    s * w[0] + ((1 / d) * w[1] if d > 0 else w[1]) + r * w[2]
    ) / sum(w)) for i, j, s, d, r in results
    )

    return sorted((
    i for i in score if i[-1] >= min_score
    ), key=lambda x: -x[-1])

    def match(self, sec, document, **kwargs):
    similar = self.similar(sec, document, **kwargs)
    if similar:
    return (similar[0][0], len(similar) == 1)
    return (None, False)

    def save(self, path):
    with open(path, 'wb') as f:
    pickle.dump(self, f)
    return self

    def load(self, path):
    with open(path, 'rb') as f:
    instance = pickle.load(f)
    self.__dict__ = instance.__dict__
    return self


    class Index(IndexMixin):

    def get_words(self, sec):
    sec = self.sections[sec]
    return list({word for token in sec.tokens for word in token})


    def get_names(file_name, key_name):
    f = open(file_name)
    advogados_dict = csv.DictReader(f)
    nomes = [x[key_name] for x in advogados_dict]
    return nomes


    def get_advogados_file(file_name):
    f = open(file_name)
    names_dict = csv.DictReader(f)
    names = [{'nome': x['nome_parte'], 'processo_mongoid': x['processo_mongoid']} for x in names_dict]
    return names


    if __name__ == '__main__':
    final_names = get_names('etl-nomes.csv', 'nome_parte')
    index = Index()
    index.fit('nomes', final_names)

    client_names = get_names('cliente-por-nome.csv', 'Nome Empregado')
    l = []
    for name in client_names:
    nome_final, status = index.match('nomes', name)
    if status:
    l.append((name, nome_final))
    print(name, nome_final)
    print(len(l))

    names_matched = list(set(l))
    print('quantidade machetd{}'.format(len(names_matched)))
    d = defaultdict(list)
    advogados_dict = get_advogados_file('etl-nomes.csv')
    for name in advogados_dict:
    d[name['nome']].append(name['processo_mongoid'])

    result = []

    for fn in names_matched:
    for processo_mongoid in d[fn[1]]:
    result.append(
    {'nome':fn[0], 'processo_mongoid': processo_mongoid, 'nome_final': fn[1]}
    )

    f = open('resultado_final.csv', 'w')
    csv_final = csv.DictWriter(f, fieldnames=['nome', 'processo_mongoid', 'nome_final'])
    csv_final.writeheader()
    csv_final.writerows(result)
    f.close()