JonnatasCabral · June 20, 2018 20:13 · Jun 20, 2018
diff --git a/rede_dia_brasil.py b/rede_dia_brasil.py
@@ -0,0 +1,184 @@
+
+# pip install python-Levenshtein gensim
+from collections import defaultdict
+from gensim import corpora
+from gensim import models
+from gensim import similarities
+from Levenshtein import distance
+from Levenshtein import ratio
+import pickle
+import re
+import unicodedata
+import csv
+
+
+class attrdict(dict):
+
+    def __setattr__(self, key, value):
+        self[key] = value
+
+    def __dir__(self):
+        return self.keys()
+
+    def __getattr__(self, key):
+        try:
+            return self[key]
+        except KeyError:
+            raise AttributeError(key)
+
+    def __setstate__(self, state):
+        pass
+
+
+class IndexMixin:
+    _join_str = ' '
+    _distance = True
+
+    def __init__(self):
+        self.sections = defaultdict(attrdict)
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}(sections=[{len(self.sections)}])'
+
+    def normalize(self, value):
+        try:
+            return unicodedata.normalize(
+                'NFKD', str(value).lower()
+            ).encode('ascii', 'ignore').decode('ascii')
+        except Exception:
+            return value
+
+    def tokenize(self, documents):
+        return [
+            [
+                word for word in re.findall(
+                    r'[A-Z]+|[a-z]+', self.normalize(doc)
+                )
+            ] for doc in documents
+        ]
+
+    def fit(self, sec, documents, **kwargs):
+        sec = self.sections[sec]
+        sec.documents = documents
+        sec.tokens = self.tokenize(documents)
+        sec.stems = [
+            self._join_str.join(token) for token in sec.tokens
+        ]
+        sec.dictionary = corpora.Dictionary(sec.tokens)
+        sec.corpus = [sec.dictionary.doc2bow(token) for token in sec.tokens]
+        sec.tfidf = models.TfidfModel(sec.corpus)
+        sec.lsi = models.LsiModel(
+            sec.tfidf[sec.corpus],
+            id2word=sec.dictionary,
+            num_topics=kwargs.pop('num_topics', 2000),
+            power_iters=kwargs.pop('power_iters', 2)
+        )
+        sec.index = similarities.MatrixSimilarity(sec.lsi[sec.corpus])
+        return self
+
+    def similar(self, sec, document, **kwargs):
+        sec = self.sections[sec]
+        token = self.tokenize([document])[0]
+        stem = self._join_str.join(token)
+        if not stem:
+            return []
+        bow = sec.dictionary.doc2bow(token)
+        lsi = sec.lsi[bow]
+        index = sec.index[lsi]
+        w = [
+            kwargs.pop('sim_weight', 1),
+            kwargs.pop('dist_weight', 1),
+            kwargs.pop('ratio_weight', 1)
+        ]
+        min_score = kwargs.pop('min_score', 0.95)
+
+        results = (
+            (
+                i, sec.stems[i], sim,
+                distance(stem, sec.stems[i]) if w[1] > 0 else 0,
+                ratio(stem, sec.stems[i]) if w[2] > 0 else 0
+            ) for i, sim in enumerate(index)
+        )
+        score = (
+            (sec.documents[i], j, s, d, r, (
+                s * w[0] + ((1 / d) * w[1] if d > 0 else w[1]) + r * w[2]
+            ) / sum(w)) for i, j, s, d, r in results
+        )
+
+        return sorted((
+            i for i in score if i[-1] >= min_score
+        ), key=lambda x: -x[-1])
+
+    def match(self, sec, document, **kwargs):
+        similar = self.similar(sec, document, **kwargs)
+        if similar:
+            return (similar[0][0], len(similar) == 1)
+        return (None, False)
+
+    def save(self, path):
+        with open(path, 'wb') as f:
+            pickle.dump(self, f)
+        return self
+
+    def load(self, path):
+        with open(path, 'rb') as f:
+            instance = pickle.load(f)
+            self.__dict__ = instance.__dict__
+        return self
+
+
+class Index(IndexMixin):
+
+    def get_words(self, sec):
+        sec = self.sections[sec]
+        return list({word for token in sec.tokens for word in token})
+
+
+def get_names(file_name, key_name):
+    f = open(file_name)
+    advogados_dict = csv.DictReader(f)
+    nomes = [x[key_name] for x in advogados_dict]
+    return nomes
+
+
+def get_advogados_file(file_name):
+    f = open(file_name)
+    names_dict = csv.DictReader(f)
+    names = [{'nome': x['nome_parte'], 'processo_mongoid': x['processo_mongoid']} for x in names_dict]
+    return names
+
+
+if __name__ == '__main__':
+    final_names = get_names('etl-nomes.csv', 'nome_parte')
+    index = Index()
+    index.fit('nomes', final_names)
+
+    client_names = get_names('cliente-por-nome.csv', 'Nome Empregado')
+    l = []
+    for name in client_names:
+        nome_final, status = index.match('nomes', name)
+        if status:
+            l.append((name, nome_final))
+            print(name, nome_final)
+    print(len(l))
+
+    names_matched = list(set(l))
+    print('quantidade machetd{}'.format(len(names_matched)))
+    d = defaultdict(list)
+    advogados_dict = get_advogados_file('etl-nomes.csv')
+    for name in advogados_dict:
+        d[name['nome']].append(name['processo_mongoid'])
+
+    result = []
+
+    for fn in names_matched:
+        for processo_mongoid in d[fn[1]]:
+            result.append(
+                {'nome':fn[0], 'processo_mongoid': processo_mongoid, 'nome_final': fn[1]}
+            )
+
+    f = open('resultado_final.csv', 'w')
+    csv_final = csv.DictWriter(f, fieldnames=['nome', 'processo_mongoid', 'nome_final'])
+    csv_final.writeheader()
+    csv_final.writerows(result)
+    f.close()
No results found