Created
June 20, 2018 20:13
-
-
Save JonnatasCabral/2b5fc35eecc3743dca799576d27f540f to your computer and use it in GitHub Desktop.
Revisions
-
JonnatasCabral created this gist
Jun 20, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,184 @@ # pip install python-Levenshtein gensim from collections import defaultdict from gensim import corpora from gensim import models from gensim import similarities from Levenshtein import distance from Levenshtein import ratio import pickle import re import unicodedata import csv class attrdict(dict): def __setattr__(self, key, value): self[key] = value def __dir__(self): return self.keys() def __getattr__(self, key): try: return self[key] except KeyError: raise AttributeError(key) def __setstate__(self, state): pass class IndexMixin: _join_str = ' ' _distance = True def __init__(self): self.sections = defaultdict(attrdict) def __repr__(self): return f'{self.__class__.__name__}(sections=[{len(self.sections)}])' def normalize(self, value): try: return unicodedata.normalize( 'NFKD', str(value).lower() ).encode('ascii', 'ignore').decode('ascii') except Exception: return value def tokenize(self, documents): return [ [ word for word in re.findall( r'[A-Z]+|[a-z]+', self.normalize(doc) ) ] for doc in documents ] def fit(self, sec, documents, **kwargs): sec = self.sections[sec] sec.documents = documents sec.tokens = self.tokenize(documents) sec.stems = [ self._join_str.join(token) for token in sec.tokens ] sec.dictionary = corpora.Dictionary(sec.tokens) sec.corpus = [sec.dictionary.doc2bow(token) for token in sec.tokens] sec.tfidf = models.TfidfModel(sec.corpus) sec.lsi = models.LsiModel( sec.tfidf[sec.corpus], id2word=sec.dictionary, num_topics=kwargs.pop('num_topics', 2000), power_iters=kwargs.pop('power_iters', 2) ) sec.index = similarities.MatrixSimilarity(sec.lsi[sec.corpus]) return self def similar(self, sec, document, **kwargs): sec = self.sections[sec] token = self.tokenize([document])[0] stem = self._join_str.join(token) if not stem: return [] bow = sec.dictionary.doc2bow(token) lsi = sec.lsi[bow] index = sec.index[lsi] w = [ kwargs.pop('sim_weight', 1), kwargs.pop('dist_weight', 1), kwargs.pop('ratio_weight', 1) ] min_score = kwargs.pop('min_score', 0.95) results = ( ( i, sec.stems[i], sim, distance(stem, sec.stems[i]) if w[1] > 0 else 0, ratio(stem, sec.stems[i]) if w[2] > 0 else 0 ) for i, sim in enumerate(index) ) score = ( (sec.documents[i], j, s, d, r, ( s * w[0] + ((1 / d) * w[1] if d > 0 else w[1]) + r * w[2] ) / sum(w)) for i, j, s, d, r in results ) return sorted(( i for i in score if i[-1] >= min_score ), key=lambda x: -x[-1]) def match(self, sec, document, **kwargs): similar = self.similar(sec, document, **kwargs) if similar: return (similar[0][0], len(similar) == 1) return (None, False) def save(self, path): with open(path, 'wb') as f: pickle.dump(self, f) return self def load(self, path): with open(path, 'rb') as f: instance = pickle.load(f) self.__dict__ = instance.__dict__ return self class Index(IndexMixin): def get_words(self, sec): sec = self.sections[sec] return list({word for token in sec.tokens for word in token}) def get_names(file_name, key_name): f = open(file_name) advogados_dict = csv.DictReader(f) nomes = [x[key_name] for x in advogados_dict] return nomes def get_advogados_file(file_name): f = open(file_name) names_dict = csv.DictReader(f) names = [{'nome': x['nome_parte'], 'processo_mongoid': x['processo_mongoid']} for x in names_dict] return names if __name__ == '__main__': final_names = get_names('etl-nomes.csv', 'nome_parte') index = Index() index.fit('nomes', final_names) client_names = get_names('cliente-por-nome.csv', 'Nome Empregado') l = [] for name in client_names: nome_final, status = index.match('nomes', name) if status: l.append((name, nome_final)) print(name, nome_final) print(len(l)) names_matched = list(set(l)) print('quantidade machetd{}'.format(len(names_matched))) d = defaultdict(list) advogados_dict = get_advogados_file('etl-nomes.csv') for name in advogados_dict: d[name['nome']].append(name['processo_mongoid']) result = [] for fn in names_matched: for processo_mongoid in d[fn[1]]: result.append( {'nome':fn[0], 'processo_mongoid': processo_mongoid, 'nome_final': fn[1]} ) f = open('resultado_final.csv', 'w') csv_final = csv.DictWriter(f, fieldnames=['nome', 'processo_mongoid', 'nome_final']) csv_final.writeheader() csv_final.writerows(result) f.close()