Created
June 20, 2018 20:13
-
-
Save JonnatasCabral/2b5fc35eecc3743dca799576d27f540f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # pip install python-Levenshtein gensim | |
| from collections import defaultdict | |
| from gensim import corpora | |
| from gensim import models | |
| from gensim import similarities | |
| from Levenshtein import distance | |
| from Levenshtein import ratio | |
| import pickle | |
| import re | |
| import unicodedata | |
| import csv | |
| class attrdict(dict): | |
| def __setattr__(self, key, value): | |
| self[key] = value | |
| def __dir__(self): | |
| return self.keys() | |
| def __getattr__(self, key): | |
| try: | |
| return self[key] | |
| except KeyError: | |
| raise AttributeError(key) | |
| def __setstate__(self, state): | |
| pass | |
| class IndexMixin: | |
| _join_str = ' ' | |
| _distance = True | |
| def __init__(self): | |
| self.sections = defaultdict(attrdict) | |
| def __repr__(self): | |
| return f'{self.__class__.__name__}(sections=[{len(self.sections)}])' | |
| def normalize(self, value): | |
| try: | |
| return unicodedata.normalize( | |
| 'NFKD', str(value).lower() | |
| ).encode('ascii', 'ignore').decode('ascii') | |
| except Exception: | |
| return value | |
| def tokenize(self, documents): | |
| return [ | |
| [ | |
| word for word in re.findall( | |
| r'[A-Z]+|[a-z]+', self.normalize(doc) | |
| ) | |
| ] for doc in documents | |
| ] | |
| def fit(self, sec, documents, **kwargs): | |
| sec = self.sections[sec] | |
| sec.documents = documents | |
| sec.tokens = self.tokenize(documents) | |
| sec.stems = [ | |
| self._join_str.join(token) for token in sec.tokens | |
| ] | |
| sec.dictionary = corpora.Dictionary(sec.tokens) | |
| sec.corpus = [sec.dictionary.doc2bow(token) for token in sec.tokens] | |
| sec.tfidf = models.TfidfModel(sec.corpus) | |
| sec.lsi = models.LsiModel( | |
| sec.tfidf[sec.corpus], | |
| id2word=sec.dictionary, | |
| num_topics=kwargs.pop('num_topics', 2000), | |
| power_iters=kwargs.pop('power_iters', 2) | |
| ) | |
| sec.index = similarities.MatrixSimilarity(sec.lsi[sec.corpus]) | |
| return self | |
| def similar(self, sec, document, **kwargs): | |
| sec = self.sections[sec] | |
| token = self.tokenize([document])[0] | |
| stem = self._join_str.join(token) | |
| if not stem: | |
| return [] | |
| bow = sec.dictionary.doc2bow(token) | |
| lsi = sec.lsi[bow] | |
| index = sec.index[lsi] | |
| w = [ | |
| kwargs.pop('sim_weight', 1), | |
| kwargs.pop('dist_weight', 1), | |
| kwargs.pop('ratio_weight', 1) | |
| ] | |
| min_score = kwargs.pop('min_score', 0.95) | |
| results = ( | |
| ( | |
| i, sec.stems[i], sim, | |
| distance(stem, sec.stems[i]) if w[1] > 0 else 0, | |
| ratio(stem, sec.stems[i]) if w[2] > 0 else 0 | |
| ) for i, sim in enumerate(index) | |
| ) | |
| score = ( | |
| (sec.documents[i], j, s, d, r, ( | |
| s * w[0] + ((1 / d) * w[1] if d > 0 else w[1]) + r * w[2] | |
| ) / sum(w)) for i, j, s, d, r in results | |
| ) | |
| return sorted(( | |
| i for i in score if i[-1] >= min_score | |
| ), key=lambda x: -x[-1]) | |
| def match(self, sec, document, **kwargs): | |
| similar = self.similar(sec, document, **kwargs) | |
| if similar: | |
| return (similar[0][0], len(similar) == 1) | |
| return (None, False) | |
| def save(self, path): | |
| with open(path, 'wb') as f: | |
| pickle.dump(self, f) | |
| return self | |
| def load(self, path): | |
| with open(path, 'rb') as f: | |
| instance = pickle.load(f) | |
| self.__dict__ = instance.__dict__ | |
| return self | |
| class Index(IndexMixin): | |
| def get_words(self, sec): | |
| sec = self.sections[sec] | |
| return list({word for token in sec.tokens for word in token}) | |
| def get_names(file_name, key_name): | |
| f = open(file_name) | |
| advogados_dict = csv.DictReader(f) | |
| nomes = [x[key_name] for x in advogados_dict] | |
| return nomes | |
| def get_advogados_file(file_name): | |
| f = open(file_name) | |
| names_dict = csv.DictReader(f) | |
| names = [{'nome': x['nome_parte'], 'processo_mongoid': x['processo_mongoid']} for x in names_dict] | |
| return names | |
| if __name__ == '__main__': | |
| final_names = get_names('etl-nomes.csv', 'nome_parte') | |
| index = Index() | |
| index.fit('nomes', final_names) | |
| client_names = get_names('cliente-por-nome.csv', 'Nome Empregado') | |
| l = [] | |
| for name in client_names: | |
| nome_final, status = index.match('nomes', name) | |
| if status: | |
| l.append((name, nome_final)) | |
| print(name, nome_final) | |
| print(len(l)) | |
| names_matched = list(set(l)) | |
| print('quantidade machetd{}'.format(len(names_matched))) | |
| d = defaultdict(list) | |
| advogados_dict = get_advogados_file('etl-nomes.csv') | |
| for name in advogados_dict: | |
| d[name['nome']].append(name['processo_mongoid']) | |
| result = [] | |
| for fn in names_matched: | |
| for processo_mongoid in d[fn[1]]: | |
| result.append( | |
| {'nome':fn[0], 'processo_mongoid': processo_mongoid, 'nome_final': fn[1]} | |
| ) | |
| f = open('resultado_final.csv', 'w') | |
| csv_final = csv.DictWriter(f, fieldnames=['nome', 'processo_mongoid', 'nome_final']) | |
| csv_final.writeheader() | |
| csv_final.writerows(result) | |
| f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment