Skip to content

Instantly share code, notes, and snippets.

@JonnatasCabral
Created June 20, 2018 20:13
Show Gist options
  • Select an option

  • Save JonnatasCabral/2b5fc35eecc3743dca799576d27f540f to your computer and use it in GitHub Desktop.

Select an option

Save JonnatasCabral/2b5fc35eecc3743dca799576d27f540f to your computer and use it in GitHub Desktop.
# pip install python-Levenshtein gensim
from collections import defaultdict
from gensim import corpora
from gensim import models
from gensim import similarities
from Levenshtein import distance
from Levenshtein import ratio
import pickle
import re
import unicodedata
import csv
class attrdict(dict):
def __setattr__(self, key, value):
self[key] = value
def __dir__(self):
return self.keys()
def __getattr__(self, key):
try:
return self[key]
except KeyError:
raise AttributeError(key)
def __setstate__(self, state):
pass
class IndexMixin:
_join_str = ' '
_distance = True
def __init__(self):
self.sections = defaultdict(attrdict)
def __repr__(self):
return f'{self.__class__.__name__}(sections=[{len(self.sections)}])'
def normalize(self, value):
try:
return unicodedata.normalize(
'NFKD', str(value).lower()
).encode('ascii', 'ignore').decode('ascii')
except Exception:
return value
def tokenize(self, documents):
return [
[
word for word in re.findall(
r'[A-Z]+|[a-z]+', self.normalize(doc)
)
] for doc in documents
]
def fit(self, sec, documents, **kwargs):
sec = self.sections[sec]
sec.documents = documents
sec.tokens = self.tokenize(documents)
sec.stems = [
self._join_str.join(token) for token in sec.tokens
]
sec.dictionary = corpora.Dictionary(sec.tokens)
sec.corpus = [sec.dictionary.doc2bow(token) for token in sec.tokens]
sec.tfidf = models.TfidfModel(sec.corpus)
sec.lsi = models.LsiModel(
sec.tfidf[sec.corpus],
id2word=sec.dictionary,
num_topics=kwargs.pop('num_topics', 2000),
power_iters=kwargs.pop('power_iters', 2)
)
sec.index = similarities.MatrixSimilarity(sec.lsi[sec.corpus])
return self
def similar(self, sec, document, **kwargs):
sec = self.sections[sec]
token = self.tokenize([document])[0]
stem = self._join_str.join(token)
if not stem:
return []
bow = sec.dictionary.doc2bow(token)
lsi = sec.lsi[bow]
index = sec.index[lsi]
w = [
kwargs.pop('sim_weight', 1),
kwargs.pop('dist_weight', 1),
kwargs.pop('ratio_weight', 1)
]
min_score = kwargs.pop('min_score', 0.95)
results = (
(
i, sec.stems[i], sim,
distance(stem, sec.stems[i]) if w[1] > 0 else 0,
ratio(stem, sec.stems[i]) if w[2] > 0 else 0
) for i, sim in enumerate(index)
)
score = (
(sec.documents[i], j, s, d, r, (
s * w[0] + ((1 / d) * w[1] if d > 0 else w[1]) + r * w[2]
) / sum(w)) for i, j, s, d, r in results
)
return sorted((
i for i in score if i[-1] >= min_score
), key=lambda x: -x[-1])
def match(self, sec, document, **kwargs):
similar = self.similar(sec, document, **kwargs)
if similar:
return (similar[0][0], len(similar) == 1)
return (None, False)
def save(self, path):
with open(path, 'wb') as f:
pickle.dump(self, f)
return self
def load(self, path):
with open(path, 'rb') as f:
instance = pickle.load(f)
self.__dict__ = instance.__dict__
return self
class Index(IndexMixin):
def get_words(self, sec):
sec = self.sections[sec]
return list({word for token in sec.tokens for word in token})
def get_names(file_name, key_name):
f = open(file_name)
advogados_dict = csv.DictReader(f)
nomes = [x[key_name] for x in advogados_dict]
return nomes
def get_advogados_file(file_name):
f = open(file_name)
names_dict = csv.DictReader(f)
names = [{'nome': x['nome_parte'], 'processo_mongoid': x['processo_mongoid']} for x in names_dict]
return names
if __name__ == '__main__':
final_names = get_names('etl-nomes.csv', 'nome_parte')
index = Index()
index.fit('nomes', final_names)
client_names = get_names('cliente-por-nome.csv', 'Nome Empregado')
l = []
for name in client_names:
nome_final, status = index.match('nomes', name)
if status:
l.append((name, nome_final))
print(name, nome_final)
print(len(l))
names_matched = list(set(l))
print('quantidade machetd{}'.format(len(names_matched)))
d = defaultdict(list)
advogados_dict = get_advogados_file('etl-nomes.csv')
for name in advogados_dict:
d[name['nome']].append(name['processo_mongoid'])
result = []
for fn in names_matched:
for processo_mongoid in d[fn[1]]:
result.append(
{'nome':fn[0], 'processo_mongoid': processo_mongoid, 'nome_final': fn[1]}
)
f = open('resultado_final.csv', 'w')
csv_final = csv.DictWriter(f, fieldnames=['nome', 'processo_mongoid', 'nome_final'])
csv_final.writeheader()
csv_final.writerows(result)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment