Created
September 26, 2017 06:02
-
-
Save yingminc/ad1f16b6dbde14523e65a8123cef36cf to your computer and use it in GitHub Desktop.
Revisions
-
yingminc created this gist
Sep 26, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,285 @@ #-*- encoding: utf-8 -*- from __future__ import division import math import struct import numpy as np from multiprocessing import Pool, Value, Array from sklearn.manifold import TSNE import scipy import codecs import argparse from bokeh.plotting import figure from bokeh.io import output_file,show from bokeh.models import LabelSet,ColumnDataSource parser =argparse.ArgumentParser() parser.add_argument('input', help = 'the file of input text') args = parser.parse_args() class ivoc: def __init__(self, word): self.word = word self.freq = 0 #process the vocabulary from the data class Vocs: def __init__(self, input_file, min_fq): #load data input_file = open(input_file, 'r') lines = input_file.read().decode('utf-8').lower().split('\n') #list data and data info voc_list = [] #list of all voc voc_dict = {} #dict of voc index voc_rdict = {} line_list = [] voc_count = 0 word_count = 0 for line in lines: word_list = [] words = line.split(' ') for word in words: if word == (' ' or ''): continue word_count += 1 if word not in voc_dict: voc_dict[word] = voc_count voc_rdict[voc_count] = word voc_count += 1 voc_list.append(ivoc(word)) voc_list[voc_dict[word]].freq +=1 #assign the freqency to voc word_list.append(voc_dict[word]) line_list.append(word_list) self.voc_list = voc_list self.dict = voc_dict self.rdict = voc_rdict self.word_count = word_count self.line_list = line_list self.fq_sort(min_fq) #discard the rare vocs and sort the list by freqency def fq_sort(self, min_fq): nvoc_list = [] nvoc_list.append(ivoc('<unk>')) unk_list = [] unk_index = 0 unk_count = 0 for i in self.voc_list: #if i is a rare voc if i.freq < min_fq: unk_count += 1 nvoc_list[unk_index].freq += i.freq unk_list.append(self.dict[i.word]) #if i is not rare voc: add it to new list else: nvoc_list.append(i) #sort the list according to freqency nvoc_list.sort(key=lambda voc: voc.freq, reverse=True) #renew the dict of voc index nvoc_rdict = {} nvoc_dict = {} for ind, i in enumerate(nvoc_list): nvoc_dict[i.word] = ind nvoc_rdict[ind] = i.word nline_list = [] for line in self.line_list: word_list = [] for iword in line: if iword in unk_list: i = nvoc_dict['<unk>'] word_list.append(i) else: word = self.rdict[iword] i = nvoc_dict[word] word_list.append(i) nline_list.append(word_list) self.line_list = nline_list self.voc_list = nvoc_list self.dict = nvoc_dict self.rdict = nvoc_rdict #call the index of the voc def index(self, voc): if voc in self.voc_list: return self.dict[voc] else: return self.dict['<unk>'] #unigramtable for negative sampling, generated for vocs class unigramtable: def __init__(self,vocs): #rise the distribution by (3/4) power pw = 0.75 #normalizing fraction nf = sum([math.pow(i.freq, pw) for i in vocs.voc_list]) #create an emppty table table_size = int(1e7) table = np.zeros(table_size, dtype=np.uint32) #fill the table with index p = 0 # Cumulative probability i = 0 for voc in vocs.voc_list: p += float(math.pow(voc.freq, pw))/nf while i < table_size and float(i) / table_size < p: table[i] = vocs.dict[voc.word] i += 1 self.table = table #pick the vocs for negative sampling randomly def neg_sample(self, neg_num): indices = np.random.randint(0, len(self.table), size = neg_num) return [self.table[i] for i in indices] #return the indices of target vocs #set initial weight for hidden layer def init_w(dim, voc_size): #dimentions and number of voc random_0 = np.random.uniform(-0.5/dim, 0.5/dim, (voc_size, dim)) #use ctypeslib to speed up syn0 = np.asarray(random_0) random_1 = np.zeros((voc_size, dim)) syn1 = np.asarray(random_1) return (syn0, syn1) def sigm(z): if z > 6: return 1.0 elif z < -1: return 0.0 else: return 1/(1 + math.exp(-z)) def train(input_file): #formalize the input_file min_freq = 3 neg_num = 20 dim = 100 vocs = Vocs(input_file, min_freq) voc_size = len(vocs.voc_list) #set initial net syn0, syn1 = init_w(dim, voc_size) table = unigramtable(vocs) alpha = 0.05 window_size = 10 word_processed=0 print 'making data' for line_num, line in enumerate(vocs.line_list): for pos, word in enumerate(line): #make dataset(x,y) with skipgram with randam window size for each x current_window = np.random.randint(1, window_size) dataset = [] for i in range(1,current_window): if pos-i >= 0: dataset.append((word, line[pos-i])) if pos+i <= len(line)-1: dataset.append((word, line[pos+i])) for x,y in dataset: #negative sampling classifiers = [(y,1)]+[(neg,0) for neg in table.neg_sample(neg_num)] neule = np.zeros(dim) for y , tag in classifiers: z = np.dot(syn0[x],syn1[y]) p = sigm(z) g = alpha*(tag-p) #loss neule += g*syn1[y] #save loss for backpropagate syn1[y]+= g*syn0[x] #update syn1 syn0[x] +=neule word_processed+=1 index = input_file.find('.txt') output_file = input_file[:index] +'_vec'+ input_file[index:] with codecs.open(output_file, 'w', 'utf-8') as op: op.write('%d %d\n' % (len(syn0), dim)) for ivoc, vector in zip(vocs.voc_list, syn0): word = ivoc.word vector_str = ' '.join([str(v) for v in vector]) op.write('%s %s\n' % (word, vector_str)) print 'done' return vocs, syn0 def tsne(voclist, vec): model = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress = True) r = model.fit_transform(vec) source = ColumnDataSource(data=dict(x=r[:,0],y=r[:,1],la=voclist)) p = figure(plot_height=1000, plot_width=2000) p.scatter(x='x', y='y',size=0,source=source) labels = LabelSet(x='x', y='y', text='la', x_offset=0, y_offset=0, level='glyph',source=source) p.add_layout(labels) outpufile='w2c.html' show(p) def pair_similar(w1, w2, voclist, vec): nw1 = voclist.index(w1) nw2 = voclist.index(w2) sim = 1 - scipy.spatial.distance.cosine(vec[nw1], vec[nw2]) return sim def top_similar(inp, voclist, vec, num=20): inpn = voclist.index(inp) sims = [1-scipy.spatial.distance.cosine(vec[inpn], i) for i in vec] wsim = zip(voclist, sims) swsim = sorted(wsim, key=lambda w: w[1], reverse = True) return swsim[:num] def word_analogy(w1, w2, w3, voclist, vec): #w1-w2+w3 xlist=[] for x in top_similar(w1, voclist, vec, num= (len(voclist)-1)): if pair_similar(x[0],w3,voclist,vec) > pair_similar(x[0],w2,voclist,vec): xlist.append(x) print xlist[:20] def make_voclist(vocs): voclist = [ i.word for i in vocs.voc_list] return voclist def load_vec(file): input_file= open(file, 'r') lines = input_file.read().split('\n') voc_num, dim = map(int, lines[0].split(' ')) lines = lines[1:] voclist = [line.split(' ')[0].decode('utf-8') for line in lines] veclist = [] for line in lines: veclist.extend(line.split(' ')[1:]) vec = np.array(veclist) vec = np.reshape(vec,(voc_num, dim)) return voclist, vec #vocs, vec = train(args.input) #voclist = make_voclist(vocs) voclist, vec = load_vec(args.input) tsne(voclist, vec)