Skip to content

Instantly share code, notes, and snippets.

@yingminc
Created September 26, 2017 06:02
Show Gist options
  • Save yingminc/ad1f16b6dbde14523e65a8123cef36cf to your computer and use it in GitHub Desktop.
Save yingminc/ad1f16b6dbde14523e65a8123cef36cf to your computer and use it in GitHub Desktop.

Revisions

  1. yingminc created this gist Sep 26, 2017.
    285 changes: 285 additions & 0 deletions word2vec_test_jp_bokeh.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,285 @@
    #-*- encoding: utf-8 -*-
    from __future__ import division
    import math
    import struct
    import numpy as np
    from multiprocessing import Pool, Value, Array
    from sklearn.manifold import TSNE
    import scipy
    import codecs
    import argparse
    from bokeh.plotting import figure
    from bokeh.io import output_file,show
    from bokeh.models import LabelSet,ColumnDataSource

    parser =argparse.ArgumentParser()
    parser.add_argument('input', help = 'the file of input text')
    args = parser.parse_args()


    class ivoc:
    def __init__(self, word):
    self.word = word
    self.freq = 0

    #process the vocabulary from the data
    class Vocs:
    def __init__(self, input_file, min_fq):
    #load data
    input_file = open(input_file, 'r')
    lines = input_file.read().decode('utf-8').lower().split('\n')

    #list data and data info
    voc_list = [] #list of all voc
    voc_dict = {} #dict of voc index
    voc_rdict = {}
    line_list = []
    voc_count = 0
    word_count = 0
    for line in lines:
    word_list = []
    words = line.split(' ')
    for word in words:
    if word == (' ' or ''):
    continue
    word_count += 1
    if word not in voc_dict:
    voc_dict[word] = voc_count
    voc_rdict[voc_count] = word
    voc_count += 1
    voc_list.append(ivoc(word))
    voc_list[voc_dict[word]].freq +=1 #assign the freqency to voc
    word_list.append(voc_dict[word])
    line_list.append(word_list)

    self.voc_list = voc_list
    self.dict = voc_dict
    self.rdict = voc_rdict
    self.word_count = word_count
    self.line_list = line_list
    self.fq_sort(min_fq)

    #discard the rare vocs and sort the list by freqency
    def fq_sort(self, min_fq):
    nvoc_list = []
    nvoc_list.append(ivoc('<unk>'))
    unk_list = []
    unk_index = 0
    unk_count = 0
    for i in self.voc_list:
    #if i is a rare voc
    if i.freq < min_fq:
    unk_count += 1
    nvoc_list[unk_index].freq += i.freq
    unk_list.append(self.dict[i.word])

    #if i is not rare voc: add it to new list
    else:
    nvoc_list.append(i)

    #sort the list according to freqency
    nvoc_list.sort(key=lambda voc: voc.freq, reverse=True)

    #renew the dict of voc index
    nvoc_rdict = {}
    nvoc_dict = {}
    for ind, i in enumerate(nvoc_list):
    nvoc_dict[i.word] = ind
    nvoc_rdict[ind] = i.word

    nline_list = []
    for line in self.line_list:
    word_list = []
    for iword in line:
    if iword in unk_list:
    i = nvoc_dict['<unk>']
    word_list.append(i)
    else:
    word = self.rdict[iword]
    i = nvoc_dict[word]
    word_list.append(i)
    nline_list.append(word_list)

    self.line_list = nline_list
    self.voc_list = nvoc_list
    self.dict = nvoc_dict
    self.rdict = nvoc_rdict

    #call the index of the voc
    def index(self, voc):
    if voc in self.voc_list:
    return self.dict[voc]
    else:
    return self.dict['<unk>']

    #unigramtable for negative sampling, generated for vocs
    class unigramtable:
    def __init__(self,vocs):
    #rise the distribution by (3/4) power
    pw = 0.75

    #normalizing fraction
    nf = sum([math.pow(i.freq, pw) for i in vocs.voc_list])

    #create an emppty table
    table_size = int(1e7)
    table = np.zeros(table_size, dtype=np.uint32)

    #fill the table with index
    p = 0 # Cumulative probability
    i = 0
    for voc in vocs.voc_list:
    p += float(math.pow(voc.freq, pw))/nf
    while i < table_size and float(i) / table_size < p:
    table[i] = vocs.dict[voc.word]
    i += 1
    self.table = table

    #pick the vocs for negative sampling randomly
    def neg_sample(self, neg_num):
    indices = np.random.randint(0, len(self.table), size = neg_num)

    return [self.table[i] for i in indices] #return the indices of target vocs

    #set initial weight for hidden layer
    def init_w(dim, voc_size): #dimentions and number of voc
    random_0 = np.random.uniform(-0.5/dim, 0.5/dim, (voc_size, dim))

    #use ctypeslib to speed up
    syn0 = np.asarray(random_0)


    random_1 = np.zeros((voc_size, dim))
    syn1 = np.asarray(random_1)

    return (syn0, syn1)

    def sigm(z):
    if z > 6:
    return 1.0
    elif z < -1:
    return 0.0
    else:
    return 1/(1 + math.exp(-z))

    def train(input_file):
    #formalize the input_file
    min_freq = 3
    neg_num = 20
    dim = 100
    vocs = Vocs(input_file, min_freq)
    voc_size = len(vocs.voc_list)

    #set initial net
    syn0, syn1 = init_w(dim, voc_size)
    table = unigramtable(vocs)

    alpha = 0.05
    window_size = 10

    word_processed=0
    print 'making data'
    for line_num, line in enumerate(vocs.line_list):
    for pos, word in enumerate(line):
    #make dataset(x,y) with skipgram with randam window size for each x
    current_window = np.random.randint(1, window_size)
    dataset = []

    for i in range(1,current_window):
    if pos-i >= 0:
    dataset.append((word, line[pos-i]))
    if pos+i <= len(line)-1:
    dataset.append((word, line[pos+i]))

    for x,y in dataset:
    #negative sampling
    classifiers = [(y,1)]+[(neg,0) for neg in table.neg_sample(neg_num)]

    neule = np.zeros(dim)

    for y , tag in classifiers:
    z = np.dot(syn0[x],syn1[y])
    p = sigm(z)
    g = alpha*(tag-p) #loss
    neule += g*syn1[y] #save loss for backpropagate
    syn1[y]+= g*syn0[x] #update syn1

    syn0[x] +=neule

    word_processed+=1

    index = input_file.find('.txt')
    output_file = input_file[:index] +'_vec'+ input_file[index:]

    with codecs.open(output_file, 'w', 'utf-8') as op:
    op.write('%d %d\n' % (len(syn0), dim))
    for ivoc, vector in zip(vocs.voc_list, syn0):
    word = ivoc.word
    vector_str = ' '.join([str(v) for v in vector])
    op.write('%s %s\n' % (word, vector_str))



    print 'done'

    return vocs, syn0

    def tsne(voclist, vec):

    model = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress = True)
    r = model.fit_transform(vec)
    source = ColumnDataSource(data=dict(x=r[:,0],y=r[:,1],la=voclist))
    p = figure(plot_height=1000, plot_width=2000)
    p.scatter(x='x', y='y',size=0,source=source)
    labels = LabelSet(x='x', y='y', text='la', x_offset=0, y_offset=0, level='glyph',source=source)
    p.add_layout(labels)
    outpufile='w2c.html'
    show(p)

    def pair_similar(w1, w2, voclist, vec):
    nw1 = voclist.index(w1)
    nw2 = voclist.index(w2)
    sim = 1 - scipy.spatial.distance.cosine(vec[nw1], vec[nw2])
    return sim


    def top_similar(inp, voclist, vec, num=20):
    inpn = voclist.index(inp)
    sims = [1-scipy.spatial.distance.cosine(vec[inpn], i) for i in vec]
    wsim = zip(voclist, sims)
    swsim = sorted(wsim, key=lambda w: w[1], reverse = True)
    return swsim[:num]

    def word_analogy(w1, w2, w3, voclist, vec):
    #w1-w2+w3
    xlist=[]
    for x in top_similar(w1, voclist, vec, num= (len(voclist)-1)):
    if pair_similar(x[0],w3,voclist,vec) > pair_similar(x[0],w2,voclist,vec):
    xlist.append(x)
    print xlist[:20]

    def make_voclist(vocs):
    voclist = [ i.word for i in vocs.voc_list]
    return voclist

    def load_vec(file):
    input_file= open(file, 'r')
    lines = input_file.read().split('\n')
    voc_num, dim = map(int, lines[0].split(' '))
    lines = lines[1:]
    voclist = [line.split(' ')[0].decode('utf-8') for line in lines]
    veclist = []
    for line in lines:
    veclist.extend(line.split(' ')[1:])
    vec = np.array(veclist)
    vec = np.reshape(vec,(voc_num, dim))
    return voclist, vec



    #vocs, vec = train(args.input)
    #voclist = make_voclist(vocs)

    voclist, vec = load_vec(args.input)
    tsne(voclist, vec)