patrickdrouin · March 19, 2019 20:40 · Jan 24, 2018 · Jan 24, 2018 · Jan 24, 2018 · Jan 24, 2018
diff --git a/gensim_word2vec_measure_semantic_shift_by_local_neighborhood.py b/gensim_word2vec_measure_semantic_shift_by_local_neighborhood.py
@@ -35,7 +35,7 @@ def measure_semantic_shift_by_neighborhood(model1,model2,word,k=25,verbose=False
 	# Get the 'meta' neighborhood (both combined)
 	meta_neighborhood = list(set(neighborhood1)|set(neighborhood2))
 
-	# Filter the meta neighborhood this so that it contains only words present in both models
+	# Filter the meta neighborhood so that it contains only words present in both models
 	meta_neighborhood = [w for w in meta_neighborhood if w in model1.vocab and w in model2.vocab]
 
 	# For both models, get a similarity vector between the focus word and all of the words in the meta neighborhood

diff --git a/gensim_word2vec_measure_semantic_shift_by_local_neighborhood.py b/gensim_word2vec_measure_semantic_shift_by_local_neighborhood.py
@@ -0,0 +1,95 @@
+def measure_semantic_shift_by_neighborhood(model1,model2,word,k=25,verbose=False):
+	"""
+	Basic implementation of William Hamilton (@williamleif) et al's measure of semantic change
+	proposed in their paper "Cultural Shift or Linguistic Drift?" (https://arxiv.org/abs/1606.02821),
+	which they call the "local neighborhood measure." They find this measure better suited to understand
+	the semantic change of nouns owing to "cultural shift," or changes in meaning "local" to that word,
+	rather than global changes in language ("linguistic drift") use that are better suited to a
+	Procrustes-alignment method (also described in the same paper.)
+	
+	Arguments are:
+	- `model1`, `model2`: Are gensim word2vec models.
+	- `word` is a sting representation of a given word.
+	- `k` is the size of the word's neighborhood (# of its closest words in its vector space).
+	"""
+	# Import function for cosine distance
+	from scipy.spatial.distance import cosine
+
+	# Check that this word is present in both models
+	if not word in model1.vocab or not word in model2.vocab:
+		print "!! Word %s not present in both models." % word
+		return None
+
+	# Get the two neighborhoods
+	neighborhood1 = [w for w,c in model1.most_similar(word,topn=k)]
+	neighborhood2 = [w for w,c in model2.most_similar(word,topn=k)]
+
+	# Print?
+	if verbose:
+		print '>> Neighborhood of associations of the word "%s" in model1:' % word
+		print ', '.join(neighborhood1)
+		print
+		print '>> Neighborhood of associations of the word "%s" in model2:' % word
+		print ', '.join(neighborhood2)
+
+	# Get the 'meta' neighborhood (both combined)
+	meta_neighborhood = list(set(neighborhood1)|set(neighborhood2))
+
+	# Filter the meta neighborhood this so that it contains only words present in both models
+	meta_neighborhood = [w for w in meta_neighborhood if w in model1.vocab and w in model2.vocab]
+
+	# For both models, get a similarity vector between the focus word and all of the words in the meta neighborhood
+	vector1 = [model1.similarity(word,w) for w in meta_neighborhood]
+	vector2 = [model2.similarity(word,w) for w in meta_neighborhood]
+
+	# Compute the cosine distance *between* those similarity vectors
+	dist=cosine(vector1,vector2)
+
+	# Return this cosine distance -- a measure of the relative semantic shift for this word between these two models
+	return dist
+
+
+
+"""
+Example usage:
+
+model1 = [a gensim model I have for text published in the 1750s]
+model2 = [a gensim model I have for text published in the 1850s]
+
+# The word 'god' does not change much in meaning:
+
+	In [61]: measure_semantic_shift_by_neighborhood(model1,model2,'god',k=10,verbose=True)
+	
+	>> Neighborhood of associations of the word "god" in model1:
+	almighty, jehovah, creator, uncreated, omniscient, logos, righteousness, christ, redeemer, salvation
+
+	>> Neighborhood of associations of the word "god" in model2:
+	almighty, heaven, jehovah, creator, redeemer, christ, divine, righteousness, providence, saviour
+	
+	Out[61]: 0.011609088245951749
+
+# The word 'matter' does, moving from meaning mainly the "matter" of the universe to "what is the matter":
+
+	In [62]: measure_semantic_shift_by_neighborhood(model1,model2,'matter',k=10,verbose=True)
+	
+	>> Neighborhood of associations of the word "matter" in model1:
+	cohesion, sediment, menstruum, purulent, conceivable, gelatinous, morbific, compression, cerebellum, divisible
+
+	>> Neighborhood of associations of the word "matter" in model2:
+	matters, question, subject, affair, substance, concernment, concerns, questions, controversy, discussion
+	
+	Out[62]: 0.0847526073498025
+
+# The word 'station' changes even more, moving from meaning one's social rank or "station", to a train station:
+
+	In [63]: measure_semantic_shift_by_neighborhood(model1,model2,'station',k=10,verbose=True)
+	
+	>> Neighborhood of associations of the word "station" in model1:
+	stations, dation, sphere, employments, deg, vocation, personate, lowest, district, apprenticeship
+
+	>> Neighborhood of associations of the word "station" in model2:
+	stations, train, posts, position, situation, town, carriage, stationed, rank, cab
+	
+	Out[63]: 0.14173381265358098
+
+"""