stephenleo · February 17, 2024 08:32 · Feb 17, 2024 · Apr 29, 2022 · Apr 29, 2022 · Apr 19, 2022
diff --git a/11_openai.py b/11_openai.py
@@ -1,29 +1,41 @@
 import openai
 import os
 import pickle
-openai.api_key = 'update_your_openai_API_key_here'
-
-if os.path.exists('../data/nlp/davinci_emb.pkl'):
-    print('Loading Davinci Embeddings')
-    with open('../data/nlp/davinci_emb.pkl', 'rb') as f:
-        davinci_emb = pickle.load(f)
-else:
-    print('Querying Davinci Embeddings')
-    davinci_emb = {}
-    engine='text-similarity-davinci-001'
-
-    unique_sentences = list(set(stsb_test['sentence1'].values.tolist() + stsb_test['sentence2'].values.tolist()))
-    for sentence in tqdm(unique_sentences):
-        if sentence not in davinci_emb.keys():
-            davinci_emb[sentence] = openai.Embedding.create(input = [sentence], 
-                                                            engine=engine)['data'][0]['embedding']
-    # Save embeddings to file      
-    with open('../data/nlp/davinci_emb.pkl', 'wb') as f:
-        pickle.dump(davinci_emb, f)
-
-# Generate Embeddings
-sentence1_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence1']]
-sentence2_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence2']]
-
-# Cosine Similarity
-stsb_test['OpenAI Davinci_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
+
+client = openai.OpenAI(api_key='update_your_openai_API_key_here')
+
+models = ["ada-002", "3-small", "3-large"]
+
+for model in models:
+    if os.path.exists(f"{model}.pkl"):
+        print(f"Loading OpenAI {model} Embeddings")
+        with open(f"{model}.pkl", "rb") as f:
+            openai_emb = pickle.load(f)
+
+    else:
+        print(f"Querying OpenAI {model} Embeddings")
+        openai_emb = {}
+
+        unique_sentences = list(
+            set(
+                stsb_test["sentence1"].values.tolist()
+                + stsb_test["sentence2"].values.tolist()
+            )
+        )
+
+        for sentence in tqdm(unique_sentences):
+            if sentence not in openai_emb.keys():
+                response = client.embeddings.create(
+                    input=sentence, model=f"text-embedding-{model}"
+                )
+                openai_emb[sentence] = response.data[0].embedding
+
+        with open(f"{model}.pkl", "wb") as f:
+            pickle.dump(openai_emb, f)
+
+    # Generate Embeddings
+    sentence1_emb = [openai_emb[sentence] for sentence in stsb_test["sentence1"]]
+    sentence2_emb = [openai_emb[sentence] for sentence in stsb_test["sentence2"]]
+
+    # Cosine Similarity
+    stsb_test[f"OpenAI {model}_cosine_score"] = cos_sim(sentence1_emb, sentence2_emb)
diff --git a/00_sota_sts.md → 00_sts.md b/00_sota_sts.md → 00_sts.md
diff --git a/00_sota_sts.md b/00_sota_sts.md
@@ -1,3 +1,3 @@
-# State of the Art Semantic Textual Similarity
+# Semantic Textual Similarity
 
-Code for the Medium post [Link]()
+Code for the Medium post [Link](https://towardsdatascience.com/semantic-textual-similarity-83b3ca4a840e?sk=8389935eda3449a172a5905b53150d30)
diff --git a/01_setup.sh b/01_setup.sh
@@ -5,10 +5,6 @@ mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp
 conda create -n semantic_similarity python=3.8
 conda activate semantic_similarity
 
-## Create Virtual Environment using venv if not using conda
-# python -m venv semantic_similarity
-# source semantic_similarity/bin/activate
-
 # Pip install the necessary libraries
 pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets 
 pip install -U numpy spacy textdistance fasttext gensim 

diff --git a/13_plot.py b/13_plot.py
@@ -1 +1,25 @@
-‎‎
+from plotly.subplots import make_subplots
+import plotly.graph_objects as go
+
+nrows = 4
+ncols = 3
+plot_array = np.arange(0, nrows*ncols).reshape(nrows, ncols)
+
+subplot_titles = [f'{row.Index.split("_")[0]}: {row.similarity_score:.2f}' for row in spearman_rank_corr.itertuples()]
+fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=subplot_titles)
+
+for index, score in enumerate(spearman_rank_corr.index):
+    row, col = np.argwhere(plot_array == index)[0]
+
+    fig.add_trace(
+        go.Scatter(
+            x=stsb_test[score_cols[0]], 
+            y=stsb_test[score],
+            mode='markers',
+        ),
+        row=row+1, col=col+1
+    )
+
+
+fig.update_layout(height=700, width=1000, title_text='Spearman Rank Correlation (ρ × 100)', showlegend=False)
+fig.show()
diff --git a/13_plot.py b/13_plot.py
@@ -0,0 +1 @@
+‎‎
diff --git a/12_spearman_rank_correlation.py b/12_spearman_rank_correlation.py
@@ -1 +1,5 @@
-‎‎
+score_cols = [col for col in stsb_test.columns if '_score' in col]
+
+# Spearman Rank Correlation
+spearman_rank_corr = stsb_test[score_cols].corr(method='spearman').iloc[1:, 0:1]*100
+spearman_rank_corr.head(10)
diff --git a/12_spearman_rank_correlation.py b/12_spearman_rank_correlation.py
@@ -0,0 +1 @@
+‎‎
diff --git a/11_openai.py b/11_openai.py
@@ -1 +1,29 @@
-‎‎
+import openai
+import os
+import pickle
+openai.api_key = 'update_your_openai_API_key_here'
+
+if os.path.exists('../data/nlp/davinci_emb.pkl'):
+    print('Loading Davinci Embeddings')
+    with open('../data/nlp/davinci_emb.pkl', 'rb') as f:
+        davinci_emb = pickle.load(f)
+else:
+    print('Querying Davinci Embeddings')
+    davinci_emb = {}
+    engine='text-similarity-davinci-001'
+
+    unique_sentences = list(set(stsb_test['sentence1'].values.tolist() + stsb_test['sentence2'].values.tolist()))
+    for sentence in tqdm(unique_sentences):
+        if sentence not in davinci_emb.keys():
+            davinci_emb[sentence] = openai.Embedding.create(input = [sentence], 
+                                                            engine=engine)['data'][0]['embedding']
+    # Save embeddings to file      
+    with open('../data/nlp/davinci_emb.pkl', 'wb') as f:
+        pickle.dump(davinci_emb, f)
+
+# Generate Embeddings
+sentence1_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence1']]
+sentence2_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence2']]
+
+# Cosine Similarity
+stsb_test['OpenAI Davinci_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
diff --git a/11_openai.py b/11_openai.py
@@ -0,0 +1 @@
+‎‎
diff --git a/10_simcse.py b/10_simcse.py
@@ -1 +1,22 @@
-‎‎
+########## Supervised ##########
+# Load the pre-trained model
+model = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large')
+
+# Generate Embeddings
+sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
+sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)
+
+# Cosine Similarity
+stsb_test['SimCSE Supervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
+
+
+########## Un-Supervised ##########
+# Load the pre-trained model
+model = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large')
+
+# Generate Embeddings
+sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
+sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)
+
+# Cosine Similarity
+stsb_test['SimCSE Unsupervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
diff --git a/10_simcse.py b/10_simcse.py
@@ -0,0 +1 @@
+‎‎
diff --git a/09_bi_encoder.py b/09_bi_encoder.py
@@ -1 +1,11 @@
-‎‎
+from sentence_transformers import SentenceTransformer
+
+# Load the pre-trained model
+model = SentenceTransformer('stsb-mpnet-base-v2')
+
+# Generate Embeddings
+sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
+sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)
+
+# Cosine Similarity
+stsb_test['SBERT BiEncoder_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
diff --git a/09_bi_encoder.py b/09_bi_encoder.py
@@ -0,0 +1 @@
+‎‎
diff --git a/08_cross_encoder.py b/08_cross_encoder.py
@@ -1 +1,10 @@
-‎‎
+from sentence_transformers import CrossEncoder
+
+# Load the pre-trained model
+model = CrossEncoder('cross-encoder/stsb-roberta-base')
+
+sentence_pairs = []
+for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']):
+    sentence_pairs.append([sentence1, sentence2])
+
+stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True)
diff --git a/08_cross_encoder.py b/08_cross_encoder.py
@@ -0,0 +1 @@
+‎‎
diff --git a/07_use.py b/07_use.py
@@ -1 +1,18 @@
-‎‎
+import tensorflow as tf
+import tensorflow_hub as hub
+
+# Load the pre-trained model
+gpus = tf.config.list_physical_devices('GPU')
+for gpu in gpus:
+    # Control GPU memory usage
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
+model = hub.load(module_url)
+
+# Generate Embeddings
+sentence1_emb = model(stsb_test['sentence1']).numpy()
+sentence2_emb = model(stsb_test['sentence2']).numpy()
+
+# Cosine Similarity
+stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
diff --git a/07_use.py b/07_use.py
@@ -0,0 +1 @@
+‎‎
diff --git a/06_wmd.py b/06_wmd.py
@@ -1 +1,16 @@
-‎‎
+import gensim.downloader as api
+
+# Load the pre-trained model
+model = api.load('fasttext-wiki-news-subwords-300')
+
+def word_movers_distance(row):
+    # Text Processing
+    sentence1 = text_processing(row['sentence1'])
+    sentence2 = text_processing(row['sentence2'])
+
+    # Negative Word Movers Distance
+    return -model.wmdistance(sentence1, sentence2)
+
+
+# Negative Word Movers Distance
+stsb_test['NegWMD_score'] = stsb_test.progress_apply(word_movers_distance, axis=1)
diff --git a/06_wmd.py b/06_wmd.py
@@ -0,0 +1 @@
+‎‎
diff --git a/05_tfidf.py b/05_tfidf.py
@@ -1 +1,13 @@
-‎‎
+from sklearn.feature_extraction.text import TfidfVectorizer
+model = TfidfVectorizer(lowercase=True, stop_words='english')
+
+# Train the model
+X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique()
+model.fit(X_train)
+
+# Generate Embeddings on Test
+sentence1_emb = model.transform(stsb_test['sentence1'])
+sentence2_emb = model.transform(stsb_test['sentence2'])
+
+# Cosine Similarity
+stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
diff --git a/05_tfidf.py b/05_tfidf.py
@@ -0,0 +1 @@
+‎‎
diff --git a/04_jaccard.py b/04_jaccard.py
@@ -1 +1,13 @@
-‎‎
+import textdistance
+
+def jaccard_sim(row):
+    # Text Processing
+    sentence1 = text_processing(row['sentence1'])
+    sentence2 = text_processing(row['sentence2'])
+
+    # Jaccard similarity
+    return textdistance.jaccard.normalized_similarity(sentence1, sentence2)
+
+
+# Jaccard Similarity
+stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1)
diff --git a/04_jaccard.py b/04_jaccard.py
@@ -0,0 +1 @@
+‎‎
diff --git a/03_helpers.py b/03_helpers.py
@@ -1 +1,36 @@
-‎‎
+from sklearn.metrics.pairwise import cosine_similarity
+import spacy
+nlp = spacy.load("en_core_web_sm")
+
+def text_processing(sentence):
+    """
+    Lemmatize, lowercase, remove numbers and stop words
+    
+    Args:
+      sentence: The sentence we want to process.
+    
+    Returns:
+      A list of processed words
+    """
+    sentence = [token.lemma_.lower()
+                for token in nlp(sentence) 
+                if token.is_alpha and not token.is_stop]
+
+    return sentence
+
+
+def cos_sim(sentence1_emb, sentence2_emb):
+    """
+    Cosine similarity between two columns of sentence embeddings
+    
+    Args:
+      sentence1_emb: sentence1 embedding column
+      sentence2_emb: sentence2 embedding column
+    
+    Returns:
+      The row-wise cosine similarity between the two columns.
+      For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
+      Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
+    """
+    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
+    return np.diag(cos_sim)
diff --git a/03_helpers.py b/03_helpers.py
@@ -0,0 +1 @@
+‎‎
diff --git a/02_download_data.py b/02_download_data.py
@@ -1 +1,15 @@
-‎‎
+# Imports
+from datasets import load_dataset
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+tqdm.pandas()
+
+# Load the English STSB dataset
+stsb_dataset = load_dataset('stsb_multi_mt', 'en')
+stsb_train = pd.DataFrame(stsb_dataset['train'])
+stsb_test = pd.DataFrame(stsb_dataset['test'])
+
+# Check loaded data
+print(stsb_train.shape, stsb_test.shape)
+stsb_test.head()
diff --git a/02_download_data.py b/02_download_data.py
@@ -0,0 +1 @@
+‎‎
diff --git a/01_setup.sh b/01_setup.sh
@@ -1 +1,19 @@
-‎‎
+# Create the necessary directories 
+mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp
+
+# Create and activate a conda environment
+conda create -n semantic_similarity python=3.8
+conda activate semantic_similarity
+
+## Create Virtual Environment using venv if not using conda
+# python -m venv semantic_similarity
+# source semantic_similarity/bin/activate
+
+# Pip install the necessary libraries
+pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets 
+pip install -U numpy spacy textdistance fasttext gensim 
+pip install -U tensorflow tensorflow_hub sentence-transformers openai
+conda install pyemd
+
+# Download the Spacy Model
+python -m spacy download en_core_web_sm
diff --git a/01_setup.sh b/01_setup.sh
@@ -0,0 +1 @@
+‎‎
No results found