Last active
February 17, 2024 08:32
-
-
Save stephenleo/b1b09bb70443d98b01e96332f6f687d2 to your computer and use it in GitHub Desktop.
Revisions
-
stephenleo revised this gist
Feb 17, 2024 . 1 changed file with 38 additions and 26 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,29 +1,41 @@ import openai import os import pickle client = openai.OpenAI(api_key='update_your_openai_API_key_here') models = ["ada-002", "3-small", "3-large"] for model in models: if os.path.exists(f"{model}.pkl"): print(f"Loading OpenAI {model} Embeddings") with open(f"{model}.pkl", "rb") as f: openai_emb = pickle.load(f) else: print(f"Querying OpenAI {model} Embeddings") openai_emb = {} unique_sentences = list( set( stsb_test["sentence1"].values.tolist() + stsb_test["sentence2"].values.tolist() ) ) for sentence in tqdm(unique_sentences): if sentence not in openai_emb.keys(): response = client.embeddings.create( input=sentence, model=f"text-embedding-{model}" ) openai_emb[sentence] = response.data[0].embedding with open(f"{model}.pkl", "wb") as f: pickle.dump(openai_emb, f) # Generate Embeddings sentence1_emb = [openai_emb[sentence] for sentence in stsb_test["sentence1"]] sentence2_emb = [openai_emb[sentence] for sentence in stsb_test["sentence2"]] # Cosine Similarity stsb_test[f"OpenAI {model}_cosine_score"] = cos_sim(sentence1_emb, sentence2_emb) -
stephenleo renamed this gist
Apr 29, 2022 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
stephenleo revised this gist
Apr 29, 2022 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,3 @@ # Semantic Textual Similarity Code for the Medium post [Link](https://towardsdatascience.com/semantic-textual-similarity-83b3ca4a840e?sk=8389935eda3449a172a5905b53150d30) -
stephenleo revised this gist
Apr 19, 2022 . 1 changed file with 0 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,10 +5,6 @@ mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp conda create -n semantic_similarity python=3.8 conda activate semantic_similarity # Pip install the necessary libraries pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets pip install -U numpy spacy textdistance fasttext gensim -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 25 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,25 @@ from plotly.subplots import make_subplots import plotly.graph_objects as go nrows = 4 ncols = 3 plot_array = np.arange(0, nrows*ncols).reshape(nrows, ncols) subplot_titles = [f'{row.Index.split("_")[0]}: {row.similarity_score:.2f}' for row in spearman_rank_corr.itertuples()] fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=subplot_titles) for index, score in enumerate(spearman_rank_corr.index): row, col = np.argwhere(plot_array == index)[0] fig.add_trace( go.Scatter( x=stsb_test[score_cols[0]], y=stsb_test[score], mode='markers', ), row=row+1, col=col+1 ) fig.update_layout(height=700, width=1000, title_text='Spearman Rank Correlation (ρ × 100)', showlegend=False) fig.show() -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 5 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,5 @@ score_cols = [col for col in stsb_test.columns if '_score' in col] # Spearman Rank Correlation spearman_rank_corr = stsb_test[score_cols].corr(method='spearman').iloc[1:, 0:1]*100 spearman_rank_corr.head(10) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 29 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,29 @@ import openai import os import pickle openai.api_key = 'update_your_openai_API_key_here' if os.path.exists('../data/nlp/davinci_emb.pkl'): print('Loading Davinci Embeddings') with open('../data/nlp/davinci_emb.pkl', 'rb') as f: davinci_emb = pickle.load(f) else: print('Querying Davinci Embeddings') davinci_emb = {} engine='text-similarity-davinci-001' unique_sentences = list(set(stsb_test['sentence1'].values.tolist() + stsb_test['sentence2'].values.tolist())) for sentence in tqdm(unique_sentences): if sentence not in davinci_emb.keys(): davinci_emb[sentence] = openai.Embedding.create(input = [sentence], engine=engine)['data'][0]['embedding'] # Save embeddings to file with open('../data/nlp/davinci_emb.pkl', 'wb') as f: pickle.dump(davinci_emb, f) # Generate Embeddings sentence1_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence1']] sentence2_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence2']] # Cosine Similarity stsb_test['OpenAI Davinci_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 22 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,22 @@ ########## Supervised ########## # Load the pre-trained model model = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large') # Generate Embeddings sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True) sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True) # Cosine Similarity stsb_test['SimCSE Supervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) ########## Un-Supervised ########## # Load the pre-trained model model = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large') # Generate Embeddings sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True) sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True) # Cosine Similarity stsb_test['SimCSE Unsupervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 11 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,11 @@ from sentence_transformers import SentenceTransformer # Load the pre-trained model model = SentenceTransformer('stsb-mpnet-base-v2') # Generate Embeddings sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True) sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True) # Cosine Similarity stsb_test['SBERT BiEncoder_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 10 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,10 @@ from sentence_transformers import CrossEncoder # Load the pre-trained model model = CrossEncoder('cross-encoder/stsb-roberta-base') sentence_pairs = [] for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']): sentence_pairs.append([sentence1, sentence2]) stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 18 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,18 @@ import tensorflow as tf import tensorflow_hub as hub # Load the pre-trained model gpus = tf.config.list_physical_devices('GPU') for gpu in gpus: # Control GPU memory usage tf.config.experimental.set_memory_growth(gpu, True) module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4' model = hub.load(module_url) # Generate Embeddings sentence1_emb = model(stsb_test['sentence1']).numpy() sentence2_emb = model(stsb_test['sentence2']).numpy() # Cosine Similarity stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 16 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,16 @@ import gensim.downloader as api # Load the pre-trained model model = api.load('fasttext-wiki-news-subwords-300') def word_movers_distance(row): # Text Processing sentence1 = text_processing(row['sentence1']) sentence2 = text_processing(row['sentence2']) # Negative Word Movers Distance return -model.wmdistance(sentence1, sentence2) # Negative Word Movers Distance stsb_test['NegWMD_score'] = stsb_test.progress_apply(word_movers_distance, axis=1) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 13 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,13 @@ from sklearn.feature_extraction.text import TfidfVectorizer model = TfidfVectorizer(lowercase=True, stop_words='english') # Train the model X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique() model.fit(X_train) # Generate Embeddings on Test sentence1_emb = model.transform(stsb_test['sentence1']) sentence2_emb = model.transform(stsb_test['sentence2']) # Cosine Similarity stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 13 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,13 @@ import textdistance def jaccard_sim(row): # Text Processing sentence1 = text_processing(row['sentence1']) sentence2 = text_processing(row['sentence2']) # Jaccard similarity return textdistance.jaccard.normalized_similarity(sentence1, sentence2) # Jaccard Similarity stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 36 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,36 @@ from sklearn.metrics.pairwise import cosine_similarity import spacy nlp = spacy.load("en_core_web_sm") def text_processing(sentence): """ Lemmatize, lowercase, remove numbers and stop words Args: sentence: The sentence we want to process. Returns: A list of processed words """ sentence = [token.lemma_.lower() for token in nlp(sentence) if token.is_alpha and not token.is_stop] return sentence def cos_sim(sentence1_emb, sentence2_emb): """ Cosine similarity between two columns of sentence embeddings Args: sentence1_emb: sentence1 embedding column sentence2_emb: sentence2 embedding column Returns: The row-wise cosine similarity between the two columns. For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z] Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)] """ cos_sim = cosine_similarity(sentence1_emb, sentence2_emb) return np.diag(cos_sim) -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 15 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,15 @@ # Imports from datasets import load_dataset import pandas as pd import numpy as np from tqdm import tqdm tqdm.pandas() # Load the English STSB dataset stsb_dataset = load_dataset('stsb_multi_mt', 'en') stsb_train = pd.DataFrame(stsb_dataset['train']) stsb_test = pd.DataFrame(stsb_dataset['test']) # Check loaded data print(stsb_train.shape, stsb_test.shape) stsb_test.head() -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 19 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +1,19 @@ # Create the necessary directories mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp # Create and activate a conda environment conda create -n semantic_similarity python=3.8 conda activate semantic_similarity ## Create Virtual Environment using venv if not using conda # python -m venv semantic_similarity # source semantic_similarity/bin/activate # Pip install the necessary libraries pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets pip install -U numpy spacy textdistance fasttext gensim pip install -U tensorflow tensorflow_hub sentence-transformers openai conda install pyemd # Download the Spacy Model python -m spacy download en_core_web_sm -
stephenleo revised this gist
Apr 17, 2022 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@
NewerOlder