Skip to content

Instantly share code, notes, and snippets.

@stephenleo
Last active February 17, 2024 08:32
Show Gist options
  • Select an option

  • Save stephenleo/b1b09bb70443d98b01e96332f6f687d2 to your computer and use it in GitHub Desktop.

Select an option

Save stephenleo/b1b09bb70443d98b01e96332f6f687d2 to your computer and use it in GitHub Desktop.

Revisions

  1. stephenleo revised this gist Feb 17, 2024. 1 changed file with 38 additions and 26 deletions.
    64 changes: 38 additions & 26 deletions 11_openai.py
    Original file line number Diff line number Diff line change
    @@ -1,29 +1,41 @@
    import openai
    import os
    import pickle
    openai.api_key = 'update_your_openai_API_key_here'

    if os.path.exists('../data/nlp/davinci_emb.pkl'):
    print('Loading Davinci Embeddings')
    with open('../data/nlp/davinci_emb.pkl', 'rb') as f:
    davinci_emb = pickle.load(f)
    else:
    print('Querying Davinci Embeddings')
    davinci_emb = {}
    engine='text-similarity-davinci-001'

    unique_sentences = list(set(stsb_test['sentence1'].values.tolist() + stsb_test['sentence2'].values.tolist()))
    for sentence in tqdm(unique_sentences):
    if sentence not in davinci_emb.keys():
    davinci_emb[sentence] = openai.Embedding.create(input = [sentence],
    engine=engine)['data'][0]['embedding']
    # Save embeddings to file
    with open('../data/nlp/davinci_emb.pkl', 'wb') as f:
    pickle.dump(davinci_emb, f)

    # Generate Embeddings
    sentence1_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence1']]
    sentence2_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence2']]

    # Cosine Similarity
    stsb_test['OpenAI Davinci_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

    client = openai.OpenAI(api_key='update_your_openai_API_key_here')

    models = ["ada-002", "3-small", "3-large"]

    for model in models:
    if os.path.exists(f"{model}.pkl"):
    print(f"Loading OpenAI {model} Embeddings")
    with open(f"{model}.pkl", "rb") as f:
    openai_emb = pickle.load(f)

    else:
    print(f"Querying OpenAI {model} Embeddings")
    openai_emb = {}

    unique_sentences = list(
    set(
    stsb_test["sentence1"].values.tolist()
    + stsb_test["sentence2"].values.tolist()
    )
    )

    for sentence in tqdm(unique_sentences):
    if sentence not in openai_emb.keys():
    response = client.embeddings.create(
    input=sentence, model=f"text-embedding-{model}"
    )
    openai_emb[sentence] = response.data[0].embedding

    with open(f"{model}.pkl", "wb") as f:
    pickle.dump(openai_emb, f)

    # Generate Embeddings
    sentence1_emb = [openai_emb[sentence] for sentence in stsb_test["sentence1"]]
    sentence2_emb = [openai_emb[sentence] for sentence in stsb_test["sentence2"]]

    # Cosine Similarity
    stsb_test[f"OpenAI {model}_cosine_score"] = cos_sim(sentence1_emb, sentence2_emb)
  2. stephenleo renamed this gist Apr 29, 2022. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. stephenleo revised this gist Apr 29, 2022. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions 00_sota_sts.md
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,3 @@
    # State of the Art Semantic Textual Similarity
    # Semantic Textual Similarity

    Code for the Medium post [Link]()
    Code for the Medium post [Link](https://towardsdatascience.com/semantic-textual-similarity-83b3ca4a840e?sk=8389935eda3449a172a5905b53150d30)
  4. stephenleo revised this gist Apr 19, 2022. 1 changed file with 0 additions and 4 deletions.
    4 changes: 0 additions & 4 deletions 01_setup.sh
    Original file line number Diff line number Diff line change
    @@ -5,10 +5,6 @@ mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp
    conda create -n semantic_similarity python=3.8
    conda activate semantic_similarity

    ## Create Virtual Environment using venv if not using conda
    # python -m venv semantic_similarity
    # source semantic_similarity/bin/activate

    # Pip install the necessary libraries
    pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets
    pip install -U numpy spacy textdistance fasttext gensim
  5. stephenleo revised this gist Apr 17, 2022. 1 changed file with 25 additions and 1 deletion.
    26 changes: 25 additions & 1 deletion 13_plot.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,25 @@
    ‎‎​
    from plotly.subplots import make_subplots
    import plotly.graph_objects as go

    nrows = 4
    ncols = 3
    plot_array = np.arange(0, nrows*ncols).reshape(nrows, ncols)

    subplot_titles = [f'{row.Index.split("_")[0]}: {row.similarity_score:.2f}' for row in spearman_rank_corr.itertuples()]
    fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=subplot_titles)

    for index, score in enumerate(spearman_rank_corr.index):
    row, col = np.argwhere(plot_array == index)[0]

    fig.add_trace(
    go.Scatter(
    x=stsb_test[score_cols[0]],
    y=stsb_test[score],
    mode='markers',
    ),
    row=row+1, col=col+1
    )


    fig.update_layout(height=700, width=1000, title_text='Spearman Rank Correlation (ρ × 100)', showlegend=False)
    fig.show()
  6. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 13_plot.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  7. stephenleo revised this gist Apr 17, 2022. 1 changed file with 5 additions and 1 deletion.
    6 changes: 5 additions & 1 deletion 12_spearman_rank_correlation.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,5 @@
    ‎‎​
    score_cols = [col for col in stsb_test.columns if '_score' in col]

    # Spearman Rank Correlation
    spearman_rank_corr = stsb_test[score_cols].corr(method='spearman').iloc[1:, 0:1]*100
    spearman_rank_corr.head(10)
  8. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 12_spearman_rank_correlation.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  9. stephenleo revised this gist Apr 17, 2022. 1 changed file with 29 additions and 1 deletion.
    30 changes: 29 additions & 1 deletion 11_openai.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,29 @@
    ‎‎​
    import openai
    import os
    import pickle
    openai.api_key = 'update_your_openai_API_key_here'

    if os.path.exists('../data/nlp/davinci_emb.pkl'):
    print('Loading Davinci Embeddings')
    with open('../data/nlp/davinci_emb.pkl', 'rb') as f:
    davinci_emb = pickle.load(f)
    else:
    print('Querying Davinci Embeddings')
    davinci_emb = {}
    engine='text-similarity-davinci-001'

    unique_sentences = list(set(stsb_test['sentence1'].values.tolist() + stsb_test['sentence2'].values.tolist()))
    for sentence in tqdm(unique_sentences):
    if sentence not in davinci_emb.keys():
    davinci_emb[sentence] = openai.Embedding.create(input = [sentence],
    engine=engine)['data'][0]['embedding']
    # Save embeddings to file
    with open('../data/nlp/davinci_emb.pkl', 'wb') as f:
    pickle.dump(davinci_emb, f)

    # Generate Embeddings
    sentence1_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence1']]
    sentence2_emb = [davinci_emb[sentence] for sentence in stsb_test['sentence2']]

    # Cosine Similarity
    stsb_test['OpenAI Davinci_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
  10. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 11_openai.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  11. stephenleo revised this gist Apr 17, 2022. 1 changed file with 22 additions and 1 deletion.
    23 changes: 22 additions & 1 deletion 10_simcse.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,22 @@
    ‎‎​
    ########## Supervised ##########
    # Load the pre-trained model
    model = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large')

    # Generate Embeddings
    sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
    sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

    # Cosine Similarity
    stsb_test['SimCSE Supervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)


    ########## Un-Supervised ##########
    # Load the pre-trained model
    model = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large')

    # Generate Embeddings
    sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
    sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

    # Cosine Similarity
    stsb_test['SimCSE Unsupervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
  12. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 10_simcse.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  13. stephenleo revised this gist Apr 17, 2022. 1 changed file with 11 additions and 1 deletion.
    12 changes: 11 additions & 1 deletion 09_bi_encoder.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,11 @@
    ‎‎​
    from sentence_transformers import SentenceTransformer

    # Load the pre-trained model
    model = SentenceTransformer('stsb-mpnet-base-v2')

    # Generate Embeddings
    sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
    sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

    # Cosine Similarity
    stsb_test['SBERT BiEncoder_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
  14. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 09_bi_encoder.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  15. stephenleo revised this gist Apr 17, 2022. 1 changed file with 10 additions and 1 deletion.
    11 changes: 10 additions & 1 deletion 08_cross_encoder.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,10 @@
    ‎‎​
    from sentence_transformers import CrossEncoder

    # Load the pre-trained model
    model = CrossEncoder('cross-encoder/stsb-roberta-base')

    sentence_pairs = []
    for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']):
    sentence_pairs.append([sentence1, sentence2])

    stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True)
  16. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 08_cross_encoder.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  17. stephenleo revised this gist Apr 17, 2022. 1 changed file with 18 additions and 1 deletion.
    19 changes: 18 additions & 1 deletion 07_use.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,18 @@
    ‎‎​
    import tensorflow as tf
    import tensorflow_hub as hub

    # Load the pre-trained model
    gpus = tf.config.list_physical_devices('GPU')
    for gpu in gpus:
    # Control GPU memory usage
    tf.config.experimental.set_memory_growth(gpu, True)

    module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
    model = hub.load(module_url)

    # Generate Embeddings
    sentence1_emb = model(stsb_test['sentence1']).numpy()
    sentence2_emb = model(stsb_test['sentence2']).numpy()

    # Cosine Similarity
    stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
  18. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 07_use.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  19. stephenleo revised this gist Apr 17, 2022. 1 changed file with 16 additions and 1 deletion.
    17 changes: 16 additions & 1 deletion 06_wmd.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,16 @@
    ‎‎​
    import gensim.downloader as api

    # Load the pre-trained model
    model = api.load('fasttext-wiki-news-subwords-300')

    def word_movers_distance(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])

    # Negative Word Movers Distance
    return -model.wmdistance(sentence1, sentence2)


    # Negative Word Movers Distance
    stsb_test['NegWMD_score'] = stsb_test.progress_apply(word_movers_distance, axis=1)
  20. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 06_wmd.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  21. stephenleo revised this gist Apr 17, 2022. 1 changed file with 13 additions and 1 deletion.
    14 changes: 13 additions & 1 deletion 05_tfidf.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,13 @@
    ‎‎​
    from sklearn.feature_extraction.text import TfidfVectorizer
    model = TfidfVectorizer(lowercase=True, stop_words='english')

    # Train the model
    X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique()
    model.fit(X_train)

    # Generate Embeddings on Test
    sentence1_emb = model.transform(stsb_test['sentence1'])
    sentence2_emb = model.transform(stsb_test['sentence2'])

    # Cosine Similarity
    stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
  22. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 05_tfidf.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  23. stephenleo revised this gist Apr 17, 2022. 1 changed file with 13 additions and 1 deletion.
    14 changes: 13 additions & 1 deletion 04_jaccard.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,13 @@
    ‎‎​
    import textdistance

    def jaccard_sim(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])

    # Jaccard similarity
    return textdistance.jaccard.normalized_similarity(sentence1, sentence2)


    # Jaccard Similarity
    stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1)
  24. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 04_jaccard.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  25. stephenleo revised this gist Apr 17, 2022. 1 changed file with 36 additions and 1 deletion.
    37 changes: 36 additions & 1 deletion 03_helpers.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,36 @@
    ‎‎​
    from sklearn.metrics.pairwise import cosine_similarity
    import spacy
    nlp = spacy.load("en_core_web_sm")

    def text_processing(sentence):
    """
    Lemmatize, lowercase, remove numbers and stop words
    Args:
    sentence: The sentence we want to process.
    Returns:
    A list of processed words
    """
    sentence = [token.lemma_.lower()
    for token in nlp(sentence)
    if token.is_alpha and not token.is_stop]

    return sentence


    def cos_sim(sentence1_emb, sentence2_emb):
    """
    Cosine similarity between two columns of sentence embeddings
    Args:
    sentence1_emb: sentence1 embedding column
    sentence2_emb: sentence2 embedding column
    Returns:
    The row-wise cosine similarity between the two columns.
    For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
    Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
    """
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)
  26. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 03_helpers.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  27. stephenleo revised this gist Apr 17, 2022. 1 changed file with 15 additions and 1 deletion.
    16 changes: 15 additions & 1 deletion 02_download_data.py
    Original file line number Diff line number Diff line change
    @@ -1 +1,15 @@
    ‎‎​
    # Imports
    from datasets import load_dataset
    import pandas as pd
    import numpy as np
    from tqdm import tqdm
    tqdm.pandas()

    # Load the English STSB dataset
    stsb_dataset = load_dataset('stsb_multi_mt', 'en')
    stsb_train = pd.DataFrame(stsb_dataset['train'])
    stsb_test = pd.DataFrame(stsb_dataset['test'])

    # Check loaded data
    print(stsb_train.shape, stsb_test.shape)
    stsb_test.head()
  28. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 02_download_data.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​
  29. stephenleo revised this gist Apr 17, 2022. 1 changed file with 19 additions and 1 deletion.
    20 changes: 19 additions & 1 deletion 01_setup.sh
    Original file line number Diff line number Diff line change
    @@ -1 +1,19 @@
    ‎‎​
    # Create the necessary directories
    mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp

    # Create and activate a conda environment
    conda create -n semantic_similarity python=3.8
    conda activate semantic_similarity

    ## Create Virtual Environment using venv if not using conda
    # python -m venv semantic_similarity
    # source semantic_similarity/bin/activate

    # Pip install the necessary libraries
    pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets
    pip install -U numpy spacy textdistance fasttext gensim
    pip install -U tensorflow tensorflow_hub sentence-transformers openai
    conda install pyemd

    # Download the Spacy Model
    python -m spacy download en_core_web_sm
  30. stephenleo revised this gist Apr 17, 2022. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions 01_setup.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    ‎‎​