Skip to content

Instantly share code, notes, and snippets.

@amir-saniyan
Last active October 26, 2024 12:47
Show Gist options
  • Select an option

  • Save amir-saniyan/e102de09b01c4ed1632e3d1a1a1cbf64 to your computer and use it in GitHub Desktop.

Select an option

Save amir-saniyan/e102de09b01c4ed1632e3d1a1a1cbf64 to your computer and use it in GitHub Desktop.

Revisions

  1. amir-saniyan revised this gist Oct 26, 2024. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions Embedding Similarity Measurement.md
    Original file line number Diff line number Diff line change
    @@ -178,7 +178,7 @@ float CalculateAngularSimilarity(const std::vector<float>& embedding1, const std
    }
    ```

    # Python Version using Scipy
    ## Python Version using Scipy

    ```python
    import math
    @@ -207,7 +207,7 @@ def calculate_angular_similarity(a, b):
    return angular_similarity
    ```

    # Python Version using Numpy
    ## Python Version using Numpy

    ```python
    import math
  2. amir-saniyan revised this gist Oct 26, 2024. 1 changed file with 32 additions and 1 deletion.
    33 changes: 32 additions & 1 deletion Embedding Similarity Measurement.md
    Original file line number Diff line number Diff line change
    @@ -178,7 +178,7 @@ float CalculateAngularSimilarity(const std::vector<float>& embedding1, const std
    }
    ```

    # Python Version
    # Python Version using Scipy

    ```python
    import math
    @@ -207,6 +207,37 @@ def calculate_angular_similarity(a, b):
    return angular_similarity
    ```

    # Python Version using Numpy

    ```python
    import math

    import numpy as np


    def calculate_cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    magnitude_a = np.linalg.norm(a)
    magnitude_b = np.linalg.norm(b)
    cosine_similarity = dot_product / (magnitude_a * magnitude_b)
    return cosine_similarity

    def calculate_cosine_distance(a, b):
    cosine_similarity = calculate_cosine_similarity(a, b)
    cosine_distance = 1 - cosine_similarity
    return cosine_distance

    def calculate_angular_similarity(a, b):
    cosine_similarity = calculate_cosine_similarity(a, b)
    angular_similarity = 1 - (math.acos(cosine_similarity) / math.pi)
    return angular_similarity

    def calculate_angular_distance(a, b):
    angular_similarity = calculate_angular_similarity(a, b)
    angular_distance = 1 - angular_similarity
    return angular_distance
    ```

    ## Similarity Search Using Tensorflow

    ```python
  3. amir-saniyan revised this gist Nov 29, 2022. 1 changed file with 52 additions and 0 deletions.
    52 changes: 52 additions & 0 deletions Embedding Similarity Measurement.md
    Original file line number Diff line number Diff line change
    @@ -206,3 +206,55 @@ def calculate_angular_similarity(a, b):
    angular_similarity = 1 - calculate_angular_distance(a, b)
    return angular_similarity
    ```

    ## Similarity Search Using Tensorflow

    ```python
    import time

    import numpy as np # np.__version__ == '1.23.5'
    import tensorflow as tf # tf.__version__ == '2.11.0'

    EMBEDDINGS_LENGTH = 512
    NUMBER_OF_EMBEDDINGS = 1000 * 1000


    def calculate_cosine_similarities(x, embeddings):
    cosine_similarities = -1 * tf.keras.losses.cosine_similarity(x, embeddings)
    return cosine_similarities.numpy()


    def find_closest_embeddings(x, embeddings, top_k=1):
    cosine_similarities = calculate_cosine_similarities(x, embeddings)
    values, indices = tf.math.top_k(cosine_similarities, k=top_k)
    return values.numpy(), indices.numpy()


    def main():
    # x shape: (512)
    # Embeddings shape: (1000000, 512)
    x = np.random.rand(EMBEDDINGS_LENGTH).astype(np.float32)
    embeddings = np.random.rand(NUMBER_OF_EMBEDDINGS, EMBEDDINGS_LENGTH).astype(np.float32)

    print('Embeddings shape: ', embeddings.shape)

    n = 100
    sum_duration = 0
    for i in range(n):
    start = time.time()
    best_values, best_indices = find_closest_embeddings(x, embeddings, top_k=1)
    end = time.time()

    duration = end - start
    sum_duration += duration

    print('Duration (seconds): {}, Best value: {}, Best index: {}'.format(duration, best_values[0], best_indices[0]))

    # Average duration (seconds): 1.707 for Intel(R) Core(TM) i7-10700 CPU @ 2.90GHz
    # Average duration (seconds): 0.961 for NVIDIA 1080 ti
    print('Average duration (seconds): ', sum_duration / n)


    if __name__ == '__main__':
    main()
    ```
  4. amir-saniyan revised this gist May 16, 2021. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion Embedding Similarity Measurement.md
    Original file line number Diff line number Diff line change
    @@ -180,7 +180,7 @@ float CalculateAngularSimilarity(const std::vector<float>& embedding1, const std

    # Python Version

    ```
    ```python
    import math

    from scipy import spatial
  5. amir-saniyan revised this gist May 16, 2021. 1 changed file with 30 additions and 1 deletion.
    31 changes: 30 additions & 1 deletion Embedding Similarity Measurement.md
    Original file line number Diff line number Diff line change
    @@ -2,7 +2,7 @@ In the name of God

    # Embedding Similarity Measurement

    This gist contains implementation of Embedding Similarity Measurement in `C++`.
    This gist contains implementation of Embedding Similarity Measurement in `C++` and `Python`.

    ## Manhattan Distance

    @@ -177,3 +177,32 @@ float CalculateAngularSimilarity(const std::vector<float>& embedding1, const std
    return angularSimilarity;
    }
    ```

    # Python Version

    ```
    import math
    from scipy import spatial
    def calculate_cosine_distance(a, b):
    cosine_distance = float(spatial.distance.cosine(a, b))
    return cosine_distance
    def calculate_cosine_similarity(a, b):
    cosine_similarity = 1 - calculate_cosine_distance(a, b)
    return cosine_similarity
    def calculate_angular_distance(a, b):
    cosine_similarity = calculate_cosine_similarity(a, b)
    angular_distance = math.acos(cosine_similarity) / math.pi
    return angular_distance
    def calculate_angular_similarity(a, b):
    angular_similarity = 1 - calculate_angular_distance(a, b)
    return angular_similarity
    ```
  6. amir-saniyan revised this gist Jan 1, 2021. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion Embedding Similarity Measurement.md
    Original file line number Diff line number Diff line change
    @@ -57,7 +57,7 @@ float CalculateEuclideanDistance(const std::vector<float>& embedding1, const std
    for(std::size_t i = 0; i < embeddingSize; i++)
    {
    float distance = embedding2[i] - embedding1[i];
    sum += std::pow(distance, 2);
    sum += distance * distance;
    }
    float euclideanDistance = std::sqrt(sum);
  7. amir-saniyan created this gist Jan 1, 2021.
    179 changes: 179 additions & 0 deletions Embedding Similarity Measurement.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,179 @@
    In the name of God

    # Embedding Similarity Measurement

    This gist contains implementation of Embedding Similarity Measurement in `C++`.

    ## Manhattan Distance

    - https://en.wikipedia.org/wiki/Taxicab_geometry

    ```cpp
    #include <cstddef>
    #include <cmath>
    #include <stdexcept>

    float CalculateManhattanDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
    {
    if(embedding1.size() != embedding2.size())
    {
    throw std::invalid_argument("Embedding sizes should be equal.");
    }

    float sum = 0;

    std::size_t embeddingSize = embedding1.size();
    for(std::size_t i = 0; i < embeddingSize; i++)
    {
    float distance = std::abs(embedding2[i] - embedding1[i]);
    sum += distance;
    }

    float manhattanDistance = sum;

    return manhattanDistance;
    }
    ```
    ## Euclidean Distance
    - https://en.wikipedia.org/wiki/Euclidean_distance
    ```cpp
    #include <cstddef>
    #include <cmath>
    #include <stdexcept>
    float CalculateEuclideanDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
    {
    if(embedding1.size() != embedding2.size())
    {
    throw std::invalid_argument("Embedding sizes should be equal.");
    }
    float sum = 0;
    std::size_t embeddingSize = embedding1.size();
    for(std::size_t i = 0; i < embeddingSize; i++)
    {
    float distance = embedding2[i] - embedding1[i];
    sum += std::pow(distance, 2);
    }
    float euclideanDistance = std::sqrt(sum);
    return euclideanDistance;
    }
    ```

    ## Minkowski Distance

    - https://en.wikipedia.org/wiki/Minkowski_distance

    ```cpp
    #include <cstddef>
    #include <cmath>
    #include <stdexcept>

    float CalculateMinkowskiDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2, int p)
    {
    if(embedding1.size() != embedding2.size())
    {
    throw std::invalid_argument("Embedding sizes should be equal.");
    }

    float sum = 0;

    std::size_t embeddingSize = embedding1.size();
    for(std::size_t i = 0; i < embeddingSize; i++)
    {
    float distance = std::abs(embedding2[i] - embedding1[i]);
    sum += std::pow(distance, p);
    }

    float minkowskiDistance = std::pow(sum, 1.0f / p);

    return minkowskiDistance;
    }
    ```
    ## L1, L2, Lp Norms
    - https://en.wikipedia.org/wiki/Lp_space
    ```cpp
    float CalculateL1Norm(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
    {
    return CalculateManhattanDistance(embedding1, embedding2);
    }
    float CalculateL2Norm(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
    {
    return CalculateEuclideanDistance(embedding1, embedding2);
    }
    float CalculateLPNorm(const std::vector<float>& embedding1, const std::vector<float>& embedding2, int p)
    {
    return CalculateMinkowskiDistance(embedding1, embedding2, p);
    }
    ```

    ## Cosine Similarity

    - https://en.wikipedia.org/wiki/Cosine_similarity

    ```cpp
    #include <cstddef>
    #include <cmath>
    #include <stdexcept>

    float CalculateCosineSimilarity(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
    {
    if(embedding1.size() != embedding2.size())
    {
    throw std::invalid_argument("Embedding sizes should be equal.");
    }

    float aa = 0;
    float bb = 0;
    float ab = 0;

    std::size_t embeddingSize = embedding1.size();
    for(std::size_t i = 0; i < embeddingSize; i++)
    {
    aa += std::pow(embedding1[i], 2);
    bb += std::pow(embedding2[i], 2);
    ab += embedding1[i] * embedding2[i];
    }

    float cosineSimilarity = ab / std::sqrt(aa * bb);

    return cosineSimilarity;
    }
    ```
    ## Angular Distance, Angular Similarity
    - https://en.wikipedia.org/wiki/Cosine_similarity
    ```cpp
    # define PI 3.14159265358979323846
    float CalculateAngularDistance(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
    {
    float cosineSimilarity = CalculateCosineSimilarity(embedding1, embedding2);
    float angularDistance = std::acos(cosineSimilarity) / PI;
    return angularDistance;
    }
    float CalculateAngularSimilarity(const std::vector<float>& embedding1, const std::vector<float>& embedding2)
    {
    float angularDistance = CalculateAngularDistance(embedding1, embedding2);
    float angularSimilarity = 1 - angularDistance;
    return angularSimilarity;
    }
    ```