library(tidyverse)
library(openai)


#-------------------------------------------------------------------------------
# Step 1: Open AI API key
#-------------------------------------------------------------------------------
# Your OpenAI API key should be an environment variable you set in ~.Renviron
# ... you should never put your API key directly in your code!


#-------------------------------------------------------------------------------
# Step 2: Define functions for later use:
#-------------------------------------------------------------------------------
# Function 1: calculate the cosine similarity between two vectors
cosine_similarity <- function(x, y) {
  sum(x * y) / (sqrt(sum(x^2)) * sqrt(sum(y^2)))
}

# Function 2: compute the maximum cosine similarity and its index between a vector and a list of vectors
max_cosine_similarity_index <- function(x, y_list) {
    # Use map_dbl() to compute the cosine similarity with each vector in y_list
    cos_sim <- map_dbl(y_list, cosine_similarity, x = x)
    # Return a list with the maximum value and its index of cosine similarity
    list(max_cos_sim = max(cos_sim), max_cos_sim_index = which.max(cos_sim))
}


#-------------------------------------------------------------------------------
# Step 3: Read in the data
#
# Assume you have two data frames called df1 and df2
# Each has a field with a school name
# For example: df1 has columns "id" and "HS"
#              df2 has columns "schid" and "name"
#-------------------------------------------------------------------------------
df1 <- read_csv("test1.csv")
print(df1)

df2 <- read_csv("test2.csv")
print(df2, n=100)


#-------------------------------------------------------------------------------
# Step 4: Get the embeddings for the key in each dataset
#
# This may take some time (took about 5 minutes for about 130 API calls)
# Cost is $0.0001 / 1K tokens
#-------------------------------------------------------------------------------
# Get the embeddings for the school names in df1 using the model text-embedding-ada-002
df1$embedding <- map(df1$HS, function(x) create_embedding(input = x, model = "text-embedding-ada-002")$data$embedding[[1]])

# Get the embeddings for the school names in df2 using the same model
df2$embedding <- map(df2$name, function(x) create_embedding(input = x, model = "text-embedding-ada-002")$data$embedding[[1]])


#-------------------------------------------------------------------------------
# Step 5: For each primary key, find the maximally similar foreign key in the 
#         other dataset
#-------------------------------------------------------------------------------
# Get the school name with the most similar embeddings
df1$similarity        <- map(df1$embedding, max_cosine_similarity_index, y_list = df2$embedding) %>% map_dbl("max_cos_sim")
df1$argmax_similarity <- map(df1$embedding, max_cosine_similarity_index, y_list = df2$embedding) %>% map_int("max_cos_sim_index")
df1$matched_name      <- df2$name[df1$argmax_similarity]


#-------------------------------------------------------------------------------
# Step 6: Inspect match performance
#-------------------------------------------------------------------------------
df1 %>% select(HS,matched_name,similarity) %>% print