library(tidyverse) library(openai) #------------------------------------------------------------------------------- # Step 1: Open AI API key #------------------------------------------------------------------------------- # Your OpenAI API key should be an environment variable you set in ~.Renviron # ... you should never put your API key directly in your code! #------------------------------------------------------------------------------- # Step 2: Define functions for later use: #------------------------------------------------------------------------------- # Function 1: calculate the cosine similarity between two vectors cosine_similarity <- function(x, y) { sum(x * y) / (sqrt(sum(x^2)) * sqrt(sum(y^2))) } # Function 2: compute the maximum cosine similarity and its index between a vector and a list of vectors max_cosine_similarity_index <- function(x, y_list) { # Use map_dbl() to compute the cosine similarity with each vector in y_list cos_sim <- map_dbl(y_list, cosine_similarity, x = x) # Return a list with the maximum value and its index of cosine similarity list(max_cos_sim = max(cos_sim), max_cos_sim_index = which.max(cos_sim)) } #------------------------------------------------------------------------------- # Step 3: Read in the data # # Assume you have two data frames called df1 and df2 # Each has a field with a school name # For example: df1 has columns "id" and "HS" # df2 has columns "schid" and "name" #------------------------------------------------------------------------------- df1 <- read_csv("test1.csv") print(df1) df2 <- read_csv("test2.csv") print(df2, n=100) #------------------------------------------------------------------------------- # Step 4: Get the embeddings for the key in each dataset # # This may take some time (took about 5 minutes for about 130 API calls) # Cost is $0.0001 / 1K tokens #------------------------------------------------------------------------------- # Get the embeddings for the school names in df1 using the model text-embedding-ada-002 df1$embedding <- map(df1$HS, function(x) create_embedding(input = x, model = "text-embedding-ada-002")$data$embedding[[1]]) # Get the embeddings for the school names in df2 using the same model df2$embedding <- map(df2$name, function(x) create_embedding(input = x, model = "text-embedding-ada-002")$data$embedding[[1]]) #------------------------------------------------------------------------------- # Step 5: For each primary key, find the maximally similar foreign key in the # other dataset #------------------------------------------------------------------------------- # Get the school name with the most similar embeddings df1$similarity <- map(df1$embedding, max_cosine_similarity_index, y_list = df2$embedding) %>% map_dbl("max_cos_sim") df1$argmax_similarity <- map(df1$embedding, max_cosine_similarity_index, y_list = df2$embedding) %>% map_int("max_cos_sim_index") df1$matched_name <- df2$name[df1$argmax_similarity] #------------------------------------------------------------------------------- # Step 6: Inspect match performance #------------------------------------------------------------------------------- df1 %>% select(HS,matched_name,similarity) %>% print