#Efficient fuzzy match of two data frames by one common column
library(dplyr)
library(fuzzyjoin)
library(stringdist)

eff_fuzzy_match<-function(data_frame_A,
                          data_frame_B,
                          by_what,
                          choose_p = 0.1,
                          choose_max_dist = 0.4,
                          best_only = TRUE,
                          make_lower_case = TRUE){
  
  #set matching variables to lowercase if make_lower_case is true
  if (make_lower_case == TRUE){
    data_frame_A[,by_what]<-tolower(data_frame_A[,by_what])
    data_frame_B[,by_what]<-tolower(data_frame_B[,by_what])
  }
  
  #take only unique entries
  data_frame_A = unique(data_frame_A)
  data_frame_B = unique(data_frame_B)
  
  #Merge data frames with dplyr
  ExactMatches<-inner_join(x = data_frame_A,
                           y = data_frame_B,
                           by = by_what)
  
  #Now look at those which didn't match at all 
  NoMatchesA<-anti_join(x = data_frame_A,
                        y = data_frame_B,
                        by = by_what)
  NoMatchesB<-anti_join(x = data_frame_B,
                        y = data_frame_A,
                        by = by_what)
  
  #FuzzyMatch the remaining ones by string distance
  FuzzyMatch<-stringdist_inner_join(x = NoMatchesA,
                                    y = NoMatchesB, 
                                    by = by_what,
                                    method= 'jw',
                                    p = choose_p,
                                    max_dist = choose_max_dist)
  
  #Add a column specifying the string distance
  FuzzyMatch$stdist<-stringdist(a = FuzzyMatch[,paste0(by_what,".x")],
                                b = FuzzyMatch[,paste0(by_what,".y")],
                                method = 'jw',
                                p = choose_p)
  
  #order by smallest string distance first
  FuzzyMatch <- FuzzyMatch[order(FuzzyMatch$stdist,decreasing=F),]
  
  #take the best matching one only if best_only is TRUE
  if(best_only==TRUE){
    FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".x"))]),]
    FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".y"))]),]
  }
  
  NotMatchedA<-anti_join(x = NoMatchesA,
                         y = FuzzyMatch,
                         by = setNames(paste0(by_what,".x"), 
                                       by_what))
  NotMatchedB<-anti_join(x = NoMatchesB,
                         y = FuzzyMatch,
                         by = setNames(paste0(by_what,".y"), 
                                       by_what))
  
  completeReturnList = list(exact = ExactMatches,
                            fuzzy = FuzzyMatch,
                            remainderA = NotMatchedA,
                            remainderB = NotMatchedB)
  
  return(completeReturnList)
}


# #testing:
# 
# dfa = data.frame(FirstNames=c("George",
#                             "Ketut",
#                             "Harriet",
#                             "Zhu",
#                             "Sarika",
#                             "Apple"))
# dfb = data.frame(FirstNames=c("GeorGe",
#                               "Ketut",
#                               "Harry",
#                               "Z.",
#                               "Rika",
#                               "Marion"))
# eff_fuzzy_match(dfa,dfb,by_what = "FirstNames")