#Efficient fuzzy match of two data frames by one common column library(dplyr) library(fuzzyjoin) library(stringdist) eff_fuzzy_match<-function(data_frame_A, data_frame_B, by_what, choose_p = 0.1, choose_max_dist = 0.4, best_only = TRUE, make_lower_case = TRUE){ #set matching variables to lowercase if make_lower_case is true if (make_lower_case == TRUE){ data_frame_A[,by_what]<-tolower(data_frame_A[,by_what]) data_frame_B[,by_what]<-tolower(data_frame_B[,by_what]) } #take only unique entries data_frame_A = unique(data_frame_A) data_frame_B = unique(data_frame_B) #Merge data frames with dplyr ExactMatches<-inner_join(x = data_frame_A, y = data_frame_B, by = by_what) #Now look at those which didn't match at all NoMatchesA<-anti_join(x = data_frame_A, y = data_frame_B, by = by_what) NoMatchesB<-anti_join(x = data_frame_B, y = data_frame_A, by = by_what) #FuzzyMatch the remaining ones by string distance FuzzyMatch<-stringdist_inner_join(x = NoMatchesA, y = NoMatchesB, by = by_what, method= 'jw', p = choose_p, max_dist = choose_max_dist) #Add a column specifying the string distance FuzzyMatch$stdist<-stringdist(a = FuzzyMatch[,paste0(by_what,".x")], b = FuzzyMatch[,paste0(by_what,".y")], method = 'jw', p = choose_p) #order by smallest string distance first FuzzyMatch <- FuzzyMatch[order(FuzzyMatch$stdist,decreasing=F),] #take the best matching one only if best_only is TRUE if(best_only==TRUE){ FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".x"))]),] FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".y"))]),] } NotMatchedA<-anti_join(x = NoMatchesA, y = FuzzyMatch, by = setNames(paste0(by_what,".x"), by_what)) NotMatchedB<-anti_join(x = NoMatchesB, y = FuzzyMatch, by = setNames(paste0(by_what,".y"), by_what)) completeReturnList = list(exact = ExactMatches, fuzzy = FuzzyMatch, remainderA = NotMatchedA, remainderB = NotMatchedB) return(completeReturnList) } # #testing: # # dfa = data.frame(FirstNames=c("George", # "Ketut", # "Harriet", # "Zhu", # "Sarika", # "Apple")) # dfb = data.frame(FirstNames=c("GeorGe", # "Ketut", # "Harry", # "Z.", # "Rika", # "Marion")) # eff_fuzzy_match(dfa,dfb,by_what = "FirstNames")