prokopyev · May 21, 2018 18:59 · Feb 22, 2018 · Feb 21, 2018 · Feb 21, 2018
diff --git a/eff_fuzzy_match.R b/eff_fuzzy_match.R
@@ -18,7 +18,7 @@ eff_fuzzy_match<-function(data_frame_A,
   }
 
   #take only unique entries
-  data_frame_A = unique(data_frame_A),
+  data_frame_A = unique(data_frame_A)
   data_frame_B = unique(data_frame_B)
 
   #Merge data frames with dplyr

diff --git a/eff_fuzzy_match.R b/eff_fuzzy_match.R
@@ -17,6 +17,10 @@ eff_fuzzy_match<-function(data_frame_A,
     data_frame_B[,by_what]<-tolower(data_frame_B[,by_what])
   }
 
+  #take only unique entries
+  data_frame_A = unique(data_frame_A),
+  data_frame_B = unique(data_frame_B)
+
   #Merge data frames with dplyr
   ExactMatches<-inner_join(x = data_frame_A,
                            y = data_frame_B,

diff --git a/eff_fuzzy_match.R b/eff_fuzzy_match.R
@@ -0,0 +1,90 @@
+#Efficient fuzzy match of two data frames by one common column
+library(dplyr)
+library(fuzzyjoin)
+library(stringdist)
+
+eff_fuzzy_match<-function(data_frame_A,
+                          data_frame_B,
+                          by_what,
+                          choose_p = 0.1,
+                          choose_max_dist = 0.4,
+                          best_only = TRUE,
+                          make_lower_case = TRUE){
+
+  #set matching variables to lowercase if make_lower_case is true
+  if (make_lower_case == TRUE){
+    data_frame_A[,by_what]<-tolower(data_frame_A[,by_what])
+    data_frame_B[,by_what]<-tolower(data_frame_B[,by_what])
+  }
+
+  #Merge data frames with dplyr
+  ExactMatches<-inner_join(x = data_frame_A,
+                           y = data_frame_B,
+                           by = by_what)
+
+  #Now look at those which didn't match at all 
+  NoMatchesA<-anti_join(x = data_frame_A,
+                        y = data_frame_B,
+                        by = by_what)
+  NoMatchesB<-anti_join(x = data_frame_B,
+                        y = data_frame_A,
+                        by = by_what)
+
+  #FuzzyMatch the remaining ones by string distance
+  FuzzyMatch<-stringdist_inner_join(x = NoMatchesA,
+                                    y = NoMatchesB, 
+                                    by = by_what,
+                                    method= 'jw',
+                                    p = choose_p,
+                                    max_dist = choose_max_dist)
+
+  #Add a column specifying the string distance
+  FuzzyMatch$stdist<-stringdist(a = FuzzyMatch[,paste0(by_what,".x")],
+                                b = FuzzyMatch[,paste0(by_what,".y")],
+                                method = 'jw',
+                                p = choose_p)
+
+  #order by smallest string distance first
+  FuzzyMatch <- FuzzyMatch[order(FuzzyMatch$stdist,decreasing=F),]
+
+  #take the best matching one only if best_only is TRUE
+  if(best_only==TRUE){
+    FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".x"))]),]
+    FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".y"))]),]
+  }
+
+  NotMatchedA<-anti_join(x = NoMatchesA,
+                         y = FuzzyMatch,
+                         by = setNames(paste0(by_what,".x"), 
+                                       by_what))
+  NotMatchedB<-anti_join(x = NoMatchesB,
+                         y = FuzzyMatch,
+                         by = setNames(paste0(by_what,".y"), 
+                                       by_what))
+
+  completeReturnList = list(exact = ExactMatches,
+                            fuzzy = FuzzyMatch,
+                            remainderA = NotMatchedA,
+                            remainderB = NotMatchedB)
+
+  return(completeReturnList)
+}
+
+
+
+
+# #testing:
+# 
+# dfa = data.frame(FirstNames=c("George",
+#                             "Ketut",
+#                             "Harriet",
+#                             "Zhu",
+#                             "Sarika",
+#                             "Apple"))
+# dfb = data.frame(FirstNames=c("GeorGe",
+#                               "Ketut",
+#                               "Harry",
+#                               "Z.",
+#                               "Rika",
+#                               "Marion"))
+# eff_fuzzy_match(dfa,dfb,by_what = "FirstNames")
No results found