|
|
@@ -0,0 +1,90 @@ |
|
|
#Efficient fuzzy match of two data frames by one common column |
|
|
library(dplyr) |
|
|
library(fuzzyjoin) |
|
|
library(stringdist) |
|
|
|
|
|
eff_fuzzy_match<-function(data_frame_A, |
|
|
data_frame_B, |
|
|
by_what, |
|
|
choose_p = 0.1, |
|
|
choose_max_dist = 0.4, |
|
|
best_only = TRUE, |
|
|
make_lower_case = TRUE){ |
|
|
|
|
|
#set matching variables to lowercase if make_lower_case is true |
|
|
if (make_lower_case == TRUE){ |
|
|
data_frame_A[,by_what]<-tolower(data_frame_A[,by_what]) |
|
|
data_frame_B[,by_what]<-tolower(data_frame_B[,by_what]) |
|
|
} |
|
|
|
|
|
#Merge data frames with dplyr |
|
|
ExactMatches<-inner_join(x = data_frame_A, |
|
|
y = data_frame_B, |
|
|
by = by_what) |
|
|
|
|
|
#Now look at those which didn't match at all |
|
|
NoMatchesA<-anti_join(x = data_frame_A, |
|
|
y = data_frame_B, |
|
|
by = by_what) |
|
|
NoMatchesB<-anti_join(x = data_frame_B, |
|
|
y = data_frame_A, |
|
|
by = by_what) |
|
|
|
|
|
#FuzzyMatch the remaining ones by string distance |
|
|
FuzzyMatch<-stringdist_inner_join(x = NoMatchesA, |
|
|
y = NoMatchesB, |
|
|
by = by_what, |
|
|
method= 'jw', |
|
|
p = choose_p, |
|
|
max_dist = choose_max_dist) |
|
|
|
|
|
#Add a column specifying the string distance |
|
|
FuzzyMatch$stdist<-stringdist(a = FuzzyMatch[,paste0(by_what,".x")], |
|
|
b = FuzzyMatch[,paste0(by_what,".y")], |
|
|
method = 'jw', |
|
|
p = choose_p) |
|
|
|
|
|
#order by smallest string distance first |
|
|
FuzzyMatch <- FuzzyMatch[order(FuzzyMatch$stdist,decreasing=F),] |
|
|
|
|
|
#take the best matching one only if best_only is TRUE |
|
|
if(best_only==TRUE){ |
|
|
FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".x"))]),] |
|
|
FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".y"))]),] |
|
|
} |
|
|
|
|
|
NotMatchedA<-anti_join(x = NoMatchesA, |
|
|
y = FuzzyMatch, |
|
|
by = setNames(paste0(by_what,".x"), |
|
|
by_what)) |
|
|
NotMatchedB<-anti_join(x = NoMatchesB, |
|
|
y = FuzzyMatch, |
|
|
by = setNames(paste0(by_what,".y"), |
|
|
by_what)) |
|
|
|
|
|
completeReturnList = list(exact = ExactMatches, |
|
|
fuzzy = FuzzyMatch, |
|
|
remainderA = NotMatchedA, |
|
|
remainderB = NotMatchedB) |
|
|
|
|
|
return(completeReturnList) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# #testing: |
|
|
# |
|
|
# dfa = data.frame(FirstNames=c("George", |
|
|
# "Ketut", |
|
|
# "Harriet", |
|
|
# "Zhu", |
|
|
# "Sarika", |
|
|
# "Apple")) |
|
|
# dfb = data.frame(FirstNames=c("GeorGe", |
|
|
# "Ketut", |
|
|
# "Harry", |
|
|
# "Z.", |
|
|
# "Rika", |
|
|
# "Marion")) |
|
|
# eff_fuzzy_match(dfa,dfb,by_what = "FirstNames") |