Skip to content

Instantly share code, notes, and snippets.

@prokopyev
Forked from gdmcdonald/eff_fuzzy_match.R
Created May 21, 2018 18:59
Show Gist options
  • Select an option

  • Save prokopyev/139b6e9fa99b8c4065fc0e029f041bc8 to your computer and use it in GitHub Desktop.

Select an option

Save prokopyev/139b6e9fa99b8c4065fc0e029f041bc8 to your computer and use it in GitHub Desktop.

Revisions

  1. @gdmcdonald gdmcdonald revised this gist Feb 22, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion eff_fuzzy_match.R
    Original file line number Diff line number Diff line change
    @@ -18,7 +18,7 @@ eff_fuzzy_match<-function(data_frame_A,
    }

    #take only unique entries
    data_frame_A = unique(data_frame_A),
    data_frame_A = unique(data_frame_A)
    data_frame_B = unique(data_frame_B)

    #Merge data frames with dplyr
  2. @gdmcdonald gdmcdonald revised this gist Feb 21, 2018. 1 changed file with 4 additions and 0 deletions.
    4 changes: 4 additions & 0 deletions eff_fuzzy_match.R
    Original file line number Diff line number Diff line change
    @@ -17,6 +17,10 @@ eff_fuzzy_match<-function(data_frame_A,
    data_frame_B[,by_what]<-tolower(data_frame_B[,by_what])
    }

    #take only unique entries
    data_frame_A = unique(data_frame_A),
    data_frame_B = unique(data_frame_B)

    #Merge data frames with dplyr
    ExactMatches<-inner_join(x = data_frame_A,
    y = data_frame_B,
  3. @gdmcdonald gdmcdonald created this gist Feb 21, 2018.
    90 changes: 90 additions & 0 deletions eff_fuzzy_match.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,90 @@
    #Efficient fuzzy match of two data frames by one common column
    library(dplyr)
    library(fuzzyjoin)
    library(stringdist)

    eff_fuzzy_match<-function(data_frame_A,
    data_frame_B,
    by_what,
    choose_p = 0.1,
    choose_max_dist = 0.4,
    best_only = TRUE,
    make_lower_case = TRUE){

    #set matching variables to lowercase if make_lower_case is true
    if (make_lower_case == TRUE){
    data_frame_A[,by_what]<-tolower(data_frame_A[,by_what])
    data_frame_B[,by_what]<-tolower(data_frame_B[,by_what])
    }

    #Merge data frames with dplyr
    ExactMatches<-inner_join(x = data_frame_A,
    y = data_frame_B,
    by = by_what)

    #Now look at those which didn't match at all
    NoMatchesA<-anti_join(x = data_frame_A,
    y = data_frame_B,
    by = by_what)
    NoMatchesB<-anti_join(x = data_frame_B,
    y = data_frame_A,
    by = by_what)

    #FuzzyMatch the remaining ones by string distance
    FuzzyMatch<-stringdist_inner_join(x = NoMatchesA,
    y = NoMatchesB,
    by = by_what,
    method= 'jw',
    p = choose_p,
    max_dist = choose_max_dist)

    #Add a column specifying the string distance
    FuzzyMatch$stdist<-stringdist(a = FuzzyMatch[,paste0(by_what,".x")],
    b = FuzzyMatch[,paste0(by_what,".y")],
    method = 'jw',
    p = choose_p)

    #order by smallest string distance first
    FuzzyMatch <- FuzzyMatch[order(FuzzyMatch$stdist,decreasing=F),]

    #take the best matching one only if best_only is TRUE
    if(best_only==TRUE){
    FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".x"))]),]
    FuzzyMatch<-FuzzyMatch[!duplicated(FuzzyMatch[,c(paste0(by_what,".y"))]),]
    }

    NotMatchedA<-anti_join(x = NoMatchesA,
    y = FuzzyMatch,
    by = setNames(paste0(by_what,".x"),
    by_what))
    NotMatchedB<-anti_join(x = NoMatchesB,
    y = FuzzyMatch,
    by = setNames(paste0(by_what,".y"),
    by_what))

    completeReturnList = list(exact = ExactMatches,
    fuzzy = FuzzyMatch,
    remainderA = NotMatchedA,
    remainderB = NotMatchedB)

    return(completeReturnList)
    }




    # #testing:
    #
    # dfa = data.frame(FirstNames=c("George",
    # "Ketut",
    # "Harriet",
    # "Zhu",
    # "Sarika",
    # "Apple"))
    # dfb = data.frame(FirstNames=c("GeorGe",
    # "Ketut",
    # "Harry",
    # "Z.",
    # "Rika",
    # "Marion"))
    # eff_fuzzy_match(dfa,dfb,by_what = "FirstNames")