Skip to content

Instantly share code, notes, and snippets.

@a8dx
Created July 9, 2018 17:59
Show Gist options
  • Save a8dx/20caedaf942f1810e16994bfdb57e8dc to your computer and use it in GitHub Desktop.
Save a8dx/20caedaf942f1810e16994bfdb57e8dc to your computer and use it in GitHub Desktop.

Revisions

  1. a8dx created this gist Jul 9, 2018.
    109 changes: 109 additions & 0 deletions DistrictNameMatching.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,109 @@
    # -- DistrictNameMatching.py
    # Author: Anthony Louis D'Agostino (ald2187 [at] columbia.edu)
    # Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library
    # Notes: Default number of matches currently set to 3, though can be modified as input argument.

    import os
    import numpy as np
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    import pandas as pd



    def districtMatch(master, using, master_dist, master_state, using_dist, using_state, outFile, num_match = 3):
    """
    This function takes two sets of district-state names, and produces a DTA with a set number (default=3)
    of matches with a flag for whether the district name has been completely matched.
    Manual work is then required for districts where a perfect match has not been made.
    master: file containing the master list of districts
    using: file containing using list of districts, eg., each of these districts is compared against these
    universe of master districts from the master file
    master_dist: variable name pertaining to districts in master file
    master_state: variable name pertaining to states in master file
    using_dist: variable name pertaining to districts in using file
    using_state: variable name pertaining to states in using file
    num_match: number of matches generated, default is 3
    outFile: includes path and filename for an outputted DTA file - should be "*.dta"
    """

    master_dists = pd.read_csv(master, quotechar = '"', skipinitialspace = True, sep = None)
    print " *** Now printing column values for master file *** "
    print list(master_dists.columns.values)

    using_dists = pd.read_csv(using, quotechar = '"', skipinitialspace = True, sep = None)
    print " *** Now printing column values for using file *** "
    print list(using_dists.columns.values)

    # -- concatenate district and state names
    master_names = master_dists[master_dist].map(str) + ", " + master_dists[master_state]
    using_names = using_dists[using_dist].map(str) + ", " + using_dists[using_state]

    fhp_new = [process.extract(x, master_names, limit=num_match) for x in using_names]

    # -- generate column names
    lab = ""
    i = 1
    while i <= num_match:
    lab = lab + " " + "Match" + str(i)
    i += 1



    fhp_matches = pd.DataFrame(fhp_new, columns = lab.split())

    d={}
    for x in range(1,num_match+1):
    d["Match{0}".format(x)]=[y[0] for y in fhp_matches['Match'+str(x)]]


    d['using_original'] = using_names


    #match1 = [x[0] for x in fhp_matches['Match1']]
    d['perfect_match'] = d['Match1'] == d['using_original']

    #fhp_matches['perfect_match'] = pd.Series(perfect_match, index = fhp_matches.index)
    out = pd.DataFrame(d)
    #out.to_stata(str(outFile + ".dta"))
    out.to_csv(str(outFile + ".csv"))
    print "******************************************"
    print "*** Your analysis has been completed! *** "
    print "******************************************"

    return out


    """
    BASIC FILES/PATHS WHOSE USE IS REPEATED
    """


    baseDir = os.path.join("<insert path>")


    outDir = os.path.join(baseDir, "Matched_Results")

    if not os.path.exists(outDir):
    os.makedirs(outDir)





    """
    ICRISAT and 1971 Polygon borders
    """

    master_file = os.path.join(baseDir, "ICRISAT_Meso_Names.csv")
    input_1971 = os.path.join(baseDir,"District_Records_1971_Clean.csv")

    outFile = os.path.join(outDir, "ICRI_1971_DistrictMatches")
    icri_1971_match = districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)

    # -- alternatively, don't save as a workspace object
    districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)