Created
July 9, 2018 17:59
-
-
Save a8dx/20caedaf942f1810e16994bfdb57e8dc to your computer and use it in GitHub Desktop.
Revisions
-
a8dx created this gist
Jul 9, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,109 @@ # -- DistrictNameMatching.py # Author: Anthony Louis D'Agostino (ald2187 [at] columbia.edu) # Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library # Notes: Default number of matches currently set to 3, though can be modified as input argument. import os import numpy as np from fuzzywuzzy import fuzz from fuzzywuzzy import process import pandas as pd def districtMatch(master, using, master_dist, master_state, using_dist, using_state, outFile, num_match = 3): """ This function takes two sets of district-state names, and produces a DTA with a set number (default=3) of matches with a flag for whether the district name has been completely matched. Manual work is then required for districts where a perfect match has not been made. master: file containing the master list of districts using: file containing using list of districts, eg., each of these districts is compared against these universe of master districts from the master file master_dist: variable name pertaining to districts in master file master_state: variable name pertaining to states in master file using_dist: variable name pertaining to districts in using file using_state: variable name pertaining to states in using file num_match: number of matches generated, default is 3 outFile: includes path and filename for an outputted DTA file - should be "*.dta" """ master_dists = pd.read_csv(master, quotechar = '"', skipinitialspace = True, sep = None) print " *** Now printing column values for master file *** " print list(master_dists.columns.values) using_dists = pd.read_csv(using, quotechar = '"', skipinitialspace = True, sep = None) print " *** Now printing column values for using file *** " print list(using_dists.columns.values) # -- concatenate district and state names master_names = master_dists[master_dist].map(str) + ", " + master_dists[master_state] using_names = using_dists[using_dist].map(str) + ", " + using_dists[using_state] fhp_new = [process.extract(x, master_names, limit=num_match) for x in using_names] # -- generate column names lab = "" i = 1 while i <= num_match: lab = lab + " " + "Match" + str(i) i += 1 fhp_matches = pd.DataFrame(fhp_new, columns = lab.split()) d={} for x in range(1,num_match+1): d["Match{0}".format(x)]=[y[0] for y in fhp_matches['Match'+str(x)]] d['using_original'] = using_names #match1 = [x[0] for x in fhp_matches['Match1']] d['perfect_match'] = d['Match1'] == d['using_original'] #fhp_matches['perfect_match'] = pd.Series(perfect_match, index = fhp_matches.index) out = pd.DataFrame(d) #out.to_stata(str(outFile + ".dta")) out.to_csv(str(outFile + ".csv")) print "******************************************" print "*** Your analysis has been completed! *** " print "******************************************" return out """ BASIC FILES/PATHS WHOSE USE IS REPEATED """ baseDir = os.path.join("<insert path>") outDir = os.path.join(baseDir, "Matched_Results") if not os.path.exists(outDir): os.makedirs(outDir) """ ICRISAT and 1971 Polygon borders """ master_file = os.path.join(baseDir, "ICRISAT_Meso_Names.csv") input_1971 = os.path.join(baseDir,"District_Records_1971_Clean.csv") outFile = os.path.join(outDir, "ICRI_1971_DistrictMatches") icri_1971_match = districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile) # -- alternatively, don't save as a workspace object districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)