a8dx · July 9, 2018 17:59 · Jul 9, 2018
diff --git a/DistrictNameMatching.py b/DistrictNameMatching.py
@@ -0,0 +1,109 @@
+# -- DistrictNameMatching.py 
+# Author: Anthony Louis D'Agostino (ald2187 [at] columbia.edu)
+# Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library
+# Notes: Default number of matches currently set to 3, though can be modified as input argument.
+
+import os
+import numpy as np
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+import pandas as pd
+
+
+
+def districtMatch(master, using, master_dist, master_state, using_dist, using_state, outFile, num_match = 3):
+	"""
+	This function takes two sets of district-state names, and produces a DTA with a set number (default=3) 
+		of matches with a flag for whether the district name has been completely matched.  
+
+	Manual work is then required for districts where a perfect match has not been made.  	
+
+	master: file containing the master list of districts 
+	using: file containing using list of districts, eg., each of these districts is compared against these
+			universe of master districts from the master file 
+	master_dist: variable name pertaining to districts in master file
+	master_state: variable name pertaining to states in master file
+	using_dist: variable name pertaining to districts in using file 
+	using_state: variable name pertaining to states in using file 
+	num_match: number of matches generated, default is 3 
+	outFile: includes path and filename for an outputted DTA file - should be "*.dta"
+	"""
+
+	master_dists = pd.read_csv(master, quotechar = '"', skipinitialspace = True, sep = None) 
+	print " *** Now printing column values for master file *** "
+	print list(master_dists.columns.values)
+
+	using_dists = pd.read_csv(using, quotechar = '"', skipinitialspace = True, sep = None)
+	print " *** Now printing column values for using file *** "
+	print list(using_dists.columns.values)	
+
+	# -- concatenate district and state names 
+	master_names = master_dists[master_dist].map(str) + ", " + master_dists[master_state]
+	using_names = using_dists[using_dist].map(str) + ", " + using_dists[using_state]
+
+	fhp_new = [process.extract(x, master_names, limit=num_match) for x in using_names]
+
+	# -- generate column names 
+	lab = "" 
+	i = 1 
+	while i <= num_match:
+		lab = lab + " " + "Match" + str(i)
+		i += 1 
+
+
+
+	fhp_matches = pd.DataFrame(fhp_new, columns = lab.split())
+
+	d={}
+	for x in range(1,num_match+1):
+	    d["Match{0}".format(x)]=[y[0] for y in fhp_matches['Match'+str(x)]] 	
+
+
+	d['using_original'] = using_names
+
+
+	#match1 = [x[0] for x in fhp_matches['Match1']] 	
+	d['perfect_match'] = d['Match1'] == d['using_original']
+
+	#fhp_matches['perfect_match'] = pd.Series(perfect_match, index = fhp_matches.index)
+	out = pd.DataFrame(d)
+	#out.to_stata(str(outFile + ".dta"))
+	out.to_csv(str(outFile + ".csv"))
+	print "******************************************"
+	print "*** Your analysis has been completed! *** "
+	print "******************************************"
+
+	return out  
+
+
+"""
+BASIC FILES/PATHS WHOSE USE IS REPEATED
+"""
+
+
+baseDir = os.path.join("<insert path>") 
+
+
+outDir = os.path.join(baseDir, "Matched_Results")
+
+if not os.path.exists(outDir):
+	os.makedirs(outDir)
+
+
+
+
+
+"""
+ICRISAT and 1971 Polygon borders 
+"""
+
+master_file = os.path.join(baseDir, "ICRISAT_Meso_Names.csv")
+input_1971 = os.path.join(baseDir,"District_Records_1971_Clean.csv")
+
+outFile = os.path.join(outDir, "ICRI_1971_DistrictMatches")
+icri_1971_match = districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)
+
+# -- alternatively, don't save as a workspace object
+districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)
+
+