|
|
@@ -0,0 +1,54 @@ |
|
|
""" |
|
|
* requires usaddress https://github.com/datamade/usaddress |
|
|
* check out the website! https://parserator.datamade.us/usaddress |
|
|
|
|
|
The usaddress package is pretty great for normalizing inconsistent address data, |
|
|
especially when if you have a lot to process and can't rely on using a geocoding api. |
|
|
The results are really granular, probably moreso than you'll need, and definitely |
|
|
more than most CRM systems if integrating these addresses is your goal. |
|
|
|
|
|
This is just a simple wrapper around the usaddress.tag() function that I feel will |
|
|
fit most use cases. The usaddress.parse() function works as well, but tag has some |
|
|
nice things I like such as combining certain address components, |
|
|
stripping out extra commas, etc. |
|
|
|
|
|
For example: |
|
|
usaddress.tag('123 Fake Street, ste 123, san diego, CA 12345, USA') |
|
|
>> (OrderedDict([('AddressNumber', u'123'), |
|
|
>> ('StreetName', u'Fake'), |
|
|
>> ('StreetNamePostType', u'Street'), |
|
|
>> ('OccupancyType', u'ste'), |
|
|
>> ('OccupancyIdentifier', u'123'), |
|
|
>> ('PlaceName', u'san diego'), |
|
|
>> ('StateName', u'CA'), |
|
|
>> ('ZipCode', u'12345'), |
|
|
>> ('CountryName', u'USA')]), |
|
|
>> 'Street Address') |
|
|
|
|
|
""" |
|
|
import collections |
|
|
import usaddress # This is based on 0.5.4 |
|
|
|
|
|
Address = collections.namedtuple("Address", "street, city, state, zip_code, country") |
|
|
|
|
|
def parse_address(address_string): |
|
|
tags, _ = usaddress.tag(address_string) |
|
|
|
|
|
def _combine(*label_prefixes): |
|
|
components = [] |
|
|
for label, component in tags.iteritems(): # tags is an OrderedDict so this is fine |
|
|
if any(map(label.startswith, label_prefixes)): |
|
|
components.append(component) |
|
|
return ' '.join(components) or None |
|
|
|
|
|
return Address( |
|
|
# Note: If you prefer street_1 and street_2, the "Occupancy" labels are generally what falls under street_2 |
|
|
street=_combine('AddressNumber', 'StreetName', 'Occupancy'), |
|
|
city=_combine('PlaceName'), |
|
|
state=_combine('StateName'), |
|
|
zip_code=_combine('ZipCode'), |
|
|
country=_combine('CountryName'), |
|
|
) |
|
|
|
|
|
parse_address('123 Fake Street , ste 123,, San Diego, CA 12345-1234, USA') |
|
|
# >> Address(street=u'123 Fake Street ste 123', city=u'San Diego', state=u'CA', zip_code=u'12345-1234', country=u'USA') |