""" * requires usaddress https://github.com/datamade/usaddress * check out the website! https://parserator.datamade.us/usaddress The usaddress package is pretty great for normalizing inconsistent address data, especially when if you have a lot to process and can't rely on using a geocoding api. The results are really granular, probably moreso than you'll need, and definitely more than most CRM systems if integrating these addresses is your goal. This is just a simple wrapper around the usaddress.tag() function that I feel will fit most use cases. The usaddress.parse() function works as well, but tag has some nice things I like such as combining certain address components, stripping out extra commas, etc. For example: usaddress.tag('123 Fake Street, ste 123, san diego, CA 12345, USA') >> (OrderedDict([('AddressNumber', u'123'), >> ('StreetName', u'Fake'), >> ('StreetNamePostType', u'Street'), >> ('OccupancyType', u'ste'), >> ('OccupancyIdentifier', u'123'), >> ('PlaceName', u'san diego'), >> ('StateName', u'CA'), >> ('ZipCode', u'12345'), >> ('CountryName', u'USA')]), >> 'Street Address') """ import collections import usaddress # This is based on 0.5.4 Address = collections.namedtuple("Address", "street, city, state, zip_code, country") def parse_address(address_string): tags, _ = usaddress.tag(address_string) def _combine(*label_prefixes): components = [] for label, component in tags.iteritems(): # tags is an OrderedDict so this is fine if any(map(label.startswith, label_prefixes)): components.append(component) return ' '.join(components) or None return Address( # Note: If you prefer street_1 and street_2, the "Occupancy" labels # are generally what falls under street_2 street=_combine('AddressNumber', 'StreetName', 'Occupancy'), city=_combine('PlaceName'), state=_combine('StateName'), zip_code=_combine('ZipCode'), country=_combine('CountryName'), ) parse_address('123 Fake Street , ste 123,, San Diego, CA 12345-1234, USA') # >> Address(street=u'123 Fake Street ste 123', city=u'San Diego', # >> state=u'CA', zip_code=u'12345-1234', country=u'USA')