liquidgenius · August 20, 2020 17:15 · Nov 11, 2015 · Nov 11, 2015
diff --git a/usaddress_adapter.py b/usaddress_adapter.py
@@ -42,13 +42,15 @@ def _combine(*label_prefixes):
         return ' '.join(components) or None
 
     return Address(
-        # Note: If you prefer street_1 and street_2, the "Occupancy" labels are generally what falls under street_2
+        # Note: If you prefer street_1 and street_2, the "Occupancy" labels 
+        # are generally what falls under street_2
         street=_combine('AddressNumber', 'StreetName', 'Occupancy'),
         city=_combine('PlaceName'),
         state=_combine('StateName'),
         zip_code=_combine('ZipCode'),
         country=_combine('CountryName'),
     )
 
-parse_address('123 Fake Street     ,    ste 123,,     San Diego, CA 12345-1234, USA')
-# >> Address(street=u'123 Fake Street ste 123', city=u'San Diego', state=u'CA', zip_code=u'12345-1234', country=u'USA')
+parse_address('123 Fake Street   ,   ste 123,,    San Diego, CA 12345-1234, USA')
+# >> Address(street=u'123 Fake Street ste 123', city=u'San Diego', 
+# >>         state=u'CA', zip_code=u'12345-1234', country=u'USA')
diff --git a/usaddress_adapter.py b/usaddress_adapter.py
@@ -0,0 +1,54 @@
+"""
+* requires usaddress https://github.com/datamade/usaddress
+* check out the website! https://parserator.datamade.us/usaddress
+
+The usaddress package is pretty great for normalizing inconsistent address data,
+especially when if you have a lot to process and can't rely on using a geocoding api.
+The results are really granular, probably moreso than you'll need, and definitely
+more than most CRM systems if integrating these addresses is your goal. 
+
+This is just a simple wrapper around the usaddress.tag() function that I feel will 
+fit most use cases. The usaddress.parse() function works as well, but tag has some 
+nice things I like such as combining certain address components, 
+stripping out extra commas, etc.
+
+For example:
+usaddress.tag('123 Fake Street, ste 123, san diego, CA 12345, USA')
+>> (OrderedDict([('AddressNumber', u'123'),
+>>               ('StreetName', u'Fake'),
+>>               ('StreetNamePostType', u'Street'),
+>>               ('OccupancyType', u'ste'),
+>>               ('OccupancyIdentifier', u'123'),
+>>               ('PlaceName', u'san diego'),
+>>               ('StateName', u'CA'),
+>>               ('ZipCode', u'12345'),
+>>               ('CountryName', u'USA')]),
+>>  'Street Address')
+
+"""
+import collections
+import usaddress  # This is based on 0.5.4
+
+Address = collections.namedtuple("Address", "street, city, state, zip_code, country")
+
+def parse_address(address_string):
+    tags, _ = usaddress.tag(address_string)
+
+    def _combine(*label_prefixes):
+        components = []
+        for label, component in tags.iteritems():  # tags is an OrderedDict so this is fine
+            if any(map(label.startswith, label_prefixes)):
+                components.append(component)
+        return ' '.join(components) or None
+
+    return Address(
+        # Note: If you prefer street_1 and street_2, the "Occupancy" labels are generally what falls under street_2
+        street=_combine('AddressNumber', 'StreetName', 'Occupancy'),
+        city=_combine('PlaceName'),
+        state=_combine('StateName'),
+        zip_code=_combine('ZipCode'),
+        country=_combine('CountryName'),
+    )
+
+parse_address('123 Fake Street     ,    ste 123,,     San Diego, CA 12345-1234, USA')
+# >> Address(street=u'123 Fake Street ste 123', city=u'San Diego', state=u'CA', zip_code=u'12345-1234', country=u'USA')