Skip to content

Instantly share code, notes, and snippets.

@mdobson
Last active June 22, 2018 18:34
Show Gist options
  • Save mdobson/240a2e311d17c99fe4aeff78f50a08c2 to your computer and use it in GitHub Desktop.
Save mdobson/240a2e311d17c99fe4aeff78f50a08c2 to your computer and use it in GitHub Desktop.

Revisions

  1. mdobson revised this gist Jun 22, 2018. 1 changed file with 43 additions and 0 deletions.
    43 changes: 43 additions & 0 deletions 2.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,43 @@
    import pandas as pd
    import requests as r
    import urllib.parse as u

    df = pd.read_csv('deq_asbestos_notifications.csv')

    request_url = 'https://gis.detroitmi.gov/arcgis/rest/services/DoIT/AddressPointGeocoder/GeocodeServer/findAddressCandidates?{}&ZIP=&Single+Line+Input=&category=&outFields=*&maxLocations=&outSR=4326&searchExtent=&location=&distance=&magicKey=&f=pjson'

    def geocode_address(addr):
    qs = {'Street': addr}
    encoded_qs = u.urlencode(qs)
    res = r.get(request_url.format(encoded_qs))
    data_dict = res.json()
    #take the top candidate for now for the sake of greed
    data_dict_candidates = data_dict['candidates']
    if len(data_dict_candidates) > 0:
    first_candidate = data_dict_candidates[0]
    first_candidate_location = first_candidate['location']
    first_candidate_attributes = first_candidate['attributes']
    return pd.Series({'Parcel': first_candidate_attributes['User_fld'].strip(), 'Score': first_candidate_attributes['Score'], 'Long':first_candidate_location['x'], 'Lat': first_candidate_location['y']})
    else:
    return pd.Series({'Parcel': 'Unknown', 'Score': 0})

    #Get all unique addresses in the entries for asbestos notifs and geocode them
    uniq_addrs = df.Address.unique()
    addrs = {'Address': uniq_addrs}
    addr_df = pd.DataFrame(data=addrs)
    #geocode_slice = addr_df[:20]
    geocode_result = addr_df.apply(lambda data_row: geocode_address(data_row['Address']), axis=1)
    merged = geocode_slice.merge(geocode_result, left_index=True, right_index=True)


    #This will find not perfectly matched geocode results
    #unmatched_gecodes = merged.query('Score < 100.00')

    #Join geocoded addresses to the originial dataframe

    joined_df = df.set_index('Address').join(merged.set_index('Address'))
    queried_df = joined_df.query('Score > 0')
    bad_addresses = joined_df.query('Score == 0')

    queried_df.to_csv('deq_asbestos_notifications_with_geocodes.csv')
    bad_addresses.to_csv('deq_asbestos_notifications_anamalous_addresses.csv')
  2. mdobson created this gist Jun 22, 2018.
    61 changes: 61 additions & 0 deletions 1.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,61 @@
    import requests
    import urllib
    from bs4 import BeautifulSoup
    from urllib.parse import urlparse


    #Initial request
    r = requests.get('http://www.deq.state.mi.us/asbestos_notifications/Pages/AbSearch.aspx')

    #Parse out nasty asp.net form stuff
    soup = BeautifulSoup(r.text, 'html.parser')

    #print(soup)

    hidden_field_token = 'ctl00_BodyContent_smAjax_HiddenField'
    parsed_hidden_field_key = '_TSM_CombinedScripts_'

    for script in soup.find_all('script'):
    src = script.get('src')
    if src != None and hidden_field_token in src:
    o = urlparse(src)
    qs = urllib.parse.parse_qs(o.query)
    #print(qs)

    payload = {}

    wayne_county_code = '82'
    city_name = 'Detroit'

    county_key = 'ctl00$BodyContent$ddlCounty'
    city_key = 'ctl00$BodyContent$txtCity'
    clear_key = 'ctl00$BodyContent$_btnClear'

    for inp in soup.find_all('input'):
    name = inp.get('name')
    value = inp.get('value') if inp.get('value') != None else ''
    payload[name] = value

    #Setup form object to be sent to their servers and hard code the county code for wayne and the city of detroit
    payload[county_key] = wayne_county_code
    payload[city_key] = city_name
    payload[hidden_field_token] = qs[parsed_hidden_field_key][0]
    payload.pop(clear_key)


    #print(payload)

    #send a POST request
    req_url = 'http://www.deq.state.mi.us/asbestos_notifications/Pages/AbSearch.aspx'

    second_r = requests.post(req_url, data=payload)
    #print(second_r.status_code)

    parsed_data_page = BeautifulSoup(second_r.text, 'html.parser')
    print(parsed_data_page)

    #scraped_data = parsed_data_page.find_all('div', attrs={'id': 'divPrint'})