-
-
Save mdobson/240a2e311d17c99fe4aeff78f50a08c2 to your computer and use it in GitHub Desktop.
Revisions
-
mdobson revised this gist
Jun 22, 2018 . 1 changed file with 43 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,43 @@ import pandas as pd import requests as r import urllib.parse as u df = pd.read_csv('deq_asbestos_notifications.csv') request_url = 'https://gis.detroitmi.gov/arcgis/rest/services/DoIT/AddressPointGeocoder/GeocodeServer/findAddressCandidates?{}&ZIP=&Single+Line+Input=&category=&outFields=*&maxLocations=&outSR=4326&searchExtent=&location=&distance=&magicKey=&f=pjson' def geocode_address(addr): qs = {'Street': addr} encoded_qs = u.urlencode(qs) res = r.get(request_url.format(encoded_qs)) data_dict = res.json() #take the top candidate for now for the sake of greed data_dict_candidates = data_dict['candidates'] if len(data_dict_candidates) > 0: first_candidate = data_dict_candidates[0] first_candidate_location = first_candidate['location'] first_candidate_attributes = first_candidate['attributes'] return pd.Series({'Parcel': first_candidate_attributes['User_fld'].strip(), 'Score': first_candidate_attributes['Score'], 'Long':first_candidate_location['x'], 'Lat': first_candidate_location['y']}) else: return pd.Series({'Parcel': 'Unknown', 'Score': 0}) #Get all unique addresses in the entries for asbestos notifs and geocode them uniq_addrs = df.Address.unique() addrs = {'Address': uniq_addrs} addr_df = pd.DataFrame(data=addrs) #geocode_slice = addr_df[:20] geocode_result = addr_df.apply(lambda data_row: geocode_address(data_row['Address']), axis=1) merged = geocode_slice.merge(geocode_result, left_index=True, right_index=True) #This will find not perfectly matched geocode results #unmatched_gecodes = merged.query('Score < 100.00') #Join geocoded addresses to the originial dataframe joined_df = df.set_index('Address').join(merged.set_index('Address')) queried_df = joined_df.query('Score > 0') bad_addresses = joined_df.query('Score == 0') queried_df.to_csv('deq_asbestos_notifications_with_geocodes.csv') bad_addresses.to_csv('deq_asbestos_notifications_anamalous_addresses.csv') -
mdobson created this gist
Jun 22, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,61 @@ import requests import urllib from bs4 import BeautifulSoup from urllib.parse import urlparse #Initial request r = requests.get('http://www.deq.state.mi.us/asbestos_notifications/Pages/AbSearch.aspx') #Parse out nasty asp.net form stuff soup = BeautifulSoup(r.text, 'html.parser') #print(soup) hidden_field_token = 'ctl00_BodyContent_smAjax_HiddenField' parsed_hidden_field_key = '_TSM_CombinedScripts_' for script in soup.find_all('script'): src = script.get('src') if src != None and hidden_field_token in src: o = urlparse(src) qs = urllib.parse.parse_qs(o.query) #print(qs) payload = {} wayne_county_code = '82' city_name = 'Detroit' county_key = 'ctl00$BodyContent$ddlCounty' city_key = 'ctl00$BodyContent$txtCity' clear_key = 'ctl00$BodyContent$_btnClear' for inp in soup.find_all('input'): name = inp.get('name') value = inp.get('value') if inp.get('value') != None else '' payload[name] = value #Setup form object to be sent to their servers and hard code the county code for wayne and the city of detroit payload[county_key] = wayne_county_code payload[city_key] = city_name payload[hidden_field_token] = qs[parsed_hidden_field_key][0] payload.pop(clear_key) #print(payload) #send a POST request req_url = 'http://www.deq.state.mi.us/asbestos_notifications/Pages/AbSearch.aspx' second_r = requests.post(req_url, data=payload) #print(second_r.status_code) parsed_data_page = BeautifulSoup(second_r.text, 'html.parser') print(parsed_data_page) #scraped_data = parsed_data_page.find_all('div', attrs={'id': 'divPrint'})