import requests from bs4 import BeautifulSoup import time # We've now imported the two packages that will do the heavy lifting # for us, reqeusts and BeautifulSoup # This is the URL that lists the current inmates # Should this URL go away, and archive is available at # http://perma.cc/2HZR-N38X url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/' # Tell the requests package to retreive the contents our page (it'll be # grabbing what you see when you use the View Source feature in your browser) r = requests.get(url_to_scrape) # We now have the source HTML of the page. Let's ask BeaultifulSoup # to parse it for us. soup = BeautifulSoup(r.text) # Down below we'll add our inmates to this list. For now, # it's just a placeholder. inmates_links = [] # Our source document puts each inmate in an HTML table row. Let's # loop through all of those rows for table_row in soup.select(".inmatesList tr"): # Each table row has a set of tabel cells, or tds. Let's # get all of those. table_cells = table_row.findAll('td') # Our table has one exception -- a row without any cells. # Let's handle that special case here by making sure we # have more than zero cells before processing the cells if len(table_cells) > 0: # By looking at our source (probably easiest in your browser), we can # see that the link is in the first td of each row. Let's extract the # value of that link here. # # Should this link pattern change, find an archive of an # example at http://perma.cc/RTU7-57DL relative_link_to_inmate_details = table_cells[0].find('a')['href'] # The links to the inmates are relative (they look # like Details.aspx?bi=212840). We need to make them absolute links. # We do that by prepending our base URL (which conveniently is the same # one we used to get the list of inmates.) absolute_link_to_inmate_details = url_to_scrape + relative_link_to_inmate_details # We're done getting the link to the inmate details. Let's add it # to our list of inmates for later use inmates_links.append(absolute_link_to_inmate_details) # Down below we'll add our inmates details to this list. For now, # it's just a placeholder. inmates = [] # Loop through the list of inmate links we built # Since the inmate list is several hunderd links in total, # we might want to slice just a few off for testing. Here, we start with five. for inmate_link in inmates_links[:10]: # Once again we'll use requests to get the HTML of our link # and use beautiful soup to process it. r = requests.get(inmate_link) soup = BeautifulSoup(r.text) # We'll put the details we want to hang on to in this dictionary inmate_details = {} # Get all of our table rows in the inmateProfile table inmate_profile_rows = soup.select("#inmateProfile tr") # Inmate age # From looking at the HTML source (using View Source in our browser) # we see that age is in the first row and the first table cell (td) # We use the strip function to cleanup unwanted spaces inmate_details['age'] = inmate_profile_rows[0].findAll('td')[0].text.strip() # Inmate race # Race and naem are in our same inmateProfile table, we just find # the correct row inmate_details['race'] = inmate_profile_rows[3].findAll('td')[0].text.strip() # Inmate sex inmate_details['sex'] = inmate_profile_rows[4].findAll('td')[0].text.strip() # Get all of our table rows in the inmateNameDate table inmate_name_date_rows = soup.select("#inmateNameDate tr") # Inmate name inmate_details['name'] = inmate_name_date_rows[1].findAll('td')[0].text.strip() # Inmate booking time inmate_details['booked_at'] = inmate_name_date_rows[2].findAll('td')[0].text.strip() # Get all of our table rows in the inmateNameDate table inmate_address_container = soup.select("#inmateAddress") inmate_details['city'] = inmate_address_container[0].text.split('\n')[2].strip() # Now that we have all of the inmate details extracted and placed in a # dictionary, let's append that dictionary to our list inmates.append(inmate_details) # We don't want to overwhelm the Polk County site. Let's pause for one # second between each inmate request. time.sleep(1) # We now have details (in our dictionary) for each inmate. Let's print those out. for inmate in inmates: print '{0}, {1}'.format(inmate['name'], inmate['age']) print '{0} {1} from {2}'.format(inmate['race'], inmate['sex'], inmate['city']) print 'Booked at {0}'.format(inmate['booked_at']) print '' # We might want to do more than just print out our numbers though. Maybe # we want to see count up each inmate's city and print it out.s inmate_cities = {} for inmate in inmates: # If we haven't seen the inmate's city already, add it to our # dictionary with the value of 1. Otherwise, just add 1. if inmate['city'] in inmate_cities: inmate_cities[inmate['city']] += 1 else: inmate_cities[inmate['city']] = 1 print inmate_cities # Or, each inmate's race inmate_races = {} for inmate in inmates: if inmate['race'] in inmate_races: inmate_races[inmate['race']] += 1 else: inmate_races[inmate['race']] = 1 print inmate_races