Skip to content

Instantly share code, notes, and snippets.

@nickolasclarke
Last active June 14, 2022 18:48
Show Gist options
  • Save nickolasclarke/fe353a1801bb6c91902f27f5d974b5b7 to your computer and use it in GitHub Desktop.
Save nickolasclarke/fe353a1801bb6c91902f27f5d974b5b7 to your computer and use it in GitHub Desktop.
Extract Power Plant Data
import requests
import pandas as pd
from bs4 import BeautifulSoup
BASE_URL = 'https://www.usbr.gov/projects/index.php?id='
def scrape_power_plant(id: int) -> dict:
"""
Returns power plant information from "Main-well" div as a dict
note: highly brittle.
"""
try:
page = requests.get(BASE_URL + str(id))
soup = BeautifulSoup(page.content)
main_div = soup.find("div", {"class": "Main-well"})
pairwise = lambda x, n: zip(*[iter(x)] * n)
#main div class "Main-well"
attempt = lambda q: q if q else "missing"
res = {
"name": attempt(main_div.h1.string),
"state": attempt(soup.select('a[href^="/projects/facilities.php?state="]')[0].text),
"region": attempt(soup.select('a[href^="/projects/facilities.php?region="]')[0].text),
#"related_links": attempt([link.get('href') for link in main_div.find_all("p")[3].find_all('a')]),
"overview": attempt(main_div.find("div", id='History').p.text),
"plan": attempt(main_div.find("div", id='Plan').p.text),
"contact": attempt(list(main_div.find("div", {"class":"contactRow"}).stripped_strings)),
}
#extract details and flatten into the main dict
details = dict(pairwise([cell.text for cell in main_div.find("div", {"id":"Details"}).find_all('td')], 2))
res.update(details)
print(f'{res["name"]} processed. . .')
return res
except Exception as e:
print(f'Plant {id} failed to process. . .', e)
return {}
power_plants = [{'Alcova Powerplant': 524},
{'Anderson Ranch Powerplant': 525},
{'Big Thompson Powerplant': 578},
{'Black Canyon Powerplant': 527},
{'Blue Mesa Powerplant': 529},
{'Boise River Diversion Powerplant': 530},
{'Boysen Powerplant': 531},
{'Buffalo Bill Powerplant': 533},
{'Canyon Ferry Powerplant': 536},
{'Chandler Powerplant': 538},
{'Crystal Powerplant': 539},
{'Davis Powerplant': 541},
{'Deer Creek Powerplant': 542},
{'Elephant Butte Powerplant': 543},
{'Estes Powerplant': 544},
{'Flaming Gorge Powerplant': 545},
{'Flatiron Powerplant': 546},
{'Folsom Powerplant': 547},
{'Fontenelle Powerplant': 549},
{'Fremont Canyon Powerplant': 550},
{'Glen Canyon Powerplant': 522},
{'Glendo Powerplant': 523},
{'Grand Coulee Powerplant': 526},
{'Green Mountain Powerplant': 528},
{'Green Springs Powerplant': 534},
{'Guernsey Powerplant': 535},
{'Heart Mountain Powerplant': 537},
{'Hoover Powerplant': 540},
{'Hungry Horse Powerplant': 548},
{'Judge Francis Carr Powerplant': 532},
{'Keswick Powerplant': 579},
{'Kortes Powerplant': 555},
{'Lewiston Powerplant': 557},
{'Lower Molina Powerplant': 558},
{'Marys Lake Powerplant': 561},
{'McPhee Powerplant': 563},
{'Minidoka Powerplant': 565},
{'Morrow Point Powerplant': 567},
{'Mount Elbert Powerplant': 568},
{'New Melones Powerplant': 569},
{'Nimbus Powerplant': 570},
{'O`Neill Powerplant': 571},
{'Palisades Powerplant': 572},
{'Parker Powerplant': 573},
{'Pilot Butte Powerplant': 574},
{'Pole Hill Powerplant': 575},
{'Roza Powerplant': 576},
{'San Luis (William R. Gianelli) Powerplant': 577},
{'Seminoe Powerplant': 551},
{'Shasta Powerplant': 552},
{'Shoshone Powerplant': 553},
{'Spirit Mountain Powerplant': 554},
{'Spring Creek Powerplant': 556},
{'Stampede Powerplant': 559},
{'Towaoc Powerplant': 560},
{'Trinity Powerplant': 562},
{'Upper Molina Powerplant': 564},
{'Yellowtail Powerplant': 566}
]
results = [scrape_power_plant(list(plant.values())[0]) for plant in power_plants]
pd.DataFrame.from_dict(results).to_csv('power_plant_data.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment