Last active
June 14, 2022 18:48
-
-
Save nickolasclarke/fe353a1801bb6c91902f27f5d974b5b7 to your computer and use it in GitHub Desktop.
Extract Power Plant Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| BASE_URL = 'https://www.usbr.gov/projects/index.php?id=' | |
| def scrape_power_plant(id: int) -> dict: | |
| """ | |
| Returns power plant information from "Main-well" div as a dict | |
| note: highly brittle. | |
| """ | |
| try: | |
| page = requests.get(BASE_URL + str(id)) | |
| soup = BeautifulSoup(page.content) | |
| main_div = soup.find("div", {"class": "Main-well"}) | |
| pairwise = lambda x, n: zip(*[iter(x)] * n) | |
| #main div class "Main-well" | |
| attempt = lambda q: q if q else "missing" | |
| res = { | |
| "name": attempt(main_div.h1.string), | |
| "state": attempt(soup.select('a[href^="/projects/facilities.php?state="]')[0].text), | |
| "region": attempt(soup.select('a[href^="/projects/facilities.php?region="]')[0].text), | |
| #"related_links": attempt([link.get('href') for link in main_div.find_all("p")[3].find_all('a')]), | |
| "overview": attempt(main_div.find("div", id='History').p.text), | |
| "plan": attempt(main_div.find("div", id='Plan').p.text), | |
| "contact": attempt(list(main_div.find("div", {"class":"contactRow"}).stripped_strings)), | |
| } | |
| #extract details and flatten into the main dict | |
| details = dict(pairwise([cell.text for cell in main_div.find("div", {"id":"Details"}).find_all('td')], 2)) | |
| res.update(details) | |
| print(f'{res["name"]} processed. . .') | |
| return res | |
| except Exception as e: | |
| print(f'Plant {id} failed to process. . .', e) | |
| return {} | |
| power_plants = [{'Alcova Powerplant': 524}, | |
| {'Anderson Ranch Powerplant': 525}, | |
| {'Big Thompson Powerplant': 578}, | |
| {'Black Canyon Powerplant': 527}, | |
| {'Blue Mesa Powerplant': 529}, | |
| {'Boise River Diversion Powerplant': 530}, | |
| {'Boysen Powerplant': 531}, | |
| {'Buffalo Bill Powerplant': 533}, | |
| {'Canyon Ferry Powerplant': 536}, | |
| {'Chandler Powerplant': 538}, | |
| {'Crystal Powerplant': 539}, | |
| {'Davis Powerplant': 541}, | |
| {'Deer Creek Powerplant': 542}, | |
| {'Elephant Butte Powerplant': 543}, | |
| {'Estes Powerplant': 544}, | |
| {'Flaming Gorge Powerplant': 545}, | |
| {'Flatiron Powerplant': 546}, | |
| {'Folsom Powerplant': 547}, | |
| {'Fontenelle Powerplant': 549}, | |
| {'Fremont Canyon Powerplant': 550}, | |
| {'Glen Canyon Powerplant': 522}, | |
| {'Glendo Powerplant': 523}, | |
| {'Grand Coulee Powerplant': 526}, | |
| {'Green Mountain Powerplant': 528}, | |
| {'Green Springs Powerplant': 534}, | |
| {'Guernsey Powerplant': 535}, | |
| {'Heart Mountain Powerplant': 537}, | |
| {'Hoover Powerplant': 540}, | |
| {'Hungry Horse Powerplant': 548}, | |
| {'Judge Francis Carr Powerplant': 532}, | |
| {'Keswick Powerplant': 579}, | |
| {'Kortes Powerplant': 555}, | |
| {'Lewiston Powerplant': 557}, | |
| {'Lower Molina Powerplant': 558}, | |
| {'Marys Lake Powerplant': 561}, | |
| {'McPhee Powerplant': 563}, | |
| {'Minidoka Powerplant': 565}, | |
| {'Morrow Point Powerplant': 567}, | |
| {'Mount Elbert Powerplant': 568}, | |
| {'New Melones Powerplant': 569}, | |
| {'Nimbus Powerplant': 570}, | |
| {'O`Neill Powerplant': 571}, | |
| {'Palisades Powerplant': 572}, | |
| {'Parker Powerplant': 573}, | |
| {'Pilot Butte Powerplant': 574}, | |
| {'Pole Hill Powerplant': 575}, | |
| {'Roza Powerplant': 576}, | |
| {'San Luis (William R. Gianelli) Powerplant': 577}, | |
| {'Seminoe Powerplant': 551}, | |
| {'Shasta Powerplant': 552}, | |
| {'Shoshone Powerplant': 553}, | |
| {'Spirit Mountain Powerplant': 554}, | |
| {'Spring Creek Powerplant': 556}, | |
| {'Stampede Powerplant': 559}, | |
| {'Towaoc Powerplant': 560}, | |
| {'Trinity Powerplant': 562}, | |
| {'Upper Molina Powerplant': 564}, | |
| {'Yellowtail Powerplant': 566} | |
| ] | |
| results = [scrape_power_plant(list(plant.values())[0]) for plant in power_plants] | |
| pd.DataFrame.from_dict(results).to_csv('power_plant_data.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment