import requests import pandas as pd from bs4 import BeautifulSoup BASE_URL = 'https://www.usbr.gov/projects/index.php?id=' def scrape_power_plant(id: int) -> dict: """ Returns power plant information from "Main-well" div as a dict note: highly brittle. """ try: page = requests.get(BASE_URL + str(id)) soup = BeautifulSoup(page.content) main_div = soup.find("div", {"class": "Main-well"}) pairwise = lambda x, n: zip(*[iter(x)] * n) #main div class "Main-well" attempt = lambda q: q if q else "missing" res = { "name": attempt(main_div.h1.string), "state": attempt(soup.select('a[href^="/projects/facilities.php?state="]')[0].text), "region": attempt(soup.select('a[href^="/projects/facilities.php?region="]')[0].text), #"related_links": attempt([link.get('href') for link in main_div.find_all("p")[3].find_all('a')]), "overview": attempt(main_div.find("div", id='History').p.text), "plan": attempt(main_div.find("div", id='Plan').p.text), "contact": attempt(list(main_div.find("div", {"class":"contactRow"}).stripped_strings)), } #extract details and flatten into the main dict details = dict(pairwise([cell.text for cell in main_div.find("div", {"id":"Details"}).find_all('td')], 2)) res.update(details) print(f'{res["name"]} processed. . .') return res except Exception as e: print(f'Plant {id} failed to process. . .', e) return {} power_plants = [{'Alcova Powerplant': 524}, {'Anderson Ranch Powerplant': 525}, {'Big Thompson Powerplant': 578}, {'Black Canyon Powerplant': 527}, {'Blue Mesa Powerplant': 529}, {'Boise River Diversion Powerplant': 530}, {'Boysen Powerplant': 531}, {'Buffalo Bill Powerplant': 533}, {'Canyon Ferry Powerplant': 536}, {'Chandler Powerplant': 538}, {'Crystal Powerplant': 539}, {'Davis Powerplant': 541}, {'Deer Creek Powerplant': 542}, {'Elephant Butte Powerplant': 543}, {'Estes Powerplant': 544}, {'Flaming Gorge Powerplant': 545}, {'Flatiron Powerplant': 546}, {'Folsom Powerplant': 547}, {'Fontenelle Powerplant': 549}, {'Fremont Canyon Powerplant': 550}, {'Glen Canyon Powerplant': 522}, {'Glendo Powerplant': 523}, {'Grand Coulee Powerplant': 526}, {'Green Mountain Powerplant': 528}, {'Green Springs Powerplant': 534}, {'Guernsey Powerplant': 535}, {'Heart Mountain Powerplant': 537}, {'Hoover Powerplant': 540}, {'Hungry Horse Powerplant': 548}, {'Judge Francis Carr Powerplant': 532}, {'Keswick Powerplant': 579}, {'Kortes Powerplant': 555}, {'Lewiston Powerplant': 557}, {'Lower Molina Powerplant': 558}, {'Marys Lake Powerplant': 561}, {'McPhee Powerplant': 563}, {'Minidoka Powerplant': 565}, {'Morrow Point Powerplant': 567}, {'Mount Elbert Powerplant': 568}, {'New Melones Powerplant': 569}, {'Nimbus Powerplant': 570}, {'O`Neill Powerplant': 571}, {'Palisades Powerplant': 572}, {'Parker Powerplant': 573}, {'Pilot Butte Powerplant': 574}, {'Pole Hill Powerplant': 575}, {'Roza Powerplant': 576}, {'San Luis (William R. Gianelli) Powerplant': 577}, {'Seminoe Powerplant': 551}, {'Shasta Powerplant': 552}, {'Shoshone Powerplant': 553}, {'Spirit Mountain Powerplant': 554}, {'Spring Creek Powerplant': 556}, {'Stampede Powerplant': 559}, {'Towaoc Powerplant': 560}, {'Trinity Powerplant': 562}, {'Upper Molina Powerplant': 564}, {'Yellowtail Powerplant': 566} ] results = [scrape_power_plant(list(plant.values())[0]) for plant in power_plants] pd.DataFrame.from_dict(results).to_csv('power_plant_data.csv')