from datetime import datetime from datetime import datetime import requests from bs4 import BeautifulSoup as BSoup from lxml import html def get_html(): url = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States' r = requests.get(url) return r.text def bs_scraping(page_source, parser): bs_obj = BSoup(page_source, parser) rows = bs_obj.find_all('table')[0].find_all('tr') data = [] for row in rows[2:]: cells = row.find_all('td') name = row.find('th').get_text() abbr = cells[0].get_text() reps = cells[-1].get_text() water_km = cells[-2].get_text() land_km = cells[-4].get_text() total_km = cells[-6].get_text() population = cells[-8].get_text() data.append([name, abbr, reps, water_km, land_km, total_km, population]) return data def lxml_scraping(page_source): tree = html.fromstring(page_source) table = tree.xpath('//*[@id="mw-content-text"]/div/table[1]')[0] rows = table.findall('tr') data = [] for row in rows[2:]: name = row.xpath('./th')[0].text_content() cells = row.xpath('./td') abbr = cells[0].text_content() reps = cells[-1].text_content() water_km = cells[-2].text_content() land_km = cells[-4].text_content() total_km = cells[-6].text_content() population = cells[-8].text_content() data.append([name, abbr, reps, water_km, land_km, total_km, population]) return data if __name__ == '__main__': repeats = 100 page_source = get_html() bs_parsers = ['lxml', 'html.parser', 'html5lib'] for parser in bs_parsers: bs_start = datetime.now() for _ in range(repeats): bs_result = bs_scraping(page_source, parser) bs_finish = datetime.now() - bs_start print('BeautifulSoup {} time: {}'.format(parser, bs_finish)) lxml_start = datetime.now() for _ in range(repeats): lxml_result = lxml_scraping(page_source) lxml_finish = datetime.now() - lxml_start print('lxml time:', lxml_finish) # BeautifulSoup lxml time: 0:00:12.774159 # BeautifulSoup html.parser time: 0:00:20.097766 # BeautifulSoup html5lib time: 0:00:50.156767 # lxml time: 0:00:02.027748