Last active
March 13, 2024 02:22
-
-
Save dmitriiweb/9ee9c2438231e74e5bc6b02802afd22f to your computer and use it in GitHub Desktop.
Revisions
-
Dmitrii K revised this gist
Aug 22, 2019 . 1 changed file with 13 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,6 @@ from datetime import datetime from datetime import datetime import requests from bs4 import BeautifulSoup as BSoup from lxml import html @@ -46,21 +48,24 @@ def lxml_scraping(page_source): if __name__ == '__main__': repeats = 100 page_source = get_html() bs_parsers = ['lxml', 'html.parser', 'html5lib'] for parser in bs_parsers: bs_start = datetime.now() for _ in range(repeats): bs_result = bs_scraping(page_source, parser) bs_finish = datetime.now() - bs_start print('BeautifulSoup {} time: {}'.format(parser, bs_finish)) lxml_start = datetime.now() for _ in range(repeats): lxml_result = lxml_scraping(page_source) lxml_finish = datetime.now() - lxml_start print('lxml time:', lxml_finish) # BeautifulSoup lxml time: 0:00:12.774159 # BeautifulSoup html.parser time: 0:00:20.097766 # BeautifulSoup html5lib time: 0:00:50.156767 # lxml time: 0:00:02.027748 -
Dmitrii K revised this gist
Oct 29, 2017 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -60,7 +60,7 @@ def lxml_scraping(page_source): print('lxml time:', lxml_finish) # BeautifulSoup lxml time: 0:00:00.328582 # BeautifulSoup html.parser time: 0:00:00.484112 # BeautifulSoup html5lib time: 0:00:01.028619 # # lxml time: 0:00:00.038192 -
Dmitrii K revised this gist
Oct 29, 2017 . 1 changed file with 7 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -57,4 +57,10 @@ def lxml_scraping(page_source): lxml_start = datetime.now() lxml_result = lxml_scraping(page_source) lxml_finish = datetime.now() - lxml_start print('lxml time:', lxml_finish) # BeautifulSoup lxml time: 0:00:00.328582 # BeautifulSoup lxml time: 0:00:00.328582 # BeautifulSoup html5lib time: 0:00:01.028619 # # lxml time: 0:00:00.038192 -
Dmitrii K created this gist
Oct 29, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,60 @@ from datetime import datetime import requests from bs4 import BeautifulSoup as BSoup from lxml import html def get_html(): url = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States' r = requests.get(url) return r.text def bs_scraping(page_source, parser): bs_obj = BSoup(page_source, parser) rows = bs_obj.find_all('table')[0].find_all('tr') data = [] for row in rows[2:]: cells = row.find_all('td') name = row.find('th').get_text() abbr = cells[0].get_text() reps = cells[-1].get_text() water_km = cells[-2].get_text() land_km = cells[-4].get_text() total_km = cells[-6].get_text() population = cells[-8].get_text() data.append([name, abbr, reps, water_km, land_km, total_km, population]) return data def lxml_scraping(page_source): tree = html.fromstring(page_source) table = tree.xpath('//*[@id="mw-content-text"]/div/table[1]')[0] rows = table.findall('tr') data = [] for row in rows[2:]: name = row.xpath('./th')[0].text_content() cells = row.xpath('./td') abbr = cells[0].text_content() reps = cells[-1].text_content() water_km = cells[-2].text_content() land_km = cells[-4].text_content() total_km = cells[-6].text_content() population = cells[-8].text_content() data.append([name, abbr, reps, water_km, land_km, total_km, population]) return data if __name__ == '__main__': page_source = get_html() bs_parsers = ['lxml', 'html.parser', 'html5lib'] for parser in bs_parsers: bs_start = datetime.now() bs_result = bs_scraping(page_source, parser) bs_finish = datetime.now() - bs_start print('BeautifulSoup {} time: {}'.format(parser, bs_finish)) lxml_start = datetime.now() lxml_result = lxml_scraping(page_source) lxml_finish = datetime.now() - lxml_start print('lxml time:', lxml_finish)