Skip to content

Instantly share code, notes, and snippets.

@dmitriiweb
Last active March 13, 2024 02:22
Show Gist options
  • Save dmitriiweb/9ee9c2438231e74e5bc6b02802afd22f to your computer and use it in GitHub Desktop.
Save dmitriiweb/9ee9c2438231e74e5bc6b02802afd22f to your computer and use it in GitHub Desktop.

Revisions

  1. Dmitrii K revised this gist Aug 22, 2019. 1 changed file with 13 additions and 8 deletions.
    21 changes: 13 additions & 8 deletions bs_vs_lxml.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,6 @@
    from datetime import datetime
    from datetime import datetime

    import requests
    from bs4 import BeautifulSoup as BSoup
    from lxml import html
    @@ -46,21 +48,24 @@ def lxml_scraping(page_source):


    if __name__ == '__main__':
    repeats = 100
    page_source = get_html()
    bs_parsers = ['lxml', 'html.parser', 'html5lib']
    for parser in bs_parsers:
    bs_start = datetime.now()
    bs_result = bs_scraping(page_source, parser)
    for _ in range(repeats):
    bs_result = bs_scraping(page_source, parser)
    bs_finish = datetime.now() - bs_start
    print('BeautifulSoup {} time: {}'.format(parser, bs_finish))

    lxml_start = datetime.now()
    lxml_result = lxml_scraping(page_source)
    for _ in range(repeats):
    lxml_result = lxml_scraping(page_source)
    lxml_finish = datetime.now() - lxml_start
    print('lxml time:', lxml_finish)
    # BeautifulSoup lxml time: 0:00:00.328582
    # BeautifulSoup html.parser time: 0:00:00.484112
    # BeautifulSoup html5lib time: 0:00:01.028619
    #
    # lxml time: 0:00:00.038192

    # BeautifulSoup lxml time: 0:00:12.774159
    # BeautifulSoup html.parser time: 0:00:20.097766
    # BeautifulSoup html5lib time: 0:00:50.156767

    # lxml time: 0:00:02.027748
  2. Dmitrii K revised this gist Oct 29, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion bs_vs_lxml.py
    Original file line number Diff line number Diff line change
    @@ -60,7 +60,7 @@ def lxml_scraping(page_source):
    print('lxml time:', lxml_finish)

    # BeautifulSoup lxml time: 0:00:00.328582
    # BeautifulSoup lxml time: 0:00:00.328582
    # BeautifulSoup html.parser time: 0:00:00.484112
    # BeautifulSoup html5lib time: 0:00:01.028619
    #
    # lxml time: 0:00:00.038192
  3. Dmitrii K revised this gist Oct 29, 2017. 1 changed file with 7 additions and 1 deletion.
    8 changes: 7 additions & 1 deletion bs_vs_lxml.py
    Original file line number Diff line number Diff line change
    @@ -57,4 +57,10 @@ def lxml_scraping(page_source):
    lxml_start = datetime.now()
    lxml_result = lxml_scraping(page_source)
    lxml_finish = datetime.now() - lxml_start
    print('lxml time:', lxml_finish)
    print('lxml time:', lxml_finish)

    # BeautifulSoup lxml time: 0:00:00.328582
    # BeautifulSoup lxml time: 0:00:00.328582
    # BeautifulSoup html5lib time: 0:00:01.028619
    #
    # lxml time: 0:00:00.038192
  4. Dmitrii K created this gist Oct 29, 2017.
    60 changes: 60 additions & 0 deletions bs_vs_lxml.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,60 @@
    from datetime import datetime
    import requests
    from bs4 import BeautifulSoup as BSoup
    from lxml import html


    def get_html():
    url = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
    r = requests.get(url)
    return r.text


    def bs_scraping(page_source, parser):
    bs_obj = BSoup(page_source, parser)
    rows = bs_obj.find_all('table')[0].find_all('tr')
    data = []
    for row in rows[2:]:
    cells = row.find_all('td')
    name = row.find('th').get_text()
    abbr = cells[0].get_text()
    reps = cells[-1].get_text()
    water_km = cells[-2].get_text()
    land_km = cells[-4].get_text()
    total_km = cells[-6].get_text()
    population = cells[-8].get_text()
    data.append([name, abbr, reps, water_km, land_km, total_km, population])
    return data


    def lxml_scraping(page_source):
    tree = html.fromstring(page_source)
    table = tree.xpath('//*[@id="mw-content-text"]/div/table[1]')[0]
    rows = table.findall('tr')
    data = []
    for row in rows[2:]:
    name = row.xpath('./th')[0].text_content()
    cells = row.xpath('./td')
    abbr = cells[0].text_content()
    reps = cells[-1].text_content()
    water_km = cells[-2].text_content()
    land_km = cells[-4].text_content()
    total_km = cells[-6].text_content()
    population = cells[-8].text_content()
    data.append([name, abbr, reps, water_km, land_km, total_km, population])
    return data


    if __name__ == '__main__':
    page_source = get_html()
    bs_parsers = ['lxml', 'html.parser', 'html5lib']
    for parser in bs_parsers:
    bs_start = datetime.now()
    bs_result = bs_scraping(page_source, parser)
    bs_finish = datetime.now() - bs_start
    print('BeautifulSoup {} time: {}'.format(parser, bs_finish))

    lxml_start = datetime.now()
    lxml_result = lxml_scraping(page_source)
    lxml_finish = datetime.now() - lxml_start
    print('lxml time:', lxml_finish)