Skip to content

Instantly share code, notes, and snippets.

@phi-line
Created February 23, 2019 00:08
Show Gist options
  • Save phi-line/c4bfecb6990d6148e1ce7c381d78d9f1 to your computer and use it in GitHub Desktop.
Save phi-line/c4bfecb6990d6148e1ce7c381d78d9f1 to your computer and use it in GitHub Desktop.

Revisions

  1. phi-line renamed this gist Feb 23, 2019. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. phi-line created this gist Feb 23, 2019.
    61 changes: 61 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,61 @@
    import asyncio
    import pyppeteer as pyp
    from pyppeteer import launch
    from bs4 import BeautifulSoup

    async def main():
    browser = await launch(
    headless=True,
    devtools=True,
    autoClose=False,
    )

    url = 'https://duckduckgo.com/'

    page = await browser.newPage()

    await page.goto(url)

    await page.type(selector='#search_form_input_homepage',
    text='cats')

    await asyncio.gather(
    page.click(
    "#search_button_homepage",
    options={"waitUntil": "networkidle0"},
    ),
    page.waitForNavigation(),
    )

    page_source = await page.evaluate(
    "new XMLSerializer().serializeToString(document);"
    )

    soup = BeautifulSoup(page_source, "html.parser")

    links = soup.find_all('a', {'class': 'result__a'})
    for link in links:
    await spider(browser, link['href'])


    async def spider(browser, link):
    try:
    page = await browser.newPage()

    await page.goto(link)

    page_source = await page.evaluate(
    "new XMLSerializer().serializeToString(document);"
    )

    soup = BeautifulSoup(page_source, "html.parser")

    links = soup.find_all('a')
    for link in links:
    await spider(browser, link['href'])
    except pyp.errors.NetworkError:
    return


    if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(main())