Created
February 23, 2019 00:08
-
-
Save phi-line/c4bfecb6990d6148e1ce7c381d78d9f1 to your computer and use it in GitHub Desktop.
Revisions
-
phi-line renamed this gist
Feb 23, 2019 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
phi-line created this gist
Feb 23, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,61 @@ import asyncio import pyppeteer as pyp from pyppeteer import launch from bs4 import BeautifulSoup async def main(): browser = await launch( headless=True, devtools=True, autoClose=False, ) url = 'https://duckduckgo.com/' page = await browser.newPage() await page.goto(url) await page.type(selector='#search_form_input_homepage', text='cats') await asyncio.gather( page.click( "#search_button_homepage", options={"waitUntil": "networkidle0"}, ), page.waitForNavigation(), ) page_source = await page.evaluate( "new XMLSerializer().serializeToString(document);" ) soup = BeautifulSoup(page_source, "html.parser") links = soup.find_all('a', {'class': 'result__a'}) for link in links: await spider(browser, link['href']) async def spider(browser, link): try: page = await browser.newPage() await page.goto(link) page_source = await page.evaluate( "new XMLSerializer().serializeToString(document);" ) soup = BeautifulSoup(page_source, "html.parser") links = soup.find_all('a') for link in links: await spider(browser, link['href']) except pyp.errors.NetworkError: return if __name__ == '__main__': asyncio.get_event_loop().run_until_complete(main())