Skip to content

Instantly share code, notes, and snippets.

@pnhuyduy
Created March 23, 2020 07:22
Show Gist options
  • Save pnhuyduy/c507e94e9309511264713dc91b8dfee5 to your computer and use it in GitHub Desktop.
Save pnhuyduy/c507e94e9309511264713dc91b8dfee5 to your computer and use it in GitHub Desktop.

Revisions

  1. pnhuyduy renamed this gist Mar 23, 2020. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. pnhuyduy created this gist Mar 23, 2020.
    56 changes: 56 additions & 0 deletions insta-crawler
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,56 @@
    // Source: https://intoli.com/blog/scrape-infinite-scroll/

    const fs = require('fs');
    const puppeteer = require('puppeteer');

    function extractItems() {
    const extractedElements = document.querySelectorAll('main > div > div:nth-child(4) > article > div > div > div img');
    const items = [];
    for (let element of extractedElements) {
    items.push(element.src);
    }
    console.log(items);
    return items;
    }

    async function scrapeInfiniteScrollItems(
    page,
    extractItems,
    itemTargetCount,
    scrollDelay = 1000,
    ) {
    let items = [];
    try {
    let previousHeight;
    while (items.length < itemTargetCount) {
    items = await page.evaluate(extractItems);
    previousHeight = await page.evaluate('document.body.scrollHeight');
    await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
    await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
    await page.waitFor(scrollDelay);
    }
    } catch(e) { }
    return items;
    }

    (async () => {
    // Set up browser and page.
    const browser = await puppeteer.launch({
    headless: false,
    args: ['--no-sandbox', '--disable-setuid-sandbox'],
    });
    const page = await browser.newPage();
    page.setViewport({ width: 1280, height: 926 });

    // Navigate to the demo page.
    await page.goto('https://www.instagram.com/diq.ng/');

    // Scroll and extract items from the page.
    const items = await scrapeInfiniteScrollItems(page, extractItems, 100);

    // Save extracted items to a file.
    fs.writeFileSync('./items.txt', items.join('\n') + '\n');

    // Close the browser.
    await browser.close();
    })();