Skip to content

Instantly share code, notes, and snippets.

@Im0rtality
Last active October 4, 2017 15:02
Show Gist options
  • Save Im0rtality/02fb22ec5597e2c2146ab4a026c2f89f to your computer and use it in GitHub Desktop.
Save Im0rtality/02fb22ec5597e2c2146ab4a026c2f89f to your computer and use it in GitHub Desktop.

Revisions

  1. Im0rtality renamed this gist Oct 4, 2017. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. Im0rtality renamed this gist Oct 4, 2017. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. Im0rtality created this gist Oct 4, 2017.
    189 changes: 189 additions & 0 deletions crawl.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,189 @@
    /**
    * Created by andy hulstkamp
    */

    var webpage = require("webpage"),
    fs = require("fs");

    var debug = false,
    pageIndex = 0,
    allLinks = [],
    url = "https://www.google.com/?hl=en",
    searchTerm = "mongodb vs couchdb",
    maxSearchPages = 3;

    var createPage = function () {

    var page = webpage.create();

    //set some headers to get the content we want
    page.customHeaders = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:22.0) Gecko/20130404 Firefox/22.0",
    "Accept-Language": "en"
    };

    //smaller size might get you the mobile versions of a site
    page.viewportSize = { width: 1280, height: 800 };

    //good to debug and abort request, we do not wish to invoke cause they slow things down (e.g. tons of plugins)
    page.onResourceRequested = function (requestData, networkRequest) {
    log(["onResourceRequested", JSON.stringify(networkRequest), JSON.stringify(requestData)]);
    //in case we do not want to invoke the request
    //networkRequest.abort();
    };

    //what dd we get
    page.onResourceReceived = function (response) {
    log(["onResourceReceived", JSON.stringify(response)]);
    };

    //what went wrong
    page.onResourceError = function (error) {
    log(["onResourceError", JSON.stringify(error)]);
    };

    page.onLoadStarted = function() {
    console.log("loading page...");
    };

    page.onLoadFinished = function(status) {
    var currentUrl = page.evaluate(function() {
    return window.location.href;
    });
    console.log("onLoadFinished", currentUrl, status);
    };

    return page;
    }


    var collectLinks = function () {
    var hrefs = page.evaluate(function () {
    var links = document.querySelectorAll("h3.r a");
    return Array.prototype.map.call(links, function (anchor) {
    return anchor.getAttribute("href");
    });
    });
    return hrefs;
    }

    var search = function (url) {

    page.open(url, function () {

    //give scripts and ajax call some time to execute and throttle execution to not appear as a robot
    //google might block us
    setTimeout(function () {

    //for debugging purposes, to see whats happening
    page.render("search.png");

    //any js can be injected on the page and used inside evaluate, inject jQuery for convenience, injected returns true if all went well
    var injected = page.injectJs('node_modules/jquery/dist/jquery.js');
    if (!injected) {
    throw Error("jquery could not be injected");
    }

    //anything that is invoked on the page must be executed inside evaluate.
    //evaluate is sandboxed, only simple types are allowed as arguments and return types
    var f = page.evaluate(function (searchTerm) {
    $("input").val(searchTerm);
    $("form").submit();
    }, searchTerm);

    //give it some time to execute
    setTimeout(function () {
    //collect links and goto next page
    collectAndNext();
    }, 2000);

    }, 2000);
    });
    };

    /*
    * collect all links on the search page, and use paging to go to the next page
    */
    var collectAndNext = function () {

    //for debugging purposes
    //page.render("./snapshots/searchPage-" + pageIndex + ".png");

    //collect all links on the page
    var links = collectLinks();
    console.log(links);
    allLinks = allLinks.concat(links);

    //evaluate and invoke request for next page
    var next = page.evaluate(function () {
    //next button on google search page
    var btn = document.getElementById("pnnext");
    //invoke click event on the next button
    var ev = document.createEvent("MouseEvent");
    ev.initEvent("click", true, true);
    btn.dispatchEvent(ev);
    });

    //allow the next page to load
    setTimeout(function () {

    //goto next page and collect link or - if we reached max . process all collected links
    //and scrape he pages
    if (++pageIndex >= maxSearchPages) {
    scrapeAll(allLinks);
    } else {
    collectAndNext();
    }
    }, 2000);
    }

    /**
    * scrape all pages
    * @param links
    */
    var scrapeAll = function (links) {
    var index = 0;

    //scrape a page at url
    var scrapePage = function (index, url) {

    log(["scrape page ", index, url]);

    //open the page
    page.open(url, function (status) {

    log(["page loaded", status]);

    //write the content of the page as plainText to disc
    //more advanced processing could be done in page.evaluate
    fs.write("./scraped/page" + index + ".txt", page.plainText, "w");

    page.render("./snapshots/page" + index + ".png");

    //scrape next link or abort
    index++;
    var u = links[index];
    if (u) {

    //give it some time to process
    setTimeout(function () {
    scrapePage(index, u)
    }, 7000);
    }
    else {
    phantom.exit();
    }
    })
    };
    // scrapePage(index, links[index]);
    }

    var log = function (args) {
    if (debug) {
    console.log(args);
    }
    }

    var page = createPage();

    search(url);
    9 changes: 9 additions & 0 deletions install.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,9 @@
    #/bin/sh
    brew install phantomjs
    brew install nodejs


    echo {} > package.json
    npm install jquery --save

    phantomjs crawl.js