Im0rtality · October 4, 2017 15:02 · Oct 4, 2017 · Oct 4, 2017 · Oct 4, 2017
diff --git a/00-install.sh → 00-install-mac.sh b/00-install.sh → 00-install-mac.sh
diff --git a/install.sh → 00-install.sh b/install.sh → 00-install.sh
diff --git a/crawl.js b/crawl.js
@@ -0,0 +1,189 @@
+/**
+ * Created by andy hulstkamp
+ */
+
+var webpage = require("webpage"),
+    fs = require("fs");
+
+var debug = false,
+    pageIndex = 0,
+    allLinks = [],
+    url = "https://www.google.com/?hl=en",
+    searchTerm = "mongodb vs couchdb",
+    maxSearchPages = 3;
+
+var createPage = function () {
+
+    var page = webpage.create();
+
+    //set some headers to get the content we want
+    page.customHeaders = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:22.0) Gecko/20130404 Firefox/22.0",
+        "Accept-Language": "en"
+    };
+
+    //smaller size might get you the mobile versions of a site
+    page.viewportSize = { width: 1280, height: 800 };
+
+    //good to debug and abort request, we do not wish to invoke cause they slow things down (e.g. tons of plugins)
+    page.onResourceRequested = function (requestData, networkRequest) {
+        log(["onResourceRequested", JSON.stringify(networkRequest), JSON.stringify(requestData)]);
+        //in case we do not want to invoke the request
+        //networkRequest.abort();
+    };
+
+    //what dd we get
+    page.onResourceReceived = function (response) {
+        log(["onResourceReceived", JSON.stringify(response)]);
+    };
+
+    //what went wrong
+    page.onResourceError = function (error) {
+        log(["onResourceError", JSON.stringify(error)]);
+    };
+
+    page.onLoadStarted = function() {
+        console.log("loading page...");
+    };
+
+    page.onLoadFinished = function(status) {
+        var currentUrl = page.evaluate(function() {
+            return window.location.href;
+        });
+        console.log("onLoadFinished", currentUrl, status);
+    };
+
+    return page;
+}
+
+
+var collectLinks = function () {
+    var hrefs = page.evaluate(function () {
+        var links = document.querySelectorAll("h3.r a");
+        return Array.prototype.map.call(links, function (anchor) {
+            return anchor.getAttribute("href");
+        });
+    });
+    return hrefs;
+}
+
+var search = function (url) {
+
+    page.open(url, function () {
+
+        //give scripts and ajax call some time to execute and throttle execution to not appear as a robot
+        //google might block us
+        setTimeout(function () {
+
+            //for debugging purposes, to see whats happening
+            page.render("search.png");
+
+            //any js can be injected on the page and used inside evaluate, inject jQuery for convenience, injected returns true if all went well
+            var injected = page.injectJs('node_modules/jquery/dist/jquery.js');
+            if (!injected) {
+                throw Error("jquery could not be injected");
+            }
+
+            //anything that is invoked on the page must be executed inside evaluate.
+            //evaluate is sandboxed, only simple types are allowed as arguments and return types
+            var f = page.evaluate(function (searchTerm) {
+                $("input").val(searchTerm);
+                $("form").submit();
+            }, searchTerm);
+
+            //give it some time to execute
+            setTimeout(function () {
+                //collect links and goto next page
+                collectAndNext();
+            }, 2000);
+
+        }, 2000);
+    });
+};
+
+/*
+ * collect all links on the search page, and use paging to go to the next page
+ */
+var collectAndNext = function () {
+
+    //for debugging purposes
+    //page.render("./snapshots/searchPage-" + pageIndex + ".png");
+
+    //collect all links on the page
+    var links = collectLinks();
+    console.log(links);
+    allLinks = allLinks.concat(links);
+
+    //evaluate and invoke request for next page
+    var next = page.evaluate(function () {
+        //next button on google search page
+        var btn = document.getElementById("pnnext");
+        //invoke click event on the next button
+        var ev = document.createEvent("MouseEvent");
+        ev.initEvent("click", true, true);
+        btn.dispatchEvent(ev);
+    });
+
+    //allow the next page to load
+    setTimeout(function () {
+
+        //goto next page and collect link or - if we reached max . process all collected links
+        //and scrape he pages
+        if (++pageIndex >= maxSearchPages) {
+            scrapeAll(allLinks);
+        } else {
+            collectAndNext();
+        }
+    }, 2000);
+}
+
+/**
+ * scrape all pages
+ * @param links
+ */
+var scrapeAll = function (links) {
+    var index = 0;
+
+    //scrape a page at url
+    var scrapePage = function (index, url) {
+
+        log(["scrape page ", index, url]);
+
+        //open the page
+        page.open(url, function (status) {
+
+            log(["page loaded", status]);
+
+            //write the content of the page as plainText to disc
+            //more advanced processing could be done in page.evaluate
+            fs.write("./scraped/page" + index + ".txt", page.plainText, "w");
+
+            page.render("./snapshots/page" + index + ".png");
+
+            //scrape next link or abort
+            index++;
+            var u = links[index];
+            if (u) {
+
+                //give it some time to process
+                setTimeout(function () {
+                    scrapePage(index, u)
+                }, 7000);
+            }
+            else {
+                phantom.exit();
+            }
+        })
+    };
+    // scrapePage(index, links[index]);
+}
+
+var log = function (args) {
+    if (debug) {
+        console.log(args);
+    }
+}
+
+var page = createPage();
+
+search(url);
diff --git a/install.sh b/install.sh
@@ -0,0 +1,9 @@
+#/bin/sh
+brew install phantomjs
+brew install nodejs
+
+
+echo {} > package.json
+npm install jquery --save
+
+phantomjs crawl.js