Skip to content

Instantly share code, notes, and snippets.

@openainext
Forked from elliotbonneville/topkeywords.js
Created September 3, 2016 19:04
Show Gist options
  • Save openainext/742f5f2879ea3746d68145b350341c33 to your computer and use it in GitHub Desktop.
Save openainext/742f5f2879ea3746d68145b350341c33 to your computer and use it in GitHub Desktop.

Revisions

  1. @elliotbonneville elliotbonneville revised this gist Jan 22, 2015. 1 changed file with 7 additions and 1 deletion.
    8 changes: 7 additions & 1 deletion topkeywords.js
    Original file line number Diff line number Diff line change
    @@ -33,7 +33,8 @@ function callback () {
    }

    request(url, function (error, response, body) {
    if (error || response.statusCode !== 200) {
    if (error) {
    console.log(“Couldn’t get page because of error: + error);
    return;
    }

    @@ -57,6 +58,11 @@ request(url, function (error, response, body) {

    // download that page
    request(url, function (error, response, body) {
    if (error) {
    console.log(“Couldn’t get page because of error: + error);
    return;
    }

    // load the page into cheerio
    var $page = cheerio.load(body),
    text = $page("body").text();
  2. @elliotbonneville elliotbonneville created this gist Dec 11, 2014.
    92 changes: 92 additions & 0 deletions topkeywords.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,92 @@
    var request = require("request"),
    cheerio = require("cheerio"),
    url = "https://www.google.com/search?q=data+mining",

    corpus = {},
    totalResults = 0,
    resultsDownloaded = 0;

    function callback () {
    resultsDownloaded++;

    if (resultsDownloaded !== totalResults) {
    return;
    }

    var words = [];

    // stick all words in an array
    for (prop in corpus) {
    words.push({
    word: prop,
    count: corpus[prop]
    });
    }

    // sort array based on how often they occur
    words.sort(function (a, b) {
    return b.count - a.count;
    });

    // finally, log the first fifty most popular words
    console.log(words.slice(0, 20));
    }

    request(url, function (error, response, body) {
    if (error || response.statusCode !== 200) {
    return;
    }

    // load the body of the page into Cheerio so we can traverse the DOM
    var $ = cheerio.load(body),
    links = $(".r a");

    links.each(function (i, link) {
    // get the href attribute of each link
    var url = $(link).attr("href");

    // strip out unnecessary junk
    url = url.replace("/url?q=", "").split("&")[0];

    if (url.charAt(0) === "/") {
    return;
    }

    // this link counts as a result, so increment results
    totalResults++;

    // download that page
    request(url, function (error, response, body) {
    // load the page into cheerio
    var $page = cheerio.load(body),
    text = $page("body").text();

    // throw away extra whitespace and non-alphanumeric characters
    text = text.replace(/\s+/g, " ")
    .replace(/[^a-zA-Z ]/g, "")
    .toLowerCase();

    // split on spaces for a list of all the words on that page and
    // loop through that list
    text.split(" ").forEach(function (word) {
    // we don't want to include very short or long words, as they're
    // probably bad data
    if (word.length < 4 || word.length > 20) {
    return;
    }

    if (corpus[word]) {
    // if this word is already in our "corpus", our collection
    // of terms, increase the count by one
    corpus[word]++;
    } else {
    // otherwise, say that we've found one of that word so far
    corpus[word] = 1;
    }
    });

    // and when our request is completed, call the callback to wrap up!
    callback();
    });
    });
    });