Skip to content

Instantly share code, notes, and snippets.

@rampadc
Created July 28, 2019 15:08
Show Gist options
  • Select an option

  • Save rampadc/f3c92359d75e0d72bee496fc7f79b316 to your computer and use it in GitHub Desktop.

Select an option

Save rampadc/f3c92359d75e0d72bee496fc7f79b316 to your computer and use it in GitHub Desktop.

Revisions

  1. rampadc created this gist Jul 28, 2019.
    191 changes: 191 additions & 0 deletions kindle-reader.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,191 @@
    // modified based on:
    // - https://lowrey.me/scraping-a-book-from-kindle-read-amazon-com/
    console.clear();

    (function() {
    var hashes = {};
    var all = "";

    function hashString(str) {
    let hash = 0;
    for (let i = 0; i < str.length; i++) {
    hash += Math.pow(str.charCodeAt(i) * 31, str.length - i);
    hash = hash & hash; // Convert to 32bit integer
    }
    return hash;
    }

    function getKindleBookAppFrame() {
    return document.querySelector("#KindleReaderIFrame").contentDocument;
    }

    function turnPage() {
    return new Promise(resolve => {
    var appFrame = getKindleBookAppFrame();
    appFrame.getElementById("kindleReader_pageTurnAreaRight").click();
    setTimeout(resolve, 200);
    });
    }

    function isHeading(block) {
    if (
    $(block).is("h1") ||
    $(block).is("h2") ||
    $(block).is("h3") ||
    $(block).is("h4")
    ) {
    return true;
    }
    $(block)
    .contents()
    .each(() => {
    if (
    $(this).is("h1") ||
    $(this).is("h2") ||
    $(this).is("h3") ||
    $(this).is("h4")
    ) {
    return true;
    }
    });
    return false;
    }

    function getContentFramesSubElements(contentFrames) {
    return Array.from(
    contentFrames[currentContentFrameIndex].querySelectorAll(
    "\
    body > div, body > h1, body > h2, body > h3, body > h4, body > h5, body > h6, \
    body > ol, body > ul, body > li \
    "
    )
    );
    }

    function scrapeFrames() {
    return new Promise(resolve => {
    frames = [];
    var appFrame = getKindleBookAppFrame();
    var contentFrames = Array.from(appFrame.querySelectorAll("iframe")).map(
    f => f.contentDocument
    );

    var frameBody = $("iframe")
    .contents()
    .find("iframe")
    .contents()
    .find("body")
    .get(1);
    // console.log(frameBody);
    let hash = hashString(frameBody.innerText);
    if (hashes[hash] === undefined) {
    hashes[hash] = true;
    frames.push(frameBody.innerHTML);
    }
    resolve(frames);
    });
    }

    function formatFrames(frames) {
    console.log("unformatted");
    console.log(frames);
    return new Promise(resolve => {
    formattedFrames = [];

    for (let i = 0; i < frames.length; i++) {
    const frame = frames[i];

    let formattedFrame = {
    is_heading: false,
    is_list_item: false,
    text: null
    };

    // check if frame contains a unordered/ordered list
    if ($(frame).is("ul")) {
    // break up list further
    let ulTexts = Array.from($(frame).contents())
    .map(el => {
    return $(el).text();
    })
    .filter(el => {
    return el.trim().length != 0;
    });
    ulTexts.forEach(text => {
    formattedFrame["text"] = text;
    formattedFrame["is_list_item"] = true;
    formattedFrames.push(formattedFrame);
    });
    } else if ($(frame).is("ol")) {
    let olTexts = Array.from($(frame).contents())
    .map(el => {
    return $(el).text();
    })
    .filter(el => {
    return el.trim().length != 0;
    });
    olTexts.forEach(text => {
    formattedFrame["text"] = text;
    formattedFrame["is_list_item"] = true;
    formattedFrames.push(formattedFrame);
    });
    console.log(formattedFrames);
    } else {
    // otherwise, treat as paragraph
    const text = $(frame).text();
    if (text.trim().length == 0) {
    continue;
    }
    formattedFrame["is_heading"] = isHeading(frame);
    formattedFrame["text"] = $(frame).text();
    formattedFrames.push(formattedFrame);
    }
    }

    resolve(formattedFrames);
    });
    }

    function getFormattedFrames() {
    // return a promise with an array of formatted scraped content
    return scrapeFrames().then(formatFrames);
    }

    function hasReachedEndSample() {
    var appFrame = getKindleBookAppFrame();
    var endSampleMessageDiv = appFrame.getElementById(
    "kindle_sample_end_message"
    );
    return $(endSampleMessageDiv).is(":visible");
    }

    function hasReachedEnd() {
    var appFrame = getKindleBookAppFrame();
    return appFrame
    .getElementById("kindleReader_footer")
    .innerText.includes("100%");
    }

    function done() {
    console.log(all);
    }

    function scrape() {
    setTimeout(() => {
    scrapeFrames()
    .then(frames => {
    console.log(".");
    all += frames.join("\n");
    })
    .then(turnPage)
    .then(() => {
    if (hasReachedEndSample() || hasReachedEnd()) {
    done();
    } else {
    scrape();
    }
    });
    }, 800);
    }
    scrape();
    })();