Created
July 28, 2019 15:08
-
-
Save rampadc/f3c92359d75e0d72bee496fc7f79b316 to your computer and use it in GitHub Desktop.
Revisions
-
rampadc created this gist
Jul 28, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,191 @@ // modified based on: // - https://lowrey.me/scraping-a-book-from-kindle-read-amazon-com/ console.clear(); (function() { var hashes = {}; var all = ""; function hashString(str) { let hash = 0; for (let i = 0; i < str.length; i++) { hash += Math.pow(str.charCodeAt(i) * 31, str.length - i); hash = hash & hash; // Convert to 32bit integer } return hash; } function getKindleBookAppFrame() { return document.querySelector("#KindleReaderIFrame").contentDocument; } function turnPage() { return new Promise(resolve => { var appFrame = getKindleBookAppFrame(); appFrame.getElementById("kindleReader_pageTurnAreaRight").click(); setTimeout(resolve, 200); }); } function isHeading(block) { if ( $(block).is("h1") || $(block).is("h2") || $(block).is("h3") || $(block).is("h4") ) { return true; } $(block) .contents() .each(() => { if ( $(this).is("h1") || $(this).is("h2") || $(this).is("h3") || $(this).is("h4") ) { return true; } }); return false; } function getContentFramesSubElements(contentFrames) { return Array.from( contentFrames[currentContentFrameIndex].querySelectorAll( "\ body > div, body > h1, body > h2, body > h3, body > h4, body > h5, body > h6, \ body > ol, body > ul, body > li \ " ) ); } function scrapeFrames() { return new Promise(resolve => { frames = []; var appFrame = getKindleBookAppFrame(); var contentFrames = Array.from(appFrame.querySelectorAll("iframe")).map( f => f.contentDocument ); var frameBody = $("iframe") .contents() .find("iframe") .contents() .find("body") .get(1); // console.log(frameBody); let hash = hashString(frameBody.innerText); if (hashes[hash] === undefined) { hashes[hash] = true; frames.push(frameBody.innerHTML); } resolve(frames); }); } function formatFrames(frames) { console.log("unformatted"); console.log(frames); return new Promise(resolve => { formattedFrames = []; for (let i = 0; i < frames.length; i++) { const frame = frames[i]; let formattedFrame = { is_heading: false, is_list_item: false, text: null }; // check if frame contains a unordered/ordered list if ($(frame).is("ul")) { // break up list further let ulTexts = Array.from($(frame).contents()) .map(el => { return $(el).text(); }) .filter(el => { return el.trim().length != 0; }); ulTexts.forEach(text => { formattedFrame["text"] = text; formattedFrame["is_list_item"] = true; formattedFrames.push(formattedFrame); }); } else if ($(frame).is("ol")) { let olTexts = Array.from($(frame).contents()) .map(el => { return $(el).text(); }) .filter(el => { return el.trim().length != 0; }); olTexts.forEach(text => { formattedFrame["text"] = text; formattedFrame["is_list_item"] = true; formattedFrames.push(formattedFrame); }); console.log(formattedFrames); } else { // otherwise, treat as paragraph const text = $(frame).text(); if (text.trim().length == 0) { continue; } formattedFrame["is_heading"] = isHeading(frame); formattedFrame["text"] = $(frame).text(); formattedFrames.push(formattedFrame); } } resolve(formattedFrames); }); } function getFormattedFrames() { // return a promise with an array of formatted scraped content return scrapeFrames().then(formatFrames); } function hasReachedEndSample() { var appFrame = getKindleBookAppFrame(); var endSampleMessageDiv = appFrame.getElementById( "kindle_sample_end_message" ); return $(endSampleMessageDiv).is(":visible"); } function hasReachedEnd() { var appFrame = getKindleBookAppFrame(); return appFrame .getElementById("kindleReader_footer") .innerText.includes("100%"); } function done() { console.log(all); } function scrape() { setTimeout(() => { scrapeFrames() .then(frames => { console.log("."); all += frames.join("\n"); }) .then(turnPage) .then(() => { if (hasReachedEndSample() || hasReachedEnd()) { done(); } else { scrape(); } }); }, 800); } scrape(); })();