"use strict"; const https = require('https'); const Sitemapper = require('sitemapper'); const jsdom = require('jsdom'); const fs = require('fs'); const { JSDOM } = jsdom; if (!process.argv[2]) { console.error('No path to sitemap given'); } const sitemapUrl = process.argv[2]; const elementSelector = process.argv[3] || 'body'; const removeMultipleNewlines = process.argv[4] || false; const get = async (url) => { console.log('Fetching html from webpage ('+url+')'); return new Promise((resolve, reject) => { https.get(url, res => { res.setEncoding('utf8'); let response = ''; res.on('data', data => { response += data; }); res.on('end', () => { resolve(response); }); }).on('error', error => reject(error)); }); }; const getUrlsFromSitemap = async (sitemapUrl) => { const sitemap = new Sitemapper(); return await sitemap.fetch(sitemapUrl); }; const scrapeUrl = async (url) => { console.log('Scraping webpage'); const html = await get(url); return getTextFromHtml(html); }; const getTextFromHtmlNode = (htmlNode) => { if ( htmlNode.tagName === 'STYLE' || htmlNode.tagName === 'style' || htmlNode.nodeType === 8 ) { return ''; } if (htmlNode.tagName === 'BR') { return "\n"; } if (htmlNode.nodeType === 3) { return htmlNode.textContent.trim(); } let output = ''; for (let i = 0; i < htmlNode.childNodes.length; i++) { output += getTextFromHtmlNode(htmlNode.childNodes[i]); } if (output) { const markdownMap = { 'H1': '# ', 'H2': '## ', 'H3': '### ', 'H4': '#### ', 'H5': '#### ', 'H6': '##### ', 'LI': '* ', 'OPTION': '* ', }; if (markdownMap[htmlNode.tagName]) { return "\n"+markdownMap[htmlNode.tagName] + output.trim(); } return "\n"+output.trim(); } return ''; }; const getTextFromHtml = (html) => { console.log('Getting text from webpage'); const dom = new JSDOM(html); let output = ''; const containerEl = dom.window.document.querySelector(elementSelector); for (let i = 0; i < containerEl.childNodes.length; i++) { output += getTextFromHtmlNode(containerEl.childNodes[i]); } return output; }; const removeNewlines = (string) => { return string.replace(/[\r\n]{2,}/g, "\n"); }; const scrapeUrls = async (urls) => { let output = ''; for (let i = 0; i < urls.length; i++) { output += "\n"; output += '# '+urls[i]; output += "\n"; output += await scrapeUrl(urls[i]); } return output; }; const writeFile = (filename, content) => { console.log('Writing all text to output.txt'); fs.writeFile(filename, content, (err) => { if(err) { return console.error(err); } console.log('The file was saved!'); }); }; // IIFE (async function() { const urls = await getUrlsFromSitemap(sitemapUrl); const content = await scrapeUrls(urls.sites); writeFile('output.md', (removeMultipleNewlines ? removeNewlines(content) : content)); })();