Last active
July 1, 2017 13:51
-
-
Save dzwarg/aea276a6ab5bc5d4ab052fc1434b7020 to your computer and use it in GitHub Desktop.
Revisions
-
dzwarg revised this gist
Jul 1, 2017 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,7 +5,7 @@ "main": "crawl.js", "repository": { "type": "git", "url": "https://gist.github.com/dzwarg/aea276a6ab5bc5d4ab052fc1434b7020.git" }, "author": "David Zwarg <[email protected]>", "license": "MIT" -
dzwarg created this gist
Jul 1, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,135 @@ #!/usr/bin/env node var Spider = require('node-spider'), fs = require('fs'), host = process.argv.length == 1 ? process.argv[0] : '', filesToSave = -1, start = Date.now(); // configure the spider var spider = new Spider({ concurrent:1, delay:0, logs: process.stderr, allowDuplicates:false, catchErrors:true, error:function(err,url){ console.log('ERROR!'); console.log(err); console.log(url); }, done:function(){ console.log('DONE:' + (-filesToSave)); console.log(((Date.now() - start)/1000).toFixed(2) + 's'); }, headers:{ 'accept-encoding': 'identity' }, encoding: null }); // make a directory tree, based on a path var mktree = function(prefix, path, done) { if (path.length == 0) { return done(null); } var parts = path.split('/'), _prefix = [prefix, parts[0]].join('/'); fs.stat(_prefix, function(err, stats){ if (err) { fs.mkdir(_prefix, function(err) { if (err) return done(err); mktree(_prefix, parts.slice(1).join('/'), done); }); } else if (stats.isDirectory()) { mktree(_prefix, parts.slice(1).join('/'), done); } }); }; // do some filtering of content here var styleScript = function(content) { return content .toString('utf8'); } // save a document/image/etc var save = function(doc) { var full = decodeURIComponent(doc.res.request.uri.pathname.substr(1)), ct = doc.res.headers['content-type'], body = doc.res.body; if (full == '' || full == 'index.php/') { full = 'index.html'; body = styleScript(body); } else if (ct.startsWith('text/html')) { full += '/index.html'; body = styleScript(body); } else if (ct.startsWith('text/css')) { full += '.css'; } else if (ct.startsWith('application/javascript')) { full += '.js'; } var path = full.substr(0, full.lastIndexOf('/')); var writeFile = function(full, content) { fs.writeFile(full, content, 'binary', function(err){ if (err) throw err; console.log('Saved file: ' + full); }); }; mktree('.', path, function(err){ if (err) throw err; if (filesToSave-- == 0) process.exit(0); else writeFile(full, body); }); }; // queue up all found 'href' attributes var enqueueHrefs = function(doc) { doc.\$('[href]').each(function(i,e){ var href=doc.\$(this).attr('href'); if (!href) return; href=href.split('#')[0]; var url=doc.resolve(href); if (!url.startsWith('http://' + host)) return; spider.queue(url,req); }); }; // queue up all found 'src' attributes var enqueueImages = function(doc) { doc.\$('[src]').each(function(i,e){ var src=doc.\$(this).attr('src'); if (!src) return; var url=doc.resolve(src); if (!url.startsWith('http://' + host)) return; spider.queue(url,req); }); }; // issue a request var req = function(doc){ var ct = doc.res.headers['content-type']; if (ct.startsWith('text/')) { save(doc); if (ct.startsWith('text/html')) { enqueueHrefs(doc); enqueueImages(doc); } } else if (ct.startsWith('image/')) { save(doc); } else if (ct.startsWith('application/javascript')) { save(doc); } else { console.log('Skipping: ' + doc.url); } }; // queue up the index page if the hostname was provided if (host != '') spider.queue('http://' + host + '/', req); This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,12 @@ { "name": "simple-crawl", "version": "1.0.0", "description": "A simple web crawler that saves HTML, IMG, CSS, and JavaScript sources.", "main": "crawl.js", "repository": { "type": "git", "url": "https://gist.github.com/dzwarg/TBD.git" }, "author": "David Zwarg <[email protected]>", "license": "MIT" }