-
-
Save matagus/6961f5afd6faa22ef13f685f8683ab6f to your computer and use it in GitHub Desktop.
Revisions
-
mdamien revised this gist
Jul 5, 2017 . 2 changed files with 11 additions and 12 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,6 +6,5 @@ ## Usage - `scrapy runspider -o items.csv -a site="https://yoursite.org" 1spider.py` - `python3 2format_results.py` This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,17 +1,17 @@ import scrapy class BrokenLinksSpider(scrapy.Spider): name = 'brokenlink-checker' handle_httpstatus_list = [404, 500] def __init__(self, site, *args, **kwargs): super().__init__(*args, **kwargs) self.start_urls = [site] self.DOMAIN = site.split('//')[1] def parse(self, response): if response.status in (404, 500): item = {} @@ -23,7 +23,7 @@ def parse(self, response): yield item if self.DOMAIN in response.url: for link in response.css('a'): href = link.xpath('@href').extract() text = link.xpath('text()').extract() -
mdamien revised this gist
Jul 4, 2017 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,5 @@ # List all the broken links on your website ## Requirements: `python3` and `scrapy` (`pip install scrapy`) -
mdamien revised this gist
Jul 4, 2017 . 3 changed files with 14 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,9 @@ ## Requirements: `python3` and `scrapy` (`pip install scrapy`) ## Usage - change the site name in the script - `scrapy runspider -o items.csv 1spider.py` - `python3 2format_results.py` File renamed without changes.This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,12 +1,11 @@ import csv, itertools items = csv.DictReader(open('items.csv')) for page, links in itertools.groupby(items, lambda item: item['prev_page']): if page: print('PAGE:', page) for line in links: print(' LINK TEXT:', line['prev_link_text']) print(' LINK URL:', line['prev_link_url']) print() print() -
mdamien revised this gist
Jul 4, 2017 . 1 changed file with 12 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,12 @@ import csv, itertools items = csv.DictReader(open('items.csv')) for page, links in itertools.groupby(items, lambda item: item['prev_page']): print('PAGE:', page) for line in links: if line['prev_page']: print(' LINK TEXT:', line['prev_link_text']) print(' LINK URL:', line['prev_link_url']) print(' LINK STATUS:', line['status']) print() print() -
mdamien created this gist
Jul 4, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,35 @@ from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.item import Item, Field SITE = 'https://bastamag.net' DOMAIN = SITE.split('//')[1] class BrokenLinksSpider(CrawlSpider): name = 'brokenlink-checker' start_urls = [SITE] handle_httpstatus_list = [404, 500] def parse(self, response): if response.status in (404, 500): item = {} item['url'] = response.url item['prev_page'] = response.meta['prev_url'] item['prev_link_url'] = response.meta['prev_href'] item['prev_link_text'] = response.meta['prev_link_text'] item['status'] = response.status yield item if DOMAIN in response.url: for link in response.css('a'): href = link.xpath('@href').extract() text = link.xpath('text()').extract() if href: # maybe should show an error if no href yield response.follow(link, self.parse, meta={ 'prev_link_text': text, 'prev_href': href, 'prev_url': response.url, })