Skip to content

Instantly share code, notes, and snippets.

@matagus
Forked from mdamien/0readme.md
Created July 7, 2017 15:12
Show Gist options
  • Save matagus/6961f5afd6faa22ef13f685f8683ab6f to your computer and use it in GitHub Desktop.
Save matagus/6961f5afd6faa22ef13f685f8683ab6f to your computer and use it in GitHub Desktop.

Revisions

  1. @mdamien mdamien revised this gist Jul 5, 2017. 2 changed files with 11 additions and 12 deletions.
    3 changes: 1 addition & 2 deletions 0readme.md
    Original file line number Diff line number Diff line change
    @@ -6,6 +6,5 @@

    ## Usage

    - change the site name in the script
    - `scrapy runspider -o items.csv 1spider.py`
    - `scrapy runspider -o items.csv -a site="https://yoursite.org" 1spider.py`
    - `python3 2format_results.py`
    20 changes: 10 additions & 10 deletions 1spider.py
    Original file line number Diff line number Diff line change
    @@ -1,17 +1,17 @@
    from scrapy.selector import HtmlXPathSelector
    from scrapy.contrib.spiders import CrawlSpider, Rule
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.item import Item, Field
    import scrapy

    SITE = 'https://bastamag.net'
    DOMAIN = SITE.split('//')[1]


    class BrokenLinksSpider(CrawlSpider):
    class BrokenLinksSpider(scrapy.Spider):
    name = 'brokenlink-checker'
    start_urls = [SITE]
    handle_httpstatus_list = [404, 500]


    def __init__(self, site, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.start_urls = [site]
    self.DOMAIN = site.split('//')[1]


    def parse(self, response):
    if response.status in (404, 500):
    item = {}
    @@ -23,7 +23,7 @@ def parse(self, response):

    yield item

    if DOMAIN in response.url:
    if self.DOMAIN in response.url:
    for link in response.css('a'):
    href = link.xpath('@href').extract()
    text = link.xpath('text()').extract()
  2. @mdamien mdamien revised this gist Jul 4, 2017. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions 0readme.md
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,5 @@
    # List all the broken links on your website

    ## Requirements:

    `python3` and `scrapy` (`pip install scrapy`)
  3. @mdamien mdamien revised this gist Jul 4, 2017. 3 changed files with 14 additions and 6 deletions.
    9 changes: 9 additions & 0 deletions 0readme.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,9 @@
    ## Requirements:

    `python3` and `scrapy` (`pip install scrapy`)

    ## Usage

    - change the site name in the script
    - `scrapy runspider -o items.csv 1spider.py`
    - `python3 2format_results.py`
    File renamed without changes.
    11 changes: 5 additions & 6 deletions format_results.py → 2format_results.py
    Original file line number Diff line number Diff line change
    @@ -1,12 +1,11 @@
    import csv, itertools

    items = csv.DictReader(open('items.csv'))
    for page, links in itertools.groupby(items, lambda item: item['prev_page']):
    print('PAGE:', page)
    for line in links:
    if line['prev_page']:
    for page, links in itertools.groupby(items, lambda item: item['prev_page']):
    if page:
    print('PAGE:', page)
    for line in links:
    print(' LINK TEXT:', line['prev_link_text'])
    print(' LINK URL:', line['prev_link_url'])
    print(' LINK STATUS:', line['status'])
    print()
    print()
    print()
  4. @mdamien mdamien revised this gist Jul 4, 2017. 1 changed file with 12 additions and 0 deletions.
    12 changes: 12 additions & 0 deletions format_results.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,12 @@
    import csv, itertools

    items = csv.DictReader(open('items.csv'))
    for page, links in itertools.groupby(items, lambda item: item['prev_page']):
    print('PAGE:', page)
    for line in links:
    if line['prev_page']:
    print(' LINK TEXT:', line['prev_link_text'])
    print(' LINK URL:', line['prev_link_url'])
    print(' LINK STATUS:', line['status'])
    print()
    print()
  5. @mdamien mdamien created this gist Jul 4, 2017.
    35 changes: 35 additions & 0 deletions spider.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,35 @@
    from scrapy.selector import HtmlXPathSelector
    from scrapy.contrib.spiders import CrawlSpider, Rule
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.item import Item, Field

    SITE = 'https://bastamag.net'
    DOMAIN = SITE.split('//')[1]


    class BrokenLinksSpider(CrawlSpider):
    name = 'brokenlink-checker'
    start_urls = [SITE]
    handle_httpstatus_list = [404, 500]

    def parse(self, response):
    if response.status in (404, 500):
    item = {}
    item['url'] = response.url
    item['prev_page'] = response.meta['prev_url']
    item['prev_link_url'] = response.meta['prev_href']
    item['prev_link_text'] = response.meta['prev_link_text']
    item['status'] = response.status

    yield item

    if DOMAIN in response.url:
    for link in response.css('a'):
    href = link.xpath('@href').extract()
    text = link.xpath('text()').extract()
    if href: # maybe should show an error if no href
    yield response.follow(link, self.parse, meta={
    'prev_link_text': text,
    'prev_href': href,
    'prev_url': response.url,
    })