Skip to content

Instantly share code, notes, and snippets.

@Parth-Vader
Created July 10, 2017 13:52
Show Gist options
  • Select an option

  • Save Parth-Vader/405852f891cdf2b568008c65699b86d1 to your computer and use it in GitHub Desktop.

Select an option

Save Parth-Vader/405852f891cdf2b568008c65699b86d1 to your computer and use it in GitHub Desktop.

Revisions

  1. Parth-Vader created this gist Jul 10, 2017.
    125 changes: 125 additions & 0 deletions followall.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,125 @@
    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    import re
    import datetime

    from six.moves.urllib.parse import urlparse
    import click
    import scrapy
    from scrapy.http import Request, HtmlResponse
    from scrapy.linkextractors import LinkExtractor

    #from books.items import Page

    from scrapy.crawler import CrawlerProcess
    from scrapy.crawler import CrawlerRunner
    from twisted.internet import reactor, defer
    from scrapy.utils.project import get_project_settings
    from scrapy.utils.log import configure_logging

    class Page(scrapy.Item):
    url = scrapy.Field()
    title = scrapy.Field()
    size = scrapy.Field()
    referer = scrapy.Field()
    newcookies = scrapy.Field()
    body = scrapy.Field()
    rating = scrapy.Field()
    price = scrapy.Field()
    category = scrapy.Field()
    stock = scrapy.Field()

    class FollowAllSpider(scrapy.Spider):

    name = 'followall'
    ratings_map = {
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    }

    def __init__(self, **kw):
    super(FollowAllSpider, self).__init__(**kw)
    url = 'http://localhost/books.toscrape.com/index.html'
    if not url.startswith('http://') and not url.startswith('https://'):
    url = 'http://%s/' % url
    self.url = url
    self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
    self.link_extractor = LinkExtractor()
    self.cookies_seen = set()
    self.previtem = 0
    self.items = 0
    self.timesec = datetime.datetime.utcnow()

    def start_requests(self):
    return [Request(self.url, callback=self.parse, dont_filter=True)]

    def parse(self, response):
    """Parse a PageItem and all requests to follow"""
    page = self._get_item(response)
    r = [page]

    r.extend(self._extract_requests(response))
    self.items = self.crawler.stats.get_value('item_scraped_count', 0)
    pages = self.crawler.stats.get_value('response_received_count', 0)
    a = self.crawler.stats.get_value('start_time')
    b = datetime.datetime.utcnow()
    self.timesec = b - a

    return r

    def close(self, reason):
    with open("Benchmark.txt", 'a') as f:
    f.write(" {0}".format(
    (self.items * (1 / self.timesec.total_seconds()))))
    click.secho("\nThe average speed of the spider is {0} items/sec\n".format(
    self.items * (1 / self.timesec.total_seconds())), bold=True)

    def _get_item(self, response):
    item = Page(
    url=response.url,
    size=str(len(response.body)),
    referer=response.request.headers.get('Referer'),
    rating=response.css('p.star-rating::attr(class)').extract_first().split(' ')[-1],
    title=response.css('.product_main h1::text').extract_first(),
    price=response.css('.product_main p.price_color::text').re_first('£(.*)'),
    stock=''.join(response.css('.product_main .instock.availability ::text').re('(\d+)')),
    category=''.join(response.css('ul.breadcrumb li:nth-last-child(2) ::text').extract()).strip(),
    )

    self._set_new_cookies(item, response)
    return item

    def _extract_requests(self, response):
    r = []
    if isinstance(response, HtmlResponse):
    links = self.link_extractor.extract_links(response)
    r.extend(Request(x.url, callback=self.parse) for x in links)
    return r

    def _set_new_cookies(self, page, response):
    cookies = []
    for cookie in [x.split(b';', 1)[0] for x in
    response.headers.getlist('Set-Cookie')]:
    if cookie not in self.cookies_seen:
    self.cookies_seen.add(cookie)
    cookies.append(cookie)
    if cookies:
    page['newcookies'] = cookies


    configure_logging()
    runner = CrawlerRunner()

    @defer.inlineCallbacks
    def crawl():
    yield runner.crawl(FollowAllSpider)
    yield runner.crawl(FollowAllSpider)
    # yield runner.crawl(FollowAllSpider)

    reactor.stop()

    crawl()
    reactor.run()