Skip to content

Instantly share code, notes, and snippets.

@redlumE01
Created February 9, 2020 09:32
Show Gist options
  • Select an option

  • Save redlumE01/c8e80fe5edd2e1af0efbfb7110d54ee6 to your computer and use it in GitHub Desktop.

Select an option

Save redlumE01/c8e80fe5edd2e1af0efbfb7110d54ee6 to your computer and use it in GitHub Desktop.

Revisions

  1. redlumE01 created this gist Feb 9, 2020.
    47 changes: 47 additions & 0 deletions content.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,47 @@
    # scrapy crawl content -o content_output.json
    import scrapy
    import logging
    import re

    bodyClass = "body.blog-detail"

    class ContentsSpider(scrapy.Spider):
    name = "content"

    def start_requests(self):

    # urls to crawl
    urls = [
    '',
    ]

    for url in urls:
    yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
    for quote in response.css(bodyClass):

    relative_img_urls = quote.css("section img::attr(src)").extract();
    replacedContent = re.sub(r'class=\"[^\"]*\"',"", quote.css('div.main').extract()[0])

    #logging.info('lezenhier:')
    #logging.info(replacedContent)

    yield {
    # Header/publication_date saved as string
    # ::Text selects children text nodes of a descendant
    'header': quote.css('h1::text').get(),
    'publication_date': quote.css('span.date::text').get(),
    # Content saved as raw-html
    'content': replacedContent,
    'image_urls': self.url_join(relative_img_urls, response),
    }

    #logging.info('lezenhier:')

    def url_join(self, urls, response):
    joined_urls = []
    for url in urls:
    joined_urls.append(response.urljoin(url))

    return joined_urls