Skip to content

Instantly share code, notes, and snippets.

@redlumE01
Created February 9, 2020 09:32
Show Gist options
  • Save redlumE01/c8e80fe5edd2e1af0efbfb7110d54ee6 to your computer and use it in GitHub Desktop.
Save redlumE01/c8e80fe5edd2e1af0efbfb7110d54ee6 to your computer and use it in GitHub Desktop.
# scrapy crawl content -o content_output.json
import scrapy
import logging
import re
bodyClass = "body.blog-detail"
class ContentsSpider(scrapy.Spider):
name = "content"
def start_requests(self):
# urls to crawl
urls = [
'',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css(bodyClass):
relative_img_urls = quote.css("section img::attr(src)").extract();
replacedContent = re.sub(r'class=\"[^\"]*\"',"", quote.css('div.main').extract()[0])
#logging.info('lezenhier:')
#logging.info(replacedContent)
yield {
# Header/publication_date saved as string
# ::Text selects children text nodes of a descendant
'header': quote.css('h1::text').get(),
'publication_date': quote.css('span.date::text').get(),
# Content saved as raw-html
'content': replacedContent,
'image_urls': self.url_join(relative_img_urls, response),
}
#logging.info('lezenhier:')
def url_join(self, urls, response):
joined_urls = []
for url in urls:
joined_urls.append(response.urljoin(url))
return joined_urls
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment