redlumE01 · February 9, 2020 09:32
diff --git a/content.py b/content.py
 # scrapy crawl content -o content_output.json
 import scrapy
 import logging
 import re

 bodyClass = "body.blog-detail"

 class ContentsSpider(scrapy.Spider):
    name = "content"

    def start_requests(self):

        # urls to crawl
        urls = [
            '',
        ]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for quote in response.css(bodyClass):

            relative_img_urls = quote.css("section img::attr(src)").extract();
            replacedContent = re.sub(r'class=\"[^\"]*\"',"", quote.css('div.main').extract()[0])

            #logging.info('lezenhier:')
            #logging.info(replacedContent)

            yield {
                # Header/publication_date saved as string
                # ::Text selects children text nodes of a descendant
                'header': quote.css('h1::text').get(),
                'publication_date': quote.css('span.date::text').get(),
                # Content saved as raw-html
                'content': replacedContent,
                'image_urls': self.url_join(relative_img_urls, response),
            }

            #logging.info('lezenhier:')

    def url_join(self, urls, response):
        joined_urls = []
        for url in urls:
            joined_urls.append(response.urljoin(url))

        return joined_urls
	# scrapy crawl content -o content_output.json
	import scrapy
	import logging
	import re

	bodyClass = "body.blog-detail"

	class ContentsSpider(scrapy.Spider):
	name = "content"

	def start_requests(self):

	# urls to crawl
	urls = [
	'',
	]

	for url in urls:
	yield scrapy.Request(url=url, callback=self.parse)

	def parse(self, response):
	for quote in response.css(bodyClass):

	relative_img_urls = quote.css("section img::attr(src)").extract();
	replacedContent = re.sub(r'class=\"[^\"]*\"',"", quote.css('div.main').extract()[0])

	#logging.info('lezenhier:')
	#logging.info(replacedContent)

	yield {
	# Header/publication_date saved as string
	# ::Text selects children text nodes of a descendant
	'header': quote.css('h1::text').get(),
	'publication_date': quote.css('span.date::text').get(),
	# Content saved as raw-html
	'content': replacedContent,
	'image_urls': self.url_join(relative_img_urls, response),
	}

	#logging.info('lezenhier:')

	def url_join(self, urls, response):
	joined_urls = []
	for url in urls:
	joined_urls.append(response.urljoin(url))

	return joined_urls