redlumE01 · February 9, 2020 09:32 · Feb 9, 2020
diff --git a/content.py b/content.py
@@ -0,0 +1,47 @@
+# scrapy crawl content -o content_output.json
+import scrapy
+import logging
+import re
+
+bodyClass = "body.blog-detail"
+
+class ContentsSpider(scrapy.Spider):
+    name = "content"
+
+    def start_requests(self):
+
+        # urls to crawl
+        urls = [
+            '',
+        ]
+
+        for url in urls:
+            yield scrapy.Request(url=url, callback=self.parse)
+
+    def parse(self, response):
+        for quote in response.css(bodyClass):
+
+            relative_img_urls = quote.css("section img::attr(src)").extract();
+            replacedContent = re.sub(r'class=\"[^\"]*\"',"", quote.css('div.main').extract()[0])
+
+            #logging.info('lezenhier:')
+            #logging.info(replacedContent)
+
+            yield {
+                # Header/publication_date saved as string
+                # ::Text selects children text nodes of a descendant
+                'header': quote.css('h1::text').get(),
+                'publication_date': quote.css('span.date::text').get(),
+                # Content saved as raw-html
+                'content': replacedContent,
+                'image_urls': self.url_join(relative_img_urls, response),
+            }
+
+            #logging.info('lezenhier:')
+
+    def url_join(self, urls, response):
+        joined_urls = []
+        for url in urls:
+            joined_urls.append(response.urljoin(url))
+
+        return joined_urls
No results found