lammoth · August 23, 2016 10:55 · Aug 23, 2016 · Aug 1, 2016
diff --git a/GHDB Scrapy BOT → ghdb_spider.py b/GHDB Scrapy BOT → ghdb_spider.py
diff --git a/items.py b/items.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class SpiderItem(scrapy.Item):
+    date = scrapy.Field()
+    title = scrapy.Field()
+    desc = scrapy.Field()
+    summary = scrapy.Field()
+    category = scrapy.Field()
+    source_link = scrapy.Field()
+    link = scrapy.Field()
diff --git a/GHDB Scrapy BOT b/GHDB Scrapy BOT
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+import scrapy
+import re
+from spider.items import SpiderItem
+
+class GHDBSpider(scrapy.Spider):
+    name = "ghdb"
+    allowed_domains = ["www.exploit-db.com"]
+    start_urls = [
+        "https://www.exploit-db.com/google-hacking-database/?action=search&ghdb_search_page=1&ghdb_search_text=&ghdb_search_cat_id=0"
+    ]
+
+    def parse(self, response):
+        for sel in response.xpath('//table[@class="category-list"]/tbody//tr'):
+            item = SpiderItem()
+            item['category'] = sel.xpath('td[@class="gd-description"]/a/text()').extract()[0]
+            yield scrapy.Request(sel.xpath('td/a[1]/@href').extract()[0], callback=self.enrich_item, meta=item)
+
+        links = response.xpath('//div[@class="pagination"]//a')
+        next_page = False
+
+        for link in links:
+            url_title = link.xpath('text()').extract()[0]
+            if url_title == 'next':
+                next_page = link.xpath('@href').extract()[0]
+        if next_page:
+            url = response.urljoin(re.sub(r'\s', '', next_page))
+            yield scrapy.Request(url, self.parse)
+
+
+    def enrich_item(self, response):
+        item = response.meta
+        rows = response.xpath('//table[@class="category-list"]/tbody/tr/td')
+        item['desc'] = re.sub(r'\s', '', rows.xpath('text()').extract()[1])
+        item['link'] = re.sub(r'\s', '', rows.xpath('a/@href').extract()[0])
+        item['date'] = re.sub(r'\s', '', rows.xpath('text()').extract()[4])
+        item['summary'] = re.sub(r'\s', '', rows.xpath('text()').extract()[5])
+
+        return item