# -*- coding: utf-8 -*- import scrapy import re from spider.items import SpiderItem class GHDBSpider(scrapy.Spider): name = "ghdb" allowed_domains = ["www.exploit-db.com"] start_urls = [ "https://www.exploit-db.com/google-hacking-database/?action=search&ghdb_search_page=1&ghdb_search_text=&ghdb_search_cat_id=0" ] def parse(self, response): for sel in response.xpath('//table[@class="category-list"]/tbody//tr'): item = SpiderItem() item['category'] = sel.xpath('td[@class="gd-description"]/a/text()').extract()[0] yield scrapy.Request(sel.xpath('td/a[1]/@href').extract()[0], callback=self.enrich_item, meta=item) links = response.xpath('//div[@class="pagination"]//a') next_page = False for link in links: url_title = link.xpath('text()').extract()[0] if url_title == 'next': next_page = link.xpath('@href').extract()[0] if next_page: url = response.urljoin(re.sub(r'\s', '', next_page)) yield scrapy.Request(url, self.parse) def enrich_item(self, response): item = response.meta rows = response.xpath('//table[@class="category-list"]/tbody/tr/td') item['desc'] = re.sub(r'\s', '', rows.xpath('text()').extract()[1]) item['link'] = re.sub(r'\s', '', rows.xpath('a/@href').extract()[0]) item['date'] = re.sub(r'\s', '', rows.xpath('text()').extract()[4]) item['summary'] = re.sub(r'\s', '', rows.xpath('text()').extract()[5]) return item