Last active
August 23, 2016 10:55
-
-
Save lammoth/63e19a45fea48e65a10b78f1377f2563 to your computer and use it in GitHub Desktop.
Revisions
-
lammoth revised this gist
Aug 23, 2016 . 2 changed files with 16 additions and 0 deletions.There are no files selected for viewing
File renamed without changes.This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,16 @@ # -*- coding: utf-8 -*- # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class SpiderItem(scrapy.Item): date = scrapy.Field() title = scrapy.Field() desc = scrapy.Field() summary = scrapy.Field() category = scrapy.Field() source_link = scrapy.Field() link = scrapy.Field() -
lammoth created this gist
Aug 1, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,40 @@ # -*- coding: utf-8 -*- import scrapy import re from spider.items import SpiderItem class GHDBSpider(scrapy.Spider): name = "ghdb" allowed_domains = ["www.exploit-db.com"] start_urls = [ "https://www.exploit-db.com/google-hacking-database/?action=search&ghdb_search_page=1&ghdb_search_text=&ghdb_search_cat_id=0" ] def parse(self, response): for sel in response.xpath('//table[@class="category-list"]/tbody//tr'): item = SpiderItem() item['category'] = sel.xpath('td[@class="gd-description"]/a/text()').extract()[0] yield scrapy.Request(sel.xpath('td/a[1]/@href').extract()[0], callback=self.enrich_item, meta=item) links = response.xpath('//div[@class="pagination"]//a') next_page = False for link in links: url_title = link.xpath('text()').extract()[0] if url_title == 'next': next_page = link.xpath('@href').extract()[0] if next_page: url = response.urljoin(re.sub(r'\s', '', next_page)) yield scrapy.Request(url, self.parse) def enrich_item(self, response): item = response.meta rows = response.xpath('//table[@class="category-list"]/tbody/tr/td') item['desc'] = re.sub(r'\s', '', rows.xpath('text()').extract()[1]) item['link'] = re.sub(r'\s', '', rows.xpath('a/@href').extract()[0]) item['date'] = re.sub(r'\s', '', rows.xpath('text()').extract()[4]) item['summary'] = re.sub(r'\s', '', rows.xpath('text()').extract()[5]) return item