Skip to content

Instantly share code, notes, and snippets.

@lammoth
Last active August 23, 2016 10:55
Show Gist options
  • Save lammoth/63e19a45fea48e65a10b78f1377f2563 to your computer and use it in GitHub Desktop.
Save lammoth/63e19a45fea48e65a10b78f1377f2563 to your computer and use it in GitHub Desktop.

Revisions

  1. lammoth revised this gist Aug 23, 2016. 2 changed files with 16 additions and 0 deletions.
    File renamed without changes.
    16 changes: 16 additions & 0 deletions items.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,16 @@
    # -*- coding: utf-8 -*-

    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html

    import scrapy


    class SpiderItem(scrapy.Item):
    date = scrapy.Field()
    title = scrapy.Field()
    desc = scrapy.Field()
    summary = scrapy.Field()
    category = scrapy.Field()
    source_link = scrapy.Field()
    link = scrapy.Field()
  2. lammoth created this gist Aug 1, 2016.
    40 changes: 40 additions & 0 deletions GHDB Scrapy BOT
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,40 @@
    # -*- coding: utf-8 -*-

    import scrapy
    import re
    from spider.items import SpiderItem

    class GHDBSpider(scrapy.Spider):
    name = "ghdb"
    allowed_domains = ["www.exploit-db.com"]
    start_urls = [
    "https://www.exploit-db.com/google-hacking-database/?action=search&ghdb_search_page=1&ghdb_search_text=&ghdb_search_cat_id=0"
    ]

    def parse(self, response):
    for sel in response.xpath('//table[@class="category-list"]/tbody//tr'):
    item = SpiderItem()
    item['category'] = sel.xpath('td[@class="gd-description"]/a/text()').extract()[0]
    yield scrapy.Request(sel.xpath('td/a[1]/@href').extract()[0], callback=self.enrich_item, meta=item)

    links = response.xpath('//div[@class="pagination"]//a')
    next_page = False

    for link in links:
    url_title = link.xpath('text()').extract()[0]
    if url_title == 'next':
    next_page = link.xpath('@href').extract()[0]
    if next_page:
    url = response.urljoin(re.sub(r'\s', '', next_page))
    yield scrapy.Request(url, self.parse)


    def enrich_item(self, response):
    item = response.meta
    rows = response.xpath('//table[@class="category-list"]/tbody/tr/td')
    item['desc'] = re.sub(r'\s', '', rows.xpath('text()').extract()[1])
    item['link'] = re.sub(r'\s', '', rows.xpath('a/@href').extract()[0])
    item['date'] = re.sub(r'\s', '', rows.xpath('text()').extract()[4])
    item['summary'] = re.sub(r'\s', '', rows.xpath('text()').extract()[5])

    return item