# -*- coding: utf-8 -*-

import scrapy
import re
from spider.items import SpiderItem

class GHDBSpider(scrapy.Spider):
    name = "ghdb"
    allowed_domains = ["www.exploit-db.com"]
    start_urls = [
        "https://www.exploit-db.com/google-hacking-database/?action=search&ghdb_search_page=1&ghdb_search_text=&ghdb_search_cat_id=0"
    ]

    def parse(self, response):
        for sel in response.xpath('//table[@class="category-list"]/tbody//tr'):
            item = SpiderItem()
            item['category'] = sel.xpath('td[@class="gd-description"]/a/text()').extract()[0]
            yield scrapy.Request(sel.xpath('td/a[1]/@href').extract()[0], callback=self.enrich_item, meta=item)

        links = response.xpath('//div[@class="pagination"]//a')
        next_page = False

        for link in links:
            url_title = link.xpath('text()').extract()[0]
            if url_title == 'next':
                next_page = link.xpath('@href').extract()[0]
        if next_page:
            url = response.urljoin(re.sub(r'\s', '', next_page))
            yield scrapy.Request(url, self.parse)


    def enrich_item(self, response):
        item = response.meta
        rows = response.xpath('//table[@class="category-list"]/tbody/tr/td')
        item['desc'] = re.sub(r'\s', '', rows.xpath('text()').extract()[1])
        item['link'] = re.sub(r'\s', '', rows.xpath('a/@href').extract()[0])
        item['date'] = re.sub(r'\s', '', rows.xpath('text()').extract()[4])
        item['summary'] = re.sub(r'\s', '', rows.xpath('text()').extract()[5])

        return item