Skip to content

Instantly share code, notes, and snippets.

@pawelmhm
Last active January 23, 2024 15:03
Show Gist options
  • Select an option

  • Save pawelmhm/8917867 to your computer and use it in GitHub Desktop.

Select an option

Save pawelmhm/8917867 to your computer and use it in GitHub Desktop.

Revisions

  1. pawelmhm revised this gist Feb 10, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -15,7 +15,7 @@ class Question(Item):
    class ArgSpider(CrawlSpider):
    """
    Scrapes all stackoverflow.com questions containing "query" within a given "tag" and
    Scrapes first 15 stackoverflow.com questions containing "query" within a given "tag" and
    displays links, number of votes etc in the terminal.
    Usage:
  2. pawelmhm revised this gist Feb 10, 2014. 1 changed file with 6 additions and 1 deletion.
    7 changes: 6 additions & 1 deletion gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -19,7 +19,12 @@ class ArgSpider(CrawlSpider):
    displays links, number of votes etc in the terminal.
    Usage:
    ~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"
    ~: scrapy crawl StackSpider -a tag=[your tag] -a query=[your query]
    For example
    ~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"
    """
  3. pawelmhm revised this gist Feb 10, 2014. 1 changed file with 3 additions and 2 deletions.
    5 changes: 3 additions & 2 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -14,8 +14,9 @@ class Question(Item):

    class ArgSpider(CrawlSpider):
    """
    No need to google anymore.
    Spider scrapes all stackoverflow.com questions containing "query" within a given "tag" and displays links, number of votes etc in the terminal
    Scrapes all stackoverflow.com questions containing "query" within a given "tag" and
    displays links, number of votes etc in the terminal.
    Usage:
    ~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"
  4. pawelmhm renamed this gist Feb 10, 2014. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  5. pawelmhm created this gist Feb 10, 2014.
    57 changes: 57 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,57 @@
    from scrapy.spider import Spider
    from scrapy.contrib.spiders import CrawlSpider, Rule
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.selector import Selector
    from scrapy.item import Item, Field
    import urllib

    class Question(Item):
    tags = Field()
    answers = Field()
    votes = Field()
    date = Field()
    link = Field()

    class ArgSpider(CrawlSpider):
    """
    No need to google anymore.
    Spider scrapes all stackoverflow.com questions containing "query" within a given "tag" and displays links, number of votes etc in the terminal

    Usage:
    ~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"


    """

    name = "StackSpider"

    def __init__(self,tag=None,query=None,*args,**kwargs):
    super(ArgSpider,self).__init__(*args,**kwargs)
    self.start_urls = []
    urlTemplate = "http://stackoverflow.com/search?q=%5B{tag}%5D{query}"
    query = urllib.quote(query)
    self.start_urls.append(urlTemplate.format(tag=tag,query=query))


    def parse(self,response):
    """

    @url http://stackoverflow.com/search?q=%5Bpython%5Dfiltering"
    @returns items 15
    @returns requests 0 1
    @scrapes votes answers date link

    """
    sel = Selector(response)
    elems = sel.css('.question-summary')
    results = []
    for elem in elems:
    item = Question()
    item["tags"] = elem.css('.post-tag::text').extract()
    item["votes"] = elem.css('.vote-count-post').xpath('.//strong/text()').extract()
    item["answers"] = elem.css('.status').xpath('.//strong/text()').extract()
    item["date"] = elem.css('.relativetime').xpath('.//@title').extract()
    link = elem.css('.result-link').xpath('.//a/@href').extract()
    item["link"] = link
    results.append(item)
    return results