pawelmhm · January 23, 2024 15:03 · Feb 10, 2014 · Feb 10, 2014 · Feb 10, 2014 · Feb 10, 2014
diff --git a/gistfile1.py b/gistfile1.py
@@ -15,7 +15,7 @@ class Question(Item):
 class ArgSpider(CrawlSpider):
     """
     
-    Scrapes all stackoverflow.com questions containing "query" within a given "tag" and 
+    Scrapes first 15 stackoverflow.com questions containing "query" within a given "tag" and 
     displays links, number of votes etc in the terminal. 
 
     Usage:

diff --git a/gistfile1.py b/gistfile1.py
@@ -19,7 +19,12 @@ class ArgSpider(CrawlSpider):
     displays links, number of votes etc in the terminal. 
 
     Usage:
-        ~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"
+    
+      ~: scrapy crawl StackSpider -a tag=[your tag] -a query=[your query]
+
+    For example
+        
+       ~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"
 
 
     """

diff --git a/gistfile1.py b/gistfile1.py
@@ -14,8 +14,9 @@ class Question(Item):
 
 class ArgSpider(CrawlSpider):
     """
-    No need to google anymore.
-    Spider scrapes all stackoverflow.com questions containing "query" within a given "tag" and displays links, number of votes etc in the terminal 
+    
+    Scrapes all stackoverflow.com questions containing "query" within a given "tag" and 
+    displays links, number of votes etc in the terminal. 
 
     Usage:
         ~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"

diff --git a/gistfile1.txt → gistfile1.py b/gistfile1.txt → gistfile1.py
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,57 @@
+from scrapy.spider import  Spider
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import Selector
+from scrapy.item import Item, Field
+import urllib
+
+class Question(Item):
+    tags = Field()
+    answers = Field()
+    votes = Field()
+    date = Field()
+    link = Field()
+
+class ArgSpider(CrawlSpider):
+    """
+    No need to google anymore.
+    Spider scrapes all stackoverflow.com questions containing "query" within a given "tag" and displays links, number of votes etc in the terminal 
+
+    Usage:
+        ~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"
+
+
+    """
+
+    name = "StackSpider"
+
+    def __init__(self,tag=None,query=None,*args,**kwargs):
+        super(ArgSpider,self).__init__(*args,**kwargs)
+        self.start_urls = []
+        urlTemplate = "http://stackoverflow.com/search?q=%5B{tag}%5D{query}"
+        query = urllib.quote(query)
+        self.start_urls.append(urlTemplate.format(tag=tag,query=query))
+
+
+    def parse(self,response):
+        """
+
+        @url http://stackoverflow.com/search?q=%5Bpython%5Dfiltering"
+        @returns items 15
+        @returns requests 0 1
+        @scrapes votes answers date link
+
+        """
+        sel = Selector(response)
+        elems = sel.css('.question-summary')
+        results = []
+        for elem in elems:
+            item = Question()
+            item["tags"] = elem.css('.post-tag::text').extract()
+            item["votes"] = elem.css('.vote-count-post').xpath('.//strong/text()').extract()
+            item["answers"] = elem.css('.status').xpath('.//strong/text()').extract()
+            item["date"] = elem.css('.relativetime').xpath('.//@title').extract()
+            link = elem.css('.result-link').xpath('.//a/@href').extract()
+            item["link"] = link
+            results.append(item)
+        return results
No results found