{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#IPython-Scrapy\n",
"\n",
"This notebook is a minimal proof-of-concept Scrapy-IPython integration.\n",
"\n",
"To try this notebook, create a 'tmp' subfolder (in the folder 'ipython notebook' is executed from) and run\n",
"\n",
" python -m SimpleHTTPServer\n",
" \n",
"from this 'tmp' folder."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Code for downloading webpages via Scrapy:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import print_function\n",
"import os\n",
"import sys\n",
"import multiprocessing\n",
"from multiprocessing.queues import Queue\n",
"import lxml.etree\n",
"import lxml.html\n",
"from scrapy import project, signals\n",
"from scrapy.spider import BaseSpider\n",
"from scrapy.item import Item, Field\n",
"from scrapy.crawler import CrawlerProcess\n",
"from scrapy.xlib.pydispatch import dispatcher\n",
"from scrapy.utils.project import get_project_settings\n",
"from scrapy.http import Request\n",
"from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector\n",
"\n",
"TMP_DIR = './tmp'\n",
"\n",
"class ResponseItem(Item):\n",
" response = Field()\n",
"\n",
"class ResponseSpider(BaseSpider):\n",
" name = 'response_spider'\n",
" \n",
" def __init__(self, url):\n",
" self.url = url\n",
" super(ResponseSpider, self).__init__()\n",
" \n",
" def start_requests(self):\n",
" return [Request(self.url, self.parse, dont_filter=True)]\n",
" \n",
" def parse(self, response):\n",
" # request with callback fails to serialize - why?\n",
" req = response.request.replace(callback=None)\n",
" return ResponseItem(\n",
" response=response.replace(request=req),\n",
" )\n",
" \n",
" \n",
"class CrawlerWorker(multiprocessing.Process):\n",
" def __init__(self, result_queue, spider, settings=None):\n",
" multiprocessing.Process.__init__(self)\n",
" self.settings = settings or get_project_settings()\n",
" self.result_queue = result_queue\n",
" self.spider = spider\n",
" self.items = []\n",
" dispatcher.connect(self._item_passed, signals.item_passed)\n",
" \n",
" def _item_passed(self, item):\n",
" self.items.append(item)\n",
" \n",
" def run(self):\n",
" self.crawler = CrawlerProcess(self.settings)\n",
" self.crawler.install()\n",
" self.crawler.configure() \n",
" self.crawler.crawl(self.spider)\n",
" self.crawler.start() \n",
" self.crawler.stop()\n",
" self.result_queue.put(self.items)\n",
" \n",
"\n",
"def _download(url):\n",
" result_queue = Queue()\n",
" spider = ResponseSpider(url)\n",
" crawler = CrawlerWorker(result_queue, spider)\n",
" crawler.start() \n",
" item = result_queue.get()[0]\n",
" result_queue.cancel_join_thread()\n",
" crawler.join()\n",
" return item['response']\n",
"\n",
"def set_base(body, base):\n",
" if '
\n", "
\n", "