# -*- coding: utf-8 -*- from __future__ import absolute_import, division, unicode_literals import logging from collections import OrderedDict from time import time from urllib.parse import urldefrag from scrapy.core.downloader import Downloader from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler, \ ScrapyAgent, _RequestBodyProducer from scrapy.utils.python import to_bytes from twisted.internet import reactor from twisted.web.http_headers import Headers as TxHeaders logger = logging.getLogger(__name__) # To use that you need to do following. # 1. Save it in some files, for example my_scrapy_project/ordered_downloader.py # 2. Define this downloader in settings.py of your Scrapy project or in spider. # For example in settings # DOWNLOAD_HANDLERS_BASE = { # 'data': 'scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler', # 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler', # 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', # Custom downloader to preserve header # order. Add path where you store your downloader. # 'https': 'my_project.ordered_downloader.HeaderOrderDownloader', # 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', # 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler', # } class OrderedHeaders(TxHeaders): # Tweaked Twisted headers object that stores headers as OrderedDict and defines # some order. First it sets ordered headers, then it adds headers that are not # in defined ordering. def __init__(self, rawHeaders=None): # define your order here ordering = [b'Host', b'User-Agent', b'Accept-Encoding', b'Accept', b'Connection', b'Cookie'] self._rawHeaders = OrderedDict() if rawHeaders is not None: # Set ordered headers for key in ordering: values = rawHeaders.get(key) if values and not isinstance(values, list): values = rawHeaders.getlist(key) self.setRawHeaders(key, values) for name, values in rawHeaders.items(): # Set remaining headers if name not in ordering: self.setRawHeaders(name, values) def setRawHeaders(self, name, values): """ Copy pasted from Twisted, with only addition being adding move_to_end() method, called after setting header. """ if not isinstance(values, list): raise TypeError("Header entry %r should be list but found " "instance of %r instead" % (name, type(values))) name = self._encodeName(name) self._rawHeaders[name] = self._encodeValues(values) self._rawHeaders.move_to_end(name) class ScrapyHeaderOrderAgent(ScrapyAgent): # Copy pasted from Scrapy, tweaked to create OrderedHeaders, not usual Twisted headers, only # difference from Scrapy is on line 81. def download_request(self, request): timeout = request.meta.get('download_timeout') or self._connectTimeout agent = self._get_agent(request, timeout) # request details url = urldefrag(request.url)[0] method = to_bytes(request.method) headers = OrderedHeaders(request.headers) if isinstance(agent, self._TunnelingAgent): headers.removeHeader(b'Proxy-Authorization') if request.body: bodyproducer = _RequestBodyProducer(request.body) elif method == b'POST': # Setting Content-Length: 0 even for POST requests is not a # MUST per HTTP RFCs, but it's common behavior, and some # servers require this, otherwise returning HTTP 411 Length required # # RFC 7230#section-3.3.2: # "a Content-Length header field is normally sent in a POST # request even when the value is 0 (indicating an empty payload body)." # # Twisted < 17 will not add "Content-Length: 0" by itself; # Twisted >= 17 fixes this; # Using a producer with an empty-string sends `0` as Content-Length # for all versions of Twisted. bodyproducer = _RequestBodyProducer(b'') else: bodyproducer = None start_time = time() d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer) # set download latency d.addCallback(self._cb_latency, request, start_time) # response body is ready to be consumed d.addCallback(self._cb_bodyready, request) d.addCallback(self._cb_bodydone, request, url) # check download timeout self._timeout_cl = reactor.callLater(timeout, d.cancel) d.addBoth(self._cb_timeout, request, url, timeout) return d class HeaderOrderDownloader(HTTP11DownloadHandler): # Copy pasted from Scrapy, tweaked to use different Agent it uses ScrapyHeaderOrderAgent def download_request(self, request, spider): """Return a deferred for the HTTP download""" agent = ScrapyHeaderOrderAgent( contextFactory=self._contextFactory, pool=self._pool, maxsize=getattr(spider, 'download_maxsize', self._default_maxsize), warnsize=getattr(spider, 'download_warnsize', self._default_warnsize), fail_on_dataloss=self._fail_on_dataloss, ) return agent.download_request(request)