# -*- coding: utf8 -*- from __future__ import absolute_import, division, print_function import socket from urlparse import urlparse class DnsResolverMiddleware(object): """ Downloader Middleware Class for address resolving. It resolves domain to IPv4 \ IPv6 addresses. Previously it was designed for IPv6-only hosts. Algorithm: 1) Replace all domain names to addresses for a request. 2) Form a new request. 3) Perform this request (at Scrapy Engine). 4) Get a response. 5) Replaces addresses to names for this response. 6) Return response. It works without any caching. So it may be quite slow. TODO: * Add caching [?]. """ dns_dict = None def process_request(self, request, spider): """ Replaces all domain names to addresses and forms a new request. Also it replaces addresses to domain names for a Referer-header. :param request: the request being processed; :param spider: dummy parameter in this case; :return: None or a new request with replaced links. """ meta = getattr(request, 'meta', None) if not meta: meta = dict() if not self.dns_dict: self.dns_dict = dict() # To resist infinite loop. if meta.get('resolved_request'): return None domain_url = request.url # Check if domain and address are computed already. domain = meta.get('domain', str()) address = meta.get('address', str()) if not address or not address: # Compute domain and IP-address. domain = self.parse_host(domain_url) address = self.compute_address(domain) self.dns_dict.setdefault(domain, []) self.dns_dict.setdefault(address, []) if address not in self.dns_dict[domain]: self.dns_dict[domain] += [address] if domain not in self.dns_dict[address]: self.dns_dict[address] += [domain] # Get a new url with an address instead of domain. address_url = self.convert_to_address(domain_url) # To replace `Referer`. headers = request.headers if not headers: headers = dict() # Get a old `Referer` with an address instead of domain. address_referer = request.headers.get('Referer', str()) # Get a new `Referer` with a domain. domain_referer = self.convert_to_domain(address_referer) # Form a new headers dict. new_headers = dict( headers, # Store a new Host-header # for correct resolving on the requested server. Host=domain, ) if domain_referer: new_headers = dict( new_headers, # Store a new Referer-header # for correct handling on the requested server Referer=domain_referer ) # Form a new meta dict. new_meta = dict( meta, # Mark this request as processed to resist infinite loop. resolved_request=True, # Store some auxiliary data. # It helps us to escape from unnecessary computations # and debug'em all. dns_dict=self.dns_dict, domain=domain, address=address, url=dict( domain=domain_url, address=address_url, ), referer=dict( domain=domain_referer, address=address_referer, ), ) # Form a new request. new_request = request.replace( url=address_url, headers=new_headers, meta=new_meta, ) return new_request def process_response(self, request, response, spider): """ Tries to replace addresses to domain names. Replaces are not so successfully as I expects. May be bug in Scrapy Engine. Nevertheless: 1) It gets `domain` and `address` from request's meta. 2) It replaces `address` to `domain` in request's url. 3) It replaces `address` to `domain` in response's url. 4) It forms a new response. :param request: the request that originated the response; :param response: the response being processed; :param spider: dummy parameter in this case; :return: None or a new request with replaced links. """ # Get meta from request. It is important. meta = getattr(request, 'meta', None) if not meta: meta = dict() if meta.get('resolved_response'): return None # Form a new url with domain instead of IP-address. url = response.url new_url = self.convert_to_domain(url) # Form a new meta dict. new_meta = dict( meta, # Mark this response as processed. resolved_response=True, ) # Form a new request with our new url. new_request = request.replace( url=new_url, meta=new_meta ) # Form a new request with our new url and request. response.replace( url=new_url, request=new_request ) return response def convert_to_address(self, url): """ Replaces a domain to an address in the url. This function is a stub for caching. :param str url: original url; :return: a new url. :rtype: str """ domain = self.parse_host(url) address_list = self.dns_dict.get(domain, []) for address in address_list: url = url.replace(domain, address) return url def convert_to_domain(self, url): """ Replaces an address to a domain in the url. This function is a stub for caching. :param str url: original url; :return: a new url. :rtype: str """ address = self.parse_host(url) domain_list = self.dns_dict.get(address, []) for domain in domain_list: url = url.replace(address, domain) return url def parse_host(self, url): """ Returns a host part of url. :param str url: source uri with IP or domain :return: host of given url :rtype: str """ parsed_uri = urlparse(url) host = parsed_uri.netloc return host def compute_address(self, host_name): """ Returns a http-compatible IP-address of the given host. This function is a stub for caching. :param str host_name: symbolic name of host; :return: string with IP-address. :rtype: str """ address = self._compute_address(host_name) return address def _compute_address(self, host_name): """ Returns a http-compatible IP-address of the given host. For IPv6 it wraps address into square brackets. :param str host_name: symbolic name of host; :return: string with IP-address. :rtype: str """ addr_list = socket.getaddrinfo(host_name, 0) for addr in addr_list: (family, _socktype, _proto, _canonname, sockaddr) = addr if socket.AF_INET6 == family: (address, _port, _flow_info, _scope_id) = sockaddr address = '[{address}]'.format( address=address ) return address elif socket.AF_INET == family: (address, _port) = sockaddr return address return host_name