-
-
Save IAlwaysBeCoding/438464eec197f546ad1b707dba1f7551 to your computer and use it in GitHub Desktop.
Revisions
-
w495 revised this gist
Feb 9, 2017 . No changes.There are no files selected for viewing
-
w495 revised this gist
Feb 9, 2017 . 1 changed file with 62 additions and 28 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -28,6 +28,8 @@ class DnsResolverMiddleware(object): """ dns_dict = None def process_request(self, request, spider): """ Replaces all domain names to addresses and forms a new request. @@ -45,25 +47,32 @@ def process_request(self, request, spider): if not meta: meta = dict() if not self.dns_dict: self.dns_dict = dict() # To resist infinite loop. if meta.get('resolved_request'): return None domain_url = request.url # Check if domain and address are computed already. domain = meta.get('domain', str()) address = meta.get('address', str()) if not address or not address: # Compute domain and IP-address. domain = self.parse_host(domain_url) address = self.compute_address(domain) self.dns_dict.setdefault(domain, []) self.dns_dict.setdefault(address, []) if address not in self.dns_dict[domain]: self.dns_dict[domain] += [address] if domain not in self.dns_dict[address]: self.dns_dict[address] += [domain] # Get a new url with an address instead of domain. address_url = self.convert_to_address(domain_url) # To replace `Referer`. headers = request.headers @@ -73,20 +82,24 @@ def process_request(self, request, spider): # Get a old `Referer` with an address instead of domain. address_referer = request.headers.get('Referer', str()) # Get a new `Referer` with a domain. domain_referer = self.convert_to_domain(address_referer) # Form a new headers dict. new_headers = dict( headers, # Store a new Host-header # for correct resolving on the requested server. Host=domain, ) if domain_referer: new_headers = dict( new_headers, # Store a new Referer-header # for correct handling on the requested server Referer=domain_referer ) # Form a new meta dict. new_meta = dict( meta, @@ -95,6 +108,7 @@ def process_request(self, request, spider): # Store some auxiliary data. # It helps us to escape from unnecessary computations # and debug'em all. dns_dict=self.dns_dict, domain=domain, address=address, url=dict( @@ -142,10 +156,8 @@ def process_response(self, request, response, spider): return None # Form a new url with domain instead of IP-address. url = response.url new_url = self.convert_to_domain(url) # Form a new meta dict. new_meta = dict( @@ -165,40 +177,61 @@ def process_response(self, request, response, spider): ) return response def convert_to_address(self, url): """ Replaces a domain to an address in the url. This function is a stub for caching. :param str url: original url; :return: a new url. :rtype: str """ domain = self.parse_host(url) address_list = self.dns_dict.get(domain, []) for address in address_list: url = url.replace(domain, address) return url def convert_to_domain(self, url): """ Replaces an address to a domain in the url. This function is a stub for caching. :param str url: original url; :return: a new url. :rtype: str """ address = self.parse_host(url) domain_list = self.dns_dict.get(address, []) for domain in domain_list: url = url.replace(address, domain) return url def parse_host(self, url): """ Returns a host part of url. :param str url: source uri with IP or domain :return: host of given url :rtype: str """ parsed_uri = urlparse(url) host = parsed_uri.netloc return host def compute_address(self, host_name): """ Returns a http-compatible IP-address of the given host. This function is a stub for caching. :param str host_name: symbolic name of host; :return: string with IP-address. :rtype: str """ address = self._compute_address(host_name) return address @@ -216,12 +249,13 @@ def _compute_address(self, host_name): addr_list = socket.getaddrinfo(host_name, 0) for addr in addr_list: (family, _socktype, _proto, _canonname, sockaddr) = addr if socket.AF_INET6 == family: (address, _port, _flow_info, _scope_id) = sockaddr address = '[{address}]'.format( address=address ) return address elif socket.AF_INET == family: (address, _port) = sockaddr return address return host_name -
w495 created this gist
Jan 3, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,227 @@ # -*- coding: utf8 -*- from __future__ import absolute_import, division, print_function import socket from urlparse import urlparse class DnsResolverMiddleware(object): """ Downloader Middleware Class for address resolving. It resolves domain to IPv4 \ IPv6 addresses. Previously it was designed for IPv6-only hosts. Algorithm: 1) Replace all domain names to addresses for a request. 2) Form a new request. 3) Perform this request (at Scrapy Engine). 4) Get a response. 5) Replaces addresses to names for this response. 6) Return response. It works without any caching. So it may be quite slow. TODO: * Add caching [?]. """ def process_request(self, request, spider): """ Replaces all domain names to addresses and forms a new request. Also it replaces addresses to domain names for a Referer-header. :param request: the request being processed; :param spider: dummy parameter in this case; :return: None or a new request with replaced links. """ meta = getattr(request, 'meta', None) if not meta: meta = dict() # To resist infinite loop. if meta.get('resolved_request'): return None domain_url = request.url # Check if domain and address are computed already. domain = meta.get('domain', '') address = meta.get('address', '') if not address or not address: # Compute domain and IP-address. parsed_uri = urlparse(domain_url) domain = parsed_uri.netloc address = self.compute_address(domain) # Get a new url with an address instead of domain. address_url = self.domain_to_address(domain_url, domain, address) # To replace `Referer`. headers = request.headers if not headers: headers = dict() # Get a old `Referer` with an address instead of domain. address_referer = request.headers.get('Referer', str()) # Get a new `Referer` with a domain. domain_referer = self.address_to_domain(address_referer, address, domain) # Form a new headers dict. new_headers = dict( headers, # Store a new Host-header # for correct resolving on the requested server. Host=domain, # Store a new Referer-header # for correct handling on the requested server Referer=domain_referer ) # Form a new meta dict. new_meta = dict( meta, # Mark this request as processed to resist infinite loop. resolved_request=True, # Store some auxiliary data. # It helps us to escape from unnecessary computations # and debug'em all. domain=domain, address=address, url=dict( domain=domain_url, address=address_url, ), referer=dict( domain=domain_referer, address=address_referer, ), ) # Form a new request. new_request = request.replace( url=address_url, headers=new_headers, meta=new_meta, ) return new_request def process_response(self, request, response, spider): """ Tries to replace addresses to domain names. Replaces are not so successfully as I expects. May be bug in Scrapy Engine. Nevertheless: 1) It gets `domain` and `address` from request's meta. 2) It replaces `address` to `domain` in request's url. 3) It replaces `address` to `domain` in response's url. 4) It forms a new response. :param request: the request that originated the response; :param response: the response being processed; :param spider: dummy parameter in this case; :return: None or a new request with replaced links. """ # Get meta from request. It is important. meta = getattr(request, 'meta', None) if not meta: meta = dict() if meta.get('resolved_response'): return None # Form a new url with domain instead of IP-address. domain = meta.get('domain', '') address = meta.get('address', '') url = response.url new_url = self.address_to_domain(url, address, domain) # Form a new meta dict. new_meta = dict( meta, # Mark this response as processed. resolved_response=True, ) # Form a new request with our new url. new_request = request.replace( url=new_url, meta=new_meta ) # Form a new request with our new url and request. response.replace( url=new_url, request=new_request ) return response def domain_to_address(self, url, domain, address): """ Replaces a domain to an address in the url. This function is a stub for caching. :param str url: original url; :param str domain: symbolic host name; :param str address: IP-address of the host. :return: a new url. """ new_url = url.replace(domain, address) return new_url def address_to_domain(self, url, domain, address): """ Replaces an address to a domain in the url. This function is a stub for caching. :param str url: original url; :param str address: IP-address of the host. :param str domain: symbolic host name; :return: a new url. """ new_url = url.replace(domain, address) return new_url def compute_address(self, host_name): """ Returns a http-compatible IP-address of the given host. This function is a stub for caching. """ address = self._compute_address(host_name) return address def _compute_address(self, host_name): """ Returns a http-compatible IP-address of the given host. For IPv6 it wraps address into square brackets. :param str host_name: symbolic name of host; :return: string with IP-address. :rtype: str """ addr_list = socket.getaddrinfo(host_name, 0) for addr in addr_list: (family, _socktype, _proto, _canonname, sockaddr) = addr (address, _port, _flow_info, _scope_id) = sockaddr if socket.AF_INET6 == family: address = u'[{address}]'.format( address=address ) return address elif socket.AF_INET == family: return address return host_name