Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save IAlwaysBeCoding/438464eec197f546ad1b707dba1f7551 to your computer and use it in GitHub Desktop.
Save IAlwaysBeCoding/438464eec197f546ad1b707dba1f7551 to your computer and use it in GitHub Desktop.

Revisions

  1. @w495 w495 revised this gist Feb 9, 2017. No changes.
  2. @w495 w495 revised this gist Feb 9, 2017. 1 changed file with 62 additions and 28 deletions.
    90 changes: 62 additions & 28 deletions dns_resolver_middleware.py
    Original file line number Diff line number Diff line change
    @@ -28,6 +28,8 @@ class DnsResolverMiddleware(object):
    """

    dns_dict = None

    def process_request(self, request, spider):
    """
    Replaces all domain names to addresses and forms a new request.
    @@ -45,25 +47,32 @@ def process_request(self, request, spider):
    if not meta:
    meta = dict()

    if not self.dns_dict:
    self.dns_dict = dict()

    # To resist infinite loop.
    if meta.get('resolved_request'):
    return None

    domain_url = request.url

    # Check if domain and address are computed already.
    domain = meta.get('domain', '')
    address = meta.get('address', '')
    domain = meta.get('domain', str())
    address = meta.get('address', str())
    if not address or not address:
    # Compute domain and IP-address.
    parsed_uri = urlparse(domain_url)
    domain = parsed_uri.netloc
    domain = self.parse_host(domain_url)
    address = self.compute_address(domain)

    self.dns_dict.setdefault(domain, [])
    self.dns_dict.setdefault(address, [])
    if address not in self.dns_dict[domain]:
    self.dns_dict[domain] += [address]
    if domain not in self.dns_dict[address]:
    self.dns_dict[address] += [domain]

    # Get a new url with an address instead of domain.
    address_url = self.domain_to_address(domain_url,
    domain,
    address)
    address_url = self.convert_to_address(domain_url)

    # To replace `Referer`.
    headers = request.headers
    @@ -73,20 +82,24 @@ def process_request(self, request, spider):
    # Get a old `Referer` with an address instead of domain.
    address_referer = request.headers.get('Referer', str())
    # Get a new `Referer` with a domain.
    domain_referer = self.address_to_domain(address_referer,
    address,
    domain)
    domain_referer = self.convert_to_domain(address_referer)

    # Form a new headers dict.
    new_headers = dict(
    headers,
    # Store a new Host-header
    # for correct resolving on the requested server.
    Host=domain,
    # Store a new Referer-header
    # for correct handling on the requested server
    Referer=domain_referer
    )

    if domain_referer:
    new_headers = dict(
    new_headers,
    # Store a new Referer-header
    # for correct handling on the requested server
    Referer=domain_referer
    )

    # Form a new meta dict.
    new_meta = dict(
    meta,
    @@ -95,6 +108,7 @@ def process_request(self, request, spider):
    # Store some auxiliary data.
    # It helps us to escape from unnecessary computations
    # and debug'em all.
    dns_dict=self.dns_dict,
    domain=domain,
    address=address,
    url=dict(
    @@ -142,10 +156,8 @@ def process_response(self, request, response, spider):
    return None

    # Form a new url with domain instead of IP-address.
    domain = meta.get('domain', '')
    address = meta.get('address', '')
    url = response.url
    new_url = self.address_to_domain(url, address, domain)
    new_url = self.convert_to_domain(url)

    # Form a new meta dict.
    new_meta = dict(
    @@ -165,40 +177,61 @@ def process_response(self, request, response, spider):
    )
    return response

    def domain_to_address(self, url, domain, address):
    def convert_to_address(self, url):
    """
    Replaces a domain to an address in the url.
    This function is a stub for caching.
    :param str url: original url;
    :param str domain: symbolic host name;
    :param str address: IP-address of the host.
    :return: a new url.
    :rtype: str
    """
    new_url = url.replace(domain, address)
    return new_url

    def address_to_domain(self, url, domain, address):
    domain = self.parse_host(url)
    address_list = self.dns_dict.get(domain, [])
    for address in address_list:
    url = url.replace(domain, address)
    return url

    def convert_to_domain(self, url):
    """
    Replaces an address to a domain in the url.
    This function is a stub for caching.
    :param str url: original url;
    :param str address: IP-address of the host.
    :param str domain: symbolic host name;
    :return: a new url.
    :rtype: str
    """

    new_url = url.replace(domain, address)
    return new_url
    address = self.parse_host(url)
    domain_list = self.dns_dict.get(address, [])
    for domain in domain_list:
    url = url.replace(address, domain)
    return url

    def parse_host(self, url):
    """
    Returns a host part of url.
    :param str url: source uri with IP or domain
    :return: host of given url
    :rtype: str
    """
    parsed_uri = urlparse(url)
    host = parsed_uri.netloc
    return host

    def compute_address(self, host_name):
    """
    Returns a http-compatible IP-address of the given host.
    This function is a stub for caching.
    :param str host_name: symbolic name of host;
    :return: string with IP-address.
    :rtype: str
    """
    address = self._compute_address(host_name)
    return address
    @@ -216,12 +249,13 @@ def _compute_address(self, host_name):
    addr_list = socket.getaddrinfo(host_name, 0)
    for addr in addr_list:
    (family, _socktype, _proto, _canonname, sockaddr) = addr
    (address, _port, _flow_info, _scope_id) = sockaddr
    if socket.AF_INET6 == family:
    address = u'[{address}]'.format(
    (address, _port, _flow_info, _scope_id) = sockaddr
    address = '[{address}]'.format(
    address=address
    )
    return address
    elif socket.AF_INET == family:
    (address, _port) = sockaddr
    return address
    return host_name
  3. @w495 w495 created this gist Jan 3, 2017.
    227 changes: 227 additions & 0 deletions dns_resolver_middleware.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,227 @@
    # -*- coding: utf8 -*-

    from __future__ import absolute_import, division, print_function

    import socket
    from urlparse import urlparse


    class DnsResolverMiddleware(object):
    """
    Downloader Middleware Class for address resolving.
    It resolves domain to IPv4 \ IPv6 addresses.
    Previously it was designed for IPv6-only hosts.
    Algorithm:
    1) Replace all domain names to addresses for a request.
    2) Form a new request.
    3) Perform this request (at Scrapy Engine).
    4) Get a response.
    5) Replaces addresses to names for this response.
    6) Return response.
    It works without any caching. So it may be quite slow.
    TODO:
    * Add caching [?].
    """

    def process_request(self, request, spider):
    """
    Replaces all domain names to addresses and forms a new request.
    Also it replaces addresses to domain names
    for a Referer-header.
    :param request: the request being processed;
    :param spider: dummy parameter in this case;
    :return: None or a new request with replaced links.
    """

    meta = getattr(request, 'meta', None)
    if not meta:
    meta = dict()

    # To resist infinite loop.
    if meta.get('resolved_request'):
    return None

    domain_url = request.url

    # Check if domain and address are computed already.
    domain = meta.get('domain', '')
    address = meta.get('address', '')
    if not address or not address:
    # Compute domain and IP-address.
    parsed_uri = urlparse(domain_url)
    domain = parsed_uri.netloc
    address = self.compute_address(domain)

    # Get a new url with an address instead of domain.
    address_url = self.domain_to_address(domain_url,
    domain,
    address)

    # To replace `Referer`.
    headers = request.headers
    if not headers:
    headers = dict()

    # Get a old `Referer` with an address instead of domain.
    address_referer = request.headers.get('Referer', str())
    # Get a new `Referer` with a domain.
    domain_referer = self.address_to_domain(address_referer,
    address,
    domain)

    # Form a new headers dict.
    new_headers = dict(
    headers,
    # Store a new Host-header
    # for correct resolving on the requested server.
    Host=domain,
    # Store a new Referer-header
    # for correct handling on the requested server
    Referer=domain_referer
    )
    # Form a new meta dict.
    new_meta = dict(
    meta,
    # Mark this request as processed to resist infinite loop.
    resolved_request=True,
    # Store some auxiliary data.
    # It helps us to escape from unnecessary computations
    # and debug'em all.
    domain=domain,
    address=address,
    url=dict(
    domain=domain_url,
    address=address_url,
    ),
    referer=dict(
    domain=domain_referer,
    address=address_referer,
    ),
    )
    # Form a new request.
    new_request = request.replace(
    url=address_url,
    headers=new_headers,
    meta=new_meta,
    )
    return new_request

    def process_response(self, request, response, spider):
    """
    Tries to replace addresses to domain names.
    Replaces are not so successfully as I expects.
    May be bug in Scrapy Engine.
    Nevertheless:
    1) It gets `domain` and `address` from request's meta.
    2) It replaces `address` to `domain` in request's url.
    3) It replaces `address` to `domain` in response's url.
    4) It forms a new response.
    :param request: the request that originated the response;
    :param response: the response being processed;
    :param spider: dummy parameter in this case;
    :return: None or a new request with replaced links.
    """

    # Get meta from request. It is important.
    meta = getattr(request, 'meta', None)
    if not meta:
    meta = dict()
    if meta.get('resolved_response'):
    return None

    # Form a new url with domain instead of IP-address.
    domain = meta.get('domain', '')
    address = meta.get('address', '')
    url = response.url
    new_url = self.address_to_domain(url, address, domain)

    # Form a new meta dict.
    new_meta = dict(
    meta,
    # Mark this response as processed.
    resolved_response=True,
    )
    # Form a new request with our new url.
    new_request = request.replace(
    url=new_url,
    meta=new_meta
    )
    # Form a new request with our new url and request.
    response.replace(
    url=new_url,
    request=new_request
    )
    return response

    def domain_to_address(self, url, domain, address):
    """
    Replaces a domain to an address in the url.
    This function is a stub for caching.
    :param str url: original url;
    :param str domain: symbolic host name;
    :param str address: IP-address of the host.
    :return: a new url.
    """
    new_url = url.replace(domain, address)
    return new_url

    def address_to_domain(self, url, domain, address):
    """
    Replaces an address to a domain in the url.
    This function is a stub for caching.
    :param str url: original url;
    :param str address: IP-address of the host.
    :param str domain: symbolic host name;
    :return: a new url.
    """

    new_url = url.replace(domain, address)
    return new_url

    def compute_address(self, host_name):
    """
    Returns a http-compatible IP-address of the given host.
    This function is a stub for caching.
    """
    address = self._compute_address(host_name)
    return address

    def _compute_address(self, host_name):
    """
    Returns a http-compatible IP-address of the given host.
    For IPv6 it wraps address into square brackets.
    :param str host_name: symbolic name of host;
    :return: string with IP-address.
    :rtype: str
    """
    addr_list = socket.getaddrinfo(host_name, 0)
    for addr in addr_list:
    (family, _socktype, _proto, _canonname, sockaddr) = addr
    (address, _port, _flow_info, _scope_id) = sockaddr
    if socket.AF_INET6 == family:
    address = u'[{address}]'.format(
    address=address
    )
    return address
    elif socket.AF_INET == family:
    return address
    return host_name