Skip to content

Instantly share code, notes, and snippets.

@mdaniel
Created May 10, 2015 20:17
Show Gist options
  • Save mdaniel/032cae7d161912d2a2ce to your computer and use it in GitHub Desktop.
Save mdaniel/032cae7d161912d2a2ce to your computer and use it in GitHub Desktop.

Revisions

  1. mdaniel created this gist May 10, 2015.
    185 changes: 185 additions & 0 deletions hidemyass_proxylist.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,185 @@
    #! /usr/bin/env python
    # -*- coding: utf-8 -*-
    from __future__ import absolute_import, print_function, unicode_literals
    __docformat__ = 'reStructureText'
    import logging
    import re
    import sys

    import html5lib
    from xml.etree.ElementTree import tostring

    logging.basicConfig(level=logging.NOTSET)
    LOG = logging.getLogger(__name__)
    LOG.setLevel(logging.INFO)

    DISPLAY_NONE_RE = re.compile(r'(?m)^\.(\S+)\s*\{\s*display\s*:\s*none\s*\}')


    def find_display_none_classes(css_text):
    """
    Finds the class names in the given css style text
    that are ``display:none``.
    For example, feeding this::
    thing { display:inline }
    other { display:none }
    will return::
    ['other']
    :rtype: list[unicode]
    """
    return DISPLAY_NONE_RE.findall(css_text)


    def test_parse_css():
    print(find_display_none_classes("""
    .s-Aw{display:none}
    .zJsC{display:inline}
    .IQvs{display:none}
    .yBZq{display:inline}
    .CyxL{display:none}
    .dt09{display:inline}
    """))


    def extract_proxy_info(tr_el):
    """
    Yields an (ip:port, proxy type, anon level) tuple
    from the provided table row element.
    :type tr_el: xml.etree.ElementTree.Element
    :rtype: (unicode, unicode, unicode)
    """
    port = None
    kind = None
    level = None

    for td_num, td_el in enumerate(tr_el):
    if not td_el.text:
    continue
    if re.match(r'(?m)\s*\d+\s*', td_el.text):
    if port is not None:
    raise Exception('Cannot cope with multiple ports: port=%s text=<<%s>>'
    % (port, td_el.text))
    port = int(re.sub(r'\s*', '', td_el.text))
    elif re.match(r'HTTPS?|SOCKS[45]', td_el.text):
    kind = td_el.text
    elif 7 == td_num: # re.match(r'(?i)low|high')
    level = td_el.text

    if port is None:
    raise ValueError('Expected port, found None')

    # we have to grab the style's parent because ElementTree
    # doesn't permit .parentNode and "dom" doesn't have .findall :-/
    #: :type: list[xml.etree.ElementTree.Element]
    style_parent_nl = tr_el.findall('.//*[style]')
    if not style_parent_nl:
    LOG.error('Expected "style" parent, found None in %s', tostring(tr_el))
    return
    if 1 != len(style_parent_nl):
    LOG.warning('Expected only one style parent, found %d of them',
    len(style_parent_nl))
    style_parent = style_parent_nl[0]

    #: :type: xml.etree.ElementTree.Element
    sty = style_parent.find('.//style')
    if sty is None:
    LOG.warning('Expected "style" child, found None: %s',
    tostring(style_parent))
    return
    bad_classes = find_display_none_classes(sty.text)
    LOG.debug('style[%s] -= %s', sty.text, bad_classes)

    #: :type: list[unicode]
    parts = []

    for ip_el in style_parent:
    LOG.debug('IP_EL=((%s))', tostring(ip_el))
    if 'style' == ip_el.tag:
    # don't continue or you'll eat the .tail text
    # which very well could contain a number or dot
    pass
    elif 'class' in ip_el.attrib:
    sp_class = ip_el.attrib['class']
    css = []
    if sp_class:
    css = sp_class.split(' ')
    ok = reduce(lambda a, b: a and b,
    [x not in bad_classes for x in css],
    True)
    if ok:
    LOG.debug('#class(%s)=%s', css, ip_el.text)
    parts.append(ip_el.text)
    elif 'style' in ip_el.attrib:
    st = ip_el.attrib['style']
    if 'x' not in find_display_none_classes('.x{%s}' % st):
    LOG.debug('#style(%s)=%s', st, ip_el.text)
    parts.append(ip_el.text)
    else:
    if ip_el.text:
    LOG.warning('??=<<%s>>' % ip_el.text)
    if ip_el.tail:
    parts.append(ip_el.tail)

    if not parts:
    LOG.warning('Your TD contained no IP parts')
    return

    ip_addr = u''.join(parts)
    if not re.match(r'\d+\.\d+\.\d+\.\d+', ip_addr):
    raise ValueError('That does not appear to be an IP: %s' % ip_addr)
    ip_port = '%s:%d' % (ip_addr, port)
    LOG.debug('ip=%s', ip_port)
    return ip_port, kind, level


    def run_body(body):
    """
    Enumerate and print the proxies found in `body`.
    :type body: unicode
    """
    # dom: :type: xml.dom.minidom.Document
    # but using it means giving up .find and friends

    #: :type: xml.etree.ElementTree.Element
    html_el = html5lib.parse(body, namespaceHTMLElements=False)

    #: :type: xml.etree.ElementTree.Element
    tab = html_el.find('.//*[@id="listable"]')

    #: :type: list[xml.etree.ElementTree.Element]
    tr_nl = tab.findall('.//tr[@rel]')
    for tr_el in tr_nl:
    info = extract_proxy_info(tr_el)
    if not info:
    continue
    ip = info[0]
    proxy_kind = info[1]
    anon_level = info[2]
    print('%s\t%s\t%s' % (ip, proxy_kind, anon_level))


    def main(argv):
    from getopt import getopt
    opts, args = getopt(argv[1:], 'v', ['verbose'])

    is_verbose = ('-v', '') in opts or ('--verbose', '') in opts
    if is_verbose:
    LOG.setLevel(logging.DEBUG)

    filename = args[0]
    if '-' == filename:
    body = sys.stdin.read().decode('utf-8')
    else:
    with open(filename) as fh:
    body = fh.read().decode('utf-8')
    run_body(body)

    if __name__ == '__main__':
    main(sys.argv)