Skip to content

Instantly share code, notes, and snippets.

@mowentian
Created August 21, 2014 08:08
Show Gist options
  • Save mowentian/d5fb8ffe15a792edc97f to your computer and use it in GitHub Desktop.
Save mowentian/d5fb8ffe15a792edc97f to your computer and use it in GitHub Desktop.

Revisions

  1. mowentian created this gist Aug 21, 2014.
    88 changes: 88 additions & 0 deletions hash_parser.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,88 @@
    #!/usr/bin/python
    # coding:utf8
    # @IgnorePep8

    import os
    import sys
    dirname = os.path.dirname(os.path.abspath(__file__))
    sys.path.insert(0, dirname + '/..')

    import subprocess
    from cdgen.post_func import send_hash
    from crawler.utils import server_list, get_play_url, get_hid_list, get_hash_list, get_quote_keyward, write_hid_list, build_pipe, send_start_signal

    server_num = len(server_list)
    crawler_cmd = ['python', '%s/web_crawler.py' % dirname]
    p_crawler = None

    def get_hids(keyword, fd):
    global p_crawler
    hid_list = []
    base_url = 'http://www.yezibo.com/SearchPlayFile.aspx?key=%s&OrType=False&taxismode=refweb&lock=True&page=%d'
    page = 1
    while True:
    url = base_url % (keyword, page)
    page += 1
    print url
    page_hid_list = get_hid_list(url)
    if not page_hid_list:
    break
    hid_list += page_hid_list
    hid_list = list(set(hid_list))
    write_hid_list(hid_list)
    send_start_signal(fd)
    p_crawler = subprocess.Popen(crawler_cmd)
    return hid_list

    def get_hid_hash(hid):
    hash_list = []
    base_url = '%s/OpenPlayMovie.aspx?hid=%s&modular=15&ntime=%d'

    print 'HID: %s' % hid
    count = 0
    tried = {}
    while count < 0.1 * server_num:
    url = get_play_url(base_url % (server_list[count % server_num], hid, count))
    count += 1
    tried[url] = tried.get(url, 0) + 1
    if tried[url] > 3:
    continue
    print 'TRY %02d: %s' % (count, url)
    page_hash_list = get_hash_list(url)
    if not page_hash_list:
    continue
    hash_list += page_hash_list
    break
    hash_list = list(set(hash_list))
    return hash_list

    def get_hash(hid_list, failed_hid):
    hash_list = []
    for hid in hid_list:
    hid_hash_list = get_hid_hash(hid)
    hash_list += hid_hash_list
    for hash_s in hid_hash_list:
    send_hash(hash_s)
    if not hid_hash_list:
    failed_hid.append(hid)
    hash_list = list(set(hash_list))
    send_hash('END')
    return hash_list

    def run(keyword):
    fd = build_pipe()
    quote_keyward = get_quote_keyward(keyword)

    hid_list = get_hids(quote_keyward, fd)
    failed_hid = []
    print 'HID LIST: %d' % len(hid_list)
    open('/tmp/%s.hid' % keyword, 'w').write(str(hid_list))

    hash_list = get_hash(hid_list, failed_hid)
    print 'HASH LIST: %d' % len(hash_list)
    open('/tmp/%s.hash' % keyword, 'w').write(str(hash_list))

    p_crawler.wait()

    if __name__ == '__main__':
    run(sys.argv[1])
    143 changes: 143 additions & 0 deletions utils.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,143 @@
    #!/usr/bin/python
    #coding:gbk
    # @IgnorePep8

    import os, sys
    import urllib, urllib2

    server_list = ['http://www.yezibo.com','http://www.dadou.tv','http://www.qso365.com','http://www.qvodso.cc','http://www.6so.cc','http://www.babykan.com','http://www.77sou.net','http://www.yes80.net','http://www.76du.cn','http://www.9ishou.com','http://www.zhiyehui.net','http://www.100soo.cn','http://www.yezibo.com','http://www.ivdy.cc','http://www.h888.net','http://www.kkso.cc','http://www.ediansou.com','http://www.bobo1314.com','http://www.19taoba.com','http://www.36so.com','http://www.dy135.com','http://www.nr54.com']#,'http://www.9skb.com']
    pipe_name = '/tmp/START_SIGNAL'

    def get_html_text(url):
    text = ''
    try:
    responce = urllib2.urlopen(url, timeout=60)
    html = responce.read()
    text = urllib.url2pathname(html)
    except Exception:
    pass
    return text

    def get_play_url(url):
    play_url = ''
    res = None
    try:
    res = urllib2.urlopen(url, timeout=60)
    text = res.read()
    l = text.split('window.location.href=\'',2)
    if len(l) >= 2:
    play_url = l[1].split('\'</script>')[0]
    except Exception:
    return 'http://'
    if play_url:
    return play_url
    else:
    return res.geturl()

    def get_http_url_list(url):
    url_list = []
    text = get_html_text(url)
    l = text.split('href=\'http://')
    if len(l) >= 2:
    l = l[1:]
    for i in l:
    u = i.split('\'',2)[0]
    if u:
    url_list.append('http://%s' % u)

    l = text.split('href=\"http://')
    if len(l) >= 2:
    l = l[1:]
    for i in l:
    u = i.split('\"',2)[0]
    if u:
    url_list.append('http://%s' % u)
    return url_list

    def get_hid_list(url):
    hid_list = []
    text = get_html_text(url)

    #l = text.split('openBox(')
    #if len(l) >= 2:
    # l = l[1:]
    # for i in l:
    # hid = i.split(',',2)[0]
    # if hid:
    # hid_list.append(hid)

    l = text.split('openPlay(\'')
    if len(l) >= 2:
    l = l[1:]
    for i in l:
    hid = i.split('\')',2)[0]
    if hid:
    hid_list.append(hid)

    l = text.split('openPlay(\"')
    if len(l) >= 2:
    l = l[1:]
    for i in l:
    hid = i.split('\")',2)[0]
    if hid:
    hid_list.append(hid)
    return hid_list

    def get_hash_list(url):
    hash_list = []
    text = get_html_text(url)
    l = text.split('qvod://')
    if len(l) >= 2:
    l = l[1:]
    for i in l:
    hl = i.split('|',3)
    if len(hl) >= 3:
    hash_list.append(hl[1])
    return hash_list

    def get_quote_keyward(keyword):
    keyword = keyword.decode(sys.stdin.encoding).encode('gbk')
    keyword = urllib.quote(keyword)
    return keyword

    def get_tn(keyword):
    tn = keyword.decode('utf8')
    return tn

    def write_hid_list(hid_list):
    fout = file('/tmp/hid_list','wb')
    fout.write(str(hid_list))
    fout.close()

    def read_hid_list():
    fin = file('/tmp/hid_list','rb')
    hid_text = fin.read()
    fin.close()
    hid_list = eval(hid_text)
    return hid_list

    def write_hash_url_dic(hash_url):
    fout = file('/tmp/hash_url_dic','wb')
    fout.write(str(hash_url))
    fout.close()

    def read_hash_url_dic():
    fin = file('/tmp/hash_url_dic','rb')
    hash_url_text = fin.read()
    fin.close()
    hash_url = eval(hash_url_text)
    return hash_url

    def build_pipe():
    fd = os.open(pipe_name, os.O_NONBLOCK | os.O_CREAT | os.O_RDWR)
    return fd

    def send_start_signal(fd):
    os.write(fd,"START")

    def wait_start_signal():
    fd = os.open(pipe_name, os.O_RDONLY)
    while os.read(fd, 5) <> 'START':
    continue
    os.close(fd)
    os.remove(pipe_name)
    63 changes: 63 additions & 0 deletions web_crawler.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,63 @@
    #!/usr/bin/python
    # coding:utf8
    # @IgnorePep8

    import os
    import sys
    sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + '/..')

    from crawler.utils import server_list, get_http_url_list, get_hash_list, read_hid_list, write_hash_url_dic, read_hash_url_dic, wait_start_signal

    def get_hid_server_url_list(hid, server):
    url_list = []
    url = '%s/ShowMovie.aspx?name=v&hid=%s&tn=v' % (server, hid)
    url_list = get_http_url_list(url)
    return url_list

    def get_hid_url_list(hid):
    hid_url_list = []
    for server in server_list:
    hid_server_url_list = get_hid_server_url_list(hid, server)
    hid_url_list += hid_server_url_list
    hid_url_list = list(set(hid_url_list))
    return hid_url_list

    def get_url_list(hid_list):
    url_list = []
    for hid in hid_list:
    hid_url_list = get_hid_url_list(hid)
    url_list += hid_url_list
    print 'HID: %s URL NUM: %s' % (hid, len(hid_url_list))
    url_list = list(set(url_list))
    print 'URL NUM: %s' % len(url_list)
    return url_list

    def get_hash_url_dic(url_list):
    hash_url = {}
    for url in url_list:
    hash_list = get_hash_list(url)
    for hash_s in hash_list:
    l = hash_url.get(hash_s, [])
    l.append(url)
    hash_url[hash_s] = l
    for hash_s in hash_url.keys():
    hash_url[hash_s] = list(set(hash_url.get(hash_s, [])))
    return hash_url

    def search_url(hash_s):
    hash_url = read_hash_url_dic()
    url_list = hash_url.get(hash_s, [])
    print 'HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list))
    open('result','a').write('HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list)))
    return url_list

    def run():
    hid_list = read_hid_list()
    url_list = get_url_list(hid_list)
    hash_url = get_hash_url_dic(url_list)
    write_hash_url_dic(hash_url)

    if __name__ == '__main__':
    #print 'WAITING FOR HID LIST......'
    #wait_start_signal()
    run()