mowentian · August 21, 2014 08:08 · Aug 21, 2014
diff --git a/hash_parser.py b/hash_parser.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python
+# coding:utf8
+# @IgnorePep8
+
+import os
+import sys
+dirname = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, dirname + '/..')
+
+import subprocess
+from cdgen.post_func import send_hash
+from crawler.utils import server_list, get_play_url, get_hid_list, get_hash_list, get_quote_keyward, write_hid_list, build_pipe, send_start_signal
+
+server_num = len(server_list)
+crawler_cmd = ['python', '%s/web_crawler.py' % dirname]
+p_crawler = None
+
+def get_hids(keyword, fd):
+    global p_crawler
+    hid_list = []
+    base_url = 'http://www.yezibo.com/SearchPlayFile.aspx?key=%s&OrType=False&taxismode=refweb&lock=True&page=%d'
+    page = 1
+    while True:
+        url = base_url % (keyword, page)
+        page += 1
+        print url
+        page_hid_list = get_hid_list(url)
+        if not page_hid_list:
+            break
+        hid_list += page_hid_list
+    hid_list = list(set(hid_list))
+    write_hid_list(hid_list)
+    send_start_signal(fd)
+    p_crawler = subprocess.Popen(crawler_cmd)
+    return hid_list
+
+def get_hid_hash(hid):
+    hash_list = []
+    base_url = '%s/OpenPlayMovie.aspx?hid=%s&modular=15&ntime=%d'
+
+    print 'HID: %s' % hid
+    count = 0
+    tried = {}
+    while count < 0.1 * server_num:
+        url = get_play_url(base_url % (server_list[count % server_num], hid, count))
+        count += 1
+        tried[url] = tried.get(url, 0) + 1
+        if tried[url] > 3:
+            continue
+        print 'TRY %02d: %s' % (count, url)
+        page_hash_list = get_hash_list(url)
+        if not page_hash_list:
+            continue
+        hash_list += page_hash_list
+        break
+    hash_list = list(set(hash_list))
+    return hash_list
+
+def get_hash(hid_list, failed_hid):
+    hash_list = []
+    for hid in hid_list:
+        hid_hash_list = get_hid_hash(hid)
+        hash_list += hid_hash_list
+        for hash_s in hid_hash_list:
+            send_hash(hash_s)
+        if not hid_hash_list:
+            failed_hid.append(hid)
+    hash_list = list(set(hash_list))
+    send_hash('END')
+    return hash_list
+
+def run(keyword):
+    fd = build_pipe()
+    quote_keyward = get_quote_keyward(keyword)
+
+    hid_list = get_hids(quote_keyward, fd)
+    failed_hid = []
+    print 'HID LIST: %d' % len(hid_list)
+    open('/tmp/%s.hid' % keyword, 'w').write(str(hid_list))
+
+    hash_list = get_hash(hid_list, failed_hid)
+    print 'HASH LIST: %d' % len(hash_list)
+    open('/tmp/%s.hash' % keyword, 'w').write(str(hash_list))
+
+    p_crawler.wait()
+
+if __name__ == '__main__':
+    run(sys.argv[1])
diff --git a/utils.py b/utils.py
@@ -0,0 +1,143 @@
+#!/usr/bin/python
+#coding:gbk
+# @IgnorePep8
+
+import os, sys
+import urllib, urllib2
+
+server_list = ['http://www.yezibo.com','http://www.dadou.tv','http://www.qso365.com','http://www.qvodso.cc','http://www.6so.cc','http://www.babykan.com','http://www.77sou.net','http://www.yes80.net','http://www.76du.cn','http://www.9ishou.com','http://www.zhiyehui.net','http://www.100soo.cn','http://www.yezibo.com','http://www.ivdy.cc','http://www.h888.net','http://www.kkso.cc','http://www.ediansou.com','http://www.bobo1314.com','http://www.19taoba.com','http://www.36so.com','http://www.dy135.com','http://www.nr54.com']#,'http://www.9skb.com']
+pipe_name = '/tmp/START_SIGNAL'
+
+def get_html_text(url):
+    text = ''
+    try:
+        responce = urllib2.urlopen(url, timeout=60)
+        html = responce.read()
+        text = urllib.url2pathname(html)
+    except Exception:
+        pass
+    return text 
+
+def get_play_url(url):
+    play_url = ''
+    res = None
+    try:
+        res = urllib2.urlopen(url, timeout=60)
+        text = res.read()
+        l = text.split('window.location.href=\'',2)
+        if len(l) >= 2:
+            play_url = l[1].split('\'</script>')[0]
+    except Exception:
+        return 'http://'
+    if play_url:
+        return play_url
+    else:
+        return res.geturl()
+
+def get_http_url_list(url):
+    url_list = []
+    text = get_html_text(url)
+    l = text.split('href=\'http://')
+    if len(l) >= 2:
+        l = l[1:]
+        for i in l:
+            u = i.split('\'',2)[0]
+            if u:
+                url_list.append('http://%s' % u)
+
+    l = text.split('href=\"http://')
+    if len(l) >= 2:
+        l = l[1:]
+        for i in l:
+            u = i.split('\"',2)[0]
+            if u:
+                url_list.append('http://%s' % u)
+    return url_list
+
+def get_hid_list(url):
+    hid_list = []
+    text = get_html_text(url)
+
+    #l = text.split('openBox(')
+    #if len(l) >= 2:
+    #    l = l[1:]
+    #    for i in l:
+    #        hid = i.split(',',2)[0]
+    #        if hid:
+    #            hid_list.append(hid)
+
+    l = text.split('openPlay(\'')
+    if len(l) >= 2:
+        l = l[1:]
+        for i in l:
+            hid = i.split('\')',2)[0]
+            if hid:
+                hid_list.append(hid)
+
+    l = text.split('openPlay(\"')
+    if len(l) >= 2:
+        l = l[1:]
+        for i in l:
+            hid = i.split('\")',2)[0]
+            if hid:
+                hid_list.append(hid)
+    return hid_list
+
+def get_hash_list(url):
+    hash_list = []
+    text = get_html_text(url)
+    l = text.split('qvod://')
+    if len(l) >= 2:
+        l = l[1:]
+        for i in l:
+            hl = i.split('|',3)
+            if len(hl) >= 3:
+                hash_list.append(hl[1])
+    return hash_list
+
+def get_quote_keyward(keyword):
+    keyword = keyword.decode(sys.stdin.encoding).encode('gbk')
+    keyword = urllib.quote(keyword)
+    return keyword
+
+def get_tn(keyword):
+    tn = keyword.decode('utf8')
+    return tn
+
+def write_hid_list(hid_list):
+    fout = file('/tmp/hid_list','wb')
+    fout.write(str(hid_list))
+    fout.close()
+
+def read_hid_list():
+    fin = file('/tmp/hid_list','rb')
+    hid_text = fin.read()
+    fin.close()
+    hid_list = eval(hid_text)
+    return hid_list
+
+def write_hash_url_dic(hash_url):
+    fout = file('/tmp/hash_url_dic','wb')
+    fout.write(str(hash_url))
+    fout.close()
+
+def read_hash_url_dic():
+    fin = file('/tmp/hash_url_dic','rb')
+    hash_url_text = fin.read()
+    fin.close()
+    hash_url = eval(hash_url_text)
+    return hash_url
+
+def build_pipe():
+    fd = os.open(pipe_name, os.O_NONBLOCK | os.O_CREAT | os.O_RDWR)
+    return fd
+
+def send_start_signal(fd):
+    os.write(fd,"START")
+
+def wait_start_signal():
+    fd = os.open(pipe_name, os.O_RDONLY)
+    while os.read(fd, 5) <> 'START':
+        continue
+    os.close(fd)
+    os.remove(pipe_name)
diff --git a/web_crawler.py b/web_crawler.py
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+# coding:utf8
+# @IgnorePep8
+
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + '/..')
+
+from crawler.utils import server_list, get_http_url_list, get_hash_list, read_hid_list, write_hash_url_dic, read_hash_url_dic, wait_start_signal
+
+def get_hid_server_url_list(hid, server):
+    url_list = []
+    url = '%s/ShowMovie.aspx?name=v&hid=%s&tn=v' % (server, hid)
+    url_list = get_http_url_list(url)
+    return url_list
+
+def get_hid_url_list(hid):
+    hid_url_list = []
+    for server in server_list:
+        hid_server_url_list = get_hid_server_url_list(hid, server)
+        hid_url_list += hid_server_url_list
+    hid_url_list = list(set(hid_url_list))
+    return hid_url_list
+
+def get_url_list(hid_list):
+    url_list = []
+    for hid in hid_list:
+        hid_url_list = get_hid_url_list(hid)
+        url_list += hid_url_list
+        print 'HID: %s URL NUM: %s' % (hid, len(hid_url_list))
+    url_list = list(set(url_list))
+    print 'URL NUM: %s' % len(url_list)
+    return url_list
+
+def get_hash_url_dic(url_list):
+    hash_url = {}
+    for url in url_list:
+        hash_list = get_hash_list(url)
+        for hash_s in hash_list:
+            l = hash_url.get(hash_s, [])
+            l.append(url)
+            hash_url[hash_s] = l
+    for hash_s in hash_url.keys():
+        hash_url[hash_s] = list(set(hash_url.get(hash_s, [])))
+    return hash_url
+
+def search_url(hash_s):
+    hash_url = read_hash_url_dic()
+    url_list = hash_url.get(hash_s, [])
+    print 'HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list))
+    open('result','a').write('HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list)))
+    return url_list
+
+def run():
+    hid_list = read_hid_list()
+    url_list = get_url_list(hid_list)
+    hash_url = get_hash_url_dic(url_list)
+    write_hash_url_dic(hash_url)
+
+if __name__ == '__main__':
+    #print 'WAITING FOR HID LIST......'
+    #wait_start_signal()
+    run()