#!/usr/bin/python # coding:utf8 # @IgnorePep8 import os import sys dirname = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, dirname + '/..') import subprocess from cdgen.post_func import send_hash from crawler.utils import server_list, get_play_url, get_hid_list, get_hash_list, get_quote_keyward, write_hid_list, build_pipe, send_start_signal server_num = len(server_list) crawler_cmd = ['python', '%s/web_crawler.py' % dirname] p_crawler = None def get_hids(keyword, fd): global p_crawler hid_list = [] base_url = 'http://www.yezibo.com/SearchPlayFile.aspx?key=%s&OrType=False&taxismode=refweb&lock=True&page=%d' page = 1 while True: url = base_url % (keyword, page) page += 1 print url page_hid_list = get_hid_list(url) if not page_hid_list: break hid_list += page_hid_list hid_list = list(set(hid_list)) write_hid_list(hid_list) send_start_signal(fd) p_crawler = subprocess.Popen(crawler_cmd) return hid_list def get_hid_hash(hid): hash_list = [] base_url = '%s/OpenPlayMovie.aspx?hid=%s&modular=15&ntime=%d' print 'HID: %s' % hid count = 0 tried = {} while count < 0.1 * server_num: url = get_play_url(base_url % (server_list[count % server_num], hid, count)) count += 1 tried[url] = tried.get(url, 0) + 1 if tried[url] > 3: continue print 'TRY %02d: %s' % (count, url) page_hash_list = get_hash_list(url) if not page_hash_list: continue hash_list += page_hash_list break hash_list = list(set(hash_list)) return hash_list def get_hash(hid_list, failed_hid): hash_list = [] for hid in hid_list: hid_hash_list = get_hid_hash(hid) hash_list += hid_hash_list for hash_s in hid_hash_list: send_hash(hash_s) if not hid_hash_list: failed_hid.append(hid) hash_list = list(set(hash_list)) send_hash('END') return hash_list def run(keyword): fd = build_pipe() quote_keyward = get_quote_keyward(keyword) hid_list = get_hids(quote_keyward, fd) failed_hid = [] print 'HID LIST: %d' % len(hid_list) open('/tmp/%s.hid' % keyword, 'w').write(str(hid_list)) hash_list = get_hash(hid_list, failed_hid) print 'HASH LIST: %d' % len(hash_list) open('/tmp/%s.hash' % keyword, 'w').write(str(hash_list)) p_crawler.wait() if __name__ == '__main__': run(sys.argv[1])