#!/usr/bin/python # coding:utf8 # @IgnorePep8 import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + '/..') from crawler.utils import server_list, get_http_url_list, get_hash_list, read_hid_list, write_hash_url_dic, read_hash_url_dic, wait_start_signal def get_hid_server_url_list(hid, server): url_list = [] url = '%s/ShowMovie.aspx?name=v&hid=%s&tn=v' % (server, hid) url_list = get_http_url_list(url) return url_list def get_hid_url_list(hid): hid_url_list = [] for server in server_list: hid_server_url_list = get_hid_server_url_list(hid, server) hid_url_list += hid_server_url_list hid_url_list = list(set(hid_url_list)) return hid_url_list def get_url_list(hid_list): url_list = [] for hid in hid_list: hid_url_list = get_hid_url_list(hid) url_list += hid_url_list print 'HID: %s URL NUM: %s' % (hid, len(hid_url_list)) url_list = list(set(url_list)) print 'URL NUM: %s' % len(url_list) return url_list def get_hash_url_dic(url_list): hash_url = {} for url in url_list: hash_list = get_hash_list(url) for hash_s in hash_list: l = hash_url.get(hash_s, []) l.append(url) hash_url[hash_s] = l for hash_s in hash_url.keys(): hash_url[hash_s] = list(set(hash_url.get(hash_s, []))) return hash_url def search_url(hash_s): hash_url = read_hash_url_dic() url_list = hash_url.get(hash_s, []) print 'HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list)) open('result','a').write('HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list))) return url_list def run(): hid_list = read_hid_list() url_list = get_url_list(hid_list) hash_url = get_hash_url_dic(url_list) write_hash_url_dic(hash_url) if __name__ == '__main__': #print 'WAITING FOR HID LIST......' #wait_start_signal() run()