Created
August 21, 2014 08:08
-
-
Save mowentian/d5fb8ffe15a792edc97f to your computer and use it in GitHub Desktop.
Revisions
-
mowentian created this gist
Aug 21, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,88 @@ #!/usr/bin/python # coding:utf8 # @IgnorePep8 import os import sys dirname = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, dirname + '/..') import subprocess from cdgen.post_func import send_hash from crawler.utils import server_list, get_play_url, get_hid_list, get_hash_list, get_quote_keyward, write_hid_list, build_pipe, send_start_signal server_num = len(server_list) crawler_cmd = ['python', '%s/web_crawler.py' % dirname] p_crawler = None def get_hids(keyword, fd): global p_crawler hid_list = [] base_url = 'http://www.yezibo.com/SearchPlayFile.aspx?key=%s&OrType=False&taxismode=refweb&lock=True&page=%d' page = 1 while True: url = base_url % (keyword, page) page += 1 print url page_hid_list = get_hid_list(url) if not page_hid_list: break hid_list += page_hid_list hid_list = list(set(hid_list)) write_hid_list(hid_list) send_start_signal(fd) p_crawler = subprocess.Popen(crawler_cmd) return hid_list def get_hid_hash(hid): hash_list = [] base_url = '%s/OpenPlayMovie.aspx?hid=%s&modular=15&ntime=%d' print 'HID: %s' % hid count = 0 tried = {} while count < 0.1 * server_num: url = get_play_url(base_url % (server_list[count % server_num], hid, count)) count += 1 tried[url] = tried.get(url, 0) + 1 if tried[url] > 3: continue print 'TRY %02d: %s' % (count, url) page_hash_list = get_hash_list(url) if not page_hash_list: continue hash_list += page_hash_list break hash_list = list(set(hash_list)) return hash_list def get_hash(hid_list, failed_hid): hash_list = [] for hid in hid_list: hid_hash_list = get_hid_hash(hid) hash_list += hid_hash_list for hash_s in hid_hash_list: send_hash(hash_s) if not hid_hash_list: failed_hid.append(hid) hash_list = list(set(hash_list)) send_hash('END') return hash_list def run(keyword): fd = build_pipe() quote_keyward = get_quote_keyward(keyword) hid_list = get_hids(quote_keyward, fd) failed_hid = [] print 'HID LIST: %d' % len(hid_list) open('/tmp/%s.hid' % keyword, 'w').write(str(hid_list)) hash_list = get_hash(hid_list, failed_hid) print 'HASH LIST: %d' % len(hash_list) open('/tmp/%s.hash' % keyword, 'w').write(str(hash_list)) p_crawler.wait() if __name__ == '__main__': run(sys.argv[1]) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,143 @@ #!/usr/bin/python #coding:gbk # @IgnorePep8 import os, sys import urllib, urllib2 server_list = ['http://www.yezibo.com','http://www.dadou.tv','http://www.qso365.com','http://www.qvodso.cc','http://www.6so.cc','http://www.babykan.com','http://www.77sou.net','http://www.yes80.net','http://www.76du.cn','http://www.9ishou.com','http://www.zhiyehui.net','http://www.100soo.cn','http://www.yezibo.com','http://www.ivdy.cc','http://www.h888.net','http://www.kkso.cc','http://www.ediansou.com','http://www.bobo1314.com','http://www.19taoba.com','http://www.36so.com','http://www.dy135.com','http://www.nr54.com']#,'http://www.9skb.com'] pipe_name = '/tmp/START_SIGNAL' def get_html_text(url): text = '' try: responce = urllib2.urlopen(url, timeout=60) html = responce.read() text = urllib.url2pathname(html) except Exception: pass return text def get_play_url(url): play_url = '' res = None try: res = urllib2.urlopen(url, timeout=60) text = res.read() l = text.split('window.location.href=\'',2) if len(l) >= 2: play_url = l[1].split('\'</script>')[0] except Exception: return 'http://' if play_url: return play_url else: return res.geturl() def get_http_url_list(url): url_list = [] text = get_html_text(url) l = text.split('href=\'http://') if len(l) >= 2: l = l[1:] for i in l: u = i.split('\'',2)[0] if u: url_list.append('http://%s' % u) l = text.split('href=\"http://') if len(l) >= 2: l = l[1:] for i in l: u = i.split('\"',2)[0] if u: url_list.append('http://%s' % u) return url_list def get_hid_list(url): hid_list = [] text = get_html_text(url) #l = text.split('openBox(') #if len(l) >= 2: # l = l[1:] # for i in l: # hid = i.split(',',2)[0] # if hid: # hid_list.append(hid) l = text.split('openPlay(\'') if len(l) >= 2: l = l[1:] for i in l: hid = i.split('\')',2)[0] if hid: hid_list.append(hid) l = text.split('openPlay(\"') if len(l) >= 2: l = l[1:] for i in l: hid = i.split('\")',2)[0] if hid: hid_list.append(hid) return hid_list def get_hash_list(url): hash_list = [] text = get_html_text(url) l = text.split('qvod://') if len(l) >= 2: l = l[1:] for i in l: hl = i.split('|',3) if len(hl) >= 3: hash_list.append(hl[1]) return hash_list def get_quote_keyward(keyword): keyword = keyword.decode(sys.stdin.encoding).encode('gbk') keyword = urllib.quote(keyword) return keyword def get_tn(keyword): tn = keyword.decode('utf8') return tn def write_hid_list(hid_list): fout = file('/tmp/hid_list','wb') fout.write(str(hid_list)) fout.close() def read_hid_list(): fin = file('/tmp/hid_list','rb') hid_text = fin.read() fin.close() hid_list = eval(hid_text) return hid_list def write_hash_url_dic(hash_url): fout = file('/tmp/hash_url_dic','wb') fout.write(str(hash_url)) fout.close() def read_hash_url_dic(): fin = file('/tmp/hash_url_dic','rb') hash_url_text = fin.read() fin.close() hash_url = eval(hash_url_text) return hash_url def build_pipe(): fd = os.open(pipe_name, os.O_NONBLOCK | os.O_CREAT | os.O_RDWR) return fd def send_start_signal(fd): os.write(fd,"START") def wait_start_signal(): fd = os.open(pipe_name, os.O_RDONLY) while os.read(fd, 5) <> 'START': continue os.close(fd) os.remove(pipe_name) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,63 @@ #!/usr/bin/python # coding:utf8 # @IgnorePep8 import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + '/..') from crawler.utils import server_list, get_http_url_list, get_hash_list, read_hid_list, write_hash_url_dic, read_hash_url_dic, wait_start_signal def get_hid_server_url_list(hid, server): url_list = [] url = '%s/ShowMovie.aspx?name=v&hid=%s&tn=v' % (server, hid) url_list = get_http_url_list(url) return url_list def get_hid_url_list(hid): hid_url_list = [] for server in server_list: hid_server_url_list = get_hid_server_url_list(hid, server) hid_url_list += hid_server_url_list hid_url_list = list(set(hid_url_list)) return hid_url_list def get_url_list(hid_list): url_list = [] for hid in hid_list: hid_url_list = get_hid_url_list(hid) url_list += hid_url_list print 'HID: %s URL NUM: %s' % (hid, len(hid_url_list)) url_list = list(set(url_list)) print 'URL NUM: %s' % len(url_list) return url_list def get_hash_url_dic(url_list): hash_url = {} for url in url_list: hash_list = get_hash_list(url) for hash_s in hash_list: l = hash_url.get(hash_s, []) l.append(url) hash_url[hash_s] = l for hash_s in hash_url.keys(): hash_url[hash_s] = list(set(hash_url.get(hash_s, []))) return hash_url def search_url(hash_s): hash_url = read_hash_url_dic() url_list = hash_url.get(hash_s, []) print 'HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list)) open('result','a').write('HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list))) return url_list def run(): hid_list = read_hid_list() url_list = get_url_list(hid_list) hash_url = get_hash_url_dic(url_list) write_hash_url_dic(hash_url) if __name__ == '__main__': #print 'WAITING FOR HID LIST......' #wait_start_signal() run()