Created
August 21, 2014 08:08
-
-
Save mowentian/d5fb8ffe15a792edc97f to your computer and use it in GitHub Desktop.
some wjl's code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # coding:utf8 | |
| # @IgnorePep8 | |
| import os | |
| import sys | |
| dirname = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.insert(0, dirname + '/..') | |
| import subprocess | |
| from cdgen.post_func import send_hash | |
| from crawler.utils import server_list, get_play_url, get_hid_list, get_hash_list, get_quote_keyward, write_hid_list, build_pipe, send_start_signal | |
| server_num = len(server_list) | |
| crawler_cmd = ['python', '%s/web_crawler.py' % dirname] | |
| p_crawler = None | |
| def get_hids(keyword, fd): | |
| global p_crawler | |
| hid_list = [] | |
| base_url = 'http://www.yezibo.com/SearchPlayFile.aspx?key=%s&OrType=False&taxismode=refweb&lock=True&page=%d' | |
| page = 1 | |
| while True: | |
| url = base_url % (keyword, page) | |
| page += 1 | |
| print url | |
| page_hid_list = get_hid_list(url) | |
| if not page_hid_list: | |
| break | |
| hid_list += page_hid_list | |
| hid_list = list(set(hid_list)) | |
| write_hid_list(hid_list) | |
| send_start_signal(fd) | |
| p_crawler = subprocess.Popen(crawler_cmd) | |
| return hid_list | |
| def get_hid_hash(hid): | |
| hash_list = [] | |
| base_url = '%s/OpenPlayMovie.aspx?hid=%s&modular=15&ntime=%d' | |
| print 'HID: %s' % hid | |
| count = 0 | |
| tried = {} | |
| while count < 0.1 * server_num: | |
| url = get_play_url(base_url % (server_list[count % server_num], hid, count)) | |
| count += 1 | |
| tried[url] = tried.get(url, 0) + 1 | |
| if tried[url] > 3: | |
| continue | |
| print 'TRY %02d: %s' % (count, url) | |
| page_hash_list = get_hash_list(url) | |
| if not page_hash_list: | |
| continue | |
| hash_list += page_hash_list | |
| break | |
| hash_list = list(set(hash_list)) | |
| return hash_list | |
| def get_hash(hid_list, failed_hid): | |
| hash_list = [] | |
| for hid in hid_list: | |
| hid_hash_list = get_hid_hash(hid) | |
| hash_list += hid_hash_list | |
| for hash_s in hid_hash_list: | |
| send_hash(hash_s) | |
| if not hid_hash_list: | |
| failed_hid.append(hid) | |
| hash_list = list(set(hash_list)) | |
| send_hash('END') | |
| return hash_list | |
| def run(keyword): | |
| fd = build_pipe() | |
| quote_keyward = get_quote_keyward(keyword) | |
| hid_list = get_hids(quote_keyward, fd) | |
| failed_hid = [] | |
| print 'HID LIST: %d' % len(hid_list) | |
| open('/tmp/%s.hid' % keyword, 'w').write(str(hid_list)) | |
| hash_list = get_hash(hid_list, failed_hid) | |
| print 'HASH LIST: %d' % len(hash_list) | |
| open('/tmp/%s.hash' % keyword, 'w').write(str(hash_list)) | |
| p_crawler.wait() | |
| if __name__ == '__main__': | |
| run(sys.argv[1]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| #coding:gbk | |
| # @IgnorePep8 | |
| import os, sys | |
| import urllib, urllib2 | |
| server_list = ['http://www.yezibo.com','http://www.dadou.tv','http://www.qso365.com','http://www.qvodso.cc','http://www.6so.cc','http://www.babykan.com','http://www.77sou.net','http://www.yes80.net','http://www.76du.cn','http://www.9ishou.com','http://www.zhiyehui.net','http://www.100soo.cn','http://www.yezibo.com','http://www.ivdy.cc','http://www.h888.net','http://www.kkso.cc','http://www.ediansou.com','http://www.bobo1314.com','http://www.19taoba.com','http://www.36so.com','http://www.dy135.com','http://www.nr54.com']#,'http://www.9skb.com'] | |
| pipe_name = '/tmp/START_SIGNAL' | |
| def get_html_text(url): | |
| text = '' | |
| try: | |
| responce = urllib2.urlopen(url, timeout=60) | |
| html = responce.read() | |
| text = urllib.url2pathname(html) | |
| except Exception: | |
| pass | |
| return text | |
| def get_play_url(url): | |
| play_url = '' | |
| res = None | |
| try: | |
| res = urllib2.urlopen(url, timeout=60) | |
| text = res.read() | |
| l = text.split('window.location.href=\'',2) | |
| if len(l) >= 2: | |
| play_url = l[1].split('\'</script>')[0] | |
| except Exception: | |
| return 'http://' | |
| if play_url: | |
| return play_url | |
| else: | |
| return res.geturl() | |
| def get_http_url_list(url): | |
| url_list = [] | |
| text = get_html_text(url) | |
| l = text.split('href=\'http://') | |
| if len(l) >= 2: | |
| l = l[1:] | |
| for i in l: | |
| u = i.split('\'',2)[0] | |
| if u: | |
| url_list.append('http://%s' % u) | |
| l = text.split('href=\"http://') | |
| if len(l) >= 2: | |
| l = l[1:] | |
| for i in l: | |
| u = i.split('\"',2)[0] | |
| if u: | |
| url_list.append('http://%s' % u) | |
| return url_list | |
| def get_hid_list(url): | |
| hid_list = [] | |
| text = get_html_text(url) | |
| #l = text.split('openBox(') | |
| #if len(l) >= 2: | |
| # l = l[1:] | |
| # for i in l: | |
| # hid = i.split(',',2)[0] | |
| # if hid: | |
| # hid_list.append(hid) | |
| l = text.split('openPlay(\'') | |
| if len(l) >= 2: | |
| l = l[1:] | |
| for i in l: | |
| hid = i.split('\')',2)[0] | |
| if hid: | |
| hid_list.append(hid) | |
| l = text.split('openPlay(\"') | |
| if len(l) >= 2: | |
| l = l[1:] | |
| for i in l: | |
| hid = i.split('\")',2)[0] | |
| if hid: | |
| hid_list.append(hid) | |
| return hid_list | |
| def get_hash_list(url): | |
| hash_list = [] | |
| text = get_html_text(url) | |
| l = text.split('qvod://') | |
| if len(l) >= 2: | |
| l = l[1:] | |
| for i in l: | |
| hl = i.split('|',3) | |
| if len(hl) >= 3: | |
| hash_list.append(hl[1]) | |
| return hash_list | |
| def get_quote_keyward(keyword): | |
| keyword = keyword.decode(sys.stdin.encoding).encode('gbk') | |
| keyword = urllib.quote(keyword) | |
| return keyword | |
| def get_tn(keyword): | |
| tn = keyword.decode('utf8') | |
| return tn | |
| def write_hid_list(hid_list): | |
| fout = file('/tmp/hid_list','wb') | |
| fout.write(str(hid_list)) | |
| fout.close() | |
| def read_hid_list(): | |
| fin = file('/tmp/hid_list','rb') | |
| hid_text = fin.read() | |
| fin.close() | |
| hid_list = eval(hid_text) | |
| return hid_list | |
| def write_hash_url_dic(hash_url): | |
| fout = file('/tmp/hash_url_dic','wb') | |
| fout.write(str(hash_url)) | |
| fout.close() | |
| def read_hash_url_dic(): | |
| fin = file('/tmp/hash_url_dic','rb') | |
| hash_url_text = fin.read() | |
| fin.close() | |
| hash_url = eval(hash_url_text) | |
| return hash_url | |
| def build_pipe(): | |
| fd = os.open(pipe_name, os.O_NONBLOCK | os.O_CREAT | os.O_RDWR) | |
| return fd | |
| def send_start_signal(fd): | |
| os.write(fd,"START") | |
| def wait_start_signal(): | |
| fd = os.open(pipe_name, os.O_RDONLY) | |
| while os.read(fd, 5) <> 'START': | |
| continue | |
| os.close(fd) | |
| os.remove(pipe_name) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # coding:utf8 | |
| # @IgnorePep8 | |
| import os | |
| import sys | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + '/..') | |
| from crawler.utils import server_list, get_http_url_list, get_hash_list, read_hid_list, write_hash_url_dic, read_hash_url_dic, wait_start_signal | |
| def get_hid_server_url_list(hid, server): | |
| url_list = [] | |
| url = '%s/ShowMovie.aspx?name=v&hid=%s&tn=v' % (server, hid) | |
| url_list = get_http_url_list(url) | |
| return url_list | |
| def get_hid_url_list(hid): | |
| hid_url_list = [] | |
| for server in server_list: | |
| hid_server_url_list = get_hid_server_url_list(hid, server) | |
| hid_url_list += hid_server_url_list | |
| hid_url_list = list(set(hid_url_list)) | |
| return hid_url_list | |
| def get_url_list(hid_list): | |
| url_list = [] | |
| for hid in hid_list: | |
| hid_url_list = get_hid_url_list(hid) | |
| url_list += hid_url_list | |
| print 'HID: %s URL NUM: %s' % (hid, len(hid_url_list)) | |
| url_list = list(set(url_list)) | |
| print 'URL NUM: %s' % len(url_list) | |
| return url_list | |
| def get_hash_url_dic(url_list): | |
| hash_url = {} | |
| for url in url_list: | |
| hash_list = get_hash_list(url) | |
| for hash_s in hash_list: | |
| l = hash_url.get(hash_s, []) | |
| l.append(url) | |
| hash_url[hash_s] = l | |
| for hash_s in hash_url.keys(): | |
| hash_url[hash_s] = list(set(hash_url.get(hash_s, []))) | |
| return hash_url | |
| def search_url(hash_s): | |
| hash_url = read_hash_url_dic() | |
| url_list = hash_url.get(hash_s, []) | |
| print 'HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list)) | |
| open('result','a').write('HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list))) | |
| return url_list | |
| def run(): | |
| hid_list = read_hid_list() | |
| url_list = get_url_list(hid_list) | |
| hash_url = get_hash_url_dic(url_list) | |
| write_hash_url_dic(hash_url) | |
| if __name__ == '__main__': | |
| #print 'WAITING FOR HID LIST......' | |
| #wait_start_signal() | |
| run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment