Skip to content

Instantly share code, notes, and snippets.

@mowentian
Created August 21, 2014 08:08
Show Gist options
  • Save mowentian/d5fb8ffe15a792edc97f to your computer and use it in GitHub Desktop.
Save mowentian/d5fb8ffe15a792edc97f to your computer and use it in GitHub Desktop.
some wjl's code
#!/usr/bin/python
# coding:utf8
# @IgnorePep8
import os
import sys
dirname = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, dirname + '/..')
import subprocess
from cdgen.post_func import send_hash
from crawler.utils import server_list, get_play_url, get_hid_list, get_hash_list, get_quote_keyward, write_hid_list, build_pipe, send_start_signal
server_num = len(server_list)
crawler_cmd = ['python', '%s/web_crawler.py' % dirname]
p_crawler = None
def get_hids(keyword, fd):
global p_crawler
hid_list = []
base_url = 'http://www.yezibo.com/SearchPlayFile.aspx?key=%s&OrType=False&taxismode=refweb&lock=True&page=%d'
page = 1
while True:
url = base_url % (keyword, page)
page += 1
print url
page_hid_list = get_hid_list(url)
if not page_hid_list:
break
hid_list += page_hid_list
hid_list = list(set(hid_list))
write_hid_list(hid_list)
send_start_signal(fd)
p_crawler = subprocess.Popen(crawler_cmd)
return hid_list
def get_hid_hash(hid):
hash_list = []
base_url = '%s/OpenPlayMovie.aspx?hid=%s&modular=15&ntime=%d'
print 'HID: %s' % hid
count = 0
tried = {}
while count < 0.1 * server_num:
url = get_play_url(base_url % (server_list[count % server_num], hid, count))
count += 1
tried[url] = tried.get(url, 0) + 1
if tried[url] > 3:
continue
print 'TRY %02d: %s' % (count, url)
page_hash_list = get_hash_list(url)
if not page_hash_list:
continue
hash_list += page_hash_list
break
hash_list = list(set(hash_list))
return hash_list
def get_hash(hid_list, failed_hid):
hash_list = []
for hid in hid_list:
hid_hash_list = get_hid_hash(hid)
hash_list += hid_hash_list
for hash_s in hid_hash_list:
send_hash(hash_s)
if not hid_hash_list:
failed_hid.append(hid)
hash_list = list(set(hash_list))
send_hash('END')
return hash_list
def run(keyword):
fd = build_pipe()
quote_keyward = get_quote_keyward(keyword)
hid_list = get_hids(quote_keyward, fd)
failed_hid = []
print 'HID LIST: %d' % len(hid_list)
open('/tmp/%s.hid' % keyword, 'w').write(str(hid_list))
hash_list = get_hash(hid_list, failed_hid)
print 'HASH LIST: %d' % len(hash_list)
open('/tmp/%s.hash' % keyword, 'w').write(str(hash_list))
p_crawler.wait()
if __name__ == '__main__':
run(sys.argv[1])
#!/usr/bin/python
#coding:gbk
# @IgnorePep8
import os, sys
import urllib, urllib2
server_list = ['http://www.yezibo.com','http://www.dadou.tv','http://www.qso365.com','http://www.qvodso.cc','http://www.6so.cc','http://www.babykan.com','http://www.77sou.net','http://www.yes80.net','http://www.76du.cn','http://www.9ishou.com','http://www.zhiyehui.net','http://www.100soo.cn','http://www.yezibo.com','http://www.ivdy.cc','http://www.h888.net','http://www.kkso.cc','http://www.ediansou.com','http://www.bobo1314.com','http://www.19taoba.com','http://www.36so.com','http://www.dy135.com','http://www.nr54.com']#,'http://www.9skb.com']
pipe_name = '/tmp/START_SIGNAL'
def get_html_text(url):
text = ''
try:
responce = urllib2.urlopen(url, timeout=60)
html = responce.read()
text = urllib.url2pathname(html)
except Exception:
pass
return text
def get_play_url(url):
play_url = ''
res = None
try:
res = urllib2.urlopen(url, timeout=60)
text = res.read()
l = text.split('window.location.href=\'',2)
if len(l) >= 2:
play_url = l[1].split('\'</script>')[0]
except Exception:
return 'http://'
if play_url:
return play_url
else:
return res.geturl()
def get_http_url_list(url):
url_list = []
text = get_html_text(url)
l = text.split('href=\'http://')
if len(l) >= 2:
l = l[1:]
for i in l:
u = i.split('\'',2)[0]
if u:
url_list.append('http://%s' % u)
l = text.split('href=\"http://')
if len(l) >= 2:
l = l[1:]
for i in l:
u = i.split('\"',2)[0]
if u:
url_list.append('http://%s' % u)
return url_list
def get_hid_list(url):
hid_list = []
text = get_html_text(url)
#l = text.split('openBox(')
#if len(l) >= 2:
# l = l[1:]
# for i in l:
# hid = i.split(',',2)[0]
# if hid:
# hid_list.append(hid)
l = text.split('openPlay(\'')
if len(l) >= 2:
l = l[1:]
for i in l:
hid = i.split('\')',2)[0]
if hid:
hid_list.append(hid)
l = text.split('openPlay(\"')
if len(l) >= 2:
l = l[1:]
for i in l:
hid = i.split('\")',2)[0]
if hid:
hid_list.append(hid)
return hid_list
def get_hash_list(url):
hash_list = []
text = get_html_text(url)
l = text.split('qvod://')
if len(l) >= 2:
l = l[1:]
for i in l:
hl = i.split('|',3)
if len(hl) >= 3:
hash_list.append(hl[1])
return hash_list
def get_quote_keyward(keyword):
keyword = keyword.decode(sys.stdin.encoding).encode('gbk')
keyword = urllib.quote(keyword)
return keyword
def get_tn(keyword):
tn = keyword.decode('utf8')
return tn
def write_hid_list(hid_list):
fout = file('/tmp/hid_list','wb')
fout.write(str(hid_list))
fout.close()
def read_hid_list():
fin = file('/tmp/hid_list','rb')
hid_text = fin.read()
fin.close()
hid_list = eval(hid_text)
return hid_list
def write_hash_url_dic(hash_url):
fout = file('/tmp/hash_url_dic','wb')
fout.write(str(hash_url))
fout.close()
def read_hash_url_dic():
fin = file('/tmp/hash_url_dic','rb')
hash_url_text = fin.read()
fin.close()
hash_url = eval(hash_url_text)
return hash_url
def build_pipe():
fd = os.open(pipe_name, os.O_NONBLOCK | os.O_CREAT | os.O_RDWR)
return fd
def send_start_signal(fd):
os.write(fd,"START")
def wait_start_signal():
fd = os.open(pipe_name, os.O_RDONLY)
while os.read(fd, 5) <> 'START':
continue
os.close(fd)
os.remove(pipe_name)
#!/usr/bin/python
# coding:utf8
# @IgnorePep8
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + '/..')
from crawler.utils import server_list, get_http_url_list, get_hash_list, read_hid_list, write_hash_url_dic, read_hash_url_dic, wait_start_signal
def get_hid_server_url_list(hid, server):
url_list = []
url = '%s/ShowMovie.aspx?name=v&hid=%s&tn=v' % (server, hid)
url_list = get_http_url_list(url)
return url_list
def get_hid_url_list(hid):
hid_url_list = []
for server in server_list:
hid_server_url_list = get_hid_server_url_list(hid, server)
hid_url_list += hid_server_url_list
hid_url_list = list(set(hid_url_list))
return hid_url_list
def get_url_list(hid_list):
url_list = []
for hid in hid_list:
hid_url_list = get_hid_url_list(hid)
url_list += hid_url_list
print 'HID: %s URL NUM: %s' % (hid, len(hid_url_list))
url_list = list(set(url_list))
print 'URL NUM: %s' % len(url_list)
return url_list
def get_hash_url_dic(url_list):
hash_url = {}
for url in url_list:
hash_list = get_hash_list(url)
for hash_s in hash_list:
l = hash_url.get(hash_s, [])
l.append(url)
hash_url[hash_s] = l
for hash_s in hash_url.keys():
hash_url[hash_s] = list(set(hash_url.get(hash_s, [])))
return hash_url
def search_url(hash_s):
hash_url = read_hash_url_dic()
url_list = hash_url.get(hash_s, [])
print 'HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list))
open('result','a').write('HASH: %s\n%s' % (hash_s, '\t'+'\n\t'.join(url_list)))
return url_list
def run():
hid_list = read_hid_list()
url_list = get_url_list(hid_list)
hash_url = get_hash_url_dic(url_list)
write_hash_url_dic(hash_url)
if __name__ == '__main__':
#print 'WAITING FOR HID LIST......'
#wait_start_signal()
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment