Skip to content

Instantly share code, notes, and snippets.

@foursking1
Last active May 20, 2016 04:36
Show Gist options
  • Save foursking1/261b04b85a0de587b989bb32c189a8f4 to your computer and use it in GitHub Desktop.
Save foursking1/261b04b85a0de587b989bb32c189a8f4 to your computer and use it in GitHub Desktop.
#-*- coding: utf-8 -*-
import logging
from Queue import Queue
import threading
import urllib2
import time
import random
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
entity_file_path = "./entity.txt"
output_file_path = "./output.txt"
max_page = 10
target_url = 'http://search.jd.com/s_new.php?keyword=%s&enc=utf-8&page=%d&s=28&scrolling=y&pos=28'
class ThreadCrawl(threading.Thread):
def __init__(self, url, queue, out_queue, name):
threading.Thread.__init__(self)
self.url = url
self.queue = queue
self.out_queue = out_queue
self.name = name
def run(self):
while True:
item = self.queue.get()
for i in range(max_page):
logger.info("Start crawl word = %s, page = %d, thread_name = %s" % (item, i, self.name))
self._get_and_parse_data(item, i)
time.sleep(random.random())
self.queue.task_done()
def _get_and_parse_data(self, item, page):
request_url = self.url % (item, page)
try:
req = urllib2.Request(url=request_url)
res = urllib2.urlopen(req)
soup = BeautifulSoup(res.read())
product = soup.find_all("div", class_="p-name")
for d in product:
# 编码问题
self.out_queue.put(d.a['title'].encode('utf-8'))
# print(d.a['title'])
except Exception, e:
logger.warning(e)
def main():
entity_queue = Queue()
title_queue = Queue()
entitys = []
entity_file = open(entity_file_path)
for d in entity_file:
d = d.strip()
entitys.append(d)
for i in range(8):
t = ThreadCrawl(target_url, entity_queue, title_queue, 'thread-%d' % i)
t.setDaemon(True)
t.start()
start = time.time()
for d in entitys:
entity_queue.put(d)
entity_queue.join()
print "Elapsed Time: %s" % (time.time() - start)
with open(output_file_path, 'w') as f:
while title_queue.empty() is not True:
title = title_queue.get()
f.write(title + '\n')
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment