代码改变世界

python爬虫

2014-04-14 21:05  Polarisary  阅读(581)  评论(0编辑  收藏  举报

前些天,学习python多线程,写了个爬虫,可以爬取虎嗅上的一些文章。开始时是想爬完文章后,再搞个分类器将文章分类展示,工作较忙,又要学习新的知识,分类器以后再搞了,备忘。

 1 # !/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import Queue
 5 import threading
 6 import time
 7 import re
 8 from BeautifulSoup import BeautifulSoup
 9 from Dbpool import DBTools
10 import urllib2
11 import logging
12 import sys
13 reload(sys)
14 sys.setdefaultencoding('utf-8')
15 
16 sBaseUrl = 'http://www.huxiu.com/focus/'
17 sMainUrl = 'http://www.huxiu.com'
18 pool = DBTools('localhost','root','1q2w3e4r','selfblog')
19 logger = logging.getLogger(__name__)
20 class WorkManager(object):
21     def __init__(self, work_num=1000,thread_num=6):
22         self.work_queue = Queue.Queue()
23         self.threads = []
24         self.__init_work_queue(work_num)
25         self.__init_thread_pool(thread_num)
26 
27     #初始化线程
28     def __init_thread_pool(self,thread_num):
29         for i in range(thread_num):
30             self.threads.append(Spider(self.work_queue))
31 
32     #初始化工作队列
33     def __init_work_queue(self, jobs_num):
34         for i in range(jobs_num):
35             page_url = '%s%s%s' %(sBaseUrl,i,'.html')
36             self.add_job(page_url)
37 
38     #添加一项工作入队
39     def add_job(self,args):
40         self.work_queue.put(args)#任务入队,Queue内部实现了同步机制
41 
42     #检查剩余队列任务
43     def check_queue(self):
44         return self.work_queue.qsize()
45 
46     #等待所有线程运行完毕
47     def wait_allcomplete(self):
48         for item in self.threads:
49             if item.isAlive():item.join()
50 
51 class Spider(threading.Thread):
52     def __init__(self, work_queue):
53         threading.Thread.__init__(self)
54         self.work_queue = work_queue
55         self.start()
56 
57     def run(self):
58         #死循环,从而让创建的线程在一定条件下关闭退出
59         while True:
60             try:
61                 page_url = self.work_queue.get(block=False)#任务异步出队,Queue内部实现了同步机制
62                 print '线程:[%s],url:%s' % (threading.local,page_url)
63                 logger.info(u'线程:[%s],url:%s' % (threading.local,page_url))
64                 self.doAnalize(page_url)
65                 self.work_queue.task_done()#通知系统任务完成
66             except Exception,e:
67                 print str(e)
68                 break
69     def doAnalize(self,page_url):
70         try:
71             content = urllib2.urlopen(page_url).read()
72         except IOError,e:
73             print(e)
74 
75         #匹配每页中的文章链接
76         re_url = re.compile(r'(/article/[0-9]{2,5}/1.html)')
77         urls = re_url.findall(content)
78         for aurl in set(urls):
79             news_url = '%s%s' %(sMainUrl,aurl)
80             #print '--',news_url
81             try:
82                 #解析获取的文章(html)并将‘转换成“ 便于拼接sql
83                 news_content = urllib2.urlopen(news_url).read().replace("\'","\"")
84                 #使用BeautifulSoup解析html 获取文章title
85                 soup = BeautifulSoup(news_content)
86                 title = soup.findAll("h1")[1].text.strip()
87 
88                 sql = 'INSERT INTO web_news(title,content_html, ref_url) VALUES (\'%s\', \'%s\',\'%s\')' %( title,news_content,news_url)
89                 pool.operateDB('insert',sql)
90             except IOError:
91                 continue
92 
93 if __name__ == '__main__':
94     start = time.time()
95     work_manager =  WorkManager(10, 5)
96     work_manager.wait_allcomplete()
97     end = time.time()
98     print "cost all time: %s" % (end-start)