Python 爬虫修正

总感觉有什么地方出错了。这爬虫总是不通用。。

#coding:utf-8
import Queue,re,urllib2
import threading
from BeautifulSoup import BeautifulSoup

# 目标
task_url="http://www.baidu.com/"
# 结果队列
result_list={}
result_list.update({task_url:0})
spider_list=Queue.Queue(10)

class ThreadPool(object):
    def __init__(self, size):
        self._queue = Queue.Queue()
        self._data_ready = threading.Condition()
        self._exit_flag = threading.Event()
        self._threads = []
        for i in range(size):
            t = threading.Thread(target=self._run, name=str(i))
            t.start()
            self._threads.append(t)

    def add_task(self, callback, *args):
        self._queue.put((callback, args))
        with self._data_ready:
            self._data_ready.notify()

    def join(self):
        self._queue.join()
        self._exit_flag.set()
        with self._data_ready:
            self._data_ready.notify_all()
        for t in self._threads:
            t.join()

    def _run(self):
        while True:
            with self._data_ready:
                while self._queue.empty() and not self._exit_flag.is_set():
                    self._data_ready.wait()
                if self._exit_flag.is_set():
                    break
                cb, args = self._queue.get_nowait()

            cb(*args)
            self._queue.task_done()

def spr_url(url):
    try:
        body_text=urllib2.urlopen(url).read()
        soup=BeautifulSoup(body_text)
        links=soup.findAll('a')
        for link in links:
            _url=link.get('href').encode('utf-8')
            if re.match('^(javascript|:;|#|mailto)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):
                continue
            if re.match('^(http|https)',_url):
                if not re.match('^'+url,_url):
                    continue
                else:
                    if result_list.has_key(_url):
                        continue
                    else:
                        rst=_url.encode('utf-8')
                        print "[*][!] 发现连接:"+rst
                        result_list.update({rst:0})
            else:
                if result_list.has_key(url+_url):
                    continue
                else:
                    rst=url+_url
                    print "[*][!] 发现新连接: "+rst.encode('utf-8')
                    result_list.update({rst.encode('utf-8'):0})
    except Exception,error:
        print error


while True:
    # 查看目标有多少个待爬
    for url in result_list:
        if result_list[url]==0:
            # 每次放入10个任务到任务池,控制数量
            if not spider_list.full():
                spider_list.put(url)

    # 判断队列是否还有任务,如果有则爬,没有则任务结束
    if spider_list.empty():
        print "Spider is Finish!"
        for r_item in result_list:
            print "-------- Sprider Results -----------"
            print "URl: " + r_item
            print "------------------------------------"
        break
    else:
        thr=ThreadPool(10)
        # 取出URL 并且爬虫,结果放入result_list
        for num in range(spider_list.qsize()):
            thr.add_task(spr_url,spider_list.get())
        thr.join()

 

posted @ 2014-04-29 14:56  墨迹哥's  阅读(304)  评论(0编辑  收藏  举报