python爬虫多线程编程
#使用了线程库 import threading from queue import Queue from bs4 import BeautifulSoup import json import requests class ThreadCrawl(threading.Thread): def __init__(self,threadNmae,pageQueue,dataQueue): #threading.Thread.__init__(self) #多个父类的话下面这个方便 super(ThreadCrawl,self).__init__( ) self.threadNmae=threadNmae self.pageQueue=pageQueue self.dataQueue=dataQueue self.headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"} def run(self): print("启动"+self.threadNmae) while not CRAWL_EXIT: try: #取出一个数字,先进先出 #1可选参数block默认值是true,不会结束,会进入阻塞状态,直到队列有新的数据 #2.如果队列为空,block为Flase的话,就会弹出一个Queue.empty()异常 page=self.pageQueue.get(False) url="https://www.qiushibaike.com/8hr/page/"+str(page)+"/" content=requests.get(url,headers=self.headers) self.dataQueue.put(content) except: pass print("结束"+self.threadNmae) CRAWL_EXIT=False PARSE_EXIT=False def main(): #页面的队列可以存储10页 pageQueue=Queue(10) #放入1-10 先进先出 for i in range(1,11): pageQueue.put(i) #采集结果的数据队列,参数为空 dataQueue=Queue() #存储三个线程采集的名字 crawList=["采集线程1号","采集线程2号","采集线程3号"] #存储三个采集线程 threadcrawl=[] for threadNmae in crawList: thread=ThreadCrawl(threadNmae,pageQueue,dataQueue) thread.start() threadcrawl.append(thread) while not pageQueue.empty(): pass global CRAWL_EXIT CRAWL_EXIT=True print("Queue为空") for thread in threadcrawl: thread.join() print("joining...............") if __name__=="__main__": main()