一、线程回顾
import time import threading '''一、一个主线程''' # def sing(): # for i in range(1,6): # print('come baby 跟我一起 嗨 嗨 嗨 !!!') # time.sleep(1) # # def dance(): # for i in range(1,6): # print('恰恰 肚皮 钢管舞 哈哈哈哈哈 ...... ') # time.sleep(1) # def main(): # sing() # dance() # if __name__ == '__main__': # main() '''二、面向过程创建线程:一个主线程,两个子线程''' # def sing(a): # for i in range(1,6): # print('当前线程:%s ...come %s 跟我一起 嗨 嗨 嗨 !!!' %(threading.current_thread().name,a)) # time.sleep(1) # def dance(a): # for i in range(1,6): # print('当前线程:%s ... 恰恰 肚皮 钢管舞 %s你要哪一种 ' %(threading.current_thread().name,a)) # time.sleep(1) # def main(): # print('...联欢晚会现在开始...') # #创建唱歌线程 # a = '悟空' # t_sing = threading.Thread(target=sing,name='唱歌',args=(a,)) # # # 创建跳舞线程 # t_dance = threading.Thread(target=dance, name='跳舞',args=(a,)) # # #启动线程 # t_sing.start() # t_dance.start() # # #让主线程等待子线程执行完毕 # t_sing.join() # t_dance.join() # # print('晚会结束,各回各家') # if __name__ == '__main__': # main() '''三、面向对象创建线程''' #写一个类,继承threading.Thread class SingThread(threading.Thread): def __init__(self,name,a): super().__init__() self.name = name self.a = a def run(self): print("线程名:%s 参数:%s" %(self.name,self.a)) for i in range(1, 6): print('爱江山更爱美人...') time.sleep(1) class DanceThread(threading.Thread): def __init__(self, name, a): super().__init__() self.name = name self.a = a def run(self): print("线程名:%s 参数:%s" % (self.name, self.a)) for i in range(1, 6): print('蹦擦擦,蹦擦擦...') time.sleep(1) def main(): #创建线程 t_sing = SingThread('唱','八戒') t_dance = DanceThread('跳','悟能') #启动线程 t_sing.start() t_dance.start() #让主线程等待子线程执行完毕 t_sing.join() t_dance.join() if __name__ == '__main__': main()
二、队列
from queue import Queue #创建队列 q = Queue(5) #5个位子 print(q.empty()) #判断是否为空 #存入数据 q.put('浓眉哥') q.put('勒布朗') q.put('丹尼*格林') q.put('库兹马') q.put('麦基') print(q.full()) #判断是否满 print(q.qsize()) #返回队列大小 # q.put('波普',False) #如果队列满了,直接报错 # q.put('波普',True,3) #如果队列满了,等待3秒还没有空位,报错 #获取数据:先进先出 print(q.get()) print(q.get()) print(q.get()) print(q.get()) print(q.get()) # q.get('波普',False) #如果队列为空,直接报错 # q.get('波普',True,3) #如果队列为空,等待3秒还是空,报错
三、多线程爬虫
import time import threading from queue import Queue import requests from lxml import etree import json #存放采集线程 crawl_thread_list = [] #存放解析线程 parse_thread_list = [] def create_queue(): #创建页码队列 page_queue = Queue() for page in range(1,6): page_queue.put(page) # 创建内容队列 data_queue = Queue() return page_queue,data_queue class CrawlThread(threading.Thread): def __init__(self,name,page_queue,data_queue): super(CrawlThread,self).__init__() self.name = name self.page_queue = page_queue self.data_queue = data_queue self.url = 'http://www.fanjian.net/jiantu-{}' self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} def run(self): print('%s启动......' %self.name) while 1: if self.page_queue.empty(): break #从队列取出页码 page = self.page_queue.get() #拼接url url = self.url.format(page) #发送请求,拿到响应 r = requests.get(url=url,headers=self.headers) #将响应内容放到data_queue self.data_queue.put(r.text) break print('%s结束......' % self.name) class ParseThread(threading.Thread): def __init__(self,name,data_queue,fp,lock): super(ParseThread, self).__init__() self.name = name self.data_queue = data_queue self.fp = fp self.lock = lock def parse_content(self,data): tree = etree.HTML(data) '''先查找所有的li,再从li下查找图片标题和src''' li_list = tree.xpath('//ul[@class="cont-list"]/li') items = [] for l in li_list: # 获取图片标题 img_title = l.xpath('//h2/a/text()')[0] #获取图片url img_url = tree.xpath('//div[@class="cont-list-main"]/p/img/@data-src')[0] item = {'标题',img_title, '链接',img_url} items.append(item) #写入文件 self.lock.acquire() #上锁 for item in items: self.fp.write(str(item)) self.lock.release() #解锁 def run(self): while 1: print('%s启动......' % self.name) #从data_queue中取出一页数据 data = self.data_queue.get() #解析内容 self.parse_content(data) def create_crawl_thread(page_queue,data_queue): crawl_name = ['采集1号','采集2号','采集3号'] for name in crawl_name: #创建子线程 t_crawl = CrawlThread(name,page_queue,data_queue) #保存到列表 crawl_thread_list.append(t_crawl) def create_parse_thread(data_queue,fp,lock): parse_name = ['解析1号', '解析2号', '解析3号'] for name in parse_name: # 创建子线程 t_parse = ParseThread(name,data_queue,fp,lock) # 保存到列表 parse_thread_list.append(t_parse) def main(): # 创建队列 page_queue,data_queue = create_queue() #打开一个文件 fp = open('jiantu.txt','a',encoding='utf8') #创建锁 lock = threading.Lock() #创建采集线程 create_crawl_thread(page_queue,data_queue) #创建解析线程 create_parse_thread(data_queue,fp,lock) # 启动采集线程 for t in crawl_thread_list: t.start() # 启动解析线程 for t in parse_thread_list: t.start() # 让主线程等待子线程执行完毕 for t in crawl_thread_list: t.join() for t in parse_thread_list: t.join() #关闭文件 fp.close() print('主线程执行完毕!') if __name__ == '__main__': main()