【爬虫】多线程爬取糗事百科写入文件
''' 爬取糗事百科的段子,将内容和连接爬取下来,写入scv 使用技术:多线程,锁,队列,xpath,csv ''' import requests import csv from queue import Queue from lxml import etree import threading class Creeper(threading.Thread): def __init__(self,url_queue,content_queue,*args,**kwargs): super().__init__(*args,**kwargs) self.url_queue = url_queue self.content_queue = content_queue def run(self): while True: if self.url_queue.empty(): break url = self.url_queue.get() self.parse_page(url) def parse_page(self,url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"} response = requests.get(url,headers=headers) text = etree.HTML(response.text) divEle = text.xpath('//div[contains(@class,"article block")]') for div in divEle: content = div.xpath('.//a[@class="contentHerf"]//span[1]//text()') new_content = "\n".join(list(map(lambda x:x.replace('\n',''),content))) a_url = "https://www.qiushibaike.com" + div.xpath('.//a[@class="contentHerf"]/@href')[0] self.content_queue.put((new_content,a_url)) class SaveFile(threading.Thread): def __init__(self,content_queue,writer,lock,*args,**kwargs): super().__init__(*args,**kwargs) self.content_queue = content_queue self.writer = writer self.lock = lock def run(self): while True: try: content,link = self.content_queue.get(timeout=30) # 设置超时时间 # 写入文件必须加锁 self.lock.acquire() self.writer.writerow((content,link)) self.lock.release() print('保存一条') except: break def main(): url_queue = Queue(100) content_queue = Queue(300) base_url = "https://www.qiushibaike.com/text/page/{}/" gLock = threading.Lock() # 解决写入中文乱码 f = open('糗事百科.csv','a',encoding='utf-8-sig',newline="") header = ['content','link'] writer = csv.writer(f) writer.writerow(header) for i in range(1,13): url = base_url.format(i) url_queue.put(url) for i in range(2): c = Creeper(url_queue, content_queue) c.start() for i in range(2): s = SaveFile(content_queue,writer,gLock) s.start() if __name__ == '__main__': main()