from typing import Optional, Callable, Iterable, Mapping, Any
import requests
from lxml import etree
from threading import Thread
from queue import Queue
import json
url = 'https://www.qiushibaike.com/text/page/%d/'
queue_url = Queue(13)
queue_html = Queue(13)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9'}
exitFlag = False
class CrawlThread(Thread):
def __init__(self, queue, thread_id) -> None:
super().__init__()
self.queue = queue
self.thread_id = thread_id
def run(self) -> None:
super().run()
print('-------------爬虫线程--%d--启动--------------' % (self.thread_id))
self.get_html()
print('-------------爬虫线程--%d--终止--------------' % (self.thread_id))
def get_html(self):
while True:
# 根据队列是否为空,退出
if self.queue.empty():
break
try:
page = self.queue.get(block=False)
response = requests.get(url=url % (page), headers=headers)
response.encoding = 'utf-8'
html = response.text
queue_html.put((html, page))
self.queue.task_done()
print('----------------爬虫线程--%d--获取--%d--页的数据-----------------' % (self.thread_id, page))
except Exception as e:
pass
pass
class ParseThread(Thread):
def __init__(self, queue, thread_id, fp):
super().__init__()
self.queue = queue
self.thread_id = thread_id
self.fp = fp
def run(self):
print('-------------解析线程--%d--启动--------------' % (self.thread_id))
self.parse_html()
print('-------------解析线程--%d--终止--------------' % (self.thread_id))
def parse_html(self):
while True:
if exitFlag:
break
try:
html,page = self.queue.get(block=False)
tree = etree.HTML(html)
divs = tree.xpath('//div[contains(@id,"qiushi_tag_")]')
# 遍历divs(列表)---------div 元素(包含我们想要的内容)
for div in divs:
try:
content = div.xpath('.//div[@class="content"]/span/text()')[0].strip()
# 点赞
zan = div.xpath('.//span[@class="stats-vote"]/i/text()')[0].strip()
# 评论
comment = div.xpath('.//span[@class="stats-comments"]//i/text()')[0].strip()
# 作者
author = div.xpath('.//div[@class="author clearfix"]//h2/text()')[0].strip()
item = {}
item['author'] = author
item['zan'] = zan
item['comment'] = comment
item['content'] = content
json.dump(item,fp,ensure_ascii=False)
except Exception as e:
print('----------解析异常,页码是:%d---------'%(page))
pass
print('------------解析线程--%d--解析页码--%d--任务-------------'%(self.thread_id,page))
self.queue.task_done()
except Exception as e:
pass
pass
if __name__ == '__main__':
for i in range(13):
queue_url.put(i + 1)
# 开启网络请求
for i in range(5):
t = CrawlThread(queue_url, i)
t.start()
fp = open('./糗事百科.txt', mode='a', encoding='utf-8')
# 解析线程的任务
for i in range(3):
t = ParseThread(queue_html, i, fp)
t.start()
# 队列锁,队列中任务必须全部完成才可以执行下一步
queue_url.join()
queue_html.join()
exitFlag = True
fp.close()