多线程爬虫------qiushi
import requests
from lxml import etree
import time
import threading
from queue import Queue
import random
class QiushiSpider(object):
def __init__(self):
self.base_url = 'https://www.qiushibaike.com/8hr/page/{}'
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"}
#记录总个数
self.count = 0
#创建队列
self.url_queue = Queue()
#创建reponse队列
self.response_queue = Queue()
#创建data队列
self.data_queue = Queue()
#组和所有的URL,遍历了所有页
def get_url_list(self):
for i in range(1, 14):
self.url_queue.put(self.base_url.format(i))
def send_request(self):
while True:
time.sleep(1)
#循环从队列中取出url
url = self.url_queue.get()
print(url)
proxy_list = [
{"http": '111.155.116.229:8123'},
{"http": '61.135.217.7:80'},
]
#随机在列表中取出proxy
proxy = random.choice(proxy_list)
response = requests.get(url,headers = self.headers,proxies = proxy).content.decode()
if response.status_code == 200:
#响应对象入队列
self.reponse_queue.put(response)
else:#如果不成功URL再发送,入队列
self.url_queue.put(url)
#队列计数器减一
self.url_queue.task_done()
def anlysis_data(self,data):
html_data = etree.HTML(data)
#取出所有段子的div
div_list = html_data.xpath('//div[@id="content-left"]/div')
for div in div_list:
# 解析昵称
nick_name = div.xpath('.//h2/text()')[0]
print(nick_name)
self.count += 1
self.data_queue.put(nick_name)
def save_data(self,data):
#循环取出数据进行保存
while True:
nick_name = self.data_queue.get()
print(nick_name)
self.data_queue.task_done()
#开始爬取
def work_spider(self):
th_list = []
#组合url——list
th_url = threading.Thread(target=self.get_url_list)
th_list.append(th_url)
#发送请求
for i in range(1,2):
th_request = threading.Thread(target=self.send_request)
th_list.append(th_request)
#解析
th_analysis = threading.Thread(target=self.anlysis_data)
th_list.append(th_analysis)
#保存
th_save = threading.Thread(target=self.save_data)
th_list.append(th_save)
#开启线程
for th in th_list:
#开启守护
th.setDaemon(True)
th.start()
#队列阻塞主线程
for qu in [self.url_queue, self.response_queue, self.data_queue]:
qu.join()
def run(self):
start_time = time.time()
self.work_spider()
end_time = time.time()
print('总共耗时:{}'.format(end_time - start_time))
print('总个数:{}'.format(self.count))
if __name__ == '__main__':
QiushiSpider().run()