爬虫多进程,etree和xpath

from pprint import pprint
from queue import Queue

from lxml import etree
import requests
# 导入进程池
from multiprocessing.dummy import Pool

import time
class QuiShi:
def __init__(self):
self.temp_url = "http://www.lovehhy.net/Joke/Detail/QSBK/{0}"
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"}
#1.建立队列
self.query = Queue()
# 2.建立进程池
self.pool = Pool(10)
# 3.结束条件
self.is_run = True
self.req_num = 0
self.resp_num = 0

def get_url_list(self):
for i in range(1,10):
self.query.put(self.temp_url.format(i))
self.req_num += 1

def parse_url(self,url):
response = requests.get(url, headers=self.headers)

return response.content.decode("gbk")

def get_content_list(self,html_str):
# print(html_str)
#etree.HTML 变成树状结构
html_str = html_str.replace("<br />","").strip("")
html = etree.HTML(html_str)
# s = html.xpath('//div[@id="footzoon"]')
h3_list = html.xpath('//div[@id="footzoon"]/h3')
content_list=[]
for h3 in h3_list:
item = {}
item["title"] = h3.xpath("./a/text()")
item["title_href"] = h3.xpath("./a/@href")
item["content"] =[]
s = h3.xpath('./following-sibling::div/text()')
for i in s:
item["content"].append(i.replace("\u3000",""))
content_list.append(item)
return content_list
def save_content_list(self,contents):
pprint(contents)

def _url_parse_content_save(self):
# 获取url列表
url = self.query.get()
# requsts响应
html_str = self.parse_url(url)
# 提取数据
contents = self.get_content_list(html_str)
# 保存
self.save_content_list(contents)
self.resp_num += 1

def _callback(self):
if (self.is_run):
# self.pool.apply_async 进程重复执行命令
# self._url_parse_content_save()
self.pool.apply_async(self._url_parse_content_save(),self._callback())

def run(self):
self.get_url_list() # 14
for i in range(3):
self.pool.apply_async(self._url_parse_content_save(),self._callback())

while True:
time.sleep(0.001)
if self.resp_num>=self.req_num:
self.is_run = False
break

if __name__ == '__main__':
t1 = time.time()
quishi = QuiShi()
quishi.run()
print(time.time() - t1)
posted @ 2019-11-21 18:44  风筝老师  阅读(329)  评论(0编辑  收藏  举报