Python网络爬虫(线程池)
基于multiprocessing.dummy线程池的数据爬取
一、项目实例(测试异步与同步效率)
flask创建服务器:
from flask import Flask from time import sleep app = Flask(__name__) @app.route('/bobo') def index1(): sleep(2) return 'hello bobo!' @app.route('/jay') def index2(): sleep(2) return 'hello jay!' @app.route('/tom') def index3(): sleep(2) return 'hello tom!' app.run()
不使用线程池爬取,同步抓取:
start = time.time() urls = [ 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/jay', 'http://127.0.0.1:5000/tom', ] for url in urls: page_text = requests.get(url,headers=headers).text print(page_text) print(time.time()-start)
打印测试结果:
hello bobo! hello jay! hello tom! 6.016878366470337
通过线程池,multiprocessing应用:
import requests import time from multiprocessing.dummy import Pool #线程池模块 #必须只可以有一个参数 def my_requests(url): return requests.get(url=url,headers=headers).text start = time.time() urls = [ 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/jay', 'http://127.0.0.1:5000/tom', ] pool = Pool(3) #map:两个参数 #参数1:自定义的函数,必须只可以有一个参数 #参数2:列表or字典 #map的作用就是让参数1表示的自定义的函数异步处理参数2对应的列表或者字典中的元素 page_texes = pool.map(my_requests,urls) print(page_texes) print(time.time()-start) # 打印结果 ['hello bobo!', 'hello jay!', 'hello tom!'] 2.0126171112060547
二、项目实例(梨视频)
需求:爬取梨视频的视频信息,并计算其爬取数据的耗时,url:https://www.pearvideo.com/
2.1 普通爬取:
import requests import random from lxml import etree import re from fake_useragent import UserAgent
#安装fake-useragent库:pip install fake-useragent url = 'http://www.pearvideo.com/category_1' #随机产生UA,如果报错则可以添加如下参数: #ua = UserAgent(verify_ssl=False,use_cache_server=False).random #禁用服务器缓存: #ua = UserAgent(use_cache_server=False) #不缓存数据: #ua = UserAgent(cache=False) #忽略ssl验证: #ua = UserAgent(verify_ssl=False) ua = UserAgent().random headers = { 'User-Agent':ua }
#获取首页页面数据 page_text = requests.get(url=url,headers=headers).text
#对获取的首页页面数据中的相关视频详情链接进行解析 tree = etree.HTML(page_text) li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')
detail_urls = [] for li in li_list: detail_url = 'http://www.pearvideo.com/'+li.xpath('./div/a/@href')[0] title = li.xpath('.//div[@class="vervideo-title"]/text()')[0] detail_urls.append(detail_url)
# 视频详请url for url in detail_urls: page_text = requests.get(url=url,headers=headers).text vedio_url = re.findall('srcUrl="(.*?)"',page_text,re.S)[0] data = requests.get(url=vedio_url,headers=headers).content fileName = str(random.randint(1,10000))+'.mp4' #随机生成视频文件名称 with open(fileName,'wb') as fp: fp.write(data) print(fileName+' is over')
2.2 线程池爬取:
import requests import random from lxml import etree import re from fake_useragent import UserAgent #安装fake-useragent库:pip install fake-useragent #导入线程池模块 from multiprocessing.dummy import Pool #实例化线程池对象 pool = Pool() url = 'http://www.pearvideo.com/category_1' #随机产生UA ua = UserAgent().random headers = { 'User-Agent':ua } #获取首页页面数据 page_text = requests.get(url=url,headers=headers).text #对获取的首页页面数据中的相关视频详情链接进行解析 tree = etree.HTML(page_text) li_list = tree.xpath('//div[@id="listvideoList"]/ul/li') detail_urls = []#存储二级页面的url for li in li_list: detail_url = 'http://www.pearvideo.com/'+li.xpath('./div/a/@href')[0] title = li.xpath('.//div[@class="vervideo-title"]/text()')[0] detail_urls.append(detail_url) vedio_urls = []#存储视频的url for url in detail_urls: page_text = requests.get(url=url,headers=headers).text vedio_url = re.findall('srcUrl="(.*?)"',page_text,re.S)[0] vedio_urls.append(vedio_url) #使用线程池进行视频数据下载 func_request = lambda link:requests.get(url=link,headers=headers).content video_data_list = pool.map(func_request,vedio_urls) #使用线程池进行视频数据保存 func_saveData = lambda data:save(data) pool.map(func_saveData,video_data_list) def save(data): fileName = str(random.randint(1,10000))+'.mp4' with open(fileName,'wb') as fp: fp.write(data) print(fileName+'已存储') pool.close() pool.join()
三、项目实例(梨视频xpath与re)
from lxml import etree import requests import re import os from uuid import uuid4 import random headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36" } # ip代理 all_ips = [] # 代理精灵接口 api_url = "http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&groupid=
0&qty=20&time=1&pro=&city=&port=1&format=html&ss=5&css=&dt=1&specialTxt=3&specialJson=" page_text = requests.get(api_url,headers=headers).text tree = etree.HTML(page_text) # 获取ip值 ip_list = tree.xpath('//body//text()') # 循环获取,存入IP存储列表中 for ip in ip_list: ip_dict = {'https':ip} all_ips.append(ip_dict) # 汽车资讯url url = "https://www.pearvideo.com/category_31" page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath("//div[@class='category-top']/div/ul/li | //*[@id='categoryList']/li") for li in li_list: # 视频详情页 img_url = "https://www.pearvideo.com/"+li.xpath("./div/a/@href")[0] # print(img_url) video_page_text = requests.get(img_url, headers=headers).text # 获取视频标题 ex_title = 'class="video-tt-box".*?video-tt">(.*?)</h1>' pa = re.compile(ex_title, re.S) video_title = pa.findall(video_page_text)[0] # 正则匹配视频详细url ex = ',srcUrl="(.*?)",' video_src = re.findall(ex, video_page_text, re.S)[0] print(video_src) if not os.path.exists("lishipin"): os.mkdir("lishipin") # 视频存放路径 filename = f"{uuid4()}.mp4" file_path = "lishipin/"+filename video_content = requests.get(url=video_src, headers=headers, proxies=random.choice(all_ips)).content print(video_content) with open(file_path, "wb") as fp: fp.write(video_content) print(video_title, " 下载完成!!!")
https://www.cnblogs.com/WiseAdministrator/