爬虫基础-爬取某神坛图片
线程池方式
import requests
from lxml import etree
from bs4 import BeautifulSoup
from lxml.html import tostring
import time
import os
import re
from multiprocessing.dummy import Pool
def get_title_list(Max_page_Num):
list_title = []
for pageNum in range(1, Max_page_Num):
url = page_url.format(Num=pageNum)
try:
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
print('<第', pageNum, '页>获取页面信息完成')
response.encoding = 'gbk'
tree = etree.HTML(response.text)
if pageNum == 1:
title_list = tree.xpath('//div[@id="main"]//table[@id="ajaxtable"]//tr[@class="tr3 t_one tac"][position()>9]')
else:
title_list = tree.xpath('//div[@id="main"]//table[@id="ajaxtable"]//tr[@class="tr3 t_one tac"]')
for tr in title_list:
# 取分类,把空格去掉,保留字典
f_name = tr.xpath('normalize-space(./td[@class="tal"]/text())')
# 取标题
e_name = tr.xpath('./td[@class="tal"]/h3//text()')[0]
# 组合之后类似这样 [真心话系列]我爱你中国
title_name = f_name + e_name
# 标题链接
title_src = server + tr.xpath('./td[@class="tal"]//a/@href[1]')[0]
title_dir = Download_dir + title_name
dic_title = {
'name': title_name,
'dir': title_dir,
'src': title_src,
'pageNum': pageNum
}
list_title.append(dic_title)
except Exception as re_index:
print(re_index, '页面请求超时', '链接地址: ', url)
return list_title
def get_img_list(dic_title):
title_name = dic_title['name']
title_dir = dic_title['dir']
title_src = dic_title['src']
pageNum = dic_title['pageNum']
if not os.path.exists(title_dir):
os.makedirs(title_dir)
print('获取标题完成: ', '<第', pageNum, '页>' ,title_name, '链接地址: ', title_src)
try:
parse_title_url = requests.get(url=title_src, headers=headers)
parse_title_url.encoding = 'gbk'
title_tree = etree.HTML(parse_title_url.text)
img_list = title_tree.xpath('//div[@class="t t2"][1]//td[@valign="top"]//div[@class="tpc_content do_not_catch"]/img')
list_img = [] # 定义一个列表,用来封装字典(字典内容为图片相关信息)
for img in img_list:
# imgNum = len(img)
# img_decode = tostring(img).decode('utf-8')
img_src = img.xpath('./@ess-data')[0]
img_name = img_src.split('/')[-1]
img_path = title_dir + '/' + img_name
# 定义一个字典,封装图片相关信息
dic_img = {
'name': img_name,
'src': img_src,
'path': img_path
}
list_img.append(dic_img)
if list_img:
print(title_name, '图片信息解析完成')
return list_img
except Exception as re_title:
print(re_title, '<', title_name, '>', '标题链接请求超时', '链接地址: ', title_src)
def download_img(dic_img):
img_name = dic_img['name']
img_src = dic_img['src']
img_path = dic_img['path']
if not os.path.exists(img_path):
try:
print(img_name,'开始下载')
img_response = requests.get(url=img_src, headers=headers).content
fp = open(img_path, 'wb')
fp.write(img_response)
fp.close()
print(img_src, '下载完成',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
except Exception as re_img:
print(re_img, img_src, '请求失败')
else:
print(img_name, '已存在,跳过下载',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
if __name__ == "__main__":
page_url = 'https://cl.fs55.xyz/thread0806.php?fid=16&page={Num}'
# page_url = 'https://cl.fs55.xyz/thread0806.php?fid=16&search=digest&page={Num}'
server = 'https://cl.fs55.xyz/'
Max_page_Num = 3
Download_dir = 'K:/Download/CL_Images/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
title_list = get_title_list(Max_page_Num)
for dic_title in title_list:
img_list = get_img_list(dic_title)
# 多线程模式
pool = Pool(5)
pool.map(download_img, img_list)
pool.close()
pool.join()
协程方式
import requests
from lxml import etree
from bs4 import BeautifulSoup
from lxml.html import tostring
import time
import os
import re
import asyncio
import aiohttp
def get_title_list(Max_page_Num):
list_title = []
for pageNum in range(1, Max_page_Num):
url = page_url.format(Num=pageNum)
try:
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
print('<第', pageNum, '页>获取页面信息完成')
response.encoding = 'gbk'
tree = etree.HTML(response.text)
if pageNum == 1:
title_list = tree.xpath('//div[@id="main"]//table[@id="ajaxtable"]//tr[@class="tr3 t_one tac"][position()>9]')
else:
title_list = tree.xpath('//div[@id="main"]//table[@id="ajaxtable"]//tr[@class="tr3 t_one tac"]')
for tr in title_list:
# 取分类,把空格去掉,保留字典
f_name = tr.xpath('normalize-space(./td[@class="tal"]/text())')
# 取标题
e_name = tr.xpath('./td[@class="tal"]/h3//text()')[0]
# 组合之后类似这样 [真心话系列]我爱你中国
title_name = f_name + e_name
# 标题链接
title_src = server + tr.xpath('./td[@class="tal"]//a/@href[1]')[0]
title_dir = Download_dir + title_name
dic_title = {
'name': title_name,
'dir': title_dir,
'src': title_src,
'pageNum': pageNum
}
list_title.append(dic_title)
time.sleep(2)
except Exception as re_index:
print(re_index, '页面请求超时', '链接地址: ', url)
if list_title:
return list_title
def get_img_list(dic_title):
title_name = dic_title['name']
title_dir = dic_title['dir']
title_src = dic_title['src']
pageNum = dic_title['pageNum']
if not os.path.exists(title_dir):
os.makedirs(title_dir)
print('标题信息: ', '<第', pageNum, '页>' ,title_name, '链接地址: ', title_src)
try:
parse_title_url = requests.get(url=title_src, headers=headers)
parse_title_url.encoding = 'gbk'
title_tree = etree.HTML(parse_title_url.text)
img_list = title_tree.xpath('//div[@class="t t2"][1]//td[@valign="top"]//div[@class="tpc_content do_not_catch"]/img')
list_img = [] # 定义一个列表,用来封装字典(字典内容为图片相关信息)
for img in img_list:
# imgNum = len(img)
# img_decode = tostring(img).decode('utf-8')
img_src = img.xpath('./@ess-data')[0]
img_name = img_src.split('/')[-1]
img_path = title_dir + '/' + img_name
# 定义一个字典,封装图片相关信息
dic_img = {
'name': img_name,
'src': img_src,
'path': img_path
}
list_img.append(dic_img)
if list_img:
print(title_name,'图片信息解析完成')
return list_img
except Exception as re_title:
print(re_title, '<', title_name, '>', '标题链接请求超时', '链接地址: ', title_src)
async def download_img(dic_img):
img_name = dic_img['name']
img_src = dic_img['src']
img_path = dic_img['path']
if not os.path.exists(img_path):
print(img_name,'开始下载')
async with aiohttp.ClientSession() as session:
async with await session.get(url=img_src, headers=headers) as response:
img_response = await response.read()
fp = open(img_path, 'wb')
fp.write(img_response)
fp.close()
print(img_src, '下载完成',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
else:
print(img_name, '已存在,跳过下载',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
if __name__ == "__main__":
# page_url = 'https://cl.fs55.xyz/thread0806.php?fid=16&page=%d'
page_url = 'https://cl.fs55.xyz/thread0806.php?fid=16&search=digest&page={Num}'
server = 'https://cl.fs55.xyz/'
Max_page_Num = 50
Download_dir = 'K:/Download/CL_Images/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
title_list = get_title_list(Max_page_Num)
for dic_title in title_list:
img_list = get_img_list(dic_title)
if not img_list:
print('任务列表为空,跳过任务')
continue
# 协程模式
tasks = []
for dic_img in img_list:
c = download_img(dic_img)
task = asyncio.ensure_future(c)
tasks.append(task)
# 创建一个事件循环对象
loop = asyncio.get_event_loop()
# 将协程对象注册到loop中,然后启动loop-需要将任务列表封装到wait中
loop.run_until_complete(asyncio.wait(tasks))