爬虫基础-爬取某神坛小说
线程池方式
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
import os
from multiprocessing.dummy import Pool
def get_book_list(pageNum):
url = page_url.format(Num=pageNum)
try:
response = requests.get(url=url, headers=headers)
response.encoding = 'gbk'
tree = etree.HTML(response.text)
book_list = []
# 这里是取首页时候用到的,从第7个tr标签之后是正式的小说,前6个是版规
if pageNum == 1:
tr_list = tree.xpath('//div[@id="main"]/div[@class="t"][2]//tr[@class="tr3 t_one tac"][position()>6]')
else:
tr_list = tree.xpath('//div[@id="main"]/div[@class="t"][2]//tr[@class="tr3 t_one tac"]')
for tr in tr_list:
# 取分类,把空格去掉,保留字典
f_name = tr.xpath('normalize-space(./td[@class="tal"]/text())')
# 取小说名字
e_name = tr.xpath('./td[@class="tal"]/h3//text()')[0]
# 组合之后类似这样 [真心话系列]我爱你中国
title_name = f_name + e_name + '.txt'
# 抓取小说链接
title_src = server + tr.xpath('./td[@class="tal"]//a/@href')[0]
title_dic = {
'name': title_name,
'src': title_src
}
book_list.append(title_dic)
return book_list
except Exception as re_page:
print(re_page,'页面链接访问失败',url)
def download_ebook(book_dic):
src = book_dic['src']
name = book_dic['name']
# 下面是解析小说链接,下载小说
try:
detail_url = requests.get(url=src, headers=headers)
detail_url.encoding = 'gbk'
# bs4方式
soup = BeautifulSoup(detail_url.text, 'lxml')
# 定位懂啊class属性为tpc_content do_not_catch的div下的span,写法如下,空格用.代替
div = soup.select('div.tpc_content.do_not_catch')[0]
# 把br标签转行成空格,先用str转换成字符串,再replace处理
content = str(div).replace('<br/>', '\n')
# book = BeautifulSoup(content, 'lxml').text
book = BeautifulSoup(content, 'lxml').text
### xpath方式,提取的不美观,所有内容都堆积成一行
# tree = etree.HTML(detail_url.text)
# book = tree.xpath('//div[@class="tpc_content do_not_catch"]')[0].xpath('string(.)')
book_path = Download_dir + name
if os.path.exists(book_path):
print('<第',pageNum,'页>',name, '已存在,跳过下载', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
else:
fp = open(book_path,'w',encoding='utf-8')
fp.write(book)
fp.close()
print('<第',pageNum,'页>', name,'下载完成', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
time.sleep(300)
except Exception as re_book:
print(re_book, src, '请求失败')
if __name__ == "__main__":
page_url = 'https://cl.fs55.xyz/thread0806.php?fid=20&page={Num}'
server = 'https://cl.fs55.xyz/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
Max_page_Num = 20
Download_dir = 'K:/Download/CL_Ebook/'
if not os.path.exists(Download_dir):
os.makedirs(Download_dir)
for pageNum in range(1, Max_page_Num):
book_list = get_book_list(pageNum)
# 多线程模式
pool = Pool(5)
pool.map(download_ebook, book_list)
pool.close()
pool.join()
协程方式(容易被封)
import requests
from lxml import etree
from bs4 import BeautifulSoup
import asyncio
import time
import aiohttp
import os
def get_book_list(pageNum):
url = page_url.format(Num=pageNum)
try:
response = requests.get(url=url, headers=headers)
response.encoding = 'gbk'
tree = etree.HTML(response.text)
title_list = []
# 这里是取首页时候用到的,从第7个tr标签之后是正式的小说,前6个是版规
if pageNum == 1:
tr_list = tree.xpath('//div[@id="main"]/div[@class="t"][2]//tr[@class="tr3 t_one tac"][position()>6]')
else:
tr_list = tree.xpath('//div[@id="main"]/div[@class="t"][2]//tr[@class="tr3 t_one tac"]')
for tr in tr_list:
# 取分类,把空格去掉,保留字典
f_name = tr.xpath('normalize-space(./td[@class="tal"]/text())')
# 取小说名字
e_name = tr.xpath('./td[@class="tal"]/h3//text()')[0]
# 组合之后类似这样 [真心话系列]我爱你中国
title_name = f_name + e_name + '.txt'
# 抓取小说链接
title_src = server + tr.xpath('./td[@class="tal"]//a/@href')[0]
title_dic = {
'name': title_name,
'src': title_src
}
title_list.append(title_dic)
# for book_dic in title_list:
# download_ebook(book_dic)
return title_list
except Exception as re_page:
print(re_page,'页面链接访问失败',url)
async def download_ebook(book_list):
for book_dic in book_list:
src = book_dic['src']
name = book_dic['name']
# 下面是解析小说链接,下载小说
async with aiohttp.ClientSession() as session:
async with await session.get(url=src, headers=headers) as response:
detail_url = await response.text(encoding='gbk')
# bs4方式
soup = BeautifulSoup(detail_url, 'lxml')
# 定位懂啊class属性为tpc_content do_not_catch的div下的span,写法如下,空格用.代替
div = soup.select('div.tpc_content.do_not_catch')[0]
# 把br标签转行成空格,先用str转换成字符串,再replace处理
content = str(div).replace('<br/>', '\n')
# book = BeautifulSoup(content, 'lxml').text
book = BeautifulSoup(content, 'lxml').text
### xpath方式,提取的不美观,所有内容都堆积成一行
# tree = etree.HTML(detail_url.text)
# book = tree.xpath('//div[@class="tpc_content do_not_catch"]')[0].xpath('string(.)')
book_path = './download/CL_Ebook/' + name
if os.path.exists(book_path):
print('<第',pageNum,'页>' , name, '已存在,跳过下载',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
else:
fp = open(book_path,'w',encoding='utf-8')
fp.write(book)
fp.close()
print('<第',pageNum,'页>', name, '下载完成',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
# 慢慢下,心急吃不了热豆腐
await asyncio.sleep(20)
if __name__ == "__main__":
page_url = 'https://cl.fs55.xyz/thread0806.php?fid=20&page={Num}'
server = 'https://cl.fs55.xyz/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
Max_page_Num = 20
if not os.path.exists('./download/CL_Ebook'):
os.makedirs('./download/CL_Ebook')
tasks = []
for pageNum in range(1, Max_page_Num):
book_list = get_book_list(pageNum)
# 把执行的任务生成一个对象c
c = download_ebook(book_list)
# 把c添加到任务对象task中
task = asyncio.ensure_future(c)
# 把任务对象task添加到任务列表tasks
tasks.append(task)
# 创建一个事件循环对象
loop = asyncio.get_event_loop()
# 将协程对象注册到loop中,然后启动loop-需要将任务列表封装到wait中
loop.run_until_complete(asyncio.wait(tasks))