爬虫实战(六):爬笔趣阁
一、 网站分析
1、 页面分析
通过抓包分析可以得到,该网站为静态网站,所有信息都保存在页面源码中,可以直接从页面源码获取信息。
在目录中,发现其全部存储在一个dl的标签中
在搜索目录中,搜索内容存储在一个列表中,我们可以通过获取列表的长度来知道是否查找到了结果;如果没有搜索到结果,则不会将列表哦渲染出来,我们可以通过<div class="novelslistss"></div>
来判断是否查找成功
2、 源码分析
通过分析源码,发现,目录包裹在<dl></dl>
,内容包裹在一个<div id="content"></div>
中
3、 链接分析
| |
| |
| |
| url = "https://www.bqg.org/modules/article/search.php?searchkey=%s" |
| from urllib import parse |
| print(parse.quote("绝世武神".encode("gbk"))) |
| |
| |
| |
| name = input("请输入书名:") |
| print(url % parse.quote(name.encode("gbk"))) |
二、 编写代码
1、 获取目录
| |
| |
| __author__ = "A.L.Kun" |
| __file__ = "biquge.py" |
| __time__ = "2022/7/6 14:03" |
| |
| import requests, re |
| from urllib import parse |
| from fake_useragent import UserAgent |
| |
| |
| |
| url = "https://www.bqg.org/53_53985/" |
| headers = { |
| 'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', |
| 'Accept-Encoding': ' gzip, deflate, br', |
| 'Accept-Language': ' zh-CN,zh;q=0.9', |
| 'Cache-Control': ' no-cache', |
| 'Connection': ' keep-alive', |
| 'Cookie': ' jieqiVisitId=article_articleviews%3D53982; jieqiVisitTime=jieqiArticlesearchTime%3D1657088500; clickbids=53982', |
| 'Host': ' www.bqg.org', |
| 'Pragma': ' no-cache', |
| 'Referer': ' https://www.bqg.org/', |
| 'sec-ch-ua': ' " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', |
| 'sec-ch-ua-mobile': ' ?0', |
| 'sec-ch-ua-platform': ' "Windows"', |
| 'Sec-Fetch-Dest': ' document', |
| 'Sec-Fetch-Mode': ' navigate', |
| 'Sec-Fetch-Site': ' same-origin', |
| 'Sec-Fetch-User': ' ?1', |
| 'Upgrade-Insecure-Requests': ' 1', |
| 'user-agent': UserAgent().random |
| } |
| def get_menu(url): |
| headers.update({'user-agent': UserAgent().random}) |
| resp = requests.get(url, headers) |
| resp.encoding = resp.apparent_encoding |
| temp = re.search("正文.*?</dl>", resp.text).group() |
| lis = re.findall(r'<a href="(.*?)">(.*?)</a>', temp) |
| url_de = [] |
| chapters = [] |
| for i in lis: |
| url_de.append(f"{url}{i[0]}") |
| chapters.append(i[1]) |
| |
| return url_de, chapters |
| |
| get_menu(url) |
2、 访问目录
| def get_content(url): |
| headers.update({'user-agent': UserAgent().random}) |
| resp = requests.get(url, headers) |
| resp.encoding = resp.apparent_encoding |
| content = re.search('<div id="content" name="content">(?P<content>.*?)</div>', resp.text, re.S).group("content") |
| content = re.sub(r'<br.*?>', "\n", content) |
| content = re.sub(r' ', " ", content) |
| return content |
| |
| |
| get_content("https://www.bqg.org/53_53985/38645272.html") |
3、 下载数据
| def save_data(url_): |
| with open("./风起龙城.txt", "w", encoding="utf-8") as f: |
| urls, menus = get_menu(url_) |
| for index, url in enumerate(urls): |
| f.write(f"==============={menus[index]}===============\n") |
| print(f"正在下载:{menus[index]}") |
| content = get_content(url) |
| f.write(content + "\n") |
| print(f"{menus[index]},下载完成") |
| |
| save_data("https://www.bqg.org/53_53985/") |
4、 搜索功能
| url = "https://www.bqg.org/modules/article/search.php?searchkey=%s" |
| from lxml import etree |
| from prettytable import PrettyTable |
| |
| |
| def have_content(text): |
| |
| html = etree.HTML(text) |
| li_s = html.xpath("//*[@id='main']/div[1]/li") |
| table = PrettyTable(['序号', "类型", '书名', '作者']) |
| lis_ = [] |
| index = 0 |
| for index, li in enumerate(li_s): |
| print(index) |
| type_ = li.xpath("./span[@class='s1']/text()")[0] |
| name = li.xpath("./span[@class='s2']/a/text()")[0] |
| url = li.xpath("./span[@class='s2']/a/@href")[0] |
| lis_.append(url) |
| author = li.xpath("./span[@class='s4']/text()")[0] |
| table.add_row([index + 1, type_, name, author]) |
| |
| print(table) |
| i = int(input("请输入要下载的序号:")) - 1 |
| print(index) |
| if 0 < i <= index: |
| return lis_[i] |
| print("请按要求输入哦!") |
| sys.exit(0) |
| |
| |
| def search(n): |
| arg = parse.quote(n.encode("gbk")) |
| headers.update({ |
| 'User-Agent': UserAgent().random, |
| "Cookie": f"jieqiVisitTime=jieqiArticlesearchTime%{int(time())}" |
| }) |
| print(headers) |
| url_ = url % arg |
| print(url_) |
| resp = requests.get(url_, headers=headers) |
| print(resp) |
| resp.encoding = resp.apparent_encoding |
| tet = resp.text |
| print(tet) |
| if re.search("错误原因:对不起,没有搜索.*?文章!", tet): |
| print("对不起,没有搜索到您要的文章!") |
| return |
| if re.search("错误原因:对不起,两次搜索.*?秒", tet): |
| print("对不起,两次搜索的间隔时间不得少于30秒") |
| return |
| |
| return have_content(tet) |
| |
| |
| name = input("请输入要下载的小说:") |
| search(name) |
三、 总代码
| |
| |
| __author__ = "A.L.Kun" |
| __file__ = "biquge.py" |
| __time__ = "2022/7/6 14:03" |
| |
| |
| import requests, re, sys |
| from urllib import parse |
| from fake_useragent import UserAgent |
| from time import time |
| from lxml import etree |
| from prettytable import PrettyTable |
| |
| url = "https://www.bqg.org/modules/article/search.php?searchkey=%s" |
| headers = { |
| 'Accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', |
| 'Accept-Encoding': ' gzip, deflate, br', |
| 'Accept-Language': ' zh-CN,zh;q=0.9', |
| 'Cache-Control': ' no-cache', |
| 'Connection': ' keep-alive', |
| 'Cookie': ' jieqiVisitId=article_articleviews%3D53982; jieqiVisitTime=jieqiArticlesearchTime%3D1657088500; clickbids=53982', |
| 'Host': ' www.bqg.org', |
| 'Pragma': ' no-cache', |
| 'Referer': ' https://www.bqg.org/', |
| 'sec-ch-ua': ' " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', |
| 'sec-ch-ua-mobile': ' ?0', |
| 'sec-ch-ua-platform': ' "Windows"', |
| 'Sec-Fetch-Dest': ' document', |
| 'Sec-Fetch-Mode': ' navigate', |
| 'Sec-Fetch-Site': ' same-origin', |
| 'Sec-Fetch-User': ' ?1', |
| 'Upgrade-Insecure-Requests': ' 1', |
| 'user-agent': UserAgent().random |
| } |
| |
| |
| def have_content(text): |
| |
| html = etree.HTML(text) |
| li_s = html.xpath("//*[@id='main']/div[1]/li") |
| table = PrettyTable(['序号', "类型", '书名', '作者']) |
| lis_ = [] |
| index = 0 |
| for index, li in enumerate(li_s): |
| print(index) |
| type_ = li.xpath("./span[@class='s1']/text()")[0] |
| name = li.xpath("./span[@class='s2']/a/text()")[0] |
| url = li.xpath("./span[@class='s2']/a/@href")[0] |
| lis_.append(url) |
| author = li.xpath("./span[@class='s4']/text()")[0] |
| table.add_row([index + 1, type_, name, author]) |
| |
| print(table) |
| i = int(input("请输入要下载的序号:")) - 1 |
| print(index) |
| if 0 < i <= index: |
| return lis_[i] |
| print("请按要求输入哦!") |
| sys.exit(0) |
| |
| |
| def search(n): |
| arg = parse.quote(n.encode("gbk")) |
| headers.update({ |
| 'User-Agent': UserAgent().random, |
| "Cookie": f"jieqiVisitTime=jieqiArticlesearchTime%{int(time())}" |
| }) |
| url_ = url % arg |
| resp = requests.get(url_, headers=headers) |
| resp.encoding = resp.apparent_encoding |
| tet = resp.text |
| if re.search("错误原因:对不起,没有搜索.*?文章!", tet): |
| print("对不起,没有搜索到您要的文章!") |
| return |
| if re.search("错误原因:对不起,两次搜索.*?秒", tet): |
| print("对不起,两次搜索的间隔时间不得少于30秒") |
| return |
| |
| return have_content(tet) |
| |
| |
| def get_menu(url): |
| headers.update({'user-agent': UserAgent().random}) |
| resp = requests.get(url, headers) |
| resp.encoding = resp.apparent_encoding |
| temp = re.search("正文.*?</dl>", resp.text).group() |
| lis = re.findall(r'<a href="(.*?)">(.*?)</a>', temp) |
| url_de = [] |
| chapters = [] |
| for i in lis: |
| url_de.append(f"{url}{i[0]}") |
| chapters.append(i[1]) |
| |
| return url_de, chapters |
| |
| |
| def get_content(url): |
| headers.update({'user-agent': UserAgent().random}) |
| resp = requests.get(url, headers) |
| resp.encoding = resp.apparent_encoding |
| content = re.search('<div id="content" name="content">(?P<content>.*?)</div>', resp.text, re.S).group( |
| "content") |
| content = re.sub(r'<br.*?>', "\n", content) |
| content = re.sub(r' ', " ", content) |
| return content |
| |
| |
| def save_data(url_, name): |
| with open(f"./{name}.txt", "w", encoding="utf-8") as f: |
| urls, menus = get_menu(url_) |
| for index, url in enumerate(urls): |
| f.write(f"==============={menus[index]}===============\n") |
| print(f"正在下载:{menus[index]}") |
| content = get_content(url) |
| f.write(content + "\n") |
| print(f"{menus[index]},下载完成") |
| print("--------------------------") |
| |
| |
| def main(): |
| name = input("请输入要下载的小说名:") |
| save_data(search(name), name) |
| print("下载完成") |
| |
| |
| if __name__ == "__main__": |
| main() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?