使用python爬取晋江文学城小说评论
之前看到有作者想备份评论,就去看了下晋江页面,没有什么反爬措施,评论内容都写在html里了,所以直接正则了。
有些评论数过多,所以每个csv保存一章的评论(好像也没什么实用价值)。
"""
根据晋江小说的novelid和需要爬取的起始和终止章节,保存这些章节的评论信息
评论信息包括:评论id、评论楼层、评论用户名、发表时间、章节id、评论内容
访问晋江评论库不需要购买章节,可以随机找一本
以该链接为例, http://www.jjwxc.net/onebook.php?novelid=2697774
novelid = 2697774
获取 1-88章评论
"""
import requests
import re
import csv
# http://www.jjwxc.net/onebook.php?novelid=2697774
novelid = int(input("请输入novelid:")) # 2697774
chapter_start = int(input("请输入起始章节:")) # 1
chapter_end = int(input("请输入终止章节:")) # 88
headers_dict = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
# 获取html页面代码
def getHTMLpage(url):
page = requests.get(url, headers=headers_dict)
page.close()
page.encoding = "gb18030"
return page
# 获取某一章的评论页数
def get_summary(page):
pattern1 = re.compile(r"共有<span class='redtext'>(?P<comment_count>\d+)</span>条评论,"
r"分<span class='redtext'>(?P<page_count>\d+)</span>页", re.S)
result = pattern1.search(page.text)
return int(result.group("page_count"))
# 获取评论相关信息
def get_re_result(page):
pattern2 = re.compile(r'data-commentid="(?P<comment_id>.*?)"'
r'.*?<span class="coltext">.*?№(?P<comment_floor>\d+).*?网友'
r'.*?target="_blank">(?P<user_name>.*?)</a></span>'
r'.*?发表时间:(?P<comment_time>.*?) '
r'.*?所评章节:.*?data-chapterid="(?P<chapter_id>.*?)">'
r'.*?mormalcomment_.*?>(?P<content>.*?)</span>', re.S)
result = pattern2.finditer(page.text)
return result
chapter_id = chapter_start
while chapter_id <= chapter_end:
page_id = 1
url_chap = f"https://www.jjwxc.net/comment.php?novelid={novelid}&chapterid={chapter_id}&page={page_id}"
page1 = getHTMLpage(url_chap)
# 跳过被锁定章节
try:
page_count = get_summary(page1)
except AttributeError:
chapter_id += 1
continue
with open(f'{novelid}_chapter{chapter_id:03}_comments.csv', 'w', encoding="utf-8", newline='') as f:
fieldnames = ["comment_id", "comment_floor", "user_name", "comment_time", "chapter_id", "content"]
csv_writer = csv.DictWriter(f, fieldnames=fieldnames)
csv_writer.writeheader()
while page_id <= page_count:
page_url = f"https://www.jjwxc.net/comment.php?novelid={novelid}&chapterid={chapter_id}&page={page_id}"
page2 = getHTMLpage(page_url)
comments = get_re_result(page2)
for comment in comments:
dic = comment.groupdict()
csv_writer.writerow(dic)
page_id += 1
print(f'chapter {chapter_id:03} has been saved.')
chapter_id += 1