python3抓取晋江文学城免费章节小说
看了别人写的抓取晋江小说的爬虫后,自己动手写了一版简单的。
记录下。
【执行脚本时只需输入想下载的文章ID即可】
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
# -*- coding:utf8 -*-
# 爬虫 - 晋江小说 import requests import lxml.html from itertools import product def jj_Download(chapters_url, chapters_title, novel_name): i = 0 for u, t in product(chapters_url, chapters_title): i + = 1 if len (chapters_url) < i: return print (t + " 下载ing......" ) html = requests.get(chapters_url[i - 1 ]).content selector = lxml.html.fromstring(html) content_text = selector.xpath( '///div[@class="noveltext"]/text()' ) name = "第" + str (i) + "章 " + t # 章节 content = '\n' + name + '\n' .join(content_text) with open (novel_name, 'a' ,encoding = "utf-8" ) as f : f.write(content) f.write( '\n' ) f.close() # 获取当前页面的所有章节地址 # 晋江小说ID id = input ( "请输入小说novelid:" ) res = requests.get(url).content tree = lxml.html.fromstring(res) # 获取非vip章节链接 chapters_url = tree.xpath( '//tr[@itemprop="chapter"]//a/@href' ) # 获取全部章节标题 chapters_title = tree.xpath( '//tr[@itemprop="chapter"]//a/text()' ) # 获取小说名 novel = tree.xpath( '//span[@itemprop="articleSection"]/text()' )[ 0 ] # 获取小说作者 author = tree.xpath( '//span[@itemprop="author"]/text()' )[ 0 ] novel_name = novel + " 作者:" + author + ".txt" jj_Download(chapters_url, chapters_title, novel_name)
|