python爬虫之—全书网小说爬取
1 import requests 2 import re 3 import time 4 import random 5 url = "http://www.xs4.cc/book/1/4460/" 6 time.sleep(random.randint(1,3)) #随机时间 7 def getNovelContent(): 8 response = requests.get(url) #请求网址 9 response.encoding = 'utf-8' #编码格式 10 html = response.text #获取网页源代码 11 url_1 = re.findall(r'<li><a href="(.*?)" title=".*?">(.*?)</a></li>',html) #正则表达式 12 for n in url_1: 13 w = 'http://www.xs4.cc' 14 novel_n = str(w) + str(n[0]) #拼接网址 15 nocel_title = n[1] 16 chapt = requests.get(novel_n) 17 chapt.encoding = 'utf-8' 18 chapt_html = chapt.text 19 html_2 = re.findall(r' id="content">(.*?)<!--<div style=',chapt_html,re.S) 20 html_3 = re.sub(r' |<br />','',html_2[0]).strip() #去掉特殊符号、空格和换行符 21 print('正在保存 %s' %nocel_title) 22 with open(r'C:\Users\Administrator\Desktop\\345\\{}.txt'.format(nocel_title),'w')as f: 23 f.write(html_3) #写入文件 24 getNovelContent()
抓取结果如下: