python爬虫之—全书网小说爬取

 1 import requests
 2 import re
 3 import time
 4 import random
 5 url = "http://www.xs4.cc/book/1/4460/"
 6 time.sleep(random.randint(1,3))  #随机时间
 7 def getNovelContent():
 8     response = requests.get(url)  #请求网址
 9     response.encoding = 'utf-8'  #编码格式
10     html = response.text  #获取网页源代码
11     url_1 = re.findall(r'<li><a href="(.*?)" title=".*?">(.*?)</a></li>',html) #正则表达式
12     for n in url_1:
13         w = 'http://www.xs4.cc' 
14         novel_n = str(w) + str(n[0])  #拼接网址
15         nocel_title = n[1]
16         chapt = requests.get(novel_n)
17         chapt.encoding = 'utf-8'
18         chapt_html = chapt.text
19         html_2 = re.findall(r' id="content">(.*?)<!--<div style=',chapt_html,re.S)
20         html_3 = re.sub(r'&nbsp;|<br />','',html_2[0]).strip()  #去掉特殊符号、空格和换行符
21         print('正在保存 %s' %nocel_title)
22         with open(r'C:\Users\Administrator\Desktop\\345\\{}.txt'.format(nocel_title),'w')as f:
23             f.write(html_3)  #写入文件
24 getNovelContent()

抓取结果如下:

 

posted @ 2018-06-12 08:22  叫我大表哥  阅读(315)  评论(0编辑  收藏  举报