python爬虫下载小说
1.
from urllib.request import urlopen
from urllib import request
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from selenium import webdriver
import socket
import time,re
from docx import Document
from docx.oxml.ns import qn
def tackle(text):
#print(text)
for i in range(len(text)):
if(text[i:i+22] == '<div class="bookname">'):
for j in range(i+39,len(text)):
if (text[j] == '<'):
name = (text[i+39:j])
break
print(name)
break
for i in range(len(text)):
if(text[i:i+18] == '<div id="content">'):
text = text[i+18:]
break
for i in range(len(text)):
if(text[i:i+6] == '</div>'):
text = text[:i]
break
text = text.replace('…','')
text = text.replace('」','')
text = text.replace('「','')
text = text.replace('<br/><br/>','')
text = re.sub(r"\s+", "", text)#正则匹配去掉空格
save(name,text)
def save(name,text):
doc = Document()
doc.styles['Normal'].font.name = u'宋体'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
doc.add_paragraph(text)
#保存.docx文档
doc.save(name + '.docx')
def download(url):#下载网页
#获取HTML
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = request.Request(url, headers=headers)
html = request.urlopen(req).read()
#保存HTML
file_name = 'text'
with open (file_name.replace('/','_')+".html","wb") as f:
f.write(html)
if __name__ == "__main__":
#url = ''
download(url)
with open('text'+str(i)+'.html', 'rb') as f:
Soup = str(BeautifulSoup(f.read(), 'html.parser'))
tackle(Soup)
2.
from urllib import request
from bs4 import BeautifulSoup
import re,codecs
def download(url,i=0):#下载网页
#获取HTML
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = request.Request(url, headers=headers)
html = request.urlopen(req).read()
#保存HTML
file_name = i
with open ("{}.html".format(i),"wb") as f:
f.write(html)
with open('{}.html'.format(i), 'rb') as f:
Soup = str(BeautifulSoup(f.read(), 'html.parser')) #把html转化为string
return Soup
def save(name,txt): # 得到标题和正文之后,保存为txt
f = codecs.open("{}.txt".format(name),'w','utf-8')
f.write(txt)
def tackle(url,i):
Soup = download(url,i) # 获取字符串,字符串内容为整个网页
pattern_title = '
mp = re.search(pattern_title,Soup) #正则搜索
title = mp.group()[12:-6] # group方法返回搜索结果的字符串形式,并把字符串中
start = Soup.find('
end = Soup.find('</p>
pattern_body = Soup[start+34:end] #标记正文位置
save(title+'.txt',pattern_body)
if name == "main":
Soup = download('path') # 小说目录网址
place = [substr.start() for substr in re.finditer('http://www.ncwxw.cc/0/298/8',Soup)] # 字符串匹配,确定所有章节的网址位置
place = place[13:] #预处理
website = [] #存储每一章的网址
for chapter in place:
website.append(Soup[chapter:chapter+36])
'''以上适用于每一章节网址无规律的情况,若有规律则可直接变址寻址'''
for i in range(1,1979):
tackle(website[i],i)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 【杭电多校比赛记录】2025“钉耙编程”中国大学生算法设计春季联赛(1)