python爬虫下载小说

1.

from urllib.request import urlopen
from urllib import request
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from selenium import webdriver
import socket
import time,re
from docx import Document
from docx.oxml.ns import qn

def tackle(text):
    #print(text)
    for i in range(len(text)):
        if(text[i:i+22] == '<div class="bookname">'):
            for j in range(i+39,len(text)):
                if (text[j] == '<'):
                    name = (text[i+39:j])
                    break
            print(name)
            break
    
    for i in range(len(text)):
        if(text[i:i+18] == '<div id="content">'):
            text = text[i+18:]
            break
    for i in range(len(text)):
        if(text[i:i+6] == '</div>'):
            text = text[:i]
            break

    text = text.replace('…','')
    text = text.replace('」','')
    text = text.replace('「','')
    text = text.replace('<br/><br/>','')
    text = re.sub(r"\s+", "", text)#正则匹配去掉空格
    save(name,text)
    
def save(name,text):
    doc = Document()
    doc.styles['Normal'].font.name = u'宋体'
    doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    doc.add_paragraph(text)
    #保存.docx文档
    doc.save(name + '.docx')

def download(url):#下载网页
    #获取HTML
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}    
    req = request.Request(url, headers=headers)
    html = request.urlopen(req).read()
    #保存HTML
    file_name = 'text'
    with open (file_name.replace('/','_')+".html","wb") as f:
        f.write(html)  
if __name__ == "__main__":
    #url = ''
    download(url)
    with open('text'+str(i)+'.html', 'rb') as f:
        Soup = str(BeautifulSoup(f.read(), 'html.parser'))
        tackle(Soup)

2.

from urllib import request
from bs4 import BeautifulSoup
import re,codecs

def download(url,i=0):#下载网页
#获取HTML
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = request.Request(url, headers=headers)
html = request.urlopen(req).read()
#保存HTML
file_name = i
with open ("{}.html".format(i),"wb") as f:
f.write(html)
with open('{}.html'.format(i), 'rb') as f:
Soup = str(BeautifulSoup(f.read(), 'html.parser')) #把html转化为string
return Soup

def save(name,txt): # 得到标题和正文之后,保存为txt
f = codecs.open("{}.txt".format(name),'w','utf-8')
f.write(txt)

def tackle(url,i):
Soup = download(url,i) # 获取字符串,字符串内容为整个网页
pattern_title = '

.
' #匹配,其中.代表匹配中间全部字符(除了换行)
mp = re.search(pattern_title,Soup) #正则搜索
title = mp.group()[12:-6] # group方法返回搜索结果的字符串形式,并把字符串中
去掉
start = Soup.find('
')
end = Soup.find('</p>
')
pattern_body = Soup[start+34:end] #标记正文位置
save(title+'.txt',pattern_body)

if name == "main":
Soup = download('path') # 小说目录网址
place = [substr.start() for substr in re.finditer('http://www.ncwxw.cc/0/298/8',Soup)] # 字符串匹配,确定所有章节的网址位置
place = place[13:] #预处理
website = [] #存储每一章的网址
for chapter in place:
website.append(Soup[chapter:chapter+36])
'''以上适用于每一章节网址无规律的情况,若有规律则可直接变址寻址'''
for i in range(1,1979):
tackle(website[i],i)

posted @   CJK'sBLOG  阅读(299)  评论(0编辑  收藏  举报
(评论功能已被禁用)
相关博文:
阅读排行:
· 无需6万激活码!GitHub神秘组织3小时极速复刻Manus,手把手教你使用OpenManus搭建本
· C#/.NET/.NET Core优秀项目和框架2025年2月简报
· Manus爆火,是硬核还是营销?
· 终于写完轮子一部分:tcp代理 了,记录一下
· 【杭电多校比赛记录】2025“钉耙编程”中国大学生算法设计春季联赛(1)
点击右上角即可分享
微信分享提示