python爬虫下载小说

1.

from urllib.request import urlopen
from urllib import request
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from selenium import webdriver
import socket
import time,re
from docx import Document
from docx.oxml.ns import qn

def tackle(text):
    #print(text)
    for i in range(len(text)):
        if(text[i:i+22] == '<div class="bookname">'):
            for j in range(i+39,len(text)):
                if (text[j] == '<'):
                    name = (text[i+39:j])
                    break
            print(name)
            break
    
    for i in range(len(text)):
        if(text[i:i+18] == '<div id="content">'):
            text = text[i+18:]
            break
    for i in range(len(text)):
        if(text[i:i+6] == '</div>'):
            text = text[:i]
            break

    text = text.replace('…','')
    text = text.replace('」','')
    text = text.replace('「','')
    text = text.replace('<br/><br/>','')
    text = re.sub(r"\s+", "", text)#正则匹配去掉空格
    save(name,text)
    
def save(name,text):
    doc = Document()
    doc.styles['Normal'].font.name = u'宋体'
    doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    doc.add_paragraph(text)
    #保存.docx文档
    doc.save(name + '.docx')

def download(url):#下载网页
    #获取HTML
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}    
    req = request.Request(url, headers=headers)
    html = request.urlopen(req).read()
    #保存HTML
    file_name = 'text'
    with open (file_name.replace('/','_')+".html","wb") as f:
        f.write(html)  
if __name__ == "__main__":
    #url = ''
    download(url)
    with open('text'+str(i)+'.html', 'rb') as f:
        Soup = str(BeautifulSoup(f.read(), 'html.parser'))
        tackle(Soup)

2.

from urllib import request
from bs4 import BeautifulSoup
import re,codecs

def download(url,i=0):#下载网页
#获取HTML
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = request.Request(url, headers=headers)
html = request.urlopen(req).read()
#保存HTML
file_name = i
with open ("{}.html".format(i),"wb") as f:
f.write(html)
with open('{}.html'.format(i), 'rb') as f:
Soup = str(BeautifulSoup(f.read(), 'html.parser')) #把html转化为string
return Soup

def save(name,txt): # 得到标题和正文之后,保存为txt
f = codecs.open("{}.txt".format(name),'w','utf-8')
f.write(txt)

def tackle(url,i):
Soup = download(url,i) # 获取字符串,字符串内容为整个网页
pattern_title = '

.
' #匹配,其中.代表匹配中间全部字符(除了换行)
mp = re.search(pattern_title,Soup) #正则搜索
title = mp.group()[12:-6] # group方法返回搜索结果的字符串形式,并把字符串中
去掉
start = Soup.find('
')
end = Soup.find('</p>
')
pattern_body = Soup[start+34:end] #标记正文位置
save(title+'.txt',pattern_body)

if name == "main":
Soup = download('path') # 小说目录网址
place = [substr.start() for substr in re.finditer('http://www.ncwxw.cc/0/298/8',Soup)] # 字符串匹配,确定所有章节的网址位置
place = place[13:] #预处理
website = [] #存储每一章的网址
for chapter in place:
website.append(Soup[chapter:chapter+36])
'''以上适用于每一章节网址无规律的情况,若有规律则可直接变址寻址'''
for i in range(1,1979):
tackle(website[i],i)

posted @ 2022-06-10 11:37  CJK'sBLOG  阅读(286)  评论(0编辑  收藏  举报