python爬虫下载小说
1.
from urllib.request import urlopen
from urllib import request
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from selenium import webdriver
import socket
import time,re
from docx import Document
from docx.oxml.ns import qn
def tackle(text):
#print(text)
for i in range(len(text)):
if(text[i:i+22] == '<div class="bookname">'):
for j in range(i+39,len(text)):
if (text[j] == '<'):
name = (text[i+39:j])
break
print(name)
break
for i in range(len(text)):
if(text[i:i+18] == '<div id="content">'):
text = text[i+18:]
break
for i in range(len(text)):
if(text[i:i+6] == '</div>'):
text = text[:i]
break
text = text.replace('…','')
text = text.replace('」','')
text = text.replace('「','')
text = text.replace('<br/><br/>','')
text = re.sub(r"\s+", "", text)#正则匹配去掉空格
save(name,text)
def save(name,text):
doc = Document()
doc.styles['Normal'].font.name = u'宋体'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
doc.add_paragraph(text)
#保存.docx文档
doc.save(name + '.docx')
def download(url):#下载网页
#获取HTML
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = request.Request(url, headers=headers)
html = request.urlopen(req).read()
#保存HTML
file_name = 'text'
with open (file_name.replace('/','_')+".html","wb") as f:
f.write(html)
if __name__ == "__main__":
#url = ''
download(url)
with open('text'+str(i)+'.html', 'rb') as f:
Soup = str(BeautifulSoup(f.read(), 'html.parser'))
tackle(Soup)
2.
from urllib import request
from bs4 import BeautifulSoup
import re,codecs
def download(url,i=0):#下载网页
#获取HTML
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = request.Request(url, headers=headers)
html = request.urlopen(req).read()
#保存HTML
file_name = i
with open ("{}.html".format(i),"wb") as f:
f.write(html)
with open('{}.html'.format(i), 'rb') as f:
Soup = str(BeautifulSoup(f.read(), 'html.parser')) #把html转化为string
return Soup
def save(name,txt): # 得到标题和正文之后,保存为txt
f = codecs.open("{}.txt".format(name),'w','utf-8')
f.write(txt)
def tackle(url,i):
Soup = download(url,i) # 获取字符串,字符串内容为整个网页
pattern_title = '
mp = re.search(pattern_title,Soup) #正则搜索
title = mp.group()[12:-6] # group方法返回搜索结果的字符串形式,并把字符串中
start = Soup.find('
end = Soup.find('</p>
pattern_body = Soup[start+34:end] #标记正文位置
save(title+'.txt',pattern_body)
if name == "main":
Soup = download('path') # 小说目录网址
place = [substr.start() for substr in re.finditer('http://www.ncwxw.cc/0/298/8',Soup)] # 字符串匹配,确定所有章节的网址位置
place = place[13:] #预处理
website = [] #存储每一章的网址
for chapter in place:
website.append(Soup[chapter:chapter+36])
'''以上适用于每一章节网址无规律的情况,若有规律则可直接变址寻址'''
for i in range(1,1979):
tackle(website[i],i)