python 爬qidian小说
1 import re 2 import urllib.request 3 from bs4 import BeautifulSoup 4 import time 5 6 url=input("第一章网址:") 7 8 def gethtml(url): 9 #获取页面源代码html 10 page=urllib.request.urlopen(url) 11 html=page.read().decode('utf-8') #html是一个列表 12 soup=BeautifulSoup(html,'html.parser') 13 14 return soup 15 16 def getcontent(soup,load): 17 18 content=soup.find_all("div",{"class":"read-content j_readContent"}) 19 20 content1=re.compile(r'<p>([\s\S]*?)</p>') #匹配到段落内容 21 22 content2=content1.findall(str(content)) 23 24 content3=re.sub("</?\w+[^>]*>",'',content2[0]) #除掉html标签 25 26 content4=content3.replace('。','。\n\n\0\0\0') #把以句号换位“。\n\n\0\0\0 两个换行符三个空格” 到此,将章节内容获取完毕 27 28 contentname=re.compile(r'<h3 class="j_chapterName">(.*?)</h3>') 29 30 contentname1=contentname.findall(str(soup)) #获取章节名称 31 32 book="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+content4 33 34 with open(load, 'a') as f: 35 36 f.write(book) 37 38 39 40 def nextcontent(soup): 41 42 content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) 43 44 #print(str(content)) 45 46 step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">') 47 48 content1=step.findall(str(content)) 49 50 if content1 == []: #判断该页是否为最后一章,是,获取最后一章(特殊)的url,不是,以常规方法获取下一章url 51 52 step1=re.compile(r'<a data-eid="qd_R118" href="(.*?)" id="j_chapterNext">') 53 54 content2=step1.findall(str(content)) 55 56 url="http:"+content2[0] 57 58 return url 59 else: 60 url="http:"+content1[0] 61 62 return url 63 64 def panduan(soup): 65 66 content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) 67 68 #print(str(content)) 69 70 step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">') 71 72 content1=step.findall(str(content)) 73 74 return content1 75 #------------------------------------------------------------------------- 76 77 78 79 #------------------------------------------------------------------------- 80 81 82 soup=gethtml(url) 83 bookname=re.findall(r'<h1>(.*?)</h1>' ,str(soup)) #匹配书名 84 85 86 87 load="d:/88/%s.txt" % bookname[0] 88 i=0 89 while 1==1: 90 soup=gethtml(url) 91 getcontent(soup,load) 92 url=nextcontent(soup) 93 content1=panduan(soup) #在该章里匹配下一章的url,若无法匹配到(输出为[]空),说明没有下一章 94 i+=1 95 print("第%d章下载完成" % i) 96 97 if content1 == []: # 98 break 99 100 time.sleep(0.2) 101
下一篇,将结合该篇写一个爬取某一页所有小说的爬虫
(本文仅供技术参考,请勿用作非法途径)