python 爬取qidian某一页全部小说
本文纯粹用于技术练习,请勿用作非法途径
1 import re 2 import urllib.request 3 from bs4 import BeautifulSoup 4 import time 5 6 url=input("第一页网址:") 7 8 def gethtml(url): 9 #获取页面源代码html 10 page=urllib.request.urlopen(url) 11 html=page.read().decode('utf-8') #html是一个列表 12 soup=BeautifulSoup(html,'html.parser') 13 14 return soup 15 16 17 def getbookurl(soup): #获取该页所有书本的链接地址 18 try: 19 20 firsturl2=[] 21 bookurl=soup.find_all("h4") 22 bookurl1=re.findall(r'<h4><a data-bid=".*?" data-eid=".*?" href="(.*?)" target="_blank"',str(bookurl)) 23 #print(bookurl1) 24 for i in range(0,len(bookurl1)): 25 bookurl="http:"+bookurl1[i] 26 27 28 soup1=gethtml(bookurl) #获取每本书第一章 的url 29 time.sleep(0.2) 30 firsturl=soup1.find_all("a",{"class":"red-btn J-getJumpUrl "}) 31 firsturl1=re.findall(r'data-firstchapterjumpurl=".*?" href="(.*?)" id="readBtn">',str(firsturl)) 32 if firsturl1[0]=='': #由于起点限制,某些链接无法爬取,显示的是一个空列表,这里要进行判断 33 continue 34 firsturl2.append(firsturl1[0]) 35 print(firsturl2) 36 return firsturl2 37 except: 38 return firsturl2 39 40 41 42 43 def getcontent(soup,load): 44 45 content=soup.find_all("div",{"class":"read-content j_readContent"}) 46 47 content1=re.compile(r'<p>([\s\S]*?)</p>') 48 49 content2=content1.findall(str(content)) 50 51 content3=re.sub("</?\w+[^>]*>",'',content2[0]) 52 53 content4=content3.replace('。','。\n\n\0\0\0') #到此,将章节内容获取完毕 54 55 contentname=re.compile(r'<h3 class="j_chapterName">(.*?)</h3>') 56 57 contentname1=contentname.findall(str(soup)) #获取章节名称 58 59 book="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+content4 60 61 with open(load, 'a',encoding='gb18030') as f: #这里的gb18030是GBK的父集,所以能兼容GBK不能编码的字符。 62 63 f.write(book) 64 65 66 67 def nextcontent(soup): 68 69 content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) 70 71 #print(str(content)) 72 73 step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">') 74 75 content1=step.findall(str(content)) 76 77 if content1 == []: 78 79 step1=re.compile(r'<a data-eid="qd_R118" href="(.*?)" id="j_chapterNext">') 80 81 content2=step1.findall(str(content)) 82 83 url="http:"+content2[0] 84 85 return url 86 else: 87 url="http:"+content1[0] 88 89 return url 90 91 def panduan(soup): 92 93 content=soup.find_all("div",{"class":"chapter-control dib-wrap"}) 94 95 #print(str(content)) 96 97 step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">') 98 99 content1=step.findall(str(content)) 100 101 return content1 102 #------------------------------------------------------------------------- 103 104 105 106 #------------------------------------------------------------------------- 107 108 while 1==1: 109 soup2=gethtml(url) 110 firsturl2=getbookurl(soup2) 111 112 for j in range(0,len(firsturl2)): 113 url="http:"+firsturl2[j] 114 soup1=gethtml("http:"+firsturl2[j]) 115 bookname=re.findall(r'<h1>(.*?)</h1>' ,str(soup1)) 116 load="d:/88/%s.txt" % bookname[0] 117 i=0 118 while 1==1: 119 soup=gethtml(url) 120 getcontent(soup,load) 121 url=nextcontent(soup) 122 content1=panduan(soup) 123 i+=1 124 print("第%d章下载完成" % i) 125 126 if content1 == []: 127 break 128 129 time.sleep(0.2) 130 print("-------------第%d本书下载完成---------" % int(j+1)) 131
结果图:
学习ing!!! 加油