python 爬取qidian某一页全部小说

 

 本文纯粹用于技术练习,请勿用作非法途径

 
  1 import re
  2 import urllib.request
  3 from bs4 import BeautifulSoup
  4 import time
  5 
  6 url=input("第一页网址:")
  7 
  8 def gethtml(url):
  9                                       #获取页面源代码html
 10     page=urllib.request.urlopen(url)
 11     html=page.read().decode('utf-8')  #html是一个列表
 12     soup=BeautifulSoup(html,'html.parser')
 13     
 14     return soup
 15 
 16 
 17 def getbookurl(soup):                   #获取该页所有书本的链接地址
 18     try:
 19         
 20         firsturl2=[]
 21         bookurl=soup.find_all("h4")
 22         bookurl1=re.findall(r'<h4><a data-bid=".*?" data-eid=".*?" href="(.*?)" target="_blank"',str(bookurl))
 23         #print(bookurl1)
 24         for i in range(0,len(bookurl1)):
 25             bookurl="http:"+bookurl1[i]
 26         
 27         
 28             soup1=gethtml(bookurl)          #获取每本书第一章 的url
 29             time.sleep(0.2)
 30             firsturl=soup1.find_all("a",{"class":"red-btn J-getJumpUrl "})
 31             firsturl1=re.findall(r'data-firstchapterjumpurl=".*?" href="(.*?)" id="readBtn">',str(firsturl))
 32             if firsturl1[0]=='':            #由于起点限制,某些链接无法爬取,显示的是一个空列表,这里要进行判断
 33                 continue
 34             firsturl2.append(firsturl1[0])
 35             print(firsturl2)
 36         return firsturl2
 37     except:
 38         return firsturl2
 39     
 40 
 41 
 42 
 43 def getcontent(soup,load):
 44     
 45     content=soup.find_all("div",{"class":"read-content j_readContent"})
 46     
 47     content1=re.compile(r'<p>([\s\S]*?)</p>')
 48     
 49     content2=content1.findall(str(content))
 50    
 51     content3=re.sub("</?\w+[^>]*>",'',content2[0])
 52     
 53     content4=content3.replace('','。\n\n\0\0\0')  #到此,将章节内容获取完毕
 54 
 55     contentname=re.compile(r'<h3 class="j_chapterName">(.*?)</h3>')
 56     
 57     contentname1=contentname.findall(str(soup))     #获取章节名称
 58 
 59     book="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+content4   
 60 
 61     with open(load, 'a',encoding='gb18030') as f:       #这里的gb18030是GBK的父集,所以能兼容GBK不能编码的字符。
 62 
 63         f.write(book)
 64 
 65     
 66 
 67 def nextcontent(soup):
 68 
 69     content=soup.find_all("div",{"class":"chapter-control dib-wrap"})
 70     
 71     #print(str(content))
 72     
 73     step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">')
 74 
 75     content1=step.findall(str(content))
 76 
 77     if content1 == []:
 78 
 79         step1=re.compile(r'<a data-eid="qd_R118" href="(.*?)" id="j_chapterNext">')
 80 
 81         content2=step1.findall(str(content))
 82 
 83         url="http:"+content2[0]
 84 
 85         return url
 86     else:
 87         url="http:"+content1[0]
 88 
 89         return url
 90 
 91 def panduan(soup):
 92     
 93     content=soup.find_all("div",{"class":"chapter-control dib-wrap"})
 94     
 95     #print(str(content))
 96     
 97     step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">')
 98     
 99     content1=step.findall(str(content))
100     
101     return content1
102     #-------------------------------------------------------------------------
103     
104     
105     
106     #-------------------------------------------------------------------------
107     
108 while 1==1:
109     soup2=gethtml(url)
110     firsturl2=getbookurl(soup2)
111 
112     for j in range(0,len(firsturl2)):
113         url="http:"+firsturl2[j]
114         soup1=gethtml("http:"+firsturl2[j])
115         bookname=re.findall(r'<h1>(.*?)</h1>' ,str(soup1))
116         load="d:/88/%s.txt" % bookname[0]
117         i=0
118         while 1==1:
119             soup=gethtml(url)
120             getcontent(soup,load)
121             url=nextcontent(soup)
122             content1=panduan(soup)
123             i+=1
124             print("第%d章下载完成" % i)
125     
126             if content1 == []:
127                 break
128             
129             time.sleep(0.2)
130         print("-------------第%d本书下载完成---------" % int(j+1))
131     

 

 

结果图:

 

 

学习ing!!!  加油

posted @ 2017-05-25 12:21  金牛小子  阅读(1050)  评论(0编辑  收藏  举报