爬取笔下wenxue小说
1 import urllib.request 2 from bs4 import BeautifulSoup 3 import re 4 5 def gethtml(url): 6 page=urllib.request.urlopen(url) 7 html=page.read().decode('gbk') 8 soup=BeautifulSoup(html,"html.parser") 9 #print(soup) 10 return soup 11 12 13 def getcontent(soup,load): #获取章节内容以及章节名称 14 content1="" 15 content=re.findall(r'<div id="content"><div id="adright"></div>(.*?)</div>',str(soup)) 16 for i in range(0,len(content)): 17 content1+=content[i] 18 content2 = re.sub("</?\w+[^>]*>", "", content1) 19 content3=content2.replace('。','。\n\n\0\0\0\0\0\0') 20 #以上获取章节内容 21 zjname = re.findall(r'<div id="title">(.*?)</div>', str(soup)) 22 #获取章节名称 23 24 with open(load, 'a', encoding='utf-8') as f: 25 f.write("\0\0\0\0\0\0-----------------------------------------------------------"+zjname[0]+"------------------------------------------------------\n\n"+content3) 26 27 def book(soup): 28 bookurl=re.findall(r'<td class="odd"><a href="(.*?)">',str(soup)) #get every book url 29 30 for i in range(0,len(bookurl)): 31 #找到“[点击阅读]按钮”的链接,以及书名 32 print(bookurl[i]) 33 soup1=gethtml(bookurl[i]) 34 allcontent=re.findall(r'</a>\xa0\xa0\xa0\xa0<a href="(.*?)">',str(soup1)) 35 bookname=re.findall(r'<strong>(.*?)全集下载</strong>', str(soup1)) 36 37 soup2=gethtml(allcontent[0]) 38 #打开点击阅读的按钮链接,找到第一章的链接 39 firsturl1 = re.findall(r'<dd><a href="(.*?)">.*?</a></dd>', str(soup2)) 40 headurl=bookurl[i][0:-4].replace("binfo","b") 41 firsturl2=headurl+"/"+firsturl1[0] 42 print(firsturl2) 43 44 #打开链接,开始爬取内容,同时获取下一章内容,并判断是否到最后一章 45 soup3=gethtml(firsturl2) 46 k=0 47 load="d:/77/%s.txt" % bookname[0] 48 try: 49 50 while True: 51 nexturl = re.findall(r'<li><a href="(.*?)">下一页', str(soup3)) 52 getcontent(soup3,load) 53 soup3=gethtml(headurl+"/"+nexturl[0]) 54 k+=1 55 print("第%d章下载完成" % int(k)) 56 except: 57 58 print("-------------------第%d本书下载完成---------------" % int(i+1)) 59 60 url="http://www.bxwx9.org/modules/article/toplist.php?sort=dayvisit" 61 soup=gethtml(url) 62 load=book(soup)
保存到txt文件中
2017-05-28
22:58:35