python 爬起点目录
1 #目标:书名,简介,作者,字数 2 #首先确定源代码的列表 3 import urllib.request 4 import re 5 from bs4 import BeautifulSoup 6 import random 7 import time 8 9 load=input("路径:") 10 num=input("输入页数:") 11 12 13 14 15 def gethtml(url): #获取页面源代码html 16 page=urllib.request.urlopen(url) 17 html=page.read().decode('utf-8') #html是一个列表 18 soup=BeautifulSoup(html,'html.parser') 19 return soup 20 21 def getbook(soup,load): 22 for i in range(1,21): 23 24 xl=soup.find_all("li",{"data-rid":str(i)}) 25 sm = re.compile(r'<h4><a .*?>(.*?)</a></h4>') #匹配书名 26 sm1=sm.findall(str(xl)) 27 a="《"+sm1[0]+"》" 28 29 ze = re.compile(r'<a class="name" .*?>(.*?)</a>') 30 ze1 = ze.findall(str(xl)) #匹配作者名 31 b=ze1[0] 32 33 jj=re.compile(r'<p class="intro">([\s\S]*?)</p>') 34 jj1=jj.findall(str(xl)) #匹配简介 35 c=jj1[0] 36 37 zs=re.compile(r'<span>(.*?)</span>') 38 zs1=zs.findall(str(xl)) 39 d=zs1[1] 40 content=[a,b,c,d] 41 42 for j in range(0,4): 43 44 with open(load, 'a') as f: 45 if j == 3: 46 f.write(content[3]) 47 else: 48 f.write(content[j]+"\n") 49 50 with open(load, 'a') as f: 51 f.write("\n\n----------------------------------------------------------------------\n\n") 52 def geturl(num): 53 for page in range(1,int(num)+1): 54 55 url="http://fin.qidian.com/?size=-1&sign=-1&tag=-1&chanId=-1&subCateId=-1&orderId=&update=-1&page=%d&month=-1&style=1&vip=0" % page 56 57 soup=gethtml(url) 58 getbook(soup,load) 59 time.sleep(2.5) 60 61 62 geturl(num)
实现