python 爬起点目录

 1 #目标:书名,简介,作者,字数
 2 #首先确定源代码的列表
 3 import urllib.request
 4 import re
 5 from bs4 import BeautifulSoup
 6 import random
 7 import time
 8 
 9 load=input("路径:")
10 num=input("输入页数:")
11 
12 
13 
14 
15 def gethtml(url):                     #获取页面源代码html
16     page=urllib.request.urlopen(url)
17     html=page.read().decode('utf-8')  #html是一个列表
18     soup=BeautifulSoup(html,'html.parser')
19     return soup
20 
21 def getbook(soup,load):
22     for i in range(1,21):
23 
24         xl=soup.find_all("li",{"data-rid":str(i)})
25         sm = re.compile(r'<h4><a .*?>(.*?)</a></h4>')    #匹配书名
26         sm1=sm.findall(str(xl))
27         a=""+sm1[0]+""
28 
29         ze = re.compile(r'<a class="name" .*?>(.*?)</a>')
30         ze1 = ze.findall(str(xl))                        #匹配作者名
31         b=ze1[0]
32 
33         jj=re.compile(r'<p class="intro">([\s\S]*?)</p>')
34         jj1=jj.findall(str(xl))                          #匹配简介
35         c=jj1[0]
36 
37         zs=re.compile(r'<span>(.*?)</span>')
38         zs1=zs.findall(str(xl))
39         d=zs1[1]
40         content=[a,b,c,d]
41         
42         for j in range(0,4):
43             
44             with open(load, 'a') as f:
45                 if j == 3:
46                     f.write(content[3])
47                 else:
48                     f.write(content[j]+"\n")
49                 
50         with open(load, 'a') as f:
51             f.write("\n\n----------------------------------------------------------------------\n\n")
52 def geturl(num):
53     for page in range(1,int(num)+1):
54         
55         url="http://fin.qidian.com/?size=-1&sign=-1&tag=-1&chanId=-1&subCateId=-1&orderId=&update=-1&page=%d&month=-1&style=1&vip=0" % page
56         
57         soup=gethtml(url)
58         getbook(soup,load)
59         time.sleep(2.5)
60         
61         
62 geturl(num)

实现

posted @ 2017-05-23 22:37  金牛小子  阅读(360)  评论(0编辑  收藏  举报