1 #!/usr/bin/env python
 2 #-*-coding: utf-8 -*-
 3 import re
 4 import urllib.request as request
 5 from bs4 import BeautifulSoup as bs
 6 import csv
 7 import os
 8 import sys
 9 from imp import reload 
10 reload(sys)
11  
12 def GetAllLink():
13     num = int(input("爬取多少页:>"))
14     if not os.path.exists('./data/'):
15         os.mkdir('./data/')
16      
17     for i in range(num):
18         if i+1 == 1:
19             url = 'http://nj.58.com/piao/'
20             GetPage(url, i)
21         else:
22             url = 'http://nj.58.com/piao/pn%s/' %(i+1)
23             GetPage(url, i)
24  
25  
26 def GetPage(url, num):
27     Url = url
28     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'
29     headers = { 'User-Agent' : user_agent }
30     req = request.Request(Url, headers = headers)
31     page = request.urlopen(req).read().decode('utf-8')
32     soup = bs(page, "html.parser")
33     table = soup.table
34     tag = table.find_all('tr')
35     # 提取出所需的那段
36     soup2 = bs(str(tag), "html.parser")
37     title = soup2.find_all('a','t')         #标题与url 
38     price = soup2.find_all('b', 'pri')      #价格
39     fixedprice = soup2.find_all('del')      #原价
40     date = soup2.find_all('span','pr25')    #时间 
41  
42     atitle = []
43     ahref = []
44     aprice = []
45     afixedprice = []
46     adate = []
47  
48     for i in title:
49         #print i.get_text(), i.get('href')
50         atitle.append(i.get_text())
51         ahref.append(i.get('href'))
52     for i in price:
53         #print i.get_text()
54         aprice.append(i.get_text())
55     for i in fixedprice:
56         #print j.get_text()
57         afixedprice.append(i.get_text())
58     for i in date:
59         #print i.get_text()
60         adate.append(i.get_text())
61 
62     csvfile = open('./data/ticket_%s.csv'%num, 'w')
63     writer = csv.writer(csvfile)
64     writer.writerow(['标题','url','售价','原价','演出时间'])
65     '''
66     每个字段必有title,但是不一定有时间date
67     如果没有date日期,我们就设为'---'
68     '''
69     if len(atitle) > len(adate):
70         for i in range(len(atitle) - len(adate)):
71             adate.append('---')
72         for i in range(len(atitle) - len(afixedprice)):
73             afixedprice.append('---')
74         for i in range(len(atitle) - len(aprice)):
75             aprice.append('---')
76             
77     for i in range(len(atitle)):
78             message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i]
79             writer.writerow([i for i in str(message).split('|')])
80     print ("[Result]:> 页面 %s 信息保存完毕!"%(num+1))
81     csvfile.close()
82  
83  
84 if __name__ == '__main__':
85     GetAllLink()

参考地址

posted on 2017-08-04 17:08  风又奈何  阅读(319)  评论(0编辑  收藏  举报