1 #!/usr/bin/env python 2 #-*-coding: utf-8 -*- 3 import re 4 import urllib.request as request 5 from bs4 import BeautifulSoup as bs 6 import csv 7 import os 8 import sys 9 from imp import reload 10 reload(sys) 11 12 def GetAllLink(): 13 num = int(input("爬取多少页:>")) 14 if not os.path.exists('./data/'): 15 os.mkdir('./data/') 16 17 for i in range(num): 18 if i+1 == 1: 19 url = 'http://nj.58.com/piao/' 20 GetPage(url, i) 21 else: 22 url = 'http://nj.58.com/piao/pn%s/' %(i+1) 23 GetPage(url, i) 24 25 26 def GetPage(url, num): 27 Url = url 28 user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0' 29 headers = { 'User-Agent' : user_agent } 30 req = request.Request(Url, headers = headers) 31 page = request.urlopen(req).read().decode('utf-8') 32 soup = bs(page, "html.parser") 33 table = soup.table 34 tag = table.find_all('tr') 35 # 提取出所需的那段 36 soup2 = bs(str(tag), "html.parser") 37 title = soup2.find_all('a','t') #标题与url 38 price = soup2.find_all('b', 'pri') #价格 39 fixedprice = soup2.find_all('del') #原价 40 date = soup2.find_all('span','pr25') #时间 41 42 atitle = [] 43 ahref = [] 44 aprice = [] 45 afixedprice = [] 46 adate = [] 47 48 for i in title: 49 #print i.get_text(), i.get('href') 50 atitle.append(i.get_text()) 51 ahref.append(i.get('href')) 52 for i in price: 53 #print i.get_text() 54 aprice.append(i.get_text()) 55 for i in fixedprice: 56 #print j.get_text() 57 afixedprice.append(i.get_text()) 58 for i in date: 59 #print i.get_text() 60 adate.append(i.get_text()) 61 62 csvfile = open('./data/ticket_%s.csv'%num, 'w') 63 writer = csv.writer(csvfile) 64 writer.writerow(['标题','url','售价','原价','演出时间']) 65 ''' 66 每个字段必有title,但是不一定有时间date 67 如果没有date日期,我们就设为'---' 68 ''' 69 if len(atitle) > len(adate): 70 for i in range(len(atitle) - len(adate)): 71 adate.append('---') 72 for i in range(len(atitle) - len(afixedprice)): 73 afixedprice.append('---') 74 for i in range(len(atitle) - len(aprice)): 75 aprice.append('---') 76 77 for i in range(len(atitle)): 78 message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i] 79 writer.writerow([i for i in str(message).split('|')]) 80 print ("[Result]:> 页面 %s 信息保存完毕!"%(num+1)) 81 csvfile.close() 82 83 84 if __name__ == '__main__': 85 GetAllLink()