[Python3爬虫从入门到精通]2.淘宝信息定向爬虫实例分析

 1 import requests
 2 import re
 3 
 4 
 5 def getHTMLText(url):
 6     """
 7     headers = {
 8         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
 9     }
10     """
11     try:
12         #print(url)
13         r = requests.get(url, timeout=30)
14         r.raise_for_status() #不小心写成r.raise_forstatus一直没结果,以为需要添加heades才行呢,醉了醉了
15         #print(r.status_code)
16         r.encoding = r.apparent_encoding
17         return r.text
18     except:
19         return ""
20 
21 
22 def parsePage(ilt, html):
23     try:
24         #plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
25         #tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
26         plt = re.findall(r'"view_price":"[\d\.]*"', html)
27         # \d     数字:[0-9]
28         # *      匹配前一字符0到无数次
29         # []     对应位置可以是字符集中任意字符
30         tlt = re.findall(r'"raw_title":".*?"', html)
31         # .      匹配除换行符“/n”外的字符
32         # *?     数量词后跟?变为非贪婪模式,在""内非贪婪匹配,去掉后一个引号匹配为空 "view_price":"
33         for i in range(len(plt)):
34             # eval()函数 list,tuple,dict和string相互转化
35             price = eval(plt[i].split(':')[1])
36             title = eval(tlt[i].split(':')[1])
37             ilt.append([price, title])
38     except:
39         print('')
40 
41 
42 def printGoodsList(ilt):
43     tplt = '{:4}\t{:8}\t{:16}' #格式化输出
44     print(tplt.format('序号', '价格', '商品名称'))
45     count = 0
46     for g in ilt:
47         count = count + 1
48         print(tplt.format(count, g[0], g[1]))
49 
50 
51 def main():
52     goods = '书包' #搜索内容
53     depth = 2 # 翻页深度
54     start_url = 'https://s.taobao.com/search?q=' + goods
55     infoList = []
56     for i in range(depth):
57         try:
58             url = start_url + '&s' + str(44 * i)
59             html = getHTMLText(url)
60             parsePage(infoList, html)
61         except:
62             continue
63     printGoodsList(infoList)
64 
65 
66 main()

50天后终于更了下一篇,继续努力!

posted @ 2017-10-22 19:54  云深~  阅读(485)  评论(15编辑  收藏  举报