爬取电商售卖信息

 1 #! /usr/bin/env python
 2 # encoding='utf-8'
 3 # Filename:spider_58center_sth.py
 4 
 5 from bs4 import BeautifulSoup
 6 import time
 7 import requests
 8 
 9 url_58 = 'http://nj.58.com/?PGTID=0d000000-0000-0c5c-ffba-71f8f3f7039e&ClickID=1'
10 
11 '''
12 用于爬取电商售卖信息:例为58同城电脑售卖信息'''
13 
14 
15 def get_url_list(url):
16     web_data = requests.get(url)
17     soup = BeautifulSoup(web_data.text, 'lxml')
18     url = soup.select('td.t > a[class="t"]')
19     url_list = ''
20     for link in url:
21         link_n = link.get('href')
22         if 'zhuanzhuan' in link_n:
23             pass
24         else:
25             if 'jump' in link_n:
26                 pass
27             else:
28                 url_list = url_list + '\n' + link_n
29 
30     print('url_list: %s' % url_list)
31     return url_list
32 
33 
34 # 分类获取目标信息
35 def get_url_info():
36     url_list = get_url_list(url_58)
37 
38     for url in url_list.split():
39         time.sleep(1)
40         web_datas = requests.get(url)
41         soup = BeautifulSoup(web_datas.text, 'lxml')
42 
43         type = soup.select('#head > div.breadCrumb.f12 > span:nth-of-type(3) > a')
44         title = soup.select(' div.col_sub.mainTitle > h1')
45         date = soup.select('li.time')
46         price = soup.select('div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.summary > ul > '
47                             'li:nth-of-type(1) > div.su_con > span.price.c_f50')
48         fineness = soup.select('div.col_sub.summary > u1 > li:nth-of-type(2) > div.su_con > span')
49         area = soup.select('div.col_sub.summary > u1 > li:nth-of-type(3) > div.su_con > span')
50 
51         for typei, titlei, datei, pricei, finenessi, areai in zip(type, title, date, price, fineness, area):
52 
53             # 做字典
54             data = {
55                 'type': typei.get_text(),
56                 'title': titlei.get_text(),
57                 'date': datei.get_text(),
58                 'price': pricei.get_text(),
59                 'fineness': (finenessi.get_text()).strip(),
60                 'area': list(areai.stripped_strings)
61             }
62             print(data)
63 
64     get_url_info()

爬取商城商品售卖信息

posted on 2017-10-13 13:20 落羽生阅读(452) 评论(0) 编辑收藏举报

刷新页面返回顶部

止静

python/python web (bash shell) | 嵌入式

爬取电商售卖信息

导航

公告