爬取电商售卖信息
1 #! /usr/bin/env python 2 # encoding='utf-8' 3 # Filename:spider_58center_sth.py 4 5 from bs4 import BeautifulSoup 6 import time 7 import requests 8 9 url_58 = 'http://nj.58.com/?PGTID=0d000000-0000-0c5c-ffba-71f8f3f7039e&ClickID=1' 10 11 ''' 12 用于爬取电商售卖信息:例为58同城电脑售卖信息''' 13 14 15 def get_url_list(url): 16 web_data = requests.get(url) 17 soup = BeautifulSoup(web_data.text, 'lxml') 18 url = soup.select('td.t > a[class="t"]') 19 url_list = '' 20 for link in url: 21 link_n = link.get('href') 22 if 'zhuanzhuan' in link_n: 23 pass 24 else: 25 if 'jump' in link_n: 26 pass 27 else: 28 url_list = url_list + '\n' + link_n 29 30 print('url_list: %s' % url_list) 31 return url_list 32 33 34 # 分类获取目标信息 35 def get_url_info(): 36 url_list = get_url_list(url_58) 37 38 for url in url_list.split(): 39 time.sleep(1) 40 web_datas = requests.get(url) 41 soup = BeautifulSoup(web_datas.text, 'lxml') 42 43 type = soup.select('#head > div.breadCrumb.f12 > span:nth-of-type(3) > a') 44 title = soup.select(' div.col_sub.mainTitle > h1') 45 date = soup.select('li.time') 46 price = soup.select('div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.summary > ul > ' 47 'li:nth-of-type(1) > div.su_con > span.price.c_f50') 48 fineness = soup.select('div.col_sub.summary > u1 > li:nth-of-type(2) > div.su_con > span') 49 area = soup.select('div.col_sub.summary > u1 > li:nth-of-type(3) > div.su_con > span') 50 51 for typei, titlei, datei, pricei, finenessi, areai in zip(type, title, date, price, fineness, area): 52 53 # 做字典 54 data = { 55 'type': typei.get_text(), 56 'title': titlei.get_text(), 57 'date': datei.get_text(), 58 'price': pricei.get_text(), 59 'fineness': (finenessi.get_text()).strip(), 60 'area': list(areai.stripped_strings) 61 } 62 print(data) 63 64 get_url_info()
爬取商城商品售卖信息