爬取58二手数据.py
#第一个模块 抓取所有频道链接 from bs4 import BeautifulSoup import requests start_url = 'http://bj.58.com/sale.shtml' url_host = 'http://bj.58.com' def get_index_url(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, 'lxml') links = soup.select('ul.ym-submnu > li > b > a') for link in links: page_url = url_host + link.get('href') print(page_url) get_index_url(start_url) #第二个模块 抓取所有商品链接和详情数据 from bs4 import BeautifulSoup import requests import time import pymongo client = pymongo.MongoClient('localhost', 27017) ceshi = client['ceshi'] url_list = ceshi['url_list4'] item_info = ceshi['item_info4'] # 在最左边是在python 中对象的名称,后面的是在数据库中的名称 # spider 1 def get_links_from(channel, pages): # td.t 没有这个就终止 list_view = '{}/pn{}/'.format(channel, str(pages)) wb_data = requests.get(list_view) time.sleep(1) soup = BeautifulSoup(wb_data.text, 'lxml') if soup.find('td', 't'): for link in soup.select('td.t a.t'): item_link = link.get('href').split('?')[0] if item_link != 'http://jump.zhineng.58.com/jump': url_list.insert({'url':item_link}) print(item_link) # return urls else: # It's the last page ! pass # spider 2 def get_item_info(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, 'lxml') if url[:25] == 'http://zhuanzhuan.58.com/': data={ 'title':soup.title.text, 'price': soup.select('span.price_now')[0].text, 'area':soup.select('div.palce_li > span > i')[0].text, 'url':url } item_info.insert(data) else: data={ 'title': soup.title.text, 'price':soup.select('span.price.c_f50')[0].text, 'area':soup.select('div.su_con > a ')[0].get_text(), 'sale_man':soup.select('ul.vcard > li > a ')[0].text, 'url':url } item_info.insert(data) #第三个模块 主文件运行开始抓取 from multiprocessing import Pool from pages_parsing import get_item_info,url_list,item_info,get_links_from from channel_extact import channel_list item_url = (item['url'] for item in url_list.find()) index_urls0 = (item['url'] for item in item_info.find()) x = set(item_url) y = set(index_urls0) rest_of_urls = x-y def get_all_links_from(channel): for i in range(1,100): get_links_from(channel,i) return rest_of_urls if __name__ == '__main__': pool = Pool() # pool = Pool(processes=6) #pool.map(get_all_links_from,channel_list.split()) pool.map(get_item_info,rest_of_urls) # count = 0 # for url in rest_of_urls: # print(url) # count += 1 # print(count) #第四个模块 查看数据流 import time from pages_parsing import url_list while True: print(url_list.find().count()) time.sleep(5)