赶集网二手数据.py
#获取所有二手频道链接 import requests from bs4 import BeautifulSoup star_url = 'http://bj.ganji.com/wu/' url_host = 'http://bj.ganji.com' page_url = [] def get_index_url(url): wb_data = requests.get(url) if wb_data.status_code == 200: soup = BeautifulSoup(wb_data.text,'lxml') links =soup.select('dl.fenlei > dt > a') for link in links: all_list = url_host + link.get('href') page_url.append(all_list) return page_url else: pass get_index_url(star_url) #获取所有频道里面的子链接 from bs4 import BeautifulSoup from multiprocessing import Pool from channel_exciting import page_url import requests import pymongo client = pymongo.MongoClient('localhost',27017) ganji = client['ganji'] url_list = ganji['url_list'] item_info = ganji['item_info'] headers = { 'user_agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36', 'Connection':'keep-alive' } def get_links_from(channel,page): #http://bj.ganji.com/jiaju/o3/ url_host = '{}o{}'.format(channel,str(page)) wb_data = requests.get(url_host,headers=headers) soup = BeautifulSoup(wb_data.text,'lxml') for link in soup.select('td a,t'): item_link = link.get('href').split('?')[0] if wb_data.status_code != 200: pass else: url_list.insert_one({'url':item_link}) print(url_list) def get_all_links(channel): for i in range(1,100): get_links_from(channel,i) if __name__ == '__main__': list = [] for item in page_url: list.append(item) pool = Pool() pool.map(get_all_links,list) #获取所有子链接里面的数据 from multiprocessing import Pool from page_parsing import url_list from bs4 import BeautifulSoup import requests import pymongo import time client = pymongo.MongoClient('localhost',27017) ganji = client['ganji'] item_info = ganji['item_info'] headers = { 'user_agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36', 'Connection':'keep-alive' } def get_items_info(url): wb_data = requests.get(url,headers=headers) soup = BeautifulSoup(wb_data.text, 'lxml') try: data={ 'title':soup.title.text, 'price':soup.select('span.price_now > i')[0].text, 'area':soup.select('div.palce_li > span > i')[0].text, 'url':url } item_info.insert_one(data) except IndexError: pass else: print(data) time.sleep(2) if __name__ == '__main__': list = [] for item in url_list.find(): list.append(item['url']) pool = Pool() pool.map(get_items_info,list)