爬取58二手数据.py

#第一个模块 抓取所有频道链接
from bs4 import BeautifulSoup
import requests


start_url = 'http://bj.58.com/sale.shtml'
url_host = 'http://bj.58.com'

def get_index_url(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    links = soup.select('ul.ym-submnu > li > b > a')
    for link in links:
        page_url = url_host + link.get('href')
        print(page_url)

get_index_url(start_url)


#第二个模块 抓取所有商品链接和详情数据

from bs4 import BeautifulSoup
import requests
import time
import pymongo

client = pymongo.MongoClient('localhost', 27017)
ceshi = client['ceshi']
url_list = ceshi['url_list4']
item_info = ceshi['item_info4']



# 在最左边是在python 中对象的名称,后面的是在数据库中的名称
# spider 1
def get_links_from(channel, pages):
    # td.t 没有这个就终止
    list_view = '{}/pn{}/'.format(channel, str(pages))
    wb_data = requests.get(list_view)
    time.sleep(1)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    if soup.find('td', 't'):
        for link in soup.select('td.t a.t'):
            item_link = link.get('href').split('?')[0]
            if item_link != 'http://jump.zhineng.58.com/jump':
                url_list.insert({'url':item_link})
                print(item_link)
            # return urls
    else:
        # It's the last page !
        pass

# spider 2
def get_item_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    if url[:25] == 'http://zhuanzhuan.58.com/':
        data={
            'title':soup.title.text,
            'price': soup.select('span.price_now')[0].text,
            'area':soup.select('div.palce_li > span > i')[0].text,
            'url':url
            }
        item_info.insert(data)
    else:
        data={
            'title': soup.title.text,
            'price':soup.select('span.price.c_f50')[0].text,
            'area':soup.select('div.su_con > a ')[0].get_text(),
            'sale_man':soup.select('ul.vcard > li > a ')[0].text,
            'url':url
            }
        item_info.insert(data)


#第三个模块 主文件运行开始抓取
from multiprocessing import Pool
from pages_parsing import get_item_info,url_list,item_info,get_links_from
from channel_extact import channel_list


item_url = (item['url'] for item in url_list.find())
index_urls0 = (item['url'] for item in item_info.find())
x = set(item_url)
y = set(index_urls0)
rest_of_urls = x-y

def get_all_links_from(channel):
    for i in range(1,100):
        get_links_from(channel,i)
    return rest_of_urls

if __name__ == '__main__':
    pool = Pool()
    # pool = Pool(processes=6)
    #pool.map(get_all_links_from,channel_list.split())
    pool.map(get_item_info,rest_of_urls)

# count = 0
# for url in rest_of_urls:
#     print(url)
#     count += 1
# print(count)

#第四个模块 查看数据流
import time
from pages_parsing import url_list

while True:
    print(url_list.find().count())
    time.sleep(5)

 

posted @ 2016-11-09 09:20  JessisLong  阅读(424)  评论(0编辑  收藏  举报