大规模数据爬取 -- Python

Python书写爬虫,目的是爬取所有的个人商家商品信息及详情,并进行数据归类分析

整个工作流程图:

 

 

 

第一步:采用自动化的方式从前台页面获取所有的频道

from bs4 import BeautifulSoup
import requests

#1、找到左侧边栏所有频道的链接
start_url = 'http://hz.58.com/sale.shtml'
url_host = 'http://hz.58.com'

def get_channel_urls(url):
    wb_data = requests.get(start_url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    links = soup.select('ul.ym-mainmnu > li > span > a["href"]')
    for link in links:
        page_url = url_host + link.get('href')
        print(page_url)
    #print(links)

get_channel_urls(start_url)

channel_list = '''
    http://hz.58.com/shouji/
    http://hz.58.com/tongxunyw/
    http://hz.58.com/danche/
    http://hz.58.com/diandongche/
    http://hz.58.com/diannao/
    http://hz.58.com/shuma/
    http://hz.58.com/jiadian/
    http://hz.58.com/ershoujiaju/
    http://hz.58.com/yingyou/
    http://hz.58.com/fushi/
    http://hz.58.com/meirong/
    http://hz.58.com/yishu/
    http://hz.58.com/tushu/
    http://hz.58.com/wenti/
    http://hz.58.com/bangong/
    http://hz.58.com/shebei.shtml
    http://hz.58.com/chengren/
'''

 

第二步:通过第一步获取的所有频道去获取所有的列表详情,并存入URL_list表中,同时获取商品详情信息

from bs4 import BeautifulSoup
import requests
import time
import pymongo

client = pymongo.MongoClient('localhost',27017)
ceshi = client['ceshi']
url_list = ceshi['url_list']
item_info = ceshi['item_info']


def get_links_from(channel,pages,who_sells=0):
    #http://hz.58.com/shouji/0/pn7/
    list_view = '{}{}/pn{}/'.format(channel,str(who_sells),str(pages))
    wb_data = requests.get(list_view)
    time.sleep(1)
    soup = BeautifulSoup(wb_data.text,'lxml')
    links = soup.select('td.t > a[onclick]')
    if soup.find('td','t'):
        for link in links:
            item_link = link.get('href').split('?')[0]
            url_list.insert_one({'url':item_link})
            print(item_link)
    else:
        pass
        # Nothing


def get_item_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    no_longer_exist = '商品已下架' in soup
    if no_longer_exist:
        pass
    else:
        title = soup.title.text
        price = soup.select('span.price_now > i')[0].text
        area = soup.select('div.palce_li > span > i')[0].text
        #url_list.insert_one({'title':title,'price':price,'area':area})
        print({'title':title,'price':price,'area':area})

#get_links_from('http://hz.58.com/pbdn/',7)
#get_item_info('http://zhuanzhuan.58.com/detail/840577950118920199z.shtml')

 

第三步:采用多进程的方式的main主函数入口

from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from

def get_all_links_from(channel):
    for num in range(1,31):
        get_links_from(channel,num)

if __name__ == '__main__':
    pool = Pool()
    pool.map(get_all_links_from,channel_list.split())

 

第四步:实时对获取到的数据进行监控

from time import sleep
from page_parsing import url_list

while True:
    print(url_list.find().count())
    sleep(5)

 

 

具体运行效果:

 

posted @ 2017-09-03 10:37  我是旺旺  阅读(1521)  评论(0编辑  收藏  举报