顺企网 爬取16W数据保存到Mongodb

import requests
from bs4 import BeautifulSoup
import pymongo
from multiprocessing.dummy import Pool as ThreadPool

headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}

# 定义数据库
client = pymongo.MongoClient('localhost',27017)
conpany_info = client['conpany_info']  # 给数据库命名
sheet_table = conpany_info['sheet_table']  # 创建表单

def jiexi(url):
    info = {}
    res = requests.get(url,headers=headers)
    if res.status_code != 404:
        soup = BeautifulSoup(res.text,'lxml')
        key = [b.text for b in soup.select('.codl dd')]
        value = [p.text for p in soup.select('.codl dt')]
        for k,v in zip(value,key):
            info[k.strip('')] = v
        return info

urls = ('https://m.11467.com/jinan/co/{}.htm'.format(str(i)) for i in range(2,160998))

def get_all_data(url):
    try:
        result = jiexi(url)
        if result:
            sheet_table.insert(result)
            print ('获取了 ' + str (sheet_table.find ().count ()) + '条数据')
    except Exception as e :
        print(e,url)
if __name__ == "__main__":
    pool = ThreadPool(4)
    results = pool.map(get_all_data,urls)
    pool.close()
    pool.join()

 

posted @ 2017-06-16 16:13  Erick-LONG  阅读(706)  评论(0编辑  收藏  举报