顺企网 爬取16W数据保存到Mongodb
import requests from bs4 import BeautifulSoup import pymongo from multiprocessing.dummy import Pool as ThreadPool headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'} # 定义数据库 client = pymongo.MongoClient('localhost',27017) conpany_info = client['conpany_info'] # 给数据库命名 sheet_table = conpany_info['sheet_table'] # 创建表单 def jiexi(url): info = {} res = requests.get(url,headers=headers) if res.status_code != 404: soup = BeautifulSoup(res.text,'lxml') key = [b.text for b in soup.select('.codl dd')] value = [p.text for p in soup.select('.codl dt')] for k,v in zip(value,key): info[k.strip(':')] = v return info urls = ('https://m.11467.com/jinan/co/{}.htm'.format(str(i)) for i in range(2,160998)) def get_all_data(url): try: result = jiexi(url) if result: sheet_table.insert(result) print ('获取了 ' + str (sheet_table.find ().count ()) + '条数据') except Exception as e : print(e,url) if __name__ == "__main__": pool = ThreadPool(4) results = pool.map(get_all_data,urls) pool.close() pool.join()