Beautiful Soup多线程爬取斗鱼所有主播信息(改良版)
花点时间改良了一下代码。如下
import requests from bs4 import BeautifulSoup import pymongo import lxml import time, datetime class douyu_host_info(): def __init__(self): self.url_host = 'https://www.douyu.com' self.date_time = datetime.datetime.now().strftime('%Y%m%d_%H%M') self.url_list = [] self.headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', } # 类别后缀列表,由于是固定不变的,第一次抓取后直接保存起来 self.categorys_list =[ '/g_LOL', '/g_blzy', '/g_DOTA2', '/g_qipai', '/g_DNF', '/g_CF', '/g_mszb', '/g_CSGO', '/g_How', '/g_DOTA', '/g_WOW', '/g_nsh', '/g_Overwatch', '/g_wxy', '/directory/category/PCgame', '/g_jdqs', '/g_TVgame', '/g_gwlrsj', '/g_FTG', '/g_xyqx', '/g_NBA2K', '/g_BF', '/g_DG', '/directory/category/djry', '/g_wzry', '/g_jdqscjzc', '/g_jdqsqjcj', '/g_qqfcsy', '/g_hyrz', '/g_xyzx', '/g_HLMJ', '/g_phone', '/g_LRSZQ', '/g_mhmnz', '/g_CFSY', '/directory/category/syxx', '/g_yz', '/g_xingyu', '/g_ecy', '/g_yqk', '/g_HW', '/g_ms', '/g_music', '/g_ip', '/directory/category/yl', '/g_smkj', '/g_yj', '/g_Finance', '/g_kepu', '/g_js', '/g_car', '/g_jlp', '/g_tkx', '/directory/sport/cate', '/g_FM233', '/g_yydt', '/g_lianmaihudong', '/g_qinggan', '/directory/category/voice', '/g_znl' ] def Mongodb_set(self, sheet_name, r_data): client = pymongo.MongoClient('localhost', 27017) douyu = client['douyu'] sheet_name = douyu[sheet_name] print(r_data) sheet_name.insert_one(r_data) def get_url_list(self): for category in self.categorys_list: category_url = self.url_host + category self.url_list.append(category_url) self.Mongodb_set(sheet_name='url_list', r_data={'url': category_url}) return self.url_list def get_host_info(self, url): time.sleep(0.2) print('Now start open {}'.format(url)) for i in range(3): try: wb_data = requests.get(url, headers=self.headers) break except: print('net work error! will retry 3 times') soup = BeautifulSoup(wb_data.text, 'lxml') print('start analazy url') try: category = soup.select('h1')[0].get_text() except: category = '未定義類別' names = soup.select('.ellipsis.fl') nums = soup.select('.dy-num.fr') titles = soup.select('.mes h3') hrefs = soup.select('#live-list-contentbox li a') for name, num, href, title in zip(names, nums, hrefs, titles): data = { '類別': category, '主播': name.get_text(), '标题': title.get_text().split('\n')[-1].strip(), '链接': self.url_host + href.get('href'), '人氣指數': float(num.get_text()[:-1]) if '万'in num.get_text() else float(num.get_text())/10000, '當前時間': self.date_time } if data['人氣指數'] > 2: self.Mongodb_set(sheet_name='host_info_{}'.format(self.date_time), r_data=data) def db_check(self, sheetname, key_word): client = pymongo.MongoClient('localhost', 27017) douyu = client['douyu'] sheetname = douyu[sheetname] for data in sheetname.find(key_word): print(data)
from multiprocessing import Pool from douyu_host_2 import douyu_host_info douyu = douyu_host_info() def data_check(): #{u'當前時間':'20180901 10:58', u'人氣指數':{'$gte':30}} #{'主播':' # sheetname = input('Which sheet do you want to check') sheetname = 'host_info_20180901_1530' # key_word = input('Do you want to check with?') key_word = {'類別': 'DOTA2'} douyu.db_check(sheetname=sheetname, key_word=key_word) def w_to_db(): pool = Pool() url_list = douyu.get_url_list() pool.map(douyu.get_host_info, url_list) if __name__ == '__main__': w_to_db() data_check()
这个爬虫没有包含翻页,只爬取了每个类别下面的首页,翻页爬所有主播请参考这个脚本。
https://www.cnblogs.com/lkd8477604/p/9848958.html