Beautiful Soup多线程爬取斗鱼所有主播信息(改良版)
花点时间改良了一下代码。如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | import requests from bs4 import BeautifulSoup import pymongo import lxml import time, datetime class douyu_host_info(): def __init__( self ): self .url_host = 'https://www.douyu.com' self .date_time = datetime.datetime.now().strftime( '%Y%m%d_%H%M' ) self .url_list = [] self .headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' , } # 类别后缀列表,由于是固定不变的,第一次抓取后直接保存起来 self .categorys_list = [ '/g_LOL' , '/g_blzy' , '/g_DOTA2' , '/g_qipai' , '/g_DNF' , '/g_CF' , '/g_mszb' , '/g_CSGO' , '/g_How' , '/g_DOTA' , '/g_WOW' , '/g_nsh' , '/g_Overwatch' , '/g_wxy' , '/directory/category/PCgame' , '/g_jdqs' , '/g_TVgame' , '/g_gwlrsj' , '/g_FTG' , '/g_xyqx' , '/g_NBA2K' , '/g_BF' , '/g_DG' , '/directory/category/djry' , '/g_wzry' , '/g_jdqscjzc' , '/g_jdqsqjcj' , '/g_qqfcsy' , '/g_hyrz' , '/g_xyzx' , '/g_HLMJ' , '/g_phone' , '/g_LRSZQ' , '/g_mhmnz' , '/g_CFSY' , '/directory/category/syxx' , '/g_yz' , '/g_xingyu' , '/g_ecy' , '/g_yqk' , '/g_HW' , '/g_ms' , '/g_music' , '/g_ip' , '/directory/category/yl' , '/g_smkj' , '/g_yj' , '/g_Finance' , '/g_kepu' , '/g_js' , '/g_car' , '/g_jlp' , '/g_tkx' , '/directory/sport/cate' , '/g_FM233' , '/g_yydt' , '/g_lianmaihudong' , '/g_qinggan' , '/directory/category/voice' , '/g_znl' ] def Mongodb_set( self , sheet_name, r_data): client = pymongo.MongoClient( 'localhost' , 27017 ) douyu = client[ 'douyu' ] sheet_name = douyu[sheet_name] print (r_data) sheet_name.insert_one(r_data) def get_url_list( self ): for category in self .categorys_list: category_url = self .url_host + category self .url_list.append(category_url) self .Mongodb_set(sheet_name = 'url_list' , r_data = { 'url' : category_url}) return self .url_list def get_host_info( self , url): time.sleep( 0.2 ) print ( 'Now start open {}' . format (url)) for i in range ( 3 ): try : wb_data = requests.get(url, headers = self .headers) break except : print ( 'net work error! will retry 3 times' ) soup = BeautifulSoup(wb_data.text, 'lxml' ) print ( 'start analazy url' ) try : category = soup.select( 'h1' )[ 0 ].get_text() except : category = '未定義類別' names = soup.select( '.ellipsis.fl' ) nums = soup.select( '.dy-num.fr' ) titles = soup.select( '.mes h3' ) hrefs = soup.select( '#live-list-contentbox li a' ) for name, num, href, title in zip (names, nums, hrefs, titles): data = { '類別' : category, '主播' : name.get_text(), '标题' : title.get_text().split( '\n' )[ - 1 ].strip(), '链接' : self .url_host + href.get( 'href' ), '人氣指數' : float (num.get_text()[: - 1 ]) if '万' in num.get_text() else float (num.get_text()) / 10000 , '當前時間' : self .date_time } if data[ '人氣指數' ] > 2 : self .Mongodb_set(sheet_name = 'host_info_{}' . format ( self .date_time), r_data = data) def db_check( self , sheetname, key_word): client = pymongo.MongoClient( 'localhost' , 27017 ) douyu = client[ 'douyu' ] sheetname = douyu[sheetname] for data in sheetname.find(key_word): print (data) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | from multiprocessing import Pool from douyu_host_2 import douyu_host_info douyu = douyu_host_info() def data_check(): #{u'當前時間':'20180901 10:58', u'人氣指數':{'$gte':30}} #{'主播':' # sheetname = input('Which sheet do you want to check') sheetname = 'host_info_20180901_1530' # key_word = input('Do you want to check with?') key_word = { '類別' : 'DOTA2' } douyu.db_check(sheetname = sheetname, key_word = key_word) def w_to_db(): pool = Pool() url_list = douyu.get_url_list() pool. map (douyu.get_host_info, url_list) if __name__ = = '__main__' : w_to_db() data_check() |
这个爬虫没有包含翻页,只爬取了每个类别下面的首页,翻页爬所有主播请参考这个脚本。
https://www.cnblogs.com/lkd8477604/p/9848958.html
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步