Beautiful Soup多线程爬取斗鱼所有主播信息(改良版)

花点时间改良了一下代码。如下

import requests
from bs4 import BeautifulSoup
import pymongo
import lxml
import time, datetime

class douyu_host_info():
    def __init__(self):
        self.url_host = 'https://www.douyu.com'
        self.date_time = datetime.datetime.now().strftime('%Y%m%d_%H%M')
        self.url_list = []
        self.headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        }
        # 类别后缀列表,由于是固定不变的,第一次抓取后直接保存起来
        self.categorys_list =[
             '/g_LOL', '/g_blzy', '/g_DOTA2', '/g_qipai', '/g_DNF', '/g_CF', '/g_mszb', '/g_CSGO', '/g_How', '/g_DOTA',
             '/g_WOW', '/g_nsh', '/g_Overwatch', '/g_wxy', '/directory/category/PCgame', '/g_jdqs', '/g_TVgame',
             '/g_gwlrsj', '/g_FTG', '/g_xyqx', '/g_NBA2K', '/g_BF', '/g_DG', '/directory/category/djry', '/g_wzry',
             '/g_jdqscjzc', '/g_jdqsqjcj', '/g_qqfcsy', '/g_hyrz', '/g_xyzx', '/g_HLMJ', '/g_phone', '/g_LRSZQ',
             '/g_mhmnz', '/g_CFSY', '/directory/category/syxx', '/g_yz', '/g_xingyu', '/g_ecy', '/g_yqk', '/g_HW',
             '/g_ms', '/g_music', '/g_ip', '/directory/category/yl', '/g_smkj', '/g_yj', '/g_Finance', '/g_kepu',
             '/g_js', '/g_car', '/g_jlp', '/g_tkx', '/directory/sport/cate', '/g_FM233', '/g_yydt', '/g_lianmaihudong',
             '/g_qinggan', '/directory/category/voice', '/g_znl'
        ]

    def Mongodb_set(self, sheet_name, r_data):
        client = pymongo.MongoClient('localhost', 27017)
        douyu = client['douyu']
        sheet_name = douyu[sheet_name]
        print(r_data)
        sheet_name.insert_one(r_data)

    def get_url_list(self):
        for category in self.categorys_list:
            category_url = self.url_host + category
            self.url_list.append(category_url)
            self.Mongodb_set(sheet_name='url_list', r_data={'url': category_url})
        return self.url_list

    def get_host_info(self, url):
        time.sleep(0.2)
        print('Now start open {}'.format(url))
        for i in range(3):
            try:
                wb_data = requests.get(url, headers=self.headers)
                break
            except:
                print('net work error! will retry 3 times')

        soup = BeautifulSoup(wb_data.text, 'lxml')
        print('start analazy url')
        try:
            category = soup.select('h1')[0].get_text()
        except:
            category = '未定義類別'
        names = soup.select('.ellipsis.fl')
        nums = soup.select('.dy-num.fr')
        titles = soup.select('.mes h3')
        hrefs = soup.select('#live-list-contentbox  li  a')
        for name, num, href, title in zip(names, nums, hrefs, titles):
            data = {
                '類別': category,
                '主播': name.get_text(),
                '标题': title.get_text().split('\n')[-1].strip(),
                '链接': self.url_host + href.get('href'),
                '人氣指數': float(num.get_text()[:-1]) if '万'in num.get_text() else float(num.get_text())/10000,
                '當前時間': self.date_time
            }
            if data['人氣指數'] > 2:
                self.Mongodb_set(sheet_name='host_info_{}'.format(self.date_time), r_data=data)

    def db_check(self, sheetname, key_word):
        client = pymongo.MongoClient('localhost', 27017)
        douyu = client['douyu']
        sheetname = douyu[sheetname]
        for data in sheetname.find(key_word):
            print(data)

  

from multiprocessing import Pool
from douyu_host_2 import douyu_host_info

douyu = douyu_host_info()

def data_check():
    #{u'當前時間':'20180901 10:58', u'人氣指數':{'$gte':30}}
    #{'主播':'
    # sheetname = input('Which sheet do you want to check')
    sheetname = 'host_info_20180901_1530'
    # key_word = input('Do you want to check with?')
    key_word = {'類別': 'DOTA2'}
    douyu.db_check(sheetname=sheetname, key_word=key_word)


def w_to_db():
    pool = Pool()
    url_list = douyu.get_url_list()
    pool.map(douyu.get_host_info, url_list)


if __name__ == '__main__':

    w_to_db()
    data_check()

  这个爬虫没有包含翻页,只爬取了每个类别下面的首页,翻页爬所有主播请参考这个脚本。

 

 

  https://www.cnblogs.com/lkd8477604/p/9848958.html

posted @ 2018-09-01 15:55  放脚一搏  阅读(894)  评论(0编辑  收藏  举报