h5端微博下载

#!/usr/bin/env python  
# encoding: utf-8
from requests_html import HTMLSession
from json import JSONDecodeError
from glom import glom
from retrying import retry
import datetime
import time
from lxml import html
from dbs.db import SaveData
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import requests
import time
import traceback
from decorators.decorators import decorator
from logger.log import get_logger
from fake_useragent import UserAgent
from config.config import *
# 禁用安全请求警告
class WeiboCN():
    def __init__(self,i,_loop=None):
        self.uid = i.get("user_id")
        self.client_id = i.get("client_id")
        self.get_container_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=' + self.uid
        self.page_url=f"https://m.weibo.cn/api/container/getIndex?containerid={{}}&page={{}}"
        self.headers={"User-Agent":UserAgent(use_cache_server=False).chrome}#可以禁用服务器缓存
        self.session= HTMLSession()
        self.wbinfodatas = {}
        self.wb_contentdatas = {}
        self.first_comment_dict={}
        self.second_comment_dict = {}
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
        # 同一个账号的log_date相同
        self.LOG_DATE = datetime.datetime.now()
        self.need_days=31
        self.proxy = chrome_args.get("proxy")
        self.proxies = {"http": self.proxy, "https": self.proxy}
    @property
    @retry(stop_max_attempt_number =3,stop_max_delay =10000,retry_on_exception=lambda x: isinstance(x, JSONDecodeError))
    def get_containerid(self):
         json=self.session.get(self.get_container_url,headers=self.headers,timeout=60,verify=False).json()

         content = glom(json,'data')
         for data in glom(content,'tabsInfo.tabs'):
             if (data.get('tab_type') == 'weibo'):
                 containerid = data.get('containerid')
         return containerid

    @retry(stop_max_attempt_number=3, stop_max_delay=600000,wait_fixed=60000)
    def get(self,url):
        try:
           response=self.session.get(url,headers=self.headers,timeout=60,verify=False)
           if response.status_code == 200:  # 判断请求码
               return  response
        except:
            print("出现错误了休眠1分钟")
            raise
    @decorator
    def get_feed_info(self,result):
        stop_flag=False
        json=result.json()
        Updatedate = datetime.datetime.now()
        for i in glom(json, 'data.cards'):
            url = glom(i, 'scheme')
            html_info = glom(i, 'mblog.text')
            infostr = html_info
            mid = glom(i, 'mblog.mid')
            longtextwithemojiurl =self.getlongtext(mid,self.session)
            if longtextwithemojiurl:
                create_date,infostr = longtextwithemojiurl
            created_time =datetime.datetime.strptime(create_date, "%a %b %d %H:%M:%S +0800 %Y")
            source = glom(i, 'mblog.source')
            reposts_count = glom(i, 'mblog.reposts_count')
            comments_count = glom(i, 'mblog.comments_count')
            attitudes_count = glom(i, 'mblog.attitudes_count')
            self.wbinfodatas.setdefault("uid", []).append(self.uid)
            self.wbinfodatas.setdefault("read_num", []).append("0")
            self.wbinfodatas.setdefault("share_num", []).append(str(reposts_count))
            self.wbinfodatas.setdefault("comment_num", []).append(str(comments_count))
            self.wbinfodatas.setdefault("like_num", []).append(str(attitudes_count))
            self.wbinfodatas.setdefault("wb_id", []).append(mid)
            # 重新构造数据插入sns_object,list作为键值
            self.wb_contentdatas.setdefault("MEDIA_TYPE", []).append("WB")
            self.wb_contentdatas.setdefault("OBJ_TYPE", []).append("FEED")
            self.wb_contentdatas.setdefault("BIZ_ID", []).append("{}_{}".format(self.uid, mid))
            self.wb_contentdatas.setdefault("STATUS", []).append("0")
            self.wb_contentdatas.setdefault("SOURCE", []).append(source)
            self.wb_contentdatas.setdefault("BIZ_PARENT_ID", []).append(self.uid)
            self.wb_contentdatas.setdefault("BIZ_ID_PATH", []).append( "{}/{}_{}".format(self.uid, self.uid, mid))
            self.wb_contentdatas.setdefault("BIZ_CREATE_TIME", []).append(created_time)
            self.wb_contentdatas.setdefault("URL", []).append(url)
            self.wb_contentdatas.setdefault("FORM_ID", []).append("")
            self.wb_contentdatas.setdefault("FORM_NAME", []).append("")
            self.wb_contentdatas.setdefault("CONTENT", []).append(infostr)
            #清洗出文本内容
            text_info =""
            try:
                text_info=self.getinfo_emoji(infostr)
            except:
                get_logger().error(f"get_feed_info is error,here is detail {traceback.format_exc()}")
            self.wb_contentdatas.setdefault("CONTENT_TEXT", []).append(text_info)
            self.wb_contentdatas.setdefault("CLIENT_ID", []).append(self.client_id)
            self.wb_contentdatas.setdefault("UPD_DATE", []).append(Updatedate)
            #一条内容取完了以后,开始获取当前的评论内容
            self.get_first_comment(mid)
        last_date=self.wb_contentdatas.get("BIZ_CREATE_TIME")[-1]
        if self.wb_send_times(last_date):
            stop_flag=True
        SaveData().save_content_data(self.wbinfodatas, self.wb_contentdatas, self.LOG_DATE)
        self.wbinfodatas.clear()
        self.wb_contentdatas.clear()
        return stop_flag

    @decorator
    def wb_send_times(self,last_date):
        before_day_datetime = datetime.datetime.fromtimestamp(time.time()) - datetime.timedelta(days=self.need_days)
        wb_datetime = last_date
        # 默认31天之前的不取
        if (before_day_datetime > wb_datetime):
            return True
        else:
            return False

    @decorator
    def getinfo_emoji(self, htmlstr):
        root = html.fromstring(htmlstr)
        nodes = root.xpath(".//text()|.//@alt")
        return ''.join([i.replace('\n','').replace(" ", "").replace("\u200b", "") for i in nodes])

    @decorator
    def run(self):
       id=self.get_containerid
       page_index=1
       stop_flag=False
       while not stop_flag:
            url=self.page_url.format(id, page_index)
            response=self.get(url)
            stop_flag =self.get_feed_info(response) #存储内容,返回是否一个月
            page_index+=1
            time.sleep(0.5)
       print(self.uid,"取完了")

    @decorator
    def getlongtext(self,mid:str,req:requests)->"获取全文逻辑":
        allinfo=None
        getlongtexturl=f"https://m.weibo.cn/statuses/show?id={mid}"
        resp=self.get(getlongtexturl).json()
        okcode=resp["ok"]
        if okcode==1:
            created_at=resp["data"]["created_at"]
            allinfo=resp["data"]["text"]
            imgurl=""
            try:
               imgurl=[f'<img src="{i["url"]}"/>' for i in resp["data"]['pics']]
               print(imgurl)
            except:
                pass
            if len(imgurl):
                allinfo=allinfo+''.join(imgurl)
                print(allinfo)
        else:
            get_logger().error("get log text is error,here are detail:{}",traceback.format_exc())
        return created_at,allinfo

    @decorator
    def get_first_comment(self,mid):
        '''
        #获取一级评论
        :return:
        '''
        max_id=0
        try:
           url=f"https://m.weibo.cn/comments/hotflow?id={mid}&mid={mid}&max_id_type=0"
           json=self.get(url).json()
           isok = glom(json, 'ok')
           if isok:
               max_id = glom(json, 'data.max_id')
               self.save_first_comment(json,mid,url)
            #开始获取第二页评论
           index=0
           while max_id:
                next_url=f"https://m.weibo.cn/comments/hotflow?id={mid}&mid={mid}&max_id={max_id}&max_id_type=0"
                json = self.get(next_url).json()
                isok = glom(json, 'ok')
                if isok:
                    max_id = glom(json, 'data.max_id')
                    self.save_first_comment(json, mid, next_url)
                else:
                    break
                time.sleep(0.05)
        except:
            get_logger().error(f"get_first_comment is error,here is detail {traceback.format_exc()}")

    @decorator
    def save_first_comment(self,json,mid,url):
           Updatedate=datetime.datetime.now()
           for item in glom(json, 'data.data'):
                create_date = glom(item, 'created_at')
                create_time=datetime.datetime.strptime(create_date, "%a %b %d %H:%M:%S +0800 %Y")
                first_cid = glom(item, 'id')
                text = glom(item, 'text')
                user_name = glom(item, 'user.screen_name')
                user_id = glom(item, 'user.id')
                total_number =glom(item, 'total_number')
                self.first_comment_dict.setdefault("MEDIA_TYPE", []).append("WB")
                self.first_comment_dict.setdefault("OBJ_TYPE", []).append("COMMENT")
                self.first_comment_dict.setdefault("BIZ_ID", []).append("{}_{}".format(mid, first_cid))
                self.first_comment_dict.setdefault("STATUS", []).append("0")
                self.first_comment_dict.setdefault("SOURCE", []).append("")
                self.first_comment_dict.setdefault("BIZ_PARENT_ID", []).append("{}_{}".format(self.uid, mid))
                self.first_comment_dict.setdefault("BIZ_ID_PATH", []).append(
                    "{}/{}_{}/{}_{}".format(self.uid, self.uid, mid, mid, first_cid))
                self.first_comment_dict.setdefault("BIZ_CREATE_TIME", []).append(create_time)
                self.first_comment_dict.setdefault("FORM_ID", []).append(user_id)
                self.first_comment_dict.setdefault("FORM_NAME", []).append(user_name)
                self.first_comment_dict.setdefault("URL", []).append(url)
                self.first_comment_dict.setdefault("CONTENT", []).append(text)
                try:
                    text=self.getinfo_emoji(text)
                except:
                    pass
                self.first_comment_dict.setdefault("CONTENT_TEXT", []).append(text)
                self.first_comment_dict.setdefault("CLIENT_ID", []).append(self.client_id)
                self.first_comment_dict.setdefault("UPD_DATE", []).append(Updatedate)
                #开始获取二级评论
                #如果total小于3则可以直接获取
                if total_number:
                   if total_number<3:
                      self.save_second_comment(item, mid,first_cid, url,key= 'comments')
                   else:
                       self.get_second_comment(mid,first_cid)

           if self.first_comment_dict:
                SaveData().save_object_data(self.first_comment_dict)
                self.first_comment_dict.clear()
            # print("一级评论",cDatetime,UserName_nodes[i].strip(),get_wb_url)

    @decorator
    def get_second_comment(self,mid,first_cid):
            '''
              #获取二级评论
              :return:
              '''
            url = f"https://m.weibo.cn/comments/hotFlowChild?cid={first_cid}&max_id=0&max_id_type=0"
            json = self.get(url).json()
            isok = glom(json, 'ok')
            if isok:
                max_id = glom(json, 'max_id')
                self.save_second_comment(json, mid, first_cid, url)
            # 开始获取第二页评论
            index = 0
            while max_id:
                next_url = f"https://m.weibo.cn/comments/hotFlowChild?cid={first_cid}&max_id={max_id}&max_id_type=0"
                json = self.get(next_url).json()
                isok = glom(json, 'ok')
                if isok:
                    max_id = glom(json, 'max_id')
                    print("json",json)
                    self.save_second_comment(json, mid,first_cid, next_url)
                time.sleep(0.05)

    @decorator
    def save_second_comment(self, json, mid,first_cid, url,key='data'):
            Updatedate = datetime.datetime.now()
            if not glom(json,key):
                return None
            for item in glom(json,key):
                create_date = glom(item, 'created_at')
                create_time = datetime.datetime.strptime(create_date, "%a %b %d %H:%M:%S +0800 %Y")
                second_cid = glom(item, 'id')
                text = glom(item, 'text')
                user_name = glom(item, 'user.screen_name')
                user_id = glom(item, 'user.id')
                self.second_comment_dict.setdefault("MEDIA_TYPE", []).append("WB")
                self.second_comment_dict.setdefault("OBJ_TYPE", []).append("COMMENT")
                self.second_comment_dict.setdefault("BIZ_ID", []).append("{}_{}".format(first_cid,second_cid ))
                self.second_comment_dict.setdefault("STATUS", []).append("0")
                self.second_comment_dict.setdefault("SOURCE", []).append("")
                self.second_comment_dict.setdefault("BIZ_PARENT_ID", []).append("{}_{}".format(mid, first_cid))
                self.second_comment_dict.setdefault("BIZ_ID_PATH", []).append(
                    "{}/{}_{}/{}_{}/{}_{}".format(self.uid, self.uid, mid, mid, first_cid, first_cid,
                                                  second_cid))
                self.second_comment_dict.setdefault("BIZ_CREATE_TIME", []).append(create_time)
                self.second_comment_dict.setdefault("FORM_ID", []).append(user_id)
                self.second_comment_dict.setdefault("FORM_NAME", []).append(user_name)
                self.second_comment_dict.setdefault("CONTENT", []).append(text)
                try:
                    text=self.getinfo_emoji(text)
                except:
                    pass
                self.second_comment_dict.setdefault("CONTENT_TEXT", []).append(text)
                self.second_comment_dict.setdefault("URL", []).append(url)
                self.second_comment_dict.setdefault("CLIENT_ID", []).append(self.client_id)
                self.second_comment_dict.setdefault("UPD_DATE", []).append(Updatedate)
            if self.second_comment_dict:
                SaveData().save_object_data(self.second_comment_dict)
                self.second_comment_dict.clear()

posted @ 2018-07-23 16:33  公众号python学习开发  阅读(781)  评论(0编辑  收藏  举报