爬微博

# -*- coding:utf-8 -*-
import time
import json
import uuid
import traceback
import requests
import datetime
import os
import random
from utils.qiniu_util import QiNiu
from bs4 import BeautifulSoup
__version__ = '1.0.0.0'
"""
@brief : 简介
@details: 详细信息
@author : zhphuang
@date : 2018-11-29
"""


class SpiderWeibo(object):

def __init__(self, window=None):
"""
"关注": 25, "美食": 33, "读书": 37, "设计": 39, "时尚": 41, "动漫": 43, "萌宠": 45,
"综艺": 10, "电影": 12, "运动健身": 16, "旅游": 30, "星座": 36, "校园": 32, "艺术": 40
"""
self.qiniu = QiNiu()
self.url = [
{"sub_type": "热门", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0"},
{"sub_type": "新鲜事", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_7978_-_ctg1_7978&openApp=0"},
{"sub_type": "搞笑", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4388_-_ctg1_4388&openApp=0"},
{"sub_type": "情感", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_1988_-_ctg1_1988&openApp=0"},
{"sub_type": "明星", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4288_-_ctg1_4288&openApp=0"},
{"sub_type": "社会", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4188_-_ctg1_4188&openApp=0"},
{"sub_type": "数码", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5088_-_ctg1_5088&openApp=0"},
{"sub_type": "体育", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_1388_-_ctg1_1388&openApp=0"},
{"sub_type": "汽车", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5188_-_ctg1_5188&openApp=0"},
{"sub_type": "电影", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5188_-_ctg1_5188&openApp=0"},
{"sub_type": "游戏", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4888_-_ctg1_4888&openApp=0"},
{"sub_type": "美食", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_2688_-_ctg1_2688&openApp=0"},
{"sub_type": "读书", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4588_-_ctg1_4588&openApp=0"},
{"sub_type": "设计", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5388_-_ctg1_5388&openApp=0"},
{"sub_type": "时尚", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4488_-_ctg1_4488&openApp=0"},
{"sub_type": "动漫", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_2388_-_ctg1_2388&openApp=0"},
{"sub_type": "萌宠", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_2788_-_ctg1_2788&openApp=0"},
{"sub_type": "综艺", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_4688_-_ctg1_4688&openApp=0"},
{"sub_type": "旅游", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_2588_-_ctg1_2588&openApp=0"},
{"sub_type": "星座", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_1688_-_ctg1_1688&openApp=0"},
{"sub_type": "校园", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_1488_-_ctg1_1488&openApp=0"},
{"sub_type": "艺术", "url": "https://m.weibo.cn/api/container/getIndex?containerid=102803_ctg1_5488_-_ctg1_5488&openApp=0"}
]
self.cookies = "_T_WM=6e170ca7910c8a0400cc34f8812ee08a; " \
"ALF=1548469366; SCF=AgozafmBO6saBbFys4DjAOQFlYFxRK6CuW_YwqYMgRKRkIh2Or_PEsE7BKZwQNDSRBoJu9EbU9DiOGNoTiRp7As.; " \
"SUB=_2A25xIEcnDeRhGeNM7FMW8ybEzT6IHXVS62lvrDV6PUNbktAKLRDWkW1NSetNTi5Tt63QYZkk2y4kc76CgKb4VoPf; " \
"SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MoCM8JudP_s60h7KX4dpy5JpX5KMhUgL.Fo-ES02Ne0nRSoz2dJLoI7_VIPHVIPHoPN9DM5tt; " \
"SUHB=0z1wYnUQHMLUAy; SSOLoginState=1545877367; MLOGIN=1; WEIBOCN_FROM=1110006030; " \
"M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D102803_ctg1_3288_-_ctg1_3288%26uicode%3D20000174%26fid%3D102803"
self.records = None
self.agents = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
]
self.window = window

def log(self, msg):
if self.window:
self.window.write_log_to_text(msg)

def save_to_csv(self, key):
dir = os.path.join(os.path.dirname(__file__), "data", "weibo", "%s" % datetime.date.today())
if not os.path.exists(dir):
os.mkdir(dir)
with open('data/weibo/%s/result_%s_%s.json' % (datetime.date.today(), key, datetime.date.today()), "w", encoding='utf-8') as json_file:
json.dump(self.records, json_file, ensure_ascii=False)

def get_random_header(self, path):
cookies = "_T_WM=6e170ca7910c8a0400cc34f8812ee08a; " \
"ALF=1548469366; SCF=AgozafmBO6saBbFys4DjAOQFlYFxRK6CuW_YwqYMgRKRkIh2Or_PEsE7BKZwQNDSRBoJu9EbU9DiOGNoTiRp7As.; " \
"SUB=_2A25xIEcnDeRhGeNM7FMW8ybEzT6IHXVS62lvrDV6PUNbktAKLRDWkW1NSetNTi5Tt63QYZkk2y4kc76CgKb4VoPf; " \
"SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5MoCM8JudP_s60h7KX4dpy5JpX5KMhUgL.Fo-ES02Ne0nRSoz2dJLoI7_VIPHVIPHoPN9DM5tt; " \
"SUHB=0z1wYnUQHMLUAy; SSOLoginState=1545877367; MLOGIN=1; WEIBOCN_FROM=1110006030; " \
"M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D102803_ctg1_3288_-_ctg1_3288%26uicode%3D20000174%26fid%3D102803"
return {
"authority": "m.weibo.cn",
"method": "GET",
"path": path,
"scheme": "https",
"accept": "application/json, text/plain, */*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": cookies,
"mweibo-pwa": "1",
"referer": "https://m.weibo.cn/",
"x-requested-with": "XMLHttpRequest",
"user-agent": random.choice(self.agents)
}

def get_comments(self, mid):
comment_list = []
commments_url = "https://m.weibo.cn/api/comments/show?id=%s&page=%s" % (mid, 1)
header = {
'User-agent': random.choice(self.agents),
'Host': 'm.weibo.cn',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Referer': "https://m.weibo.cn/u/%s" % mid,
'Connection': 'keep-alive',
"cookie": self.cookies,

}
try:
time.sleep(random.randint(1, 3))
res = requests.get(commments_url, headers=header)
if res.json().get("ok") == 1:
data = res.json().get("data").get("data")
for item in data:
try:
comment_content = BeautifulSoup(item["text"], "html.parser")
[s.extract() for s in comment_content('a')] # 去掉<a>标签
comment_content = comment_content.text.replace("回复:", "")
if comment_content.strip():
comment_time = int(time.time()) + random.randint(0, 3600 * 24)
comment_list.append({"nickname": item["user"]["screen_name"],
"publish_time": comment_time,
"content": comment_content})
except Exception as e:
print(traceback.format_exc())
continue

except Exception as e:
print(traceback.format_exc())
return comment_list

def get_data(self):
for url in self.url:
self.records = []
header = self.get_random_header(url.get("url").replace("https://m.weibo.cn", ""))
time.sleep(random.randint(5, 20))
res = requests.get(url.get("url"), headers=header)
res = res.json()
if res.get("ok") == 1:
datas = res.get("data").get("cards")
for data in datas:
if not data.__contains__('mblog'):
continue
try:
record = dict()
record['_type'] = 0 # 代表微博
record['sub_type'] = url["sub_type"] # 分类
record['uid'] = data["mblog"]["id"]
record['username'] = data["mblog"]["user"]["screen_name"]
record["signature"] = data["mblog"]["user"]["description"]
record['user_photo'] = data["mblog"]["user"]["avatar_hd"]
record['user_photo'] = self.qiniu.fetch(record['user_photo'], record['uid'] + "_avatar.jpg")
record['sex'] = 1 if data["mblog"]["user"]["gender"] == "m" else 0
record["publish_time"] = int(time.time())
bs = BeautifulSoup(data["mblog"]["text"], "html.parser")
[s.extract() for s in bs('a')] # 去掉<a>标签
record["content"] = bs.text
record["face_list"] = []
record["src_list"] = [image["url"] for image in data["mblog"].get("pics", [])]
record["src_list"] = [self.qiniu.fetch(i, str(uuid.uuid1()) + "weibo_image_%s.jpg" % index)
for index, i in enumerate(record["src_list"])]
span_list = bs.select("span.surl-text")
record['label_list'] = [label_div.text for label_div in span_list]
record["zhuanfa_count"] = data["mblog"]["reposts_count"]
record["comment_count"] = data["mblog"]["comments_count"]
record["dianzan_count"] = data["mblog"]["attitudes_count"]
record["comment_list"] = self.get_comments(record['uid'])
print(len(record["comment_list"]))
record['link'] = data["scheme"]
record["video_url"] = {}
if data["mblog"].__contains__("page_info"):
if data["mblog"]["page_info"].__contains__("media_info") and data["mblog"]["page_info"]["type"] == "video":
page_pic = data["mblog"]["page_info"]["page_pic"]
video_url = data["mblog"]["page_info"]["media_info"].get("mp4_sd_url")
if not video_url:
video_url = data["mblog"]["page_info"]["media_info"].get("mp4_hd_url")
if video_url:
video_url = self.qiniu.fetch(video_url, str(uuid.uuid1()) + "_weibo_video.mp4")
page_pic = self.qiniu.fetch(page_pic["url"], str(uuid.uuid1()) + "_weibo_pre_video_image.jpg")
record["video_url"] = {"pre_image": page_pic, "video_url": video_url}
print(record)
self.records.append(record)
except Exception as e:
print(traceback.format_exc())
self.save_to_csv(url["sub_type"])


if __name__ == '__main__':
obj = SpiderWeibo()
obj.get_data()
posted @ 2019-03-17 22:50  牛牛码代码  阅读(433)  评论(0编辑  收藏  举报