遇事不决,可问春风,春风不语,谨遵本心|

布都御魂

园龄:3年9个月粉丝:2关注:1

微博搜索

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
author:张鑫
date:2021/12/3 10:01
https://weibo.com/ajax/statuses/longtext?id=KDJGenW1X
https://weibo.com/1281382091/KDJGenW1X?refer_flag=1001030103_
"""
import random
import re
import time
from urllib.parse import quote
 
import pymongo
import requests
from lxml import etree
 
 
def remove_label(content):
    if '<' in content:
        pre = re.compile('>(.*?)<')
        content = content.replace('&nbsp', '')
        content = ''.join(pre.findall(content))
        return content
    else:
        content = content.replace('&nbsp', '')
        return content
 
 
database = pymongo.MongoClient('localhost', 27017)
client = database['weibo']
search_list = client['search_list']
q = quote('新婚姻法')
for page in range(19, 101):
    print(f'*************第{page}页***************')
    time.sleep(random.randint(3, 5))
    url = f'https://s.weibo.com/weibo?q={q}&Refer=realtime_weibo&page={page}'
    print(url)
    headers = {
        'cookie': 'SINAGLOBAL=209674443713.62775.1637812588940; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFrm3zGJwUUhLB_Zq91EPT85JpX5KMhUgL.Fo-4ehn7SoeXehe2dJLoI05LxK-L12zLBKBLxK-LBK-L12zLxKML1-2L1hBLxK-L12zL1hMLxKqLBo5L1KB4e0Mt; UOR=,,login.sina.com.cn; ALF=1670033602; SSOLoginState=1638497603; SCF=AvfZc65wQjQdiV7RbqiIW2ty9XKEfdXFF4Sj9KtoCva0Pqi5xTUK1Jc5QCmWvvSik408olEIiaU8s4J6hmSiJj4.; SUB=_2A25MrQ0TDeRhGeNH61oR9i3Iyz-IHXVv23nbrDV8PUNbmtAKLWvDkW9NSvWJkV4-FW9DdWOqkOlW-djeqAeQHm3n; _s_tentry=login.sina.com.cn; Apache=2332814448343.1055.1638497606244; ULV=1638497606602:20:1:5:2332814448343.1055.1638497606244:1638173526637'
    }
    requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
    s = requests.session()
    s.keep_alive = False  # 关闭多余连接
    html = s.get(url=url, headers=headers, verify=False).content.decode()
    tree = etree.HTML(html)
    # 作者
    for i in range(1, 23):
        time.sleep(random.randint(3, 5))
        # 详情页
        detail_url = tree.xpath(f'//div[{i}]/div/div[1]/div[2]/p[1]/a[1]//@href')
        # print(detail_url)
        if detail_url == []:
            continue
        else:
            try:
                second_url = 'https://weibo.com/ajax/statuses/show?id=' + (''.join(detail_url).split('/')[-1]).replace(
                    '?refer_flag=1001030103_', '')
                # print(second_url)
                requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
                s = requests.session()
                s.keep_alive = False  # 关闭多余连接
                html2 = s.get(url=second_url, headers=headers, verify=False).json()
                # print(html2)
 
                search = {}
                search['详情页连接'] = 'https:' + ''.join(detail_url)
                search['用户名'] = html2['user']['screen_name']
                search['发布时间'] = html2['created_at']
                search['来源'] = html2['source']
 
                search['分享'] = html2['reposts_count']
                search['评论'] = html2['comments_count']
                search['点赞'] = html2['attitudes_count']
                data_url = f'https://weibo.com/ajax/statuses/longtext?id=' + second_url.split('/')[-1].replace(
                    'show?id=',
                    '')
                # print(f'data_url:{data_url}')
                requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
                s = requests.session()
                s.keep_alive = False  # 关闭多余连接
                html3 = s.get(url=data_url, headers=headers, verify=False).json()
                # print(html3)
 
                try:
                    search['文章内容'] = ''.join(remove_label(html3['data']['longTextContent'])).replace('\n', '').replace(
                        ' ',
                        '').replace(
                        '\u200b', '')
                except:
                    search['文章内容'] = ''.join(remove_label(html2['text_raw'])).replace('\n', '').replace(' ',
                                                                                                        '').replace(
                        '\u200b', '')
                # print(search)
                # print(html3, type(html3))
                count = search_list.count_documents({'用户名': search["用户名"]})
                if count == 0:
                    search_list.insert_one(search)
                    print('******************************')
                    print(search)
                    print('入库成功')
                    print('******************************')
 
                    print('\n')
                else:
                    print(search)
                    print('数据已存在')
            except:
                continue

  

本文作者:布都御魂

本文链接:https://www.cnblogs.com/wolvies/p/15638487.html

版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。

posted @   布都御魂  阅读(749)  评论(0编辑  收藏  举报
   
点击右上角即可分享
微信分享提示
评论
收藏
关注
推荐
深色
回顶
收起