微博搜索
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | """ author:张鑫 date:2021/12/3 10:01 https://weibo.com/ajax/statuses/longtext?id=KDJGenW1X https://weibo.com/1281382091/KDJGenW1X?refer_flag=1001030103_ """ import random import re import time from urllib.parse import quote import pymongo import requests from lxml import etree def remove_label(content): if '<' in content: pre = re. compile ( '>(.*?)<' ) content = content.replace( ' ' , '') content = ''.join(pre.findall(content)) return content else : content = content.replace( ' ' , '') return content database = pymongo.MongoClient( 'localhost' , 27017 ) client = database[ 'weibo' ] search_list = client[ 'search_list' ] q = quote( '新婚姻法' ) for page in range ( 19 , 101 ): print (f '*************第{page}页***************' ) time.sleep(random.randint( 3 , 5 )) url = f 'https://s.weibo.com/weibo?q={q}&Refer=realtime_weibo&page={page}' print (url) headers = { 'cookie' : 'SINAGLOBAL=209674443713.62775.1637812588940; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFrm3zGJwUUhLB_Zq91EPT85JpX5KMhUgL.Fo-4ehn7SoeXehe2dJLoI05LxK-L12zLBKBLxK-LBK-L12zLxKML1-2L1hBLxK-L12zL1hMLxKqLBo5L1KB4e0Mt; UOR=,,login.sina.com.cn; ALF=1670033602; SSOLoginState=1638497603; SCF=AvfZc65wQjQdiV7RbqiIW2ty9XKEfdXFF4Sj9KtoCva0Pqi5xTUK1Jc5QCmWvvSik408olEIiaU8s4J6hmSiJj4.; SUB=_2A25MrQ0TDeRhGeNH61oR9i3Iyz-IHXVv23nbrDV8PUNbmtAKLWvDkW9NSvWJkV4-FW9DdWOqkOlW-djeqAeQHm3n; _s_tentry=login.sina.com.cn; Apache=2332814448343.1055.1638497606244; ULV=1638497606602:20:1:5:2332814448343.1055.1638497606244:1638173526637' } requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 s = requests.session() s.keep_alive = False # 关闭多余连接 html = s.get(url = url, headers = headers, verify = False ).content.decode() tree = etree.HTML(html) # 作者 for i in range ( 1 , 23 ): time.sleep(random.randint( 3 , 5 )) # 详情页 detail_url = tree.xpath(f '//div[{i}]/div/div[1]/div[2]/p[1]/a[1]//@href' ) # print(detail_url) if detail_url = = []: continue else : try : second_url = 'https://weibo.com/ajax/statuses/show?id=' + (' '.join(detail_url).split(' / ')[ - 1 ]).replace( '?refer_flag=1001030103_' , '') # print(second_url) requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 s = requests.session() s.keep_alive = False # 关闭多余连接 html2 = s.get(url = second_url, headers = headers, verify = False ).json() # print(html2) search = {} search[ '详情页连接' ] = 'https:' + ''.join(detail_url) search[ '用户名' ] = html2[ 'user' ][ 'screen_name' ] search[ '发布时间' ] = html2[ 'created_at' ] search[ '来源' ] = html2[ 'source' ] search[ '分享' ] = html2[ 'reposts_count' ] search[ '评论' ] = html2[ 'comments_count' ] search[ '点赞' ] = html2[ 'attitudes_count' ] data_url = f 'https://weibo.com/ajax/statuses/longtext?id=' + second_url.split( '/' )[ - 1 ].replace( 'show?id=' , '') # print(f'data_url:{data_url}') requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 s = requests.session() s.keep_alive = False # 关闭多余连接 html3 = s.get(url = data_url, headers = headers, verify = False ).json() # print(html3) try : search[ '文章内容' ] = ' '.join(remove_label(html3[' data '][' longTextContent '])).replace(' \n ', ' ').replace( ' ' , '').replace( '\u200b' , '') except : search[ '文章内容' ] = ' '.join(remove_label(html2[' text_raw '])).replace(' \n ', ' ').replace(' ', '').replace( '\u200b' , '') # print(search) # print(html3, type(html3)) count = search_list.count_documents({ '用户名' : search[ "用户名" ]}) if count = = 0 : search_list.insert_one(search) print ( '******************************' ) print (search) print ( '入库成功' ) print ( '******************************' ) print ( '\n' ) else : print (search) print ( '数据已存在' ) except : continue |
本文作者:布都御魂
本文链接:https://www.cnblogs.com/wolvies/p/15638487.html
版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步