微博根据关键字搜索爬虫
1.登录获取cookies
2.cookie转cookies
# -*- coding: utf-8 -*-
# TODO cookies_str转cookies_dic
# @Date : 2022/4/22 9:38
# @Author : layman
cookies_str = "SINAGLOBAL=462092313429110.737.1648189947190; login_sid_t=799d349cdfsd25759903d131ca6fd0ad0; cross_origin_proto=SSL; _s_tentry=weibo.com; Apache=8348613412866.332.1650589816565; ULV=1650589816569:2:1:1:8348613412866.332.1650589816565:1648189947200; SUB=_2A25PZnDJDeRhGeFN6VUW-S_Kyj6IHXVsEuUBrDV8PUNbmtAKLUL6kW9NQFh55mlCd6g7TuU659NR2F5DNWShYC_i; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF4kv-4n5KEAdq3XeiQfdqc5JpX5KzhUgL.FoM0eoMN1K2ceKz2dJLoI7LbIgUjqPL_qgRt; ALF=1682125848; SSOLoginState=1650589849; wvr=6; webim_unReadCount=%7B%22time%22%3A1650589853165%2C%22dm_pub_total%22%3A9%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A32%2C%22msgbox%22%3A0%7D; PC_TOKEN=0d19237494; WBStorage=4d96c54e|undefined"
cookies_dic = {}
for cookie in cookies_str.split('; '):
cookies_dic[cookie.split('=')[0]] = cookie.split('=')[-1]
print(cookies_dic)
3.爬取收集
# -*- coding: utf-8 -*-
# TODO 微博查询
# @Date : 2022/4/22 9:12
# @Author : layman
import json
import time
import pandas as pd
import pymysql
import requests
from lxml import etree
headers = {
'referer': 'https://s.weibo.com/user?q=%E5%AE%9C%E6%98%8C&Refer=weibo_user',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36',
}
cookies = {'SINAGLOBAL': '462092384310.737.1648189947190', 'login_sid_t': '799d349cf324w903d131ca6fd0ad0',
'cross_origin_proto': 'SSL', 'PC_TOKEN': 'c797273222', '_s_tentry': 'weibo.com',
'Apache': '8348613412866.332.1650589816565',
'ULV': '1650589816569:2:1:1:8348613412866.332.1650589816565:1648189947200',
'SUB': '_2A25PZnDJDeRhewrN6VUW-S_Kyj6IHXVsEuUBrDV8PUNbmtAKLUL6kW9NQFh55mlCd6g7TuU659NR2F5DNWShYC_i',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WF4kv-4n5KEAdq3XeiQfdqc5JpX5KzhUgL.FoM0eoMN1K2ceKz2dJLoI7LbIgUjqPL_qgRt',
'ALF': '1682125848', 'SSOLoginState': '1650589849', 'wvr': '6',
'webim_unReadCount': '%7B%22time%22%3A1650589853165%2C%22dm_pub_total%22%3A9%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A32%2C%22msgbox%22%3A0%7D',
'WBStorage': '4d96c54e|undefined'}
db = pymysql.connect(host='localhost', port=3306,
user='root', passwd='root', db='wxb', charset='utf8')
cursor = db.cursor()
for page in range(1, 51):
resp = requests.get(url=f'https://s.weibo.com/user?q=%E5%AE%9C%E6%98%8C&Refer=weibo_user&page={page}',
cookies=cookies)
time.sleep(1)
html = etree.HTML(resp.text)
try:
user_list = html.xpath('//*[@id="pl_user_feedList"]')[0]
for user_name, official, user_fans in zip(user_list.xpath('./div[*]/div[2]/div/a[1]/text()'),
user_list.xpath('./div[*]/div[2]/p[2]/text()'),
user_list.xpath('./div[*]/div[2]/p[3]/span[2]/a/text()')):
# user_name = user_list.xpath('./div[*]/div[2]/div/a[1]/text()')
# user_fans = user_list.xpath('./div[*]/div[2]/p[3]/span[2]/a/text()')
print(official)
if official is None or len(str(official).strip()) == 0:
official = '非官微'
values = (user_name, official, user_fans)
try:
sql = "INSERT INTO weibo(user_name, official, user_fans) VALUES (%s,%s,%s)"
cursor.execute(sql, values)
db.commit()
except:
pass
except:
pass