微博根据关键字搜索爬虫

1.登录获取cookies
2.cookie转cookies

# -*- coding: utf-8 -*-
# TODO cookies_str转cookies_dic
# @Date    : 2022/4/22 9:38
# @Author  : layman
cookies_str = "SINAGLOBAL=462092313429110.737.1648189947190; login_sid_t=799d349cdfsd25759903d131ca6fd0ad0; cross_origin_proto=SSL; _s_tentry=weibo.com; Apache=8348613412866.332.1650589816565; ULV=1650589816569:2:1:1:8348613412866.332.1650589816565:1648189947200; SUB=_2A25PZnDJDeRhGeFN6VUW-S_Kyj6IHXVsEuUBrDV8PUNbmtAKLUL6kW9NQFh55mlCd6g7TuU659NR2F5DNWShYC_i; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF4kv-4n5KEAdq3XeiQfdqc5JpX5KzhUgL.FoM0eoMN1K2ceKz2dJLoI7LbIgUjqPL_qgRt; ALF=1682125848; SSOLoginState=1650589849; wvr=6; webim_unReadCount=%7B%22time%22%3A1650589853165%2C%22dm_pub_total%22%3A9%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A32%2C%22msgbox%22%3A0%7D; PC_TOKEN=0d19237494; WBStorage=4d96c54e|undefined"

cookies_dic = {}
for cookie in cookies_str.split('; '):
    cookies_dic[cookie.split('=')[0]] = cookie.split('=')[-1]

print(cookies_dic)

3.爬取收集

# -*- coding: utf-8 -*-
# TODO 微博查询
# @Date    : 2022/4/22 9:12
# @Author  : layman
import json
import time

import pandas as pd
import pymysql
import requests
from lxml import etree

headers = {
    'referer': 'https://s.weibo.com/user?q=%E5%AE%9C%E6%98%8C&Refer=weibo_user',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36',
}
cookies = {'SINAGLOBAL': '462092384310.737.1648189947190', 'login_sid_t': '799d349cf324w903d131ca6fd0ad0',
           'cross_origin_proto': 'SSL', 'PC_TOKEN': 'c797273222', '_s_tentry': 'weibo.com',
           'Apache': '8348613412866.332.1650589816565',
           'ULV': '1650589816569:2:1:1:8348613412866.332.1650589816565:1648189947200',
           'SUB': '_2A25PZnDJDeRhewrN6VUW-S_Kyj6IHXVsEuUBrDV8PUNbmtAKLUL6kW9NQFh55mlCd6g7TuU659NR2F5DNWShYC_i',
           'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9WF4kv-4n5KEAdq3XeiQfdqc5JpX5KzhUgL.FoM0eoMN1K2ceKz2dJLoI7LbIgUjqPL_qgRt',
           'ALF': '1682125848', 'SSOLoginState': '1650589849', 'wvr': '6',
           'webim_unReadCount': '%7B%22time%22%3A1650589853165%2C%22dm_pub_total%22%3A9%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A32%2C%22msgbox%22%3A0%7D',
           'WBStorage': '4d96c54e|undefined'}
db = pymysql.connect(host='localhost', port=3306,
                     user='root', passwd='root', db='wxb', charset='utf8')

cursor = db.cursor()
for page in range(1, 51):
    resp = requests.get(url=f'https://s.weibo.com/user?q=%E5%AE%9C%E6%98%8C&Refer=weibo_user&page={page}',
                        cookies=cookies)
    time.sleep(1)
    html = etree.HTML(resp.text)
    try:
        user_list = html.xpath('//*[@id="pl_user_feedList"]')[0]
        for user_name, official, user_fans in zip(user_list.xpath('./div[*]/div[2]/div/a[1]/text()'),
                                                  user_list.xpath('./div[*]/div[2]/p[2]/text()'),
                                                  user_list.xpath('./div[*]/div[2]/p[3]/span[2]/a/text()')):
            # user_name = user_list.xpath('./div[*]/div[2]/div/a[1]/text()')
            # user_fans = user_list.xpath('./div[*]/div[2]/p[3]/span[2]/a/text()')
            print(official)
            if official is None or len(str(official).strip()) == 0:
                official = '非官微'
            values = (user_name, official, user_fans)
            try:
                sql = "INSERT INTO weibo(user_name, official, user_fans) VALUES (%s,%s,%s)"
                cursor.execute(sql, values)
                db.commit()
            except:
                pass
    except:
        pass

posted @ 2022-04-24 15:52  biglayman  阅读(214)  评论(0编辑  收藏  举报