Python微信公众号爬虫脚本(采集文章标题\链接\点赞\阅读数等)

一、前期准备

1、微信公众号(抓包获取指定公众号所有文章、标题、链接数据)

个人注册微信公众号平台教程 :https://kf.qq.com/faq/120911VrYVrA151009eIrYvy.html

2、Fiddler + PC 版微信环境(抓包获取点赞、阅读数数据)

Fiddler 微信抓包教程:https://www.cnblogs.com/du-hong/p/16821199.html

二、获取指定微信公众号所有文章

1、获取公众号平台cookie,采集所有文章标题、链接

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 2、根据上述步骤中获取的token、cookie、fakeid 修改脚本中的配置

#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
@file:GzhSpider.py
@time:2022/12/28
"""
import time
from time import sleep

import requests
import pandas as pd
import json


class GzhSpider(object):
    def __init__(self):
        self.token = "25351****"
        self.fakeid = "MzA4MzYwNTA0Mg=="
        self.cookie = "" 

    def get_html(self, page):
        """
        通过微信公众号后台获取数据
        :param page: 页码
        :return:
        """
        params = {
            "action": "list_ex",
            "fakeid": self.fakeid,
            "query": "",
            "begin": str(page * 4),
            "count": "4",
            "type": "9",
            "need_author_name": "1",
            "token": self.token,
            "lang": "zh_CN",
            "f": "json",
            "ajax": "1"
        }
        url = "https://mp.weixin.qq.com/cgi-bin/appmsg"
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54',
            "cookie": self.cookie
        }
        response = requests.get(url, headers=headers, params=params)
        return response.text

    def parse_data(self, items):
        results = []
        items = json.loads(items)
        if "app_msg_list" not in items:
            return None
        for item in items["app_msg_list"]:
            create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(item["create_time"]))
            readNum, likeNum, old_like_num = (0, 0, 0)
            results.append({
                "title": item['title'],
                "url": item['link'],
                "create_time": create_time,
                "author_name": item["author_name"],
                "readNum": readNum,
                "likeNum": likeNum,
                "old_like_num": old_like_num
            })
        print(json.dumps(results, indent=4))
        return results

    def save(self, results):
        data = pd.DataFrame(results)
        data.to_csv("data.csv")

    def run(self):
        results = []
        for i in range(25):  # 采集25页
            html = self.get_html(i)
            result = self.parse_data(html)
            results.extend(result)
            sleep(5)
        self.save(results)


if __name__ == '__main__':
    GzhSpider().run()

三、采集公众号点赞阅读数据

 

1、打开文章详情页刷新,通过fiddler抓包工具获取PC版微信cookie、User-Agent、uni、key、pass_ticket、appmsg_token

 

 

 

 

 

 

2、修改脚本配置执行最后会导出一个 data1.csv 文件

#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
@file:test4.py
@time:2022/12/28
"""
import time

import requests
import pandas as pd


def getMoreInfo(link):
    # 获得mid,_biz,idx,sn 这几个在link中的信息
    mid = link.split("&")[1].split("=")[1]
    idx = link.split("&")[2].split("=")[1]
    sn = link.split("&")[3].split("=")[1]
    _biz = link.split("&")[0].split("_biz=")[1]

    # fillder 中取得一些不变得信息
    # req_id = "0614ymV0y86FlTVXB02AXd8p"
    pass_ticket = ""  # 从fiddler中获取
    appmsg_token = ""  # 从fiddler中获取
    uin = "" # 从fiddler 中获取
    key = "" # 从fiddler 中获取

    # 目标url
    url = "http://mp.weixin.qq.com/mp/getappmsgext"  # 获取详情页的网址
    # 添加Cookie避免登陆操作,这里的"User-Agent"最好为手机浏览器的标识
    phoneCookie = "" # 从fiddler 中获取 
    headers = {
        "Cookie": phoneCookie,
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63070517)"
    }
    # 添加data,`req_id`、`pass_ticket`分别对应文章的信息,从fiddler复制即可。
    data = {
        "is_only_read": "1",
        "is_temp_url": "0",
        "appmsg_type": "9",
        'reward_uin_count': '0'
    }
    """
    添加请求参数
    __biz对应公众号的信息,唯一
    mid、sn、idx分别对应每篇文章的url的信息,需要从url中进行提取
    key、appmsg_token从fiddler上复制即可
    pass_ticket对应的文章的信息,也可以直接从fiddler复制
    """
    params = {
        "__biz": _biz,
        "mid": mid,
        "sn": sn,
        "idx": idx,
        "key": key,
        "pass_ticket": pass_ticket,
        "appmsg_token": appmsg_token,
        "uin": uin,
        "wxtoken": "777",
    }

    content = requests.post(url, headers=headers, data=data, params=params).json()
    # 提取其中的阅读数和点赞数
    print(content["appmsgstat"]["read_num"], content["appmsgstat"]["like_num"])
    try:
        readNum = content["appmsgstat"]["read_num"]
        print("阅读数:" + str(readNum))
    except:
        readNum = 0
    try:
        likeNum = content["appmsgstat"]["like_num"]
        print("喜爱数:" + str(likeNum))
    except:
        likeNum = 0
    try:
        old_like_num = content["appmsgstat"]["old_like_num"]
        print("在读数:" + str(old_like_num))
    except:
        old_like_num = 0

    return readNum, likeNum, old_like_num


df = pd.read_csv("data.csv")
try:
    for index, row in enumerate(df.itertuples()):
        readNum, likeNum, old_like_num = getMoreInfo(row.url)
        df.loc[index, 'likeNum'] = likeNum
        df.loc[index, 'readNum'] = readNum
        df.loc[index, 'old_like_num'] = old_like_num
        print(row.url)
        # 歇3s,防止被封
        time.sleep(3)
except Exception as e:
    print(e)
df.to_csv("data1.csv")

四、采集数据展示

 

posted @ 2023-01-06 16:21  Hlikex  阅读(4656)  评论(0编辑  收藏  举报