Python微信公众号爬虫脚本(采集文章标题\链接\点赞\阅读数等)
一、前期准备
1、微信公众号(抓包获取指定公众号所有文章、标题、链接数据)
个人注册微信公众号平台教程 :https://kf.qq.com/faq/120911VrYVrA151009eIrYvy.html
2、Fiddler + PC 版微信环境(抓包获取点赞、阅读数数据)
Fiddler 微信抓包教程:https://www.cnblogs.com/du-hong/p/16821199.html
二、获取指定微信公众号所有文章
1、获取公众号平台cookie,采集所有文章标题、链接
2、根据上述步骤中获取的token、cookie、fakeid 修改脚本中的配置
#!/usr/bin/python # -*- coding: UTF-8 -*- """ @file:GzhSpider.py @time:2022/12/28 """ import time from time import sleep import requests import pandas as pd import json class GzhSpider(object): def __init__(self): self.token = "25351****" self.fakeid = "MzA4MzYwNTA0Mg==" self.cookie = "" def get_html(self, page): """ 通过微信公众号后台获取数据 :param page: 页码 :return: """ params = { "action": "list_ex", "fakeid": self.fakeid, "query": "", "begin": str(page * 4), "count": "4", "type": "9", "need_author_name": "1", "token": self.token, "lang": "zh_CN", "f": "json", "ajax": "1" } url = "https://mp.weixin.qq.com/cgi-bin/appmsg" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54', "cookie": self.cookie } response = requests.get(url, headers=headers, params=params) return response.text def parse_data(self, items): results = [] items = json.loads(items) if "app_msg_list" not in items: return None for item in items["app_msg_list"]: create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(item["create_time"])) readNum, likeNum, old_like_num = (0, 0, 0) results.append({ "title": item['title'], "url": item['link'], "create_time": create_time, "author_name": item["author_name"], "readNum": readNum, "likeNum": likeNum, "old_like_num": old_like_num }) print(json.dumps(results, indent=4)) return results def save(self, results): data = pd.DataFrame(results) data.to_csv("data.csv") def run(self): results = [] for i in range(25): # 采集25页 html = self.get_html(i) result = self.parse_data(html) results.extend(result) sleep(5) self.save(results) if __name__ == '__main__': GzhSpider().run()
三、采集公众号点赞阅读数据
1、打开文章详情页刷新,通过fiddler抓包工具获取PC版微信cookie、User-Agent、uni、key、pass_ticket、appmsg_token
2、修改脚本配置执行最后会导出一个 data1.csv 文件
#!/usr/bin/python # -*- coding: UTF-8 -*- """ @file:test4.py @time:2022/12/28 """ import time import requests import pandas as pd def getMoreInfo(link): # 获得mid,_biz,idx,sn 这几个在link中的信息 mid = link.split("&")[1].split("=")[1] idx = link.split("&")[2].split("=")[1] sn = link.split("&")[3].split("=")[1] _biz = link.split("&")[0].split("_biz=")[1] # fillder 中取得一些不变得信息 # req_id = "0614ymV0y86FlTVXB02AXd8p" pass_ticket = "" # 从fiddler中获取 appmsg_token = "" # 从fiddler中获取 uin = "" # 从fiddler 中获取 key = "" # 从fiddler 中获取 # 目标url url = "http://mp.weixin.qq.com/mp/getappmsgext" # 获取详情页的网址 # 添加Cookie避免登陆操作,这里的"User-Agent"最好为手机浏览器的标识 phoneCookie = "" # 从fiddler 中获取 headers = { "Cookie": phoneCookie, "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 NetType/WIFI MicroMessenger/7.0.20.1781(0x6700143B) WindowsWechat(0x63070517)" } # 添加data,`req_id`、`pass_ticket`分别对应文章的信息,从fiddler复制即可。 data = { "is_only_read": "1", "is_temp_url": "0", "appmsg_type": "9", 'reward_uin_count': '0' } """ 添加请求参数 __biz对应公众号的信息,唯一 mid、sn、idx分别对应每篇文章的url的信息,需要从url中进行提取 key、appmsg_token从fiddler上复制即可 pass_ticket对应的文章的信息,也可以直接从fiddler复制 """ params = { "__biz": _biz, "mid": mid, "sn": sn, "idx": idx, "key": key, "pass_ticket": pass_ticket, "appmsg_token": appmsg_token, "uin": uin, "wxtoken": "777", } content = requests.post(url, headers=headers, data=data, params=params).json() # 提取其中的阅读数和点赞数 print(content["appmsgstat"]["read_num"], content["appmsgstat"]["like_num"]) try: readNum = content["appmsgstat"]["read_num"] print("阅读数:" + str(readNum)) except: readNum = 0 try: likeNum = content["appmsgstat"]["like_num"] print("喜爱数:" + str(likeNum)) except: likeNum = 0 try: old_like_num = content["appmsgstat"]["old_like_num"] print("在读数:" + str(old_like_num)) except: old_like_num = 0 return readNum, likeNum, old_like_num df = pd.read_csv("data.csv") try: for index, row in enumerate(df.itertuples()): readNum, likeNum, old_like_num = getMoreInfo(row.url) df.loc[index, 'likeNum'] = likeNum df.loc[index, 'readNum'] = readNum df.loc[index, 'old_like_num'] = old_like_num print(row.url) # 歇3s,防止被封 time.sleep(3) except Exception as e: print(e) df.to_csv("data1.csv")
四、采集数据展示
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 【.NET】调用本地 Deepseek 模型
· CSnakes vs Python.NET:高效嵌入与灵活互通的跨语言方案对比
· Plotly.NET 一个为 .NET 打造的强大开源交互式图表库
· DeepSeek “源神”启动!「GitHub 热点速览」
· 上周热点回顾(2.17-2.23)