微信公众号爬虫--历史文章-首页
在上次的爬虫中,我们只是爬取了历史文章中加载更多的数据(https://www.cnblogs.com/jueshilaozhongyi/p/11656435.html),这次是历史文章中首页的数据
历史文章首页的数据是返回在html中的,再具体点在JavaScript中
本次代码的缺点:1.还是不能很智能,需要通过抓包工具获取首页的链接
2.有些公众号没有历史文章,这种公众号不能使用
3.有些公众号历史文章使用的是分类,这种也不能使用(下次分享这种的怎么处理)
好了,我们先来看看首页的链接吧:
+action=getmsg
对比地址,我们可以看到也就是访问的路径都一样,只是action的参数不一样,这次的action值是home,后面的参数都一样
下面开始放代码吧:
# 在之前我们的公众号名字是通过我们手动输入的,这次因为是在首页,可以通过正则表达式直接获取,新增加了获取公众号名的步骤
import requests
import re, os
import time
# 在之前的链接里我们封装的数据库操作,可以直接拿来用
from conn.connect_mysql import insert_wechat_content,select_wechat_content
path = os.getcwd()
print(path)
file_path = path + '//content_file'
def get_content_text(url):
"""
请求接口数据
:return:
"""
wechat_home_url = url
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) MicroMessenger/2.3.27(0x12031b13) MacWechat Chrome/39.0.2171.95 Safari/537.36 NetType/WIFI WindowsWechat",
"cookie": "devicetype=android-28; lang=zh_CN; pass_ticket=sZNf5AG/C0AvageD87nRhK3W3AuVgYP3dYTvz3i57WFq718hIiDmMmA/ICUWA3W; version=2700073a; wap_sid2=CILAnPMFElxnMzRMNjdKbGpLdXYxZ0xzN2JfeldZX25JaGQ1a0EyLTNGUmE5SHZxNGRqTERPX1kybnd6a0Nwd2pONkJiLUxRbW84OU9kdkxjcHJjMHVZRXRxQUVDd2dFQUFBfjCMwsvtBTgNQJVO; wxuin=1583816706"
}
result = requests.get(url=wechat_home_url, headers=headers, verify=False)
r = result.text
return r
def write_content_file(url):
"""
写入接口请求的数据
:param data:
:return:
"""
data = get_content_text(url)
f = open(file_path, 'w+', encoding='utf-8')
f.write(data)
f.close()
def read_content_file():
"""
读取file数据
:return: text
"""
f = open(file_path, 'r', encoding='utf-8')
text = f.read()
f.close()
return text
def find_msg():
"""
正则表达式获取msgList中的数据
:return: str
"""
r = read_content_file()
msgList = re.findall(r"msgList = \'(.*)\'", str(r))
return str(msgList[0])
def msg_replace():
"""
替换引号为单引号
:return: str
"""
msg = find_msg()
msg_replace = msg.replace(""", "'")
return msg_replace
def msg_json():
"""
将数据处理成json格式
:return: json
"""
import demjson
msg = msg_replace()
msg_json = demjson.decode(msg)
return msg_json['list']
def get_wechat_name():
"""
获取公众号名
:return:
"""
r = read_content_file()
wechat_name = re.findall(r"nickname = \"(.*)\" |\|"";", str(r))
# wechat_name = "'{}'".format(wechat_name)
# print(wechat_name)
return wechat_name[0]
def format_data():
"""
保存获取到的数据
:return:
"""
msg = msg_json()
wechat_name = get_wechat_name()
wechat_name = "'{}'".format(wechat_name)
for i in msg:
# 标题
title = i['app_msg_ext_info']['title']
title = "'{}'".format(title)
# 文章地址
content_url = i['app_msg_ext_info']['content_url']
content_url = "'{}'".format(content_url)
# 封面图
cover = i['app_msg_ext_info']['cover']
cover = "'{}'".format(cover)
# 转载路径
source_url = i['app_msg_ext_info']['source_url']
source_url = "'{}'".format(source_url)
# 转载公众号
source_name = i['app_msg_ext_info']['author']
source_name = "'{}'".format(source_name)
# 发布时间
datetime = i['comm_msg_info']['datetime']
datetime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(datetime))
datetime = "'{}'".format(datetime)
print(title, content_url, cover, source_url, source_name, datetime)
if select_wechat_content(title) == 1:
print("数据已经存在")
else:
insert_wechat_content(wechat_name, title, content_url, cover, source_url, source_name, datetime)
def run(url):
write_content_file(url)
format_data()
if __name__ == "__main__":
url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI5NDY1MjQzNA==&uin=MTU4MzgxNjcwNg%3D%3D&key=b1719993cc296ec41a4aad024aa262db236a1b7242d12dd98e5d02bf751cb5e705f8ef8ef6cda9e235519a360bab4c42b4ab301a460e39a67ca76f0945e49ddf2cbaaf03553a73e079426924bbbe17ce&devicetype=iMac+MacBookPro15%2C1+OSX+OSX+10.14.5+build(18F203)&version=12031b13&lang=zh_CN&nettype=WIFI&a8scene=0&fontScale=100&pass_ticket=CSP6SWxOUwP4xAvrB01DuLNCJIO%2FR65vUpx4MFOWrJCce3JldcoyR1VZK4%2BQfXzn"
run(url)
大功告成,Over!
学习最大的乐趣在于分享,我是绝世老中医,欢迎来诊
个人qq:1978529954