【笔记整理】[案例]使用正则表达式来提取36Kr新闻
import datetime
import json
import re
import requests
class Kr36(object):
def __init__(self):
self.url = "https://36kr.com/information/web_news/"
self.headers = {
"Host": "36kr.com",
"referer": "https://36kr.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
self.file = open("36kr.json", "w", encoding="utf-8")
def parse_data(self, data):
match_list = re.search(r'<script>window.initialState=(.*?)</script>', data)
# 拿到正则匹配的第一个组的内容
group1 = match_list.group(1)
# 将json转换为python字典
json_dict = json.loads(group1)
# print(json_dict)
results_list = []
# 遍历获取需要的信息
for item in json_dict["information"]["informationList"]["itemList"]:
url = f"""https://36kr.com/p/{item["templateMaterial"]["itemId"]}"""
title = item["templateMaterial"]["widgetTitle"]
img_url = item["templateMaterial"]["widgetImage"]
publish_time = item["templateMaterial"]["publishTime"]
# 将整合的字典添加到列表中
results_list.append({
"title": title,
"url": url,
"img_url": img_url,
"publish_time": datetime.datetime.fromtimestamp(publish_time / 1000).isoformat()
})
return results_list
def get_data(self) -> bytes:
"""发送请求,返回二进制响应体数据"""
resp = requests.get(self.url, headers=self.headers)
# 这里不decode了,调用者自行decode
return resp.content
def save_data(self, data):
self.file.write(json.dumps(data, ensure_ascii=False, indent=4))
def __del__(self):
self.file.flush()
self.file.close()
def run(self):
resp = self.get_data()
data = self.parse_data(resp.decode())
self.save_data(data)
if __name__ == '__main__':
Kr36().run()
本文来自博客园,作者:蕝戀,转载请注明原文链接:https://www.cnblogs.com/juelian/p/17559489.html