思路
在聚合数据申请账号(https://www.juhe.cn/)
通过聚合数据api获取微信精选文章api
通过newspaper
库提取相应的文本内容,关于newspaper
库的使用方法可以参考这里
代码
# -*- coding: utf-8 -*-
# !/usr/bin/env python
# Time: 2019/5/9 18:57
# Author: sty
# File: get_data.py
import json, urllib
from urllib.parse import urlencode
import requests
import json
import re
from newspaper import Article
def remove_punctuation(strs):
"""
去除标点符号
:param strs:
:return:
"""
return re.sub("[\s+\.\!\/<>“”,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", strs.strip())
def remove_unusual_upunctuation(strs):
"""
去除标点符号
:param strs:
:return:
"""
return re.sub("[\s+\.\!\/<>“”,$%^*(+\"\']+|[+——、~@#¥%……&*()]+", "", strs.strip())
# 获取当天的详细信息
def request():
url = "http://v.juhe.cn/weixin/query"
payload = {
"pno": 1,
"ps":50,
"dtype":"json",
"key":"_______" # 这里填写自己在聚合数据申请api时,产生的key
}
f = requests.get(url,params=payload)
res = json.loads(f.text)
for detail in res["result"]["list"]:
url = detail["url"]
article = Article(url, language='zh') # Chinese
article.download()
article.parse()
text_res = article.text[:].strip()
print("Title is :", detail["title"])
text_res = text_res.replace("\n\n", "")
print(remove_unusual_upunctuation(text_res))
if __name__ == '__main__':
request()
www.juhe.cn