人民日报 自动下载脚本
功能:获取当天的报刊电子版(别的日期也行)。如果把此脚本设置为每日自动执行,效果就是报纸每天会自动呈现在你的电脑桌面上。
因为近期有某些原因需要下载这个报纸,于是写了这个脚本,正好熟悉一下python的pdf操作。当然期间也解决了一些小问题,例如正则表达式和中文乱码的问题。
代码放到github了,https://github.com/raddyfiy/The-Peoples-Daily-download
安装
安装所需模块:pip3 install PyPDF2
用法:
下载当天报纸: python3 peoples_daily_download.py
下载指定日期报纸:python3 peoples_daily_download.py -date 20221010
权限:目录下允许读写
最佳使用姿势:可以在windows建立一个计划任务,每天定时执行此脚本,报纸会自动出现在你的桌面。
注意:有时候官网会忘记上传某一页,默认重试五次,依然失败的页会丢弃,继续向后下载。
# -*- coding: UTF-8 -*- import requests import re import PyPDF2 import os import shutil import datetime import sys import argparse import warnings warnings.filterwarnings("ignore") fsock = open('./log', 'w') sys.stderr = fsock proxies = { 'http': '127.0.0.1:8080', 'https': '127.0.0.1:8080' } headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36" } def download(today,partpath,newspaperpatch): today1=today today2=today[0:4]+today[5:7]+today[8:10] today3=today.replace("-","") try: os.mkdir(newspaperpatch) except: pass try: os.mkdir('./part') except: shutil.rmtree(partpath) os.mkdir('./part') filelist=os.listdir(newspaperpatch) if "People's.Daily.{}.pdf".format(today2) in filelist: print("该日期已经下载过了!") print("You alreay download this newspaper!") exit(0) coverurl="http://paper.people.com.cn/rmrb/html/{}/nbs.D110000renmrb_01.htm".format(today1) response=requests.get(coverurl,headers=headers) pagenum=len(re.findall("nbs",response.text))#get page number print(pagenum) if pagenum!=0:#old date process if response.status_code==403: print("你选择的日期太久远,网站不提供。只有两年之内的。") exit(0) print("下载中……") for page in range(1,pagenum+1): for retry in range(5): downtplurl="http://paper.people.com.cn/rmrb/images/{0}/{2}/rmrb{1}{2}.pdf" formatpage="{0:0>2}".format(page) downurl=downtplurl.format(today1,today2,formatpage) filename='rmrb{}.pdf'.format(today2+formatpage) response=requests.get(downurl,headers=headers) file=response.content # print(len(file)) if len(file)>1000: break else: #new rules after 2024.12.01 coverurl="http://paper.people.com.cn/rmrb/pc/layout/{0}/node_01.html".format(today3) print(coverurl) response=requests.get(coverurl,headers=headers) pagenum=len(re.findall("pageLink",response.text))#get page number for page in range(1,pagenum+1): print("第{0}页下载中……".format(page)) for retry in range(5): currentPageUrl="http://paper.people.com.cn/rmrb/pc/layout/{0}/node_{1:0>2}.html".format(today3,page) print(currentPageUrl) response=requests.get(currentPageUrl,headers=headers) dumpUrl=re.findall(r'''attachement.*?\.pdf''',response.text)[0] downloadUrl="http://paper.people.com.cn/rmrb/pc/"+dumpUrl print(downloadUrl) formatpage="{0:0>2}".format(page) filename='rmrb{}.pdf'.format(today2+formatpage) response=requests.get(url=downloadUrl,headers=headers,proxies=proxies) file=response.content print(len(file)) if len(file)>1000: break print(partpath+"/"+filename) with open(partpath+"/"+filename,"wb") as fn: fn.write(file) def merge(partpath,newspaperpatch): print("合并中……") filelist=os.listdir(partpath) filelist.sort() try: pdfFM=PyPDF2.PdfFileMerger(strict=False) except: pdfFM=PyPDF2.PdfMerger(strict=False) for file in filelist: fullpath=partpath+'/'+file filesize=os.path.getsize(fullpath) #判断文件大小,有的页本身不支持下载,发现为空则合并 if filesize<10: print("第{}页网站不支持下载,已跳过".format(file[-6:-4])) continue pdfFM.append(fullpath) pdfFM.write(newspaperpatch+"/People's.Daily."+filelist[0][4:12]+".pdf") #保存新文件在newspaperpatch下 pdfFM.close() def delete(partpath): shutil.rmtree(partpath) def menu(): pass if __name__ == '__main__': menu() today=datetime.date.today().strftime("%Y-%m/%d") argc=len(sys.argv) if argc>1: argv = sys.argv[1:] parser = argparse.ArgumentParser(description='ArgUtils') parser.add_argument('-date', type=str, default=today, help="data date") parser.add_argument('--date', type=str, default=today, help="data date") args = parser.parse_args() today="{}-{}/{}".format(args.date[0:4],args.date[4:6],args.date[6:8]) print(today) partpath="./part" #临时文件夹,存每一页的文件,每次运行会自动创建和删除 newspaperpatch='./newspaper' #报纸保存位置,没有就自动创建 # today="2024-12/05" #默认下载当天的,可以命令行-date传入,也可在此手动修改日期,去掉注释按格式设置日期 print("Date: "+today) download(today,partpath,newspaperpatch) #分片下载 merge(partpath,newspaperpatch)#合并 delete(partpath)#删除临时文件夹partpath print("下载成功! 文件在newspaper里。")