requests 爬取 台州市 履行中状态的合同附件
code
#coding=utf-8 import requests import time import os def formatFloat(num): return '{:.2f}'.format(num) #下载文件 def downloadFile(name, url): headers = {'Proxy-Connection':'keep-alive'} r = requests.get(url, stream=True, headers=headers) length = float(r.headers['content-length']) f = open(name, 'wb') count = 0 count_tmp = 0 time1 = time.time() for chunk in r.iter_content(chunk_size = 512): if chunk: f.write(chunk) count += len(chunk) if time.time() - time1 > 2: p = count / length * 100 speed = (count - count_tmp) / 1024 / 1024 / 2 count_tmp = count print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S') time1 = time.time() f.close() dic={} headers={ 'Accept': 'application/json, text/plain, */*' ,'client': 'Web' ,'Content-Type': 'application/json;charset=UTF-8' ,'Origin': "http://baidu.com.com" ,'Referer': "http://baidu.com.com" ,'token': 'NGKPaLge8urbOlOAuHasURwYP4AKQIo8O1zad5F3vLA=' ,'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } #文件保存目录 file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files") if(not os.path.exists(file_dir)): os.mkdir(file_dir) #登陆地址 url="http://baidu.com.com/login/pc" payload = {"userName":"abc","passWord":"icloudeep123","kaptcha":"12345"} r = requests.get(url, params=payload) #获取token dic["token"]=r.json()['data']['token'] headers["token"]=dic["token"] print(dic["token"]) #html合同列表 html_c_list=[] #计数 count=0 c=0 #无法下载 error_list=[] for i in range(1,60): #台州市履行中的合同 url="http://baidu.com.com/contracts?page={}&size=40&contractNo=®ionCode=331000&status=2".format(i) r = requests.get(url,headers=headers) contract_list=r.json()['data']['content'] for j in contract_list: c+=1 print("第{}页,第{}个".format(i,c)) contractUrl=j["contractUrl"] contractNo_tmp=j["contractNo"]+"\n" if(contractUrl==None): html_c_list.append(contractNo_tmp) else: companyName=j["companyName"] contractNo=j["contractNo"] try: downloadFile(os.path.join(file_dir,"{}_{}.pdf".format(companyName,contractNo)),contractUrl) except Exception as e: error_list.append(contractNo_tmp) print(e) else: count+=1 #html合同 record=os.path.join(os.path.dirname((os.path.abspath(__file__))),"records.txt") #保存未下载的合同列表 with open(record,"a+") as f: for k in html_c_list: f.write(k) #下载失败的合同 error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt") #保存未下载的合同列表 with open(error,"a+") as f: for l in error_list: f.write(l) print("已下载:{}".format(count)) print("下载失败:{}".format(len(error_list))) print("未下载:{}".format(len(html_c_list))) print("总数:{}".format(count+len(html_c_list)+len(error_list)))