python3 爬取合同附件

code

code
#coding=utf-8
import requests
import time
import os

def formatFloat(num):
    return '{:.2f}'.format(num)

#下载文件
def downloadFile(name, url):
    headers = {'Proxy-Connection':'keep-alive'}
    r = requests.get(url, stream=True, headers=headers)
    length = float(r.headers['content-length'])
    f = open(name, 'wb')
    count = 0
    count_tmp = 0
    time1 = time.time()
    for chunk in r.iter_content(chunk_size = 512):
        if chunk:
            f.write(chunk)
            count += len(chunk)
            if time.time() - time1 > 2:
                p = count / length * 100
                speed = (count - count_tmp) / 1024 / 1024 / 2
                count_tmp = count
                print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
                time1 = time.time()
    f.close()
    
dic={}
headers={
    'Accept': 'application/json, text/plain, */*'
    ,'client': 'Web'
    ,'Content-Type': 'application/json;charset=UTF-8'
    ,'Origin': "http://baidu.com.com"
    ,'Referer': "http://baidu.com.com"
    ,'token': 'NGKPaLge8urbOlOAuHasURwYP4AKQIo8O1zad5F3vLA='
    ,'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
#文件保存目录
file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files")
if(not os.path.exists(file_dir)):
    os.mkdir(file_dir)

#登陆地址
url="http://baidu.com.com/login/pc"
payload = {"userName":"abc","passWord":"icloudeep123","kaptcha":"12345"}
r = requests.get(url, params=payload)

#获取token
dic["token"]=r.json()['data']['token']
headers["token"]=dic["token"]

print(dic["token"])

#html合同列表
html_c_list=[]

#计数
count=0
c=0

#无法下载
error_list=[]

for i in range(1,60):
    #台州市履行中的合同
    url="http://baidu.com.com/contracts?page={}&size=40&contractNo=&regionCode=331000&status=2".format(i)
    r = requests.get(url,headers=headers)
    contract_list=r.json()['data']['content']
    for j in contract_list:
        c+=1
        print("第{}页,第{}个".format(i,c))
        contractUrl=j["contractUrl"]
        contractNo_tmp=j["contractNo"]+"\n"
        if(contractUrl==None):
            html_c_list.append(contractNo_tmp)
        else:
            companyName=j["companyName"]
            contractNo=j["contractNo"]
            try:
                downloadFile(os.path.join(file_dir,"{}_{}.pdf".format(companyName,contractNo)),contractUrl)
            except Exception as e:
                error_list.append(contractNo_tmp)
                print(e)
            else:
                count+=1

#html合同
record=os.path.join(os.path.dirname((os.path.abspath(__file__))),"records.txt")
#保存未下载的合同列表
with open(record,"a+") as f:
    for k in html_c_list:
        f.write(k)

#下载失败的合同
error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt")
#保存未下载的合同列表
with open(error,"a+") as f:
    for l in error_list:
        f.write(l)  

print("已下载:{}".format(count))
print("下载失败:{}".format(len(error_list)))
print("未下载:{}".format(len(html_c_list)))

print("总数:{}".format(count+len(html_c_list)+len(error_list)))

 

 

 

 

 

 

 

 

posted @ 2020-12-28 11:35  anobscureretreat  阅读(267)  评论(0编辑  收藏  举报