selenium 爬取详情页附件链接,并且下载
code1
import sys,os sys.path.append("/".join(os.path.dirname(os.path.abspath(__file__)).split("/")[:-1])+'/lib') from selenium import webdriver import time from selenium.webdriver.common.keys import Keys #coding=utf-8 import requests import time import os def formatFloat(num): return '{:.2f}'.format(num) #下载文件 def downloadFile(name, url): headers = {'Proxy-Connection':'keep-alive'} r = requests.get(url, stream=True, headers=headers) length = float(r.headers['content-length']) f = open(name, 'wb') count = 0 count_tmp = 0 time1 = time.time() for chunk in r.iter_content(chunk_size = 512): if chunk: f.write(chunk) count += len(chunk) if time.time() - time1 > 2: p = count / length * 100 speed = (count - count_tmp) / 1024 / 1024 / 2 count_tmp = count print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S') time1 = time.time() f.close() #文件保存目录 file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files") if(not os.path.exists(file_dir)): os.mkdir(file_dir) def asleep(driver): driver.implicitly_wait(3.5) time.sleep(2) ''' chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('headless') driver = webdriver.Chrome(options=chrome_options) ''' driver = webdriver.Chrome() asleep(driver) driver.get("http://www.baidu.com/#/login") driver.maximize_window() driver.find_element_by_xpath(".//div[@class='login-container-r']/div[2]/input").send_keys("abc") driver.find_element_by_xpath(".//div[@class='login-container-r']/div[3]/input").send_keys("icloudeep123") driver.find_element_by_xpath(".//div[@class='login-container-r']/div[4]/input").send_keys("12345") driver.find_element_by_xpath(".//div[@class='login-container-r']/div[6]").click() asleep(driver) #下载失败的合同 error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt") #保存未下载的合同列表 with open(error,"r") as f: for i in f.readlines(): contractNo=i.strip() detail_url="http://www.baidu.com/#/contractDetail?contractNum={}".format(contractNo) driver.get(detail_url) asleep(driver) contractUrl=driver.find_element_by_xpath(".//div[@class='float-left column list-r']/div[4]/div[2]/a").get_attribute("href") try: downloadFile(os.path.join(file_dir,"{}.pdf".format(contractNo)),contractUrl) print("下载成功!") except Exception as e: print(e) print(contractNo)
code2
#coding=utf-8 import requests import time import os def formatFloat(num): return '{:.2f}'.format(num) #下载文件 def downloadFile(name, url): headers = {'Proxy-Connection':'keep-alive'} r = requests.get(url, stream=True, headers=headers) length = float(r.headers['content-length']) f = open(name, 'wb') count = 0 count_tmp = 0 time1 = time.time() for chunk in r.iter_content(chunk_size = 512): if chunk: f.write(chunk) count += len(chunk) if time.time() - time1 > 2: p = count / length * 100 speed = (count - count_tmp) / 1024 / 1024 / 2 count_tmp = count print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S') time1 = time.time() f.close() #文件保存目录 file_dir=os.path.join(os.path.dirname((os.path.abspath(__file__))),"files") if(not os.path.exists(file_dir)): os.mkdir(file_dir) #html合同 record=os.path.join(os.path.dirname((os.path.abspath(__file__))),"records.txt") #保存未下载的合同列表 with open(record,"r") as f: print(f.readlines()) #下载失败的合同 error=os.path.join(os.path.dirname((os.path.abspath(__file__))),"error.txt") #保存未下载的合同列表 with open(error,"r") as f: for i in f.readlines(): contractNo=i.strip() contractUrl="http://www.baidu.com/contract/view?contractNo={}".format(contractNo) try: downloadFile(os.path.join(file_dir,"{}.pdf".format(contractNo)),contractUrl) except Exception as e: print(e) print(contractNo)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· Open-Sora 2.0 重磅开源!
· 周边上新:园子的第一款马克杯温暖上架
2018-12-28 python 断言
2018-12-28 wireshark 下载 安装使用
2018-12-28 python 连加
2018-12-28 python 阶乘
2018-12-28 python input输入元素相加
2018-12-28 python input选择
2018-12-28 python 数据交换