亲测,完全有效,使用webdriver,自写,裁判文书网,批量全部下载
直接上代码(注意改用户名,密码)
1 """程序说明""" 2 # -*- coding: utf-8 -*- 3 # Author: zhou bo 4 # Datetime : 2020 5 # software: PyCharm 7 from selenium import webdriver 8 from selenium.webdriver.common.by import By 9 from selenium.webdriver.support import expected_conditions as EC 10 from selenium.webdriver.support.wait import WebDriverWait 11 import math 12 import time 13 import logging 14 from selenium.webdriver.firefox.options import Options 15 import os 16 from crawler_tools import user_agent as u 17 from datetime import datetime 18 from selenium.common.exceptions import * 19 import pyautogui 20 import random 21 from selenium.webdriver import ActionChains 22 from retrying import retry 23 24 25 def login(driver): 26 """登录""" 27 # 切换框架 28 wait = WebDriverWait(driver, 20) 29 driver.refresh() 30 frame = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="contentIframe"]'))) 31 driver.switch_to.frame(frame) 32 33 click = wait.until(EC.presence_of_element_located( 34 (By.XPATH, '//*[@id="phoneNumber"]'))) 35 click.send_keys("手机号") 36 time.sleep(1) 37 click1 = wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/app-root/div/app-login/div/div/form/div/div[2]/input'))) 38 # click1.clear() 39 click1.send_keys("密码") 40 time.sleep(1) # 等一秒是最优选择,短了网络错误 41 button1 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.custom-button'))) 42 button1.click() 43 # 必须加上表单退出,否者就是死元素无法定位 44 driver.switch_to.default_content() 45 46 # 通过输入,进行进入数据 47 select_value = wait.until(EC.presence_of_element_located( 48 (By.XPATH, '//*[@id="_view_1540966814000"]/div/div[1]/div[2]/input')))#//*[@id="_view_1540966814000"]/div/div[1]/div[2]/input 49 select_value.send_keys("合同纠纷") 50 time.sleep(2) # 等一秒是最优选择,短了网络错误 51 driver.get( 52 "https://wenshu.court.gov.cn/website/wenshu/181217BMTKHNT2W0/index.html?pageId=b67ff15b548ff825d1e09dc899ecf778&s21=%E5%90%88%E5%90%8C%E7%BA%A0%E7%BA%B7") 53 five_to_15(driver) 54 down_load(driver) 55 while(1): 56 next_page(driver) 57 time.sleep(2) 58 down_load(driver) 59 60 def five_to_15(driver): 61 wait = WebDriverWait(driver, 20) 62 button_ = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="_view_1545184311000"]/div[8]/div/select')))#//*[@id="_view_1545184311000"]/div[8]/div/select 63 button_.click() 64 # time.sleep(1) 65 button_ = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="_view_1545184311000"]/div[8]/div/select/option[3]')))#//*[@id="_view_1545184311000"]/div[8]/div/select/option[3] 66 button_.click() 67 time.sleep(1) 68 69 def down_load(driver): 70 wait = WebDriverWait(driver, 20) 71 button_select = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="_view_1545184311000"]/div[2]/div[4]/a[1]'))) 72 button_select.click() 73 time.sleep(2) # 等一秒是最优选择,短了网络错误 74 button_download = wait.until( 75 EC.element_to_be_clickable((By.XPATH, '//*[@id="_view_1545184311000"]/div[2]/div[4]/a[3]'))) 76 button_download.click() 77 78 def next_page(driver): 79 wait = WebDriverWait(driver, 20) 80 button_ = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[@class="left_7_3"]/a[last()]'))) 81 time.sleep(2) 82 button_.click() 83 84 85 86 if __name__ =="__main__": 87 # 读取限定词目录 88 driver = webdriver.Chrome('E:\Google\Driver\chromedriver.exe') 89 driver.get("https://wenshu.court.gov.cn/website/wenshu/181217BMTKHNT2W0/index.html?pageId=b67ff15b548ff825d1e09dc899ecf778&s21=%E5%90%88%E5%90%8C%E7%BA%A0%E7%BA%B7") 90 time.sleep(5) 91 login(driver)