IEEE Xplore批量下载2018,selenium右键另存为
语言:python
需要安装的包: selenium, pywin32, chrome-driver
需要安装的软件:Chrome
下载频率请不要过于频繁。
替换url可以下载当前链接页面中所有的文章。
右键另存为使用的是win32api的方法。
1 ''' 2 @author:Gawen 3 ''' 4 import requests 5 from bs4 import BeautifulSoup 6 from selenium import webdriver 7 from selenium.webdriver.common.action_chains import ActionChains 8 from selenium.webdriver.common.keys import Keys 9 import time 10 import win32api 11 import win32con 12 13 #替换url可以下载当前页面所有的文章 14 url = 'https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8240062&punumber=8240062&filter=issueId%20EQ%20%228252286%22&rowsPerPage=75&pageNumber=2&rowsPerPage=75' 15 fore = 'https://ieeexplore.ieee.org' 16 s = 'Download or View the PDF:' 17 sites = [] 18 errtitle = [] 19 r = requests.get(url) 20 html = r.content.decode('utf-8') 21 soup = BeautifulSoup(html,'lxml') 22 h3 = soup.find('div', class_='cf jrnl-results-filter').find_all('h3') 23 h3text = [] 24 for h in h3: 25 h3text.append(h.text.strip()) 26 print(h3text) 27 for i in range(len(h3text)): 28 if((soup.find('a', attrs={'aria-label':s+' '+h3text[i]}))==None): 29 errtitle.append(h3text[i]) 30 continue 31 pdf = soup.find('a', attrs={'aria-label':s+' '+h3text[i]})['href'] 32 print(fore+pdf) 33 sites.append(fore+pdf) 34 driver = webdriver.Chrome() 35 driver.maximize_window() 36 for site in sites: 37 driver.get(site) 38 element = driver.find_element_by_css_selector('body') 39 driver.implicitly_wait(20) 40 time.sleep(3) 41 ActionChains(driver).context_click(element).perform() 42 win32api.keybd_event(65,win32con.KEYEVENTF_KEYUP,0) 43 time.sleep(1) 44 win32api.keybd_event(18,0,0,0)#left alt 45 win32api.keybd_event(83,0,0,0) 46 win32api.keybd_event(83,0,win32con.KEYEVENTF_KEYUP,0) 47 win32api.keybd_event(18,0,win32con.KEYEVENTF_KEYUP,0)#left alt up 48 time.sleep(20) 49 driver.close() 50 51 print(str(len(errtitle))+'篇文章下载失败,分别为:') 52 for title in errtitle: 53 print(title+'\n')