IEEE Xplore批量下载2018,selenium右键另存为

语言:python

需要安装的包: selenium, pywin32, chrome-driver

需要安装的软件:Chrome

下载频率请不要过于频繁。

替换url可以下载当前链接页面中所有的文章。

右键另存为使用的是win32api的方法。

 1 '''
 2 @author:Gawen
 3 '''
 4 import requests
 5 from bs4 import BeautifulSoup
 6 from selenium import webdriver
 7 from selenium.webdriver.common.action_chains import ActionChains
 8 from selenium.webdriver.common.keys import Keys
 9 import time
10 import win32api
11 import win32con
12 
13 #替换url可以下载当前页面所有的文章
14 url = 'https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8240062&punumber=8240062&filter=issueId%20EQ%20%228252286%22&rowsPerPage=75&pageNumber=2&rowsPerPage=75'
15 fore = 'https://ieeexplore.ieee.org'
16 s = 'Download or View the PDF:'
17 sites = []
18 errtitle = []
19 r = requests.get(url)
20 html = r.content.decode('utf-8')
21 soup = BeautifulSoup(html,'lxml')
22 h3 = soup.find('div', class_='cf jrnl-results-filter').find_all('h3')
23 h3text = []
24 for h in h3:
25     h3text.append(h.text.strip())
26 print(h3text)
27 for i in range(len(h3text)):
28     if((soup.find('a', attrs={'aria-label':s+'  '+h3text[i]}))==None):
29         errtitle.append(h3text[i])
30         continue
31     pdf = soup.find('a', attrs={'aria-label':s+'  '+h3text[i]})['href']
32     print(fore+pdf)
33     sites.append(fore+pdf)
34 driver = webdriver.Chrome()
35 driver.maximize_window()
36 for site in sites:
37     driver.get(site)
38     element = driver.find_element_by_css_selector('body')
39     driver.implicitly_wait(20)
40     time.sleep(3)
41     ActionChains(driver).context_click(element).perform()
42     win32api.keybd_event(65,win32con.KEYEVENTF_KEYUP,0)
43     time.sleep(1)
44     win32api.keybd_event(18,0,0,0)#left alt
45     win32api.keybd_event(83,0,0,0)
46     win32api.keybd_event(83,0,win32con.KEYEVENTF_KEYUP,0)
47     win32api.keybd_event(18,0,win32con.KEYEVENTF_KEYUP,0)#left alt up
48     time.sleep(20)
49 driver.close()
50 
51 print(str(len(errtitle))+'篇文章下载失败,分别为:')
52 for title in errtitle:
53     print(title+'\n')

 

posted @ 2018-10-11 11:19  Gawen_NEU  阅读(1239)  评论(0)    收藏  举报