Python 爬虫-抓取中小企业股份转让系统公司公告的链接并下载
系统运行系统:MAC
用到的python库:selenium、phantomjs等
由于中小企业股份转让系统网页使用了javasvript,无法用传统的requests、BeautifulSoup库获取想要的下载链接,所以这次使用selenium、phantomjs库,这两个库的安装方法百度搜索就可以找到。本次代码只抓取一页的下载链接并下载pdf文档。
抓取源代码如下:
from selenium import webdriver from urllib.request import urlretrieve import os, time class DownloadFiles(): def __init__(self): self.url = 'http://www.neeq.com.cn/disclosure/announcement.html' self.basePath = os.path.dirname(__file__) def makedir(self, name): path = os.path.join(self.basePath, name) isExist = os.path.exists(path) if not isExist: os.makedirs(path) print('File has been created.') else: print('The file is existed.') # 切换到该目录下 os.chdir(path) def connect(self, url): driver = webdriver.PhantomJS() driver.get(url) return driver def getFiles(self): driver = self.connect(self.url) self.makedir('Files') aList = driver.find_elements_by_tag_name('a') for r in aList: try: link = r.get_attribute('href') if link.endswith('pdf'): print(r.text) print(link) fileName = r.text + '.pdf' urlretrieve(link, fileName) except: pass if __name__ == '__main__': obj = DownloadFiles() obj.getFiles()