先认识具体情况之前,我们需要先了解什么是ajax,ajax它的英文全称是asynchronous javascript and xml,是一种异步JavaScript和xml。我们可以通过ajax进行页面数据请求,它返回的数据格式是json类型的。
import json from Chapter3 import download import csv def simpletest(): ''' it will write the date to the country.csv the json data has the attribute records, and the records has area, country and capital value :return: ''' fileds = ('area', 'country', 'capital') writer = csv.writer(open("country.csv", "w")) writer.writerow(fileds) d = download.Downloader() html = d("http://example.webscraping.com/ajax/search.json?page=0&page_size=10&search_term=A") try: ajax = json.loads(html) except Exception as e: print str(e) else: for record in ajax['records']: row = [record[filed] for filed in fileds] writer.writerow(row) if __name__ == "__main__": simpletest()
在开始之前呢,首先要先下载pyside,直接用 pip install pyside 命令行即可。
from PySide.QtWebKit import * from PySide.QtGui import * from PySide.QtCore import * import lxml.html def simpletest(): ''' get content of the div # result in http://example.webscraping.com/places/default/dynamic :return: content ''' app = QApplication([]) webview = QWebView() loop = QEventLoop() # finish the loop if we have finished load the html webview.loadFinished.connect(loop.quit) webview.load(QUrl("http://example.webscraping.com/places/default/dynamic")) loop.exec_() htmled = webview.page().mainFrame().toHtml() # get the special content tree = lxml.html.fromstring(htmled) return tree.cssselect('#result')[0].text_content() content = simpletest() print content
def getallcountry(): ''' open the html and set search term = a and page_size = 10 and then click auto by javascript :return: ''' app = QApplication([]) webview = QWebView() loop = QEventLoop() # finish the loop if we have finished load the html webview.loadFinished.connect(loop.quit) webview.load(QUrl("http://example.webscraping.com/places/default/search")) loop.exec_() # show the webview webview.show() frame = webview.page().mainFrame() # set search text is b frame.findFirstElement('#search_term').setAttribute('value', 'b') # set page_size is 10 frame.findFirstElement('#page_size option:checked').setPlainText('10') # click search button auto frame.findFirstElement('#search').evaluateJavaScript('this.click()') app.exec_()
为了将以上的几种方法变得更加具有通用性,我们可以把他们写在 一个类中。这个类包含的功能有:下载,获取html,找到相应的元素,设置属性值,设置文本值,点击,轮询页面,等待下载
from PySide.QtCore import * from PySide.QtGui import * from PySide.QtWebKit import * import time import sys class BrowserRender(QWebView): def __init__(self, show=True): ''' if the show is true then we can see webview :param show: ''' self.app = QApplication(sys.argv) QWebView.__init__(self) if show: self.show() def download(self, url, timeout=60): ''' download the url if timeout is false :param url: the download url :param timeout: the timeout time :return: html if not timeout ''' loop = QEventLoop() timer = QTimer() timer.setSingleShot(True) timer.timeout.connect(loop.quit) self.loadFinished.connect(loop.quit) self.load(QUrl(url)) timer.start(timeout*1000) loop.exec_() if timer.isActive(): timer.stop() return self.html() else: print "Request time out "+url def html(self): ''' shortcut to return the current html :return: ''' return self.page().mainFrame().toHtml() def find(self, pattern): ''' find all elements that match the pattern :param pattern: :return: ''' return self.page().mainFrame().findAllElements(pattern) def attr(self, pattern, name, value): ''' set attribute for matching pattern :param pattern: :param name: :param value: :return: ''' for e in self.find(pattern): e.setAttribute(name, value) def text(self, pattern, value): ''' set plaintext for matching pattern :param pattern: :param value: :return: ''' for e in self.find(pattern): e.setPlainText(value) def click(self, pattern): ''' click matching pattern :param pattern: :return: ''' for e in self.find(pattern): e.evaluateJavaScript("this.click()") def wait_load(self, pattern, timeout=60): ''' wait untill pattern is found and return matches :param pattern: :param timeout: :return: ''' deadtiem = time.time() + timeout while time.time() < deadtiem: self.app.processEvents() matches = self.find(pattern) if matches: return matches print "wait load timed out" br = BrowserRender() br.download("http://example.webscraping.com/places/default/search") br.attr('#search_term', 'value', '.') br.text('#page_size option:checked', '1000') br.click('#search') elements = br.wait_load('#results a') countries = [e.toPlainText().strip() for e in elements] print countries
在调用的时候,一定要注意要把pattern写对,我就把#results a 写成了#result a,导致一直出现time out现象
selenium 是一个简单的能够与页面交互的接口,它提供了使得浏览器自动化的API接口。selenium的使用非常的简单,它相当于已经把我们想要的函数都已经封装起来了,我们所需要的就是调用相应的函数。
from selenium import webdriver def simpleuse(): driver = webdriver.Chrome() driver.get("http://example.webscraping.com/places/default/search") driver.find_element_by_id("search_term").send_keys('.') js = "document.getElementById('page_size').options[1].text='1000'" driver.execute_script(js) driver.find_element_by_id('search').click() driver.implicitly_wait(30) links = driver.find_element_by_css_selector("#results a") countries = [link.text for link in links] print countries
driver.close() if __name__ == "__main__": simpleuse()
这个问题已经解决了,只需要去官网上下载对应版本的chromedriver.exe,然后将保存它的绝对路径加入 webdriver.chrome(绝对路径)即可。现在的代码变成如下:
from selenium import webdriver def simpleuse(): driver = webdriver.Chrome('E:\chromedriver\chromedriver.exe') driver.get("http://example.webscraping.com/places/default/search") driver.find_element_by_id("search_term").send_keys('.') js = "document.getElementById('page_size').options[1].text='1000'" driver.execute_script(js) driver.find_element_by_id('search').click() driver.implicitly_wait(30) links = driver.find_elements_by_css_selector("#results a") countries = [link.text for link in links] print countries driver.close() if __name__ == "__main__": simpleuse()