selenium模拟浏览器对搜狗微信文章进行爬取
在上一篇博客中使用redis所维护的代理池抓取微信文章,开始运行良好,之后运行时总是会报501错误,我用浏览器打开网页又能正常打开,调试了好多次都还是会出错,既然这种方法出错,那就用selenium模拟浏览器获取搜狗微信文章的详情页面信息,把这个详情页面信息获取后,仍然用pyquery库进行解析,之后就可以正常的获得微信文章的url,然后就可以通过这个url,获得微信文章的信息
代码如下:
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from weixin.weixin.weixin_article import WeixinArticle from requests.exceptions import ConnectionError from pyquery import PyQuery as pq class SeleniumWeixinArticle(WeixinArticle): """使用selenium模拟浏览器,获取搜狗微信搜索的详细信息,继承WeixinArticle这个类""" proxy = None def __init__(self): """初始化浏览器,及部分浏览器信息""" self.browser = webdriver.Chrome(executable_path="C:/codeapp/seleniumDriver/chrome/chromedriver.exe") self.wait = WebDriverWait(self.browser, 10) super(SeleniumWeixinArticle, self).__init__() def get_html(self, url, count=1): """重写WeixinArticle 中的get_html 用selenium模拟浏览器去获取搜狗微信搜索的信息""" if not url: return None # 最后递归max_count这么多次,防止无限递归 if count >= self.max_count: print("try many count ") return None print('crowling url ', url) print('crowling count ', count) global proxy if self.proxy: proxy_ip = '--proxy-server=http://' + self.proxy chrome_options = webdriver.ChromeOptions() # 切换IP chrome_options.add_argument(proxy_ip) browser = self.browser(chrome_options=chrome_options) else: browser = self.browser try: browser.get(url) # 返回值是None,要取数直接用browser.page_source next_page = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#sogou_next"))) if browser.current_url == url: page_source = browser.page_source return page_source else: print("must change ip proxy ") proxy = self.get_proxy(self.proxy_pool_url) if proxy: return self.get_html(url) else: print("get proxy is faired ") return None except ConnectionError: count += 1 proxy = self.get_proxy(self.proxy_pool_url) return self.get_html(url, count) if __name__ == "__main__": weixin_article = SeleniumWeixinArticle() weixin_article.run()
程序较为简单,主要是重写WeixinArticle中的get_html方法,其他的逻辑不变,这也是面向对象编程的好处,
程序结构逻辑如下: