selenium + 浏览器 分页爬取 文件
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import re # 正则 from requests.exceptions import RequestException # 预防报错 from multiprocessing import Pool #多线程 from pyquery import PyQuery as pq #解析库 from config import * #引入当前目录config 文件内容 print(SERVICE_ARGS) # bro = webdriver.Chrome() # 引用浏览器Chrome 提前要安装chromedriver.exe bro = webdriver.PhantomJS(service_args=SERVICE_ARGS) #引用无界面 PhantomJS 不在弹框 只在后台运行 # wait = WebDriverWait(bro, 10) # selenium 属性设置 # bro.set_window_size(1400, 900) # 设置 浏览器窗口宽度 def search(): print('正在搜索...') try: bro.get('http://search.zongheng.com') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#commSearch > div > input.search-text.fl'))) # 使用css属性定位 submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#commSearch > div > input.search-btn.fr')))# 使用css属性定位 input.send_keys('都市')#输入文字 submit.click() #点击按钮 total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#totalPage')))# 使用css属性定位 get_pr() return total.text # 返回HTML上的文字 except TimeoutException: search() def next_page(total): print('正在翻页',total) try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > input.search_d_page_value')))# 使用css属性定位 submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > input.search_d_page_submit')))# 使用css属性定位 input.clear()#清除里面的内容 input.send_keys(total)#输入文字 submit.click()# 点击 # 判断当前是否加载完毕 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > a.active'), str(total))) get_pr() except RequestException: next_page(total) def get_pr(): wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab')))# 使用css属性定位 html = bro.page_source doc = pq(html) items = doc('body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab').items() #使用css属性定位 获取属性下的html for item in items: html = re.compile('.*?src="(.*?)" onerro.*?class="tit"><a href="(.*?)" target.*?">(.*?)</a>.*?}">(.*?)</a>.*?">(.*?)</a>.*?</em><span>(.*?)</span>.*?em><span>(.*?)</span>.*?<p>.*?(.*?)</p>',re.S)#正则 html = re.findall(html,str(item))#比配 for ii in html: # print(ii) product = { # '图片': item.find('.imgbox img').attr('scr'), # '图片': item.find('.se-result-infos .tit').text(), '图片': ii[0], '地址': ii[1], '书名': ii[2], '作者': ii[3] + ii[4], '连载': ii[6], '介绍': ii[7] } print(product) # print(html) # print(item) def main(): total = search() # total = int(re.compile('(\d+)').search(totla).group(1)) for i in range(2, int(total) + 1): next_page(i) # break # print(total) pool = Pool() # pool.map([next_page(i) for i in range(2, int(total) + 1)]) #多线程 bro.close() # 关闭浏览器 if __name__ == '__main__': # pool = Pool() # pool.map(main) # main() pass