selenium + 浏览器 分页爬取 文件

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re  # 正则
from requests.exceptions import RequestException # 预防报错
from multiprocessing import Pool #多线程
from pyquery import PyQuery as pq #解析库
from config import * #引入当前目录config 文件内容

print(SERVICE_ARGS)
# bro = webdriver.Chrome() # 引用浏览器Chrome 提前要安装chromedriver.exe
bro = webdriver.PhantomJS(service_args=SERVICE_ARGS) #引用无界面 PhantomJS 不在弹框 只在后台运行
# wait = WebDriverWait(bro, 10) # selenium 属性设置

# bro.set_window_size(1400, 900) # 设置 浏览器窗口宽度

def search():
    print('正在搜索...')
    try:
        bro.get('http://search.zongheng.com')
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#commSearch > div > input.search-text.fl'))) # 使用css属性定位
        submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#commSearch > div > input.search-btn.fr')))# 使用css属性定位

        input.send_keys('都市')#输入文字
        submit.click() #点击按钮
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#totalPage')))# 使用css属性定位
        get_pr()
        return total.text # 返回HTML上的文字
    except TimeoutException:
        search()
def next_page(total):
    print('正在翻页',total)
    try:
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > input.search_d_page_value')))# 使用css属性定位
        submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > input.search_d_page_submit')))# 使用css属性定位
        input.clear()#清除里面的内容
        input.send_keys(total)#输入文字
        submit.click()# 点击
        # 判断当前是否加载完毕
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > a.active'), str(total)))
        get_pr()
    except RequestException:
        next_page(total)

def get_pr():
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab')))# 使用css属性定位
    html = bro.page_source
    doc = pq(html)
    items = doc('body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab').items() #使用css属性定位 获取属性下的html
    for item in items:
        html = re.compile('.*?src="(.*?)" onerro.*?class="tit"><a href="(.*?)" target.*?">(.*?)</a>.*?}">(.*?)</a>.*?">(.*?)</a>.*?</em><span>(.*?)</span>.*?em><span>(.*?)</span>.*?<p>.*?(.*?)</p>',re.S)#正则
        html = re.findall(html,str(item))#比配
        for ii in html:
            # print(ii)
            product = {
                # '图片': item.find('.imgbox img').attr('scr'),
                # '图片': item.find('.se-result-infos .tit').text(),
                '图片': ii[0],
                '地址': ii[1],
                '书名': ii[2],
                '作者': ii[3] + ii[4],
                '连载': ii[6],
                '介绍': ii[7]
            }
            print(product)
        # print(html)
    # print(item)

def main():
    total = search()
    # total = int(re.compile('(\d+)').search(totla).group(1))
    for i in range(2, int(total) + 1):
        next_page(i)
        # break
    # print(total)
    pool = Pool()
    # pool.map([next_page(i) for i in range(2, int(total) + 1)]) #多线程
    bro.close() # 关闭浏览器
if __name__ == '__main__':
    # pool = Pool()
    # pool.map(main)
    # main()
    pass

 

posted on 2019-06-26 15:40  ||子义  阅读(692)  评论(0编辑  收藏  举报

导航