220702 selenium爬虫学习

1、

今天继续学习selenium操作。测试了下,目标网站的翻页机制是跳转框输入页码或直接点击页码翻页。两种处理的方式依据不同思路,可以用不同代码实现。先看下第一种的,在20220630随笔中已经有,这里只要看下如何输入页码后回车就行,因为目标网站跳转页面并没有下一页按键(参考链接:https://www.gaoyuanqi.cn/python-selenium-send_keys/)

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome()
driver.get('https://www.gaoyuanqi.cn/python-html-1/#more')

# 等待2s
time.sleep(2)

# 通过name的属性值定位输入框,得到一个WebElement对象
webelement = driver.find_element_by_name('t2')

# 输入文本
webelement.send_keys('雨园博客')
time.sleep(2)

# 删除后一个字符
webelement.send_keys(Keys.BACKSPACE)
time.sleep(2)

# Ctrl+A 全选
webelement.send_keys(Keys.CONTROL, 'a')
# Ctrl+X 剪贴
webelement.send_keys(Keys.CONTROL, 'x')
time.sleep(2)

# 输入文本
webelement.send_keys('1314')
time.sleep(2)

# 回车
webelement.send_keys(Keys.ENTER)
time.sleep(2)

# Ctrl+V 粘贴
webelement.send_keys(Keys.CONTROL, 'v')

time.sleep(5)

# 退出驱动并关闭浏览器
driver.quit()

那么第二种直接翻页的呢?

直接用xpath选择器找到对应的点击页码就可以了,可以先试下代码实现(这里参考链接:https://www.jianshu.com/p/fa07cd0b21aa)。

# -*- coding: utf-8 -*-
# @AuThor  : frank_lee

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import time
from scrapy.selector import Selector


class ZfCaigou():
    """
    """
    def __init__(self):
        self.url = 'http://www.zjzfcg.gov.cn/purchaseNotice/index.html?categoryId=3001'
        self.driver = webdriver.Chrome()
        self.wait = WebDriverWait(self.driver, 30)  # 设置超时时间
        self.zoom = 1

    def get_info(self):
        self.driver.get(self.url)
        self.driver.maximize_window()
        time.sleep(5)
        i = 0
        while i < 8:  #这里的页数随意设置,也可以定义一个total_page,然后在这里用self.total_page调用
            time.sleep(2)
            data = self.driver.page_source
            response = Selector(text=data)  # 这里如果不使用"text=data",直接写data将会报错 'str' object has no attribute 'text'
            infodata = response.css(".items p")
            for infoline in infodata:
                city = infoline.css(".warning::text").extract()[0].replace("[", "").replace("·", "").strip()
                issuescate = infoline.css(".warning .limit::text").extract()[0]
                title = infoline.css("a .underline::text").extract()[0].replace("]", "")
                publish_date = infoline.css(".time::text").extract()[0].replace("[", "").replace("]", "")
                print(city + "--" + title + "--" + issuescate + "--" + publish_date)
            self.driver.find_element_by_css_selector(
                'div.paginationjs-pages > ul > li.paginationjs-next.J-paginationjs-next a').click()
            i += 1
            time.sleep(3)
        time.sleep(3)
        self.driver.close()


if __name__ == '__main__':
    z = ZfCaigou()
    z.get_info()

晚上尝试翻页获取数据,发现抓不到。可能要换个方法。试了下用send_keys()可以实现翻页。我实现的代码如下(也参考了这个链接:https://www.jianshu.com/p/fa07cd0b21aa):

j = 1
while j <= 3:
    sleep(1)
    lst = []
    lst1 = []
    for i in range(1,11):
        Project_name = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[4]/div[2]/table/tbody/tr[%s]/td[1]"%i)))
        Stat_tel = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[2]/div/span"%i)))
        Recent_person = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[5]"%i)))
        Last_Updated = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[6]/div/span"%i)))
        lst.append(Project_name.text)
        lst1.append(Stat_tel.text)
#        lst2.append(Recent_person.text)
#        lst3.append(Last_Updated.text)
    print(lst)
    fanye = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[2]/div[2]/div/span[2]/div/input")))
    fanye.send_keys(j)
    fanye.send_keys(Keys.ENTER)
    j += 1
    sleep(1)
sleep(1)
merge = pd.DataFrame(data=[lst, lst1], index=['a','b'])
print(lst)
print(merge)

这样写的问题在于,最后列表仍然只有最后一页的十个,不能将每次翻页抓取的数据,整合到一起。我想了下这样对列表操作需要用到pandas库,这是大数据清洗及处理的利器。尝试了许久,终于找到解决方案了。原来用这个方法就可以获取所有的分页数据并整合,代码见(参考链接:https://blog.csdn.net/lucky_shi/article/details/105172283):

j = 1
total = []
while j <= 3:
    sleep(1)
    lst = []
    lst1 = []
    for i in range(1,11):
        Project_name = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[4]/div[2]/table/tbody/tr[%s]/td[1]"%i)))
        Stat_tel = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[2]/div/span"%i)))
        Recent_person = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[5]"%i)))
        Last_Updated = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[6]/div/span"%i)))
        lst.append(Project_name.text)
        lst1.append(Stat_tel.text)
        ls = [lst,lst1]
    total.append(ls)
#        lst2.append(Recent_person.text)
#        lst3.append(Last_Updated.text)
    fanye = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[2]/div[2]/div/span[2]/div/input")))
    fanye.send_keys(j)
    fanye.send_keys(Keys.ENTER)
    j += 1
    sleep(1)
sleep(1)
print(lst)
print(total)

但这种方法拼出来的数据有问题,是多个列表循环嵌套在内,后来想了办法,将多个列表相同索引的元素进行拼接并输出,主要注意拼接符号要规整,这样就可以实现dataframe操作了,后续直接将单个列写入csv用strip()分列为多列就行。(参考链接:https://zhuanlan.zhihu.com/p/391193380)

j = 1
total = []
while j <= 3:
    sleep(1)
    lst = []
    lst1 = []
    lst2 = []
    lst3 = []
    for i in range(1,11):
        Project_name = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[4]/div[2]/table/tbody/tr[%s]/td[1]"%i)))
        Stat_tel = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[2]/div/span"%i)))
        Recent_person = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[5]"%i)))
        Last_Updated = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[6]/div/span"%i)))
        lst.append(Project_name.text)
        lst1.append(Stat_tel.text)
    len_a = len(lst)
    for i in range(len_a):
        total.append(lst[i]+","+ lst1[i])
    print(lst)
    fanye = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[2]/div[2]/div/span[2]/div/input")))
    fanye.send_keys(j)
    fanye.send_keys(Keys.ENTER)
    j += 1
    sleep(1)
sleep(1)
df = pd.DataFrame(data=[total],index=['a'])
print(df)
#df["电站名称","联系电话","最新时间","最后更新时间"] = df['a'].str.split(',', expand=True)
def GetDesktopPath():
    return os.path.join(os.path.expanduser("~"), 'Desktop')
path = GetDesktopPath()
 

目前基本已解决爬虫获取数据问题。

posted @ 2022-07-02 23:21  dion至君  阅读(47)  评论(0编辑  收藏  举报