220702 selenium爬虫学习
1、
今天继续学习selenium操作。测试了下,目标网站的翻页机制是跳转框输入页码或直接点击页码翻页。两种处理的方式依据不同思路,可以用不同代码实现。先看下第一种的,在20220630随笔中已经有,这里只要看下如何输入页码后回车就行,因为目标网站跳转页面并没有下一页按键(参考链接:https://www.gaoyuanqi.cn/python-selenium-send_keys/)
import time from selenium import webdriver from selenium.webdriver.common.keys import Keys driver = webdriver.Chrome() driver.get('https://www.gaoyuanqi.cn/python-html-1/#more') # 等待2s time.sleep(2) # 通过name的属性值定位输入框,得到一个WebElement对象 webelement = driver.find_element_by_name('t2') # 输入文本 webelement.send_keys('雨园博客') time.sleep(2) # 删除后一个字符 webelement.send_keys(Keys.BACKSPACE) time.sleep(2) # Ctrl+A 全选 webelement.send_keys(Keys.CONTROL, 'a') # Ctrl+X 剪贴 webelement.send_keys(Keys.CONTROL, 'x') time.sleep(2) # 输入文本 webelement.send_keys('1314') time.sleep(2) # 回车 webelement.send_keys(Keys.ENTER) time.sleep(2) # Ctrl+V 粘贴 webelement.send_keys(Keys.CONTROL, 'v') time.sleep(5) # 退出驱动并关闭浏览器 driver.quit()
那么第二种直接翻页的呢?
直接用xpath选择器找到对应的点击页码就可以了,可以先试下代码实现(这里参考链接:https://www.jianshu.com/p/fa07cd0b21aa)。
# -*- coding: utf-8 -*- # @AuThor : frank_lee from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait import time from scrapy.selector import Selector class ZfCaigou(): """ """ def __init__(self): self.url = 'http://www.zjzfcg.gov.cn/purchaseNotice/index.html?categoryId=3001' self.driver = webdriver.Chrome() self.wait = WebDriverWait(self.driver, 30) # 设置超时时间 self.zoom = 1 def get_info(self): self.driver.get(self.url) self.driver.maximize_window() time.sleep(5) i = 0 while i < 8: #这里的页数随意设置,也可以定义一个total_page,然后在这里用self.total_page调用 time.sleep(2) data = self.driver.page_source response = Selector(text=data) # 这里如果不使用"text=data",直接写data将会报错 'str' object has no attribute 'text' infodata = response.css(".items p") for infoline in infodata: city = infoline.css(".warning::text").extract()[0].replace("[", "").replace("·", "").strip() issuescate = infoline.css(".warning .limit::text").extract()[0] title = infoline.css("a .underline::text").extract()[0].replace("]", "") publish_date = infoline.css(".time::text").extract()[0].replace("[", "").replace("]", "") print(city + "--" + title + "--" + issuescate + "--" + publish_date) self.driver.find_element_by_css_selector( 'div.paginationjs-pages > ul > li.paginationjs-next.J-paginationjs-next a').click() i += 1 time.sleep(3) time.sleep(3) self.driver.close() if __name__ == '__main__': z = ZfCaigou() z.get_info()
晚上尝试翻页获取数据,发现抓不到。可能要换个方法。试了下用send_keys()可以实现翻页。我实现的代码如下(也参考了这个链接:https://www.jianshu.com/p/fa07cd0b21aa):
j = 1 while j <= 3: sleep(1) lst = [] lst1 = [] for i in range(1,11): Project_name = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[4]/div[2]/table/tbody/tr[%s]/td[1]"%i))) Stat_tel = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[2]/div/span"%i))) Recent_person = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[5]"%i))) Last_Updated = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[6]/div/span"%i))) lst.append(Project_name.text) lst1.append(Stat_tel.text) # lst2.append(Recent_person.text) # lst3.append(Last_Updated.text) print(lst) fanye = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[2]/div[2]/div/span[2]/div/input"))) fanye.send_keys(j) fanye.send_keys(Keys.ENTER) j += 1 sleep(1) sleep(1) merge = pd.DataFrame(data=[lst, lst1], index=['a','b']) print(lst) print(merge)
这样写的问题在于,最后列表仍然只有最后一页的十个,不能将每次翻页抓取的数据,整合到一起。我想了下这样对列表操作需要用到pandas库,这是大数据清洗及处理的利器。尝试了许久,终于找到解决方案了。原来用这个方法就可以获取所有的分页数据并整合,代码见(参考链接:https://blog.csdn.net/lucky_shi/article/details/105172283):
j = 1 total = [] while j <= 3: sleep(1) lst = [] lst1 = [] for i in range(1,11): Project_name = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[4]/div[2]/table/tbody/tr[%s]/td[1]"%i))) Stat_tel = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[2]/div/span"%i))) Recent_person = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[5]"%i))) Last_Updated = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[6]/div/span"%i))) lst.append(Project_name.text) lst1.append(Stat_tel.text) ls = [lst,lst1] total.append(ls) # lst2.append(Recent_person.text) # lst3.append(Last_Updated.text) fanye = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[2]/div[2]/div/span[2]/div/input"))) fanye.send_keys(j) fanye.send_keys(Keys.ENTER) j += 1 sleep(1) sleep(1) print(lst) print(total)
但这种方法拼出来的数据有问题,是多个列表循环嵌套在内,后来想了办法,将多个列表相同索引的元素进行拼接并输出,主要注意拼接符号要规整,这样就可以实现dataframe操作了,后续直接将单个列写入csv用strip()分列为多列就行。(参考链接:https://zhuanlan.zhihu.com/p/391193380)
j = 1 total = [] while j <= 3: sleep(1) lst = [] lst1 = [] lst2 = [] lst3 = [] for i in range(1,11): Project_name = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[4]/div[2]/table/tbody/tr[%s]/td[1]"%i))) Stat_tel = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[2]/div/span"%i))) Recent_person = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[5]"%i))) Last_Updated = wait.until(EC.presence_of_element_located((By.XPATH,"//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[1]/div[3]/table/tbody/tr[%s]/td[6]/div/span"%i))) lst.append(Project_name.text) lst1.append(Stat_tel.text) len_a = len(lst) for i in range(len_a): total.append(lst[i]+","+ lst1[i]) print(lst) fanye = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='main-frame']/div[4]/div/div[1]/div[2]/div[2]/div[2]/div/span[2]/div/input"))) fanye.send_keys(j) fanye.send_keys(Keys.ENTER) j += 1 sleep(1)
目前基本已解决爬虫获取数据问题。