selenium配合phantomjs实现爬虫功能,并把抓取的数据写入excel
# -*- coding: UTF-8 -*- ''' Created on 2016年5月13日 @author: csxie ''' import datetime from Base import BasePage import ExcelOperation as excel from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.expected_conditions import text_to_be_present_in_element import unittest class JobLog(BasePage): url=""#目标url def setUp(self): BasePage.setUP(self) def tearDown(self): BasePage.tearDown(self) def test_querysql(self): """ 查询到符合条件的信息并写入excel """ driver=self.driver driver.get(self.url) ''' 设置参数 start:起始日期 end:截止日期 user:查询人员 ''' end=datetime.date.today() start=end+datetime.timedelta(days=-6) user=None #起始日期 txtStart=driver.find_element_by_id("ctl01_txtSelectDateFrom") txtStart.clear() txtStart.send_keys(start) #截止日期 txtEnd=driver.find_element_by_id("ctl01_txtSelectDateTo") txtEnd.clear() txtEnd.send_keys(end) #查询人员 if(user!=None): txtuser=driver.find_element_by_id("ctl01_txtUser") txtuser.clear() txtuser.send_keys(user) #查询按钮 btnQueryInfo=driver.find_element_by_id("ctl01_btnQueryInfo") btnQueryInfo.click() #总页数 totalPages=int(driver.find_element_by_id("ctl01_uc_CommonPager_lb_TotalPages").text) i=1 while(i<=totalPages): #table的xpath listTable=self.getTableData(".//*[@id='ctl01__wrLogList_gvlist']",colNO=9,loop=i) #当前页数 currentPage=int(driver.find_element_by_id("ctl01_uc_CommonPager_lb_PageNum").text) if(currentPage==1): excel.save_to_excel_newsheet(listTable,excelPath='joblog.xls') else: excel.save_to_excel_oldsheet(listTable,excelPath='joblog.xls') if(i!=totalPages): #下一页按钮 driver.find_element_by_id("ctl01_uc_CommonPager_NextPage").click() WebDriverWait(driver,8).until(text_to_be_present_in_element(('id','ctl01_uc_CommonPager_lb_PageNum'),str(i+1))) i=i+1 def getTableData(self,value,colTag='td',colNO=None,loop=None): ''' 获取表格数据 colNO:存在合并行时,强制指定表格列数 ''' driver=self.driver table=driver.find_element(by="xpath",value=value) rows=table.find_elements(by="tag name",value="tr")#行集合 maxrowCount=len(rows)#表格行数 #如果colNO=None不为空,则用传进来的值作为列数 if(colNO==None): cols = table.find_elements(by="tag name",value="td"); if(cols!='td'): cols = table.find_elements(by="tag name",value="th") maxcolCount=len(cols)/len(rows)#表格列数 else: maxcolCount=colNO lists = [[] for i in range(maxrowCount)] #print "列数"+str(maxcolCount) #print "行数"+str(maxrowCount) i=1 while i<=maxrowCount: j=1;#初始化列 while j<=maxcolCount: try: if(i==1): try: innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text except: colTag="th" innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text else: colTag="td" if(j==3 or j==5): innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]/a").get_attribute("title") elif(j==8): innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text innerText=float(innerText.replace("h", "")) else: innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text except: innerText="" #print "第"+str(i)+"行,第"+str(j)+"列"+str(innerText) lists[i-1].append(innerText) j+=1 i+=1 if(loop!=1):#不是第一次循环,列头不要 return lists[1:] return lists if __name__ == "__main__": unittest.main()
# -*- coding: UTF-8 -*- ''' Created on 2016年5月18日 @author: csxie ''' import xlwt from xlrd import open_workbook from xlutils.copy import copy import os def save_to_excel_newsheet(listTable,excelPath=r'C:\demo.xls',sheetName='sheet1'): if(isinstance(listTable,list)): rowNO=len(listTable); if(rowNO==0): raise ValueError,u'传入的是list是空的' colNO=len(listTable[0]); wkb = xlwt.Workbook() sheet = wkb.add_sheet(sheetName) for i in range(rowNO): for j in range(colNO): #print listTable[i][j] #print str(i)+'行'+str(j)+'列' sheet.write(i,j,listTable[i][j]) if(os.path.exists(excelPath)): os.remove(excelPath) wkb.save(excelPath) return True else: raise TypeError,u'传入的类型不是list' def save_to_excel_oldsheet(listTable,excelPath=r'C:\demo.xls',sheetName='sheet1'): if(isinstance(listTable,list)): rowNO=len(listTable); if(rowNO==0): raise ValueError,u'传入的是list是空的' colNO=len(listTable[0]); rb = open_workbook(excelPath) sheet_index=rb.sheet_names().index(sheetName) old_rows=rb.sheet_by_name(sheetName).nrows#已存在的excel中数据行数 wkb = copy(rb) sheet = wkb.get_sheet(sheet_index) for i in range(rowNO): for j in range(colNO): sheet.write(i+old_rows,j,listTable[i][j]) #print str(i)+'行'+str(j)+'列' wkb.save(excelPath) return True else: raise TypeError,u'传入的类型不是list'
# -*- coding: UTF-8 -*- from selenium import webdriver import unittest class BasePage(unittest.TestCase): def setUP(self): self.driver= webdriver.PhantomJS(executable_path="phantomjs.exe") #self.driver.set_window_size(1400, 1000) #self.driver.implicitly_wait(1) self.verificationErrors = [] def tearDown(self): self.driver.quit() #self.driver.close() self.assertEqual([], self.verificationErrors)