爬虫
1. multiThread.py
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
# -*- coding:utf-8 -*- import threading import time import Queue SHARE_Q = Queue.Queue() #构造一个不限制大小的的队列 _WORKER_THREAD_NUM = 3 #设置线程的个数 class MyThread(threading.Thread) : """ doc of class Attributess: func: 线程函数逻辑 """ def __init__(self, func) : super(MyThread, self).__init__() #调用父类的构造函数 self.func = func #传入线程函数逻辑 def run(self) : """ 重写基类的run方法 """ self.func() # def worker() : # """ # 主要用来写工作逻辑, 只要队列不空持续处理 # 队列为空时, 检查队列, 由于Queue中已经包含了wait, # notify和锁, 所以不需要在取任务或者放任务的时候加锁解锁 # """ # global SHARE_Q # while not SHARE_Q.empty(): # testCaseName_link_set = SHARE_Q.get() #获得任务 # testCaseName_ratio_set = getPassRation(testCaseName_link_set) # SHARE_Q.task_done() # def main() : # global SHARE_Q # threads = [] # #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务 # testCaseName_link_list = parseLink() # for item in testCaseName_link_list : # SHARE_Q.put(item) # #开启_WORKER_THREAD_NUM个线程 # for i in xrange(_WORKER_THREAD_NUM) : # thread = MyThread(worker) # thread.start() #线程开始处理任务 # threads.append(thread) # for thread in threads : # thread.join() # #等待所有任务完成 # SHARE_Q.join()
2. parseLink.py
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
# -*- coding:utf-8 -*- from selenium import webdriver from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import re import time # 按照时间顺序排列list,list[0]是最近的task taskLinkList = ["http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af1224016493479e7522c4&W3=", "http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af1224016489b92f142114&W3=", "http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af12240164896246d020f8&W3="] def getPageSource(baseUrl): # headless chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--diable-gpu') driver = webdriver.Chrome(chrome_options=chrome_options) # driver = webdriver.Chrome() driver.set_window_size(480,30000) # 将窗口高度设置得足够大,以使得所有 driver.set_script_timeout(30) driver.set_page_load_timeout(30) try: driver.get(baseUrl) # login userName = driver.find_element_by_name('userName') userName.send_keys('l84102264') password = driver.find_element_by_name('password') password.send_keys('233*Dgg666') BtmLogin = driver.find_element_by_class_name('btn-purple') BtmLogin.send_keys(Keys.ENTER) # switch to target frame driver.implicitly_wait(3) driver.switch_to.frame("iframe") driver.implicitly_wait(3) # 一定千万等待一会 time.sleep(5) htm_next = driver.page_source bsObj = BeautifulSoup(htm_next, 'lxml') except Exception: print(Exception) print("Failed to get bsObj from: %s" % baseUrl) finally: driver.close() return bsObj # headless chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--diable-gpu') driver = webdriver.Chrome(chrome_options=chrome_options) # driver = webdriver.Chrome() driver.set_window_size(480,8000) # 将窗口高度设置得足够大,以使得所有 driver.set_script_timeout(30) driver.set_page_load_timeout(30) driver.get(taskLinkList[4]) # driver.get(baseUrl) # login userName = driver.find_element_by_name('userName') userName.send_keys('l84102264') password = driver.find_element_by_name('password') password.send_keys('233*Dgg666') BtmLogin = driver.find_element_by_class_name('btn-purple') BtmLogin.send_keys(Keys.ENTER) # switch to target frame driver.implicitly_wait(8) driver.switch_to.frame("iframe") # 网页加载时间 time.sleep(3) print("before get html") htm_next = driver.page_source print("html length: %s" % len(htm_next)) print("get bsObj") # bsObj = BeautifulSoup(htm_next, 'html.parser') 'lxml','html5lib' bsObj = BeautifulSoup(htm_next, 'lxml') print("after get bsObj") driver.close() count = 0 data = bsObj.find("div", {"id": "ext-gen91"}) print("data length: %s " % len(data)) body = data.find_all("div", {"class": "x-grid-group-body"}) print("body length: %s" % len(body)) for i in range(len(body)): print("loop: %i" % i) suit_body = body[i].div.find("a", {"target": "_blank"}) suit_name = suit_body.get_text() # print("=======================================================================================") # print("suit-%i: %s " % (i, suit_name)) links_body = body[i].find_all("div", {"class": 'x-grid3-cell-inner x-grid3-col-2'}) print("links_body length: %s" % len(links_body)) for j in range(len(links_body)): link = links_body[j].a.attrs["href"] link = 'http://10.179.142.230:8080/ilog' + link[1:] count = count + 1 # print("---------------------------------------------------------------------------------------") # print("total: %i | link-%i: %s " % (count, j, link))
3. ScrapData.py
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
# -*- coding:utf-8 -*- from selenium import webdriver from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import re import time class ScrapData: def getPageSource(self, baseUrl): """ # 获取测试用例的网页html # Args: # baseUrl: 网页链接 # Returns: # bsObj: """ # 使用后台模式驱动Chrome headless chrome chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--diable-gpu') driver = webdriver.Chrome(chrome_options=chrome_options) try: driver.get(baseUrl) # login userName = driver.find_element_by_name('userName') userName.send_keys('l84102264') password = driver.find_element_by_name('password') password.send_keys('233*Dgg666') BtmLogin = driver.find_element_by_class_name('btn-purple') BtmLogin.send_keys(Keys.ENTER) # switch to target frame driver.implicitly_wait(2) driver.switch_to.frame("main1") driver.implicitly_wait(2) html = driver.page_source bsObj = BeautifulSoup(html, 'html.parser') except Exception: print("Failed to get bsObj from: %s" % baseUrl) finally: driver.close() return bsObj def getPassRation(self, tcIndex_link_set): """ 获取 通过率(Pass Ratio) Args: html: 目标网页html Returns: (tcIndex, ratio): 通过率 """ tcIndex = tcIndex_link_set[0] link = tcIndex_link_set[1] count = 0 ratio = 0.0 # Execution Result exceptCount = 0 bsObj=self.getPageSource(link) targetResIdPattern = re.compile( r'testcaseDetailInfoViewList_[0-9]*_5') try: data = bsObj.find('tbody',{'id':'testcaseDetailInfoViewList_databody'}) for i in range(len(data)): ids = 'testcaseDetailInfoViewList' + '_' + str(i) dataItem = bsObj.find('tr', {'id': re.compile(ids)}) try: target = dataItem.find('td', {'id': targetResIdPattern}) if target.get_text().strip() == 'Pass': count = count + 1 except Exception: exceptCount = exceptCount + 1 break ratio = float(count) / (len(data) - exceptCount) ratio = float('%.2f' % ratio) # 保留两位有效数字 except Exception: print("bsObj findERRO") return (tcIndex, ratio) def getPageSource(self, baseUrl, frame): """ # 获取测试任务的网页html # Args: # baseUrl: 网页链接 # frame: "iframe" # Returns: # bsObj: """ # headless chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--diable-gpu') driver = webdriver.Chrome(chrome_options=chrome_options) # driver = webdriver.Chrome() # 将窗口高度设置得足够大,以使得所有 driver.set_window_size(480,30000) driver.set_script_timeout(30) driver.set_page_load_timeout(30) try: driver.get(baseUrl) # login userName = driver.find_element_by_name('userName') userName.send_keys('l84102264') password = driver.find_element_by_name('password') password.send_keys('233*Dgg666') BtmLogin = driver.find_element_by_class_name('btn-purple') BtmLogin.send_keys(Keys.ENTER) # switch to target frame driver.implicitly_wait(3) # driver.switch_to.frame("iframe") driver.switch_to.frame(frame) driver.implicitly_wait(3) # 一定千万等待一会,等待网页js渲染结束 time.sleep(5) htm_next = driver.page_source # 需要安装解析器 ‘lxml’,也可以使用‘html.parser’ print("parsing .html,yield bsObj") bsObj = BeautifulSoup(htm_next, 'lxml') except Exception: print("Failed to get bsObj from: %s" % baseUrl) finally: driver.close() return bsObj def parseLink(self, taskLinkList): print("parsing link from web ...") # 保留最近日期的测试任务中测试套的数据,之后的数据已被包含,丢弃。 testSuitLinkTotalDir = {} for n in range(len(taskLinkList)): print("===============================================================================") print("task-%i" % n) bsObj = self.getPageSource(taskLinkList[n], "iframe") data = bsObj.find("div", {"id": "ext-gen91"}) body = data.find_all("div", {"class": "x-grid-group-body"}) testSuitLinkDir = {} for i in range(len(body)): suit_body = body[i].div.find("a", {"target": "_blank"}) suit_name = suit_body.get_text() print("-----------------------------------------------------------------------------------") print("task-suit-%i: task%i-%s " % (i, n, suit_name)) if suit_name not in testSuitLinkTotalDir: links_body = body[i].find_all("div", {"class": 'x-grid3-cell-inner x-grid3-col-2'}) linkList = [] for j in range(1, len(links_body)): link = links_body[j].a.attrs["href"] link = 'http://10.179.142.230:8080/ilog' + link[1:] linkList.append(link) if suit_name in testSuitLinkDir: testSuitLinkDir[suit_name].extend(linkList) else: testSuitLinkDir[suit_name] = linkList print("linkList count: %i" % len(linkList)) print("testSuitLinkDir count: %i" % len(testSuitLinkDir)) testSuitLinkTotalDir.update(testSuitLinkDir) print("testSuitLinkTotalDir count: %i" % len(testSuitLinkTotalDir)) return testSuitLinkTotalDir def getTestCaseData(self, testSuitLinkTotalDir): """ 获取: Test case name | Total Pass | Fail Block | Unavailable | Pass Ratio | Fail Ratio Args: testSuitLinkTotalDir: Returns: (tcIndex, ratio): 通过率 """ tcIndex = tcIndex_link_set[0] link = tcIndex_link_set[1] count = 0 ratio = 0.0 # Execution Result exceptCount = 0 bsObj=self.getPageSource(link) targetResIdPattern = re.compile( r'testcaseDetailInfoViewList_[0-9]*_5') try: data = bsObj.find('tbody',{'id':'testcaseDetailInfoViewList_databody'}) for i in range(len(data)): ids = 'testcaseDetailInfoViewList' + '_' + str(i) dataItem = bsObj.find('tr', {'id': re.compile(ids)}) try: target = dataItem.find('td', {'id': targetResIdPattern}) if target.get_text().strip() == 'Pass': count = count + 1 except Exception: exceptCount = exceptCount + 1 break ratio = float(count) / (len(data) - exceptCount) ratio = float('%.2f' % ratio) # 保留两位有效数字 except Exception: print("bsObj findERRO") return (tcIndex, ratio) if __name__ == '__main__': # import xlrd # workBook = xlrd.open_workbook("Intelligent_analysis.xlsx") # workSheet = workBook.sheets()[0] # linkList = workSheet.col_values(11) # baseUrl = linkList[3] # scrapData = ScrapData() # print(scrapData.getPassRation((1, baseUrl))) # 按照时间顺序排列list,list[0]是最近的task taskLinkList = ['http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af12240164782eaa191de0&W3=', 'http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af1224016477bd42f01dd3&W3=', 'http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af1224016477bd42f01dd3&W3=' ] scrapDate = ScrapData() scrapDate.parseLink(taskLinkList)
4. write2Excel.py
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
# -*- coding:utf-8 -*- import openpyxl import os from openpyxl.styles import PatternFill def writeRatioDate(DATA_SET, hightlightNum, fileUrl = 'Intelligent_analysis.xlsx'): assert os.path.exists(fileUrl) wb = openpyxl.load_workbook(fileUrl) sheet = wb.get_active_sheet() for tcIndex_ratio_set in DATA_SET: tcIndex = int(tcIndex_ratio_set[0]) + 1 ratio = tcIndex_ratio_set[1] # Pass_ratio, Fail_ratio 分别位于 u 列和 v 列, 行数从1开始 sheet["U%i" % tcIndex] = ratio sheet["V%i" % tcIndex] = 1-ratio wb.save(fileUrl) print("Write ratioData to \'Intelligent_analysis.xlsx\' successfully!") DATA_List = [x for x in DATA_SET] DATA_List.sort(key=lambda x: x[1], reverse=True) fill = PatternFill(fill_type='solid', fgColor="FFB6C1") # EE82EE for item in DATA_List[:hightlightNum]: index = int(item[0]) + 1 for i in range(1,22): sheet.cell(row=index, column=i).fill = fill wb.save(fileUrl) print("Add hightlight successfully!") wb.close() if __name__ == '__main__': # ss = set() # ss.add((2,5)) # ss.add((4,1)) # ss.add((1,8)) # print(type(ss)) # print("ss: %s" % ss) # ssList = [x for x in ss] # print(type(ssList)) # print(len(ssList)) # print("ssListUNsort: %s" % ssList) # ssList.sort(key=lambda x: x[1], reverse=True) # print(ssList) wb = openpyxl.load_workbook('Intelligent_analysis.xlsx') sheet = wb.get_active_sheet() fill = PatternFill(fill_type='solid', fgColor="EE82EE") for j in range(1,3): for i in range(1,22): sheet.cell(row=j, column=i).fill = fill