爬虫

1. multiThread.py

# -*- coding:utf-8 -*-
 
import threading
import time
import Queue
 
SHARE_Q = Queue.Queue()  #构造一个不限制大小的的队列
_WORKER_THREAD_NUM = 3  #设置线程的个数
 
class MyThread(threading.Thread) :
    """
    doc of class
 
    Attributess:
        func: 线程函数逻辑
    """
    def __init__(self, func) :
        super(MyThread, self).__init__()  #调用父类的构造函数
        self.func = func  #传入线程函数逻辑
 
    def run(self) :
        """
        重写基类的run方法
 
        """
        self.func()
 
 
# def worker() :
#     """
#     主要用来写工作逻辑, 只要队列不空持续处理
#     队列为空时, 检查队列, 由于Queue中已经包含了wait,
#     notify和锁, 所以不需要在取任务或者放任务的时候加锁解锁
#     """
#     global SHARE_Q
#     while not SHARE_Q.empty(): 
#         testCaseName_link_set = SHARE_Q.get() #获得任务
        
#         testCaseName_ratio_set = getPassRation(testCaseName_link_set)
        
#         SHARE_Q.task_done()

 
# def main() :
    
#     global SHARE_Q
#     threads = []
    
#     #向队列中放入任务, 真正使用时, 应该设置为可持续的放入任务
#     testCaseName_link_list = parseLink() 
#     for item in testCaseName_link_list :   
#         SHARE_Q.put(item)
    
#     #开启_WORKER_THREAD_NUM个线程
#     for i in xrange(_WORKER_THREAD_NUM) :
#         thread = MyThread(worker)
#         thread.start()  #线程开始处理任务
#         threads.append(thread)
#     for thread in threads :
#         thread.join()
#     #等待所有任务完成
#     SHARE_Q.join()
 
View Code

2. parseLink.py

# -*- coding:utf-8 -*-

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import time


# 按照时间顺序排列list,list[0]是最近的task
taskLinkList = ["http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af1224016493479e7522c4&W3=",
               "http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af1224016489b92f142114&W3=",
               "http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af12240164896246d020f8&W3="]


def getPageSource(baseUrl):
        # headless
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--diable-gpu')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        # driver = webdriver.Chrome()
        
        driver.set_window_size(480,30000) # 将窗口高度设置得足够大,以使得所有
        driver.set_script_timeout(30)
        driver.set_page_load_timeout(30)
        try:
            driver.get(baseUrl)
            # login
            userName = driver.find_element_by_name('userName')
            userName.send_keys('l84102264')
            password = driver.find_element_by_name('password')
            password.send_keys('233*Dgg666')
            BtmLogin = driver.find_element_by_class_name('btn-purple')
            BtmLogin.send_keys(Keys.ENTER)

            # switch to target frame
            driver.implicitly_wait(3)
            driver.switch_to.frame("iframe")
            driver.implicitly_wait(3)
            
            # 一定千万等待一会
            time.sleep(5)
            
            htm_next = driver.page_source
            bsObj = BeautifulSoup(htm_next, 'lxml')
        except Exception:
            print(Exception)
            print("Failed to get bsObj from: %s" % baseUrl)
        finally:
            driver.close()
            return bsObj



# headless
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--diable-gpu')
driver = webdriver.Chrome(chrome_options=chrome_options)
# driver = webdriver.Chrome()

driver.set_window_size(480,8000) # 将窗口高度设置得足够大,以使得所有
driver.set_script_timeout(30)
driver.set_page_load_timeout(30)

driver.get(taskLinkList[4])

# driver.get(baseUrl)
# login
userName = driver.find_element_by_name('userName')
userName.send_keys('l84102264')
password = driver.find_element_by_name('password')
password.send_keys('233*Dgg666')
BtmLogin = driver.find_element_by_class_name('btn-purple')
BtmLogin.send_keys(Keys.ENTER)

# switch to target frame
driver.implicitly_wait(8)
driver.switch_to.frame("iframe")
# 网页加载时间
time.sleep(3)

print("before get html")
htm_next = driver.page_source
print("html length: %s" % len(htm_next))
print("get bsObj")
# bsObj = BeautifulSoup(htm_next, 'html.parser') 'lxml','html5lib'
bsObj = BeautifulSoup(htm_next, 'lxml')
print("after get bsObj")

driver.close()

count = 0
data = bsObj.find("div", {"id": "ext-gen91"})
print("data length: %s " % len(data))
body = data.find_all("div", {"class": "x-grid-group-body"})
print("body length: %s" % len(body))
for i in range(len(body)):
    print("loop: %i" % i)
    suit_body = body[i].div.find("a", {"target": "_blank"})
    suit_name = suit_body.get_text()
    # print("=======================================================================================")
    # print("suit-%i: %s " % (i, suit_name))
    links_body = body[i].find_all("div", {"class": 'x-grid3-cell-inner x-grid3-col-2'})
    print("links_body length: %s" % len(links_body))
    for j in range(len(links_body)):
        link = links_body[j].a.attrs["href"]
        link = 'http://10.179.142.230:8080/ilog' + link[1:]
        count = count + 1
        # print("---------------------------------------------------------------------------------------")
        # print("total: %i | link-%i: %s " % (count, j, link))
View Code

3. ScrapData.py

# -*- coding:utf-8 -*-

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import time

class ScrapData:

    def getPageSource(self, baseUrl):
        """
        # 获取测试用例的网页html

        # Args:
        #     baseUrl: 网页链接
        
        # Returns:
        #     bsObj:
        """
        # 使用后台模式驱动Chrome headless chrome
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--diable-gpu')

        driver = webdriver.Chrome(chrome_options=chrome_options)
        try:
            driver.get(baseUrl)
            # login
            userName = driver.find_element_by_name('userName')
            userName.send_keys('l84102264')
            password = driver.find_element_by_name('password')
            password.send_keys('233*Dgg666')
            BtmLogin = driver.find_element_by_class_name('btn-purple')
            BtmLogin.send_keys(Keys.ENTER)

            # switch to target frame
            driver.implicitly_wait(2)
            driver.switch_to.frame("main1")
            driver.implicitly_wait(2)
            
            html = driver.page_source
            bsObj = BeautifulSoup(html, 'html.parser')
        except Exception:
            print("Failed to get bsObj from: %s" % baseUrl)
        finally:
            driver.close()
            return bsObj

    def getPassRation(self, tcIndex_link_set):
        """
        获取 通过率(Pass Ratio)

        Args:
            html: 目标网页html

        Returns:
            (tcIndex, ratio): 通过率
        """
        tcIndex = tcIndex_link_set[0]
        link = tcIndex_link_set[1]
        count = 0
        ratio = 0.0
        # Execution Result 
        exceptCount = 0
        bsObj=self.getPageSource(link)
        targetResIdPattern = re.compile( r'testcaseDetailInfoViewList_[0-9]*_5')
        
        try:
            data = bsObj.find('tbody',{'id':'testcaseDetailInfoViewList_databody'})
            for i in range(len(data)):
                ids = 'testcaseDetailInfoViewList' + '_' + str(i)
                dataItem = bsObj.find('tr', {'id': re.compile(ids)})
                try:
                    target = dataItem.find('td', {'id': targetResIdPattern})
                    if target.get_text().strip() == 'Pass':
                        count = count + 1
                except Exception:
                    exceptCount = exceptCount + 1
                    break

            ratio = float(count) / (len(data) - exceptCount)
            ratio = float('%.2f' % ratio) # 保留两位有效数字
        except Exception:
            print("bsObj findERRO")

        return (tcIndex, ratio)


    def getPageSource(self, baseUrl, frame):
        """
        # 获取测试任务的网页html

        # Args:
        #     baseUrl: 网页链接
        #     frame: "iframe"
        
        # Returns:
        #     bsObj:
        """
        
        # headless
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--diable-gpu')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        # driver = webdriver.Chrome()
        
        # 将窗口高度设置得足够大,以使得所有
        driver.set_window_size(480,30000) 
        driver.set_script_timeout(30)
        driver.set_page_load_timeout(30)
        try:
            driver.get(baseUrl)
            # login
            userName = driver.find_element_by_name('userName')
            userName.send_keys('l84102264')
            password = driver.find_element_by_name('password')
            password.send_keys('233*Dgg666')
            BtmLogin = driver.find_element_by_class_name('btn-purple')
            BtmLogin.send_keys(Keys.ENTER)

            # switch to target frame
            driver.implicitly_wait(3)
            # driver.switch_to.frame("iframe")
            driver.switch_to.frame(frame)
            driver.implicitly_wait(3)
            
            # 一定千万等待一会,等待网页js渲染结束
            time.sleep(5)
            
            htm_next = driver.page_source
            # 需要安装解析器 ‘lxml’,也可以使用‘html.parser’
            print("parsing .html,yield bsObj")
            bsObj = BeautifulSoup(htm_next, 'lxml')
        except Exception:
            print("Failed to get bsObj from: %s" % baseUrl)
        finally:
            driver.close()
            return bsObj
    

    def parseLink(self, taskLinkList):
        
        print("parsing link from web ...")
        # 保留最近日期的测试任务中测试套的数据,之后的数据已被包含,丢弃。
        testSuitLinkTotalDir = {}
        for n in range(len(taskLinkList)):
            print("===============================================================================")
            print("task-%i" % n)
            bsObj = self.getPageSource(taskLinkList[n], "iframe")
            data = bsObj.find("div", {"id": "ext-gen91"})
            body = data.find_all("div", {"class": "x-grid-group-body"})
            testSuitLinkDir = {}
            for i in range(len(body)):
                suit_body = body[i].div.find("a", {"target": "_blank"})
                suit_name = suit_body.get_text()
                print("-----------------------------------------------------------------------------------")
                print("task-suit-%i: task%i-%s " % (i, n, suit_name))
                if suit_name not in testSuitLinkTotalDir:
                    links_body = body[i].find_all("div", {"class": 'x-grid3-cell-inner x-grid3-col-2'})
                    linkList = []
                    for j in range(1, len(links_body)):
                        link = links_body[j].a.attrs["href"]
                        link = 'http://10.179.142.230:8080/ilog' + link[1:]
                        linkList.append(link)
                    if suit_name in testSuitLinkDir:
                        testSuitLinkDir[suit_name].extend(linkList)
                    else:
                        testSuitLinkDir[suit_name] = linkList
                    print("linkList count: %i" % len(linkList))

            print("testSuitLinkDir count: %i" % len(testSuitLinkDir))
            testSuitLinkTotalDir.update(testSuitLinkDir)
        print("testSuitLinkTotalDir count: %i" % len(testSuitLinkTotalDir))    
        return testSuitLinkTotalDir

    
    def getTestCaseData(self, testSuitLinkTotalDir):
        """
        获取: Test case name | Total Pass | Fail Block | Unavailable | Pass Ratio | Fail Ratio

        Args:
            testSuitLinkTotalDir: 

        Returns:
            (tcIndex, ratio): 通过率
        """
        tcIndex = tcIndex_link_set[0]
        link = tcIndex_link_set[1]
        count = 0
        ratio = 0.0
        # Execution Result 
        exceptCount = 0
        bsObj=self.getPageSource(link)
        targetResIdPattern = re.compile( r'testcaseDetailInfoViewList_[0-9]*_5')
        
        try:
            data = bsObj.find('tbody',{'id':'testcaseDetailInfoViewList_databody'})
            for i in range(len(data)):
                ids = 'testcaseDetailInfoViewList' + '_' + str(i)
                dataItem = bsObj.find('tr', {'id': re.compile(ids)})
                try:
                    target = dataItem.find('td', {'id': targetResIdPattern})
                    if target.get_text().strip() == 'Pass':
                        count = count + 1
                except Exception:
                    exceptCount = exceptCount + 1
                    break

            ratio = float(count) / (len(data) - exceptCount)
            ratio = float('%.2f' % ratio) # 保留两位有效数字
        except Exception:
            print("bsObj findERRO")

        return (tcIndex, ratio)



if __name__ == '__main__':

    # import xlrd

    # workBook = xlrd.open_workbook("Intelligent_analysis.xlsx")
    # workSheet = workBook.sheets()[0]
    # linkList = workSheet.col_values(11)
    # baseUrl = linkList[3]
    # scrapData = ScrapData()
    # print(scrapData.getPassRation((1, baseUrl)))
    
    # 按照时间顺序排列list,list[0]是最近的task
    taskLinkList = ['http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af12240164782eaa191de0&W3=',
                    'http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af1224016477bd42f01dd3&W3=',
                    'http://10.179.142.230:8080/ilog/business.action?BMEBusiness=TaskInfoDetail&targetstep=viewReportByUser&taskId=8a3aa20863af1224016477bd42f01dd3&W3='
                ]
    scrapDate = ScrapData()
    scrapDate.parseLink(taskLinkList)
    
View Code

4. write2Excel.py

# -*- coding:utf-8 -*-

import openpyxl
import os
from openpyxl.styles import PatternFill

def writeRatioDate(DATA_SET, hightlightNum, fileUrl = 'Intelligent_analysis.xlsx'):

    assert os.path.exists(fileUrl)
    wb = openpyxl.load_workbook(fileUrl)
    sheet = wb.get_active_sheet()
       
    for tcIndex_ratio_set in DATA_SET:
        tcIndex = int(tcIndex_ratio_set[0]) + 1
        ratio = tcIndex_ratio_set[1]
        # Pass_ratio, Fail_ratio 分别位于 u 列和 v 列, 行数从1开始
        sheet["U%i" % tcIndex] = ratio
        sheet["V%i" % tcIndex] = 1-ratio
        wb.save(fileUrl)
    print("Write ratioData to \'Intelligent_analysis.xlsx\' successfully!")

    DATA_List = [x for x in DATA_SET]
    DATA_List.sort(key=lambda x: x[1], reverse=True)
    fill = PatternFill(fill_type='solid', fgColor="FFB6C1") # EE82EE
    for item in DATA_List[:hightlightNum]:
        index = int(item[0]) + 1
        for i in range(1,22):
            sheet.cell(row=index, column=i).fill = fill
    wb.save(fileUrl)
    print("Add hightlight successfully!")

    wb.close()


if __name__ == '__main__':

    # ss = set()
    # ss.add((2,5))
    # ss.add((4,1))
    # ss.add((1,8)) 
    # print(type(ss))
    # print("ss: %s" % ss)
    
    # ssList = [x for x in ss]
    # print(type(ssList))
    # print(len(ssList))
    # print("ssListUNsort: %s" % ssList)


    # ssList.sort(key=lambda x: x[1], reverse=True)
    # print(ssList)

    wb = openpyxl.load_workbook('Intelligent_analysis.xlsx')
    sheet = wb.get_active_sheet()
    fill = PatternFill(fill_type='solid', fgColor="EE82EE")
    for j in range(1,3):
        for i in range(1,22):
            sheet.cell(row=j, column=i).fill = fill

    
    
View Code
posted @ 2018-08-27 09:14  Charlie-OwO  阅读(187)  评论(0编辑  收藏  举报