selenium遇到翻页xpath多变怎么解决?

以企查查的翻页为例,如果你只是根据xpath来用selenium定位操作的话,你会发现xpath有很多种,并且部分还会出现并非是下一页的xpath。此时有一种方法可以解决,那就是通过正则和切割来解决。

首先正则匹配出页数标签有多少,然后切割判断下一页标签索引是多少,再组合出对应的xpath

patent_number = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[1]/aspan/span[1]').__getattribute__('text')
    patent_page = int(patent_number)// 10
    patent_pages = int(patent_number)% 10
    for page in range(patent_page+ 1): #专利信息
        if page != patent_page:
            for g in range(2,12):
                patent_dict = {}
                xpath36 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[2]/a'
                xpath37 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[3]'
                xpath38 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[4]/span'
                xpath39 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[5]'
                xpath40 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[6]'
                xpath41 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[7]'
                xpath42 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[8]'
                invent_name = driver.find_element_by_xpath(xpath36).__getattribute__('text')
                patent_type = driver.find_element_by_xpath(xpath37).__getattribute__('text')
                legal_status = driver.find_element_by_xpath(xpath38).__getattribute__('text')
                application_number = driver.find_element_by_xpath(xpath39).__getattribute__('text')
                date_of_application = driver.find_element_by_xpath(xpath40).__getattribute__('text')
                publication_number = driver.find_element_by_xpath(xpath41).__getattribute__('text')
                open_date = driver.find_element_by_xpath(xpath42).__getattribute__('text')
                patent_dict['发明名称'] = invent_name
                patent_dict['专利类型'] = patent_type
                patent_dict['法律状态'] = legal_status
                patent_dict['申请号'] = application_number
                patent_dict['申请日'] = date_of_application
                patent_dict['公开号'] = publication_number
                patent_dict['公开日期'] = open_date
                patent_list.append(patent_dict)
        else:
            for g in range(2,patent_pages+2):
                patent_dict = {}
                xpath36 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[2]/a'
                xpath37 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[3]'
                xpath38 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[4]/span'
                xpath39 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[5]'
                xpath40 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[6]'
                xpath41 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[7]'
                xpath42 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[8]'
                invent_name = driver.find_element_by_xpath(xpath36).__getattribute__('text')
                patent_type = driver.find_element_by_xpath(xpath37).__getattribute__('text')
                legal_status = driver.find_element_by_xpath(xpath38).__getattribute__('text')
                application_number = driver.find_element_by_xpath(xpath39).__getattribute__('text')
                date_of_application = driver.find_element_by_xpath(xpath40).__getattribute__('text')
                publication_number = driver.find_element_by_xpath(xpath41).__getattribute__('text')
                open_date = driver.find_element_by_xpath(xpath42).__getattribute__('text')
                patent_dict['发明名称'] = invent_name
                patent_dict['专利类型'] = patent_type
                patent_dict['法律状态'] = legal_status
                patent_dict['申请号'] = application_number
                patent_dict['申请日'] = date_of_application
                patent_dict['公开号'] = publication_number
                patent_dict['公开日期'] = open_date
                patent_list.append(patent_dict)
        text = driver.page_source
        next = re.findall(r'class="pagination">(.*?)</ul>',text)[5]
        page_list = next.split('</li>')
        number = 0
        for pages in  page_list:
            if "&gt;" in pages: #&gt;是>
number += 1 xpath49 = f'/html/body/div[5]/div[2]/div/div[5]/section[5]/div[7]/nav/ul/li[{number+1}]/a' driver.find_element_by_xpath(xpath49).click() #点击下一页

 

posted @ 2020-12-15 21:56  Eliphaz  阅读(1085)  评论(0编辑  收藏  举报