selenium遇到翻页xpath多变怎么解决?
以企查查的翻页为例,如果你只是根据xpath来用selenium定位操作的话,你会发现xpath有很多种,并且部分还会出现并非是下一页的xpath。此时有一种方法可以解决,那就是通过正则和切割来解决。
首先正则匹配出页数标签有多少,然后切割判断下一页标签索引是多少,再组合出对应的xpath
patent_number = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[1]/aspan/span[1]').__getattribute__('text') patent_page = int(patent_number)// 10 patent_pages = int(patent_number)% 10 for page in range(patent_page+ 1): #专利信息 if page != patent_page: for g in range(2,12): patent_dict = {} xpath36 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[2]/a' xpath37 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[3]' xpath38 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[4]/span' xpath39 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[5]' xpath40 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[6]' xpath41 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[7]' xpath42 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[8]' invent_name = driver.find_element_by_xpath(xpath36).__getattribute__('text') patent_type = driver.find_element_by_xpath(xpath37).__getattribute__('text') legal_status = driver.find_element_by_xpath(xpath38).__getattribute__('text') application_number = driver.find_element_by_xpath(xpath39).__getattribute__('text') date_of_application = driver.find_element_by_xpath(xpath40).__getattribute__('text') publication_number = driver.find_element_by_xpath(xpath41).__getattribute__('text') open_date = driver.find_element_by_xpath(xpath42).__getattribute__('text') patent_dict['发明名称'] = invent_name patent_dict['专利类型'] = patent_type patent_dict['法律状态'] = legal_status patent_dict['申请号'] = application_number patent_dict['申请日'] = date_of_application patent_dict['公开号'] = publication_number patent_dict['公开日期'] = open_date patent_list.append(patent_dict) else: for g in range(2,patent_pages+2): patent_dict = {} xpath36 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[2]/a' xpath37 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[3]' xpath38 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[4]/span' xpath39 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[5]' xpath40 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[6]' xpath41 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[7]' xpath42 = f'//*[@id="zhuanlilist"]/table/tbody/tr[{g}]/td[8]' invent_name = driver.find_element_by_xpath(xpath36).__getattribute__('text') patent_type = driver.find_element_by_xpath(xpath37).__getattribute__('text') legal_status = driver.find_element_by_xpath(xpath38).__getattribute__('text') application_number = driver.find_element_by_xpath(xpath39).__getattribute__('text') date_of_application = driver.find_element_by_xpath(xpath40).__getattribute__('text') publication_number = driver.find_element_by_xpath(xpath41).__getattribute__('text') open_date = driver.find_element_by_xpath(xpath42).__getattribute__('text') patent_dict['发明名称'] = invent_name patent_dict['专利类型'] = patent_type patent_dict['法律状态'] = legal_status patent_dict['申请号'] = application_number patent_dict['申请日'] = date_of_application patent_dict['公开号'] = publication_number patent_dict['公开日期'] = open_date patent_list.append(patent_dict) text = driver.page_source next = re.findall(r'class="pagination">(.*?)</ul>',text)[5] page_list = next.split('</li>') number = 0 for pages in page_list: if ">" in pages: #>是>
number += 1 xpath49 = f'/html/body/div[5]/div[2]/div/div[5]/section[5]/div[7]/nav/ul/li[{number+1}]/a' driver.find_element_by_xpath(xpath49).click() #点击下一页