Python 通过lxml遍历html xpath
#coding:utf-8 ''' Created on 2017年10月9日 @author: li.liu ''' from selenium import webdriver from lxml import etree import urllib import urllib2 import time #url='http://www.woyihome.com' url='http://sso.woyihome.com/sso/pc-login' #url='http://www.baidu.com' user_agent='Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36' values = {'name' : 'WHY', 'location' : 'SDU', 'language' : 'Python' } headers = { 'User-Agent' : user_agent } data = urllib.urlencode(values) req = urllib2.Request(url, data, headers) response = urllib2.urlopen(req) html1= response.read().encode('utf-8') def test1(): x1={} #html1=urllib.urlopen(url).read().decode('utf-8') #print html1 hxml=etree.HTML(html1) #print hxml htree=etree.ElementTree(hxml) #print htree id_dite=htree.xpath('//*[@id]') #print id_dite coun=0 for id_items in id_dite: #print id_items.items() #print htree.getpath(id_items) for id_item in id_items.items(): #print id_item if id_item[0]=='id': id_str='//*[@id="'+id_item[1]+'"]' x1[id_str]=[] #print id_str id_path=htree.getpath(htree.xpath(id_str)[0]) #print id_path id_str1=id_str+'//*' idelem_list=htree.xpath(id_str1) #print idelem_list for e in idelem_list: if len(e.items())==0: pass else: e_path=htree.getpath(e) #print e_path e_path1=e_path.split(id_path) #print e_path1[1] if len(e_path1)>1: e_str=id_str+e_path1[1] e_list=e_str.split('/') if 'li' in e_list[len(e_list)-1] or 'ul' in e_list[len(e_list)-1] or 'span' in e_list[len(e_list)-1]: pass else: #print e_str coun+=1 x1[id_str].append(e_str) ''' for i in x1: #print i for i1 in x1[i]: print i1 ''' a=0 b=0 driver=webdriver.Chrome() driver.get(url) #print driver.title for i in x1: #print i for i1 in x1[i]: #print i1 try: d=driver.find_element_by_xpath(i1) a+=1 print d.text time.sleep(2) driver.find_element_by_xpath(i1).click() headx=driver.window_handles #print headx print '当前页面地址:\n',driver.current_url time.sleep(1) print i,'\n' if len(headx)!=1: driver.switch_to_window(headx[1]) durl= driver.current_url print '当前页面地址:\n',durl,'\n' if 'woyihome' in durl: driver.close() driver.switch_to_window(headx[0]) else: k=1 break elif 'localhost' in driver.current_url: print a except : pass #print b print a #driver.quit() #print '====================================================' print coun test1()