Python 通过lxml遍历html xpath

#coding:utf-8
'''
Created on 2017年10月9日

@author: li.liu
'''
from selenium import webdriver
from lxml import etree
import urllib
import urllib2
import time

#url='http://www.woyihome.com'
url='http://sso.woyihome.com/sso/pc-login'
#url='http://www.baidu.com'
user_agent='Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
values = {'name' : 'WHY',    
          'location' : 'SDU',    
          'language' : 'Python' }    
  
headers = { 'User-Agent' : user_agent }  
data = urllib.urlencode(values)    
req = urllib2.Request(url, data, headers) 
response = urllib2.urlopen(req)    
html1= response.read().encode('utf-8')

def test1():
    x1={}

    #html1=urllib.urlopen(url).read().decode('utf-8')
    #print html1
    hxml=etree.HTML(html1)
    #print hxml
    htree=etree.ElementTree(hxml)
    #print htree
    id_dite=htree.xpath('//*[@id]')
    #print id_dite
    coun=0
    for id_items in id_dite:
        #print id_items.items()
        #print htree.getpath(id_items)       
        for id_item in id_items.items():
            #print id_item
            if id_item[0]=='id':
                id_str='//*[@id="'+id_item[1]+'"]'
                x1[id_str]=[]
                #print id_str
                id_path=htree.getpath(htree.xpath(id_str)[0])
                #print id_path
                id_str1=id_str+'//*'
                idelem_list=htree.xpath(id_str1)
                #print idelem_list
                for e in idelem_list:
                    if len(e.items())==0:
                        pass
                    else:
                        e_path=htree.getpath(e)
                        #print e_path                   
                        e_path1=e_path.split(id_path)
                        #print e_path1[1]
                        if len(e_path1)>1:
                            e_str=id_str+e_path1[1]
                            e_list=e_str.split('/')
                            if 'li' in e_list[len(e_list)-1] or 'ul' in e_list[len(e_list)-1] or 'span' in e_list[len(e_list)-1]:
                                pass
                            else:
                                #print e_str
                                coun+=1
                                x1[id_str].append(e_str)
    '''
    for i in x1:
    #print i
        for i1 in x1[i]:
            print i1
        
    '''                                
    a=0
    b=0                            
    driver=webdriver.Chrome()
    driver.get(url)
    #print driver.title                            
    for i in x1:
        #print i
        for i1 in x1[i]:
            #print i1
            try:
                d=driver.find_element_by_xpath(i1)
                a+=1
                print d.text
                time.sleep(2)
                driver.find_element_by_xpath(i1).click()
                headx=driver.window_handles
                #print headx
                print '当前页面地址:\n',driver.current_url
                time.sleep(1)
                print i,'\n'
                if len(headx)!=1:
                    driver.switch_to_window(headx[1])
                    durl= driver.current_url
                    print '当前页面地址:\n',durl,'\n'
                    if 'woyihome' in durl:
                        driver.close()
                        driver.switch_to_window(headx[0])
                    else:
                        k=1
                        break
                elif 'localhost' in driver.current_url:
                    
                    print a
            except :
                pass
                #print b
    print a        
            
            
            
    #driver.quit()        
            
            
            
            
            
            
            
        #print '===================================================='
                                
                        
                        
                        
                        
                        
                        
                            
    print coun
                
                
                
                
            
            
            
            
test1()

 

posted @ 2017-12-18 15:57  LLSix  阅读(4156)  评论(0编辑  收藏  举报