PyQuery查询html信息

以下代码主要演示使用pyquery进行对html文件的解析,包括设定编码,对子块进行查询等操作:

from pyquery import PyQuery as pq
import os
from lxml.html import HTMLParser, fromstring

def getHouseInfoFromPage(page):
    houseInfo = HouseinfoItem()
    UTF8_PARSER = HTMLParser(encoding='utf-8') #此处设定pyquery使用的编码
    with open(page, encoding='utf-8') as filehandler:
        file_contents = filehandler.read()
    doc = pq(fromstring(file_contents, parser = UTF8_PARSER))

    # 获取联系方式div
    contactCard = doc('.right-border')
    houseInfo.houseType = contactCard('.col-right-tit div.fl').text()
    houseInfo.personName = contactCard('.person-name').text()
    houseInfo.companyName = contactCard('p.company-name').text()
    if houseInfo.personName=='':
        return

    houseInfo.price = doc('.basic-info-price').text()
    if isNumber(houseInfo.price):
        houseInfo.price = float(houseInfo.price)

    # 获取基本信息div
    basicInfo = doc('.basic-info')
    houseInfo.addr = basicInfo('li.with-area a:last').text()
    houseInfo.district = basicInfo('li.with-area a:eq(1)').text()
    huXing = basicInfo('li:contains("㎡")').text()
    houseInfo.area = huXing.split('-')[-1]
    
    houseInfo.allocation = basicInfo('.peizhi p').text()
    houseInfo.link = os.path.basename(page)
    houseInfo.summary = doc('.summary-cont').text()

    phoneEle = doc('.talk-btn')
    houseInfo.phone = phoneEle.attr['data-phone']
    houseInfo.houseId = houseInfo.link.split('.')[0]

 

posted @ 2016-01-25 21:40  silverbullet11  阅读(511)  评论(0编辑  收藏  举报