PyQuery查询html信息
以下代码主要演示使用pyquery进行对html文件的解析,包括设定编码,对子块进行查询等操作:
from pyquery import PyQuery as pq import os from lxml.html import HTMLParser, fromstring def getHouseInfoFromPage(page): houseInfo = HouseinfoItem() UTF8_PARSER = HTMLParser(encoding='utf-8') #此处设定pyquery使用的编码 with open(page, encoding='utf-8') as filehandler: file_contents = filehandler.read() doc = pq(fromstring(file_contents, parser = UTF8_PARSER)) # 获取联系方式div contactCard = doc('.right-border') houseInfo.houseType = contactCard('.col-right-tit div.fl').text() houseInfo.personName = contactCard('.person-name').text() houseInfo.companyName = contactCard('p.company-name').text() if houseInfo.personName=='': return houseInfo.price = doc('.basic-info-price').text() if isNumber(houseInfo.price): houseInfo.price = float(houseInfo.price) # 获取基本信息div basicInfo = doc('.basic-info') houseInfo.addr = basicInfo('li.with-area a:last').text() houseInfo.district = basicInfo('li.with-area a:eq(1)').text() huXing = basicInfo('li:contains("㎡")').text() houseInfo.area = huXing.split('-')[-1] houseInfo.allocation = basicInfo('.peizhi p').text() houseInfo.link = os.path.basename(page) houseInfo.summary = doc('.summary-cont').text() phoneEle = doc('.talk-btn') houseInfo.phone = phoneEle.attr['data-phone'] houseInfo.houseId = houseInfo.link.split('.')[0]