python例子-PyQuery抓取信息.

#!/usr/bin/python
#coding:utf-8

from pyquery import PyQuery
import re

# 抓取:http://www.stylebop.com/cn/product_details.php?id=606526&special=sale
# 获得   产品名 品牌 价格 size  图片(大图)
def main():
    pqhtml = PyQuery(url = 'http://www.stylebop.com/cn/product_details.php?id=606526&special=sale')
    #产品图片:
    img_li = pqhtml('li').filter('.image_click_rotator')
    pattern_img = re.compile(".*?'(.*?jpg)'.*?'.*?'.*?'.*?'.*?'(.*?jpg)'.*?")
    img_list = []
    for li in img_li:
        #div = li.getchildren()[0]
        #a = div.getchildren()[0]
        href = li.getchildren()[0].getchildren()[0].get('href')
        items = re.findall(pattern_img,href)
        img_large = list(items[0])[1]
        if img_large[0:4] != 'http' :
            img_large = 'http://www.stylebop.com%s' %img_large
        img_list.append(img_large)
    print '产品图片:' , img_list

    #产品品牌:
    brand = pqhtml('div').filter('.productInfo')('a:first').text()
    print '品牌:%s' %brand

    #价格
    price_div = pqhtml('div').filter('#product_price')  #根据ID获取价格的div
    price_first_span = price_div('span:first') #获取第一个span
    old_price = ''
    new_price = ''
    if price_first_span.hasClass('old_price'):
        old_price = price_first_span.text
        new_price = price_div('span:eq(1)').text() + ' / ' + price_div('span:eq(3)').text()
    else:
        new_price = price_div.text() + ' / ' + price_div('span:first').text
    print '价格:' , new_price
    #print '价格:%s' % new_price #这样打印会报编码错误:'ascii' codec can't encode character u'\u20ac' in position 21: ordinal not in range(128)

    #size
    size_option = pqhtml('select').filter('.newInput2')('option')
    size_list = []
    for size in size_option:        #为HTMLElement对象
        size_list.append(size.text)
    print 'size:', size_list

    #产品名:
    pname = pqhtml('div').filter('.productInfo')('span:first').text()
    print '产品名:%s' % pname

if __name__ == '__main__':
    main()

 

posted @ 2015-10-20 12:07  超超xc  Views(515)  Comments(0Edit  收藏  举报
I suppose,were childrenonec.