lxml 和 pyquery 示例 爬 卡牌

 

 

import requests
from pyquery import PyQuery as pq
import json
import jsonpath
from lxml import etree
import os

html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''
html = requests.get('http://news.4399.com/gonglue/lscs/kptj/').content.decode('gbk')
num = 0
def pq方法():
    global num
    doc = pq(html)
    items = doc('#dq_list > li').items()
    # print(doc)
    # print(type(doc))
    for item in items:
        url=item.find('img').attr('lz_src')
        num+=1
        print(str(num),url)
        url_content=requests.get(url).content
        name = item.find('.kp-name').text()
        
        with open('e:/py3/002/'+'{:0>4}'.format(str(num))+name+'.jpg','wb') as file:
            file.write(url_content)
        # print(url,name)

def lxml方法():
    print(html)
    global num
    r=etree.HTML(html)
    # items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a/img/@lz_src")
    items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a")
    # print(items)
    for item in items:
        kpname=item.xpath("./div/text()")[0]
        lzsrc=item.xpath("./img/@lz_src")[0]
        num+=1
        print(kpname,lzsrc)
        lzcontent=requests.get(lzsrc).content
        with open('e:/py3/003/'+'{:0>4}'.format(str(num))+'_'+kpname+'.jpg','wb')as file:
            file.write(lzcontent)





if __name__ == '__main__':
    # pq方法()
    lxml方法()

    # 创建目录
    '''
    for dirnum in range(1,100):
        dirnum2='{:0>3}'.format(str(dirnum))
        mkpath="e:\\py3\\{}\\".format(dirnum2)
        print(mkpath)
        print('已存在!') if os.path.exists(mkpath) else os.makedirs(mkpath)
    '''

'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc)

print(soup.prettify()) # 结构化输出文档
print(soup.title) # 获取title标签
print(soup.title.name) # 获取title标签名称 
print(soup.title.parent.name)
print(soup.p['class'])
'''

 

posted @ 2018-10-28 17:08  快乐多巴胺  阅读(464)  评论(0编辑  收藏  举报