lxml 和 pyquery 示例 爬 卡牌
import requests
from pyquery import PyQuery as pq
import json
import jsonpath
from lxml import etree
import os
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
html = requests.get('http://news.4399.com/gonglue/lscs/kptj/').content.decode('gbk')
num = 0
def pq方法():
global num
doc = pq(html)
items = doc('#dq_list > li').items()
# print(doc)
# print(type(doc))
for item in items:
url=item.find('img').attr('lz_src')
num+=1
print(str(num),url)
url_content=requests.get(url).content
name = item.find('.kp-name').text()
with open('e:/py3/002/'+'{:0>4}'.format(str(num))+name+'.jpg','wb') as file:
file.write(url_content)
# print(url,name)
def lxml方法():
print(html)
global num
r=etree.HTML(html)
# items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a/img/@lz_src")
items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a")
# print(items)
for item in items:
kpname=item.xpath("./div/text()")[0]
lzsrc=item.xpath("./img/@lz_src")[0]
num+=1
print(kpname,lzsrc)
lzcontent=requests.get(lzsrc).content
with open('e:/py3/003/'+'{:0>4}'.format(str(num))+'_'+kpname+'.jpg','wb')as file:
file.write(lzcontent)
if __name__ == '__main__':
# pq方法()
lxml方法()
# 创建目录
'''
for dirnum in range(1,100):
dirnum2='{:0>3}'.format(str(dirnum))
mkpath="e:\\py3\\{}\\".format(dirnum2)
print(mkpath)
print('已存在!') if os.path.exists(mkpath) else os.makedirs(mkpath)
'''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc)
print(soup.prettify()) # 结构化输出文档
print(soup.title) # 获取title标签
print(soup.title.name) # 获取title标签名称
print(soup.title.parent.name)
print(soup.p['class'])
'''