XPath
将字符串转换成对象:
- 方式一:
response.xpath('//div[@id='content-list']/div[@class='item']')
- 方式二:
from scrapy.selector import HtmlXPathSelector
hxs = HtmlXPathSelector(response=response)
items = hxs.xpath("//div[@id='content-list']/div[@class='item']")
from lxml import etree(待补充mark)
html=etree.HTML(r.text)
img_urls=html.xpath('.//img/@src')
查找规则:
//a
//div/a
//a[re:test(@id, "i\d+")]
items = hxs.xpath("//div[@id='content-list']/div[@class='item']")
for item in items:
item.xpath('.//div')
解析:
标签对象:xpath('/html/body/ul/li/a/@href')
列表: xpath('/html/body/ul/li/a/@href').extract()
值: xpath('//body/ul/li/a/@href').extract_first()
#// 代表从整个文档中搜索
In [1]: response.xpath('//a')
Out[1]:
[<Selector xpath='//a' data='<a href="image1.html">Name: My image 1 <'>,
<Selector xpath='//a' data='<a href="image2.html">Name: My image 2 <'>,
<Selector xpath='//a' data='<a href="image3.html">Name: My image 3 <'>,
<Selector xpath='//a' data='<a href="image4.html">Name: My image 4 <'>,
<Selector xpath='//a' data='<a href="image5.html">Name: My image 5 <'>]
In [2]: response.xpath('//a').extract()
Out[2]:
['<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>',
'<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
'<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>',
'<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>',
'<a href="image5.html">Name: My image 5 <br><img src="image5_thumb.jpg"></a>']
In [3]: response.xpath('//a').extract_first()
Out[3]: '<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>'
#找儿子
In [9]: response.xpath('//div/a').extract()
Out[9]:
['<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>',
'<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
'<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>',
'<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>',
'<a href="image5.html">Name: My image 5 <br><img src="image5_thumb.jpg"></a>']
#找子孙
In [13]: response.xpath('//div//img').extract()
Out[13]:
['<img src="image1_thumb.jpg">',
'<img src="image2_thumb.jpg">',
'<img src="image3_thumb.jpg">',
'<img src="image4_thumb.jpg">',
'<img src="image5_thumb.jpg">']
#找内容
response.css('a::text').extract()
response.xpath('//a/text()').extract()
#找属性
response.css('img::attr("src")').extract()
response.xpath('//img/@src').extract()
#设置找不到情况下的默认值
In [27]: response.xpath('//img/@srcsssss').extract_first('not found')
Out[27]: 'not found'
#按照属性查找
response.css('#images').extract()
response.xpath('//*[@id="images"]').extract()
response.xpath('//*[@href="image2.html"]').extract()
#模糊匹配
response.css('*[src*="im"]').extract()
response.xpath('//*[contains(@id,"result")]').extract_first()
#嵌套查询
response.xpath('//div').css('a')
response.xpath('//div').xpath('a') #一样response.xpath('//div').xpath('./a')
response.xpath('//div').xpath('img')
response.xpath('//div').xpath('//img')
#正则
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
#带变量的xpath规则
response.xpath('//*[@id="images"]').extract_first()
response.xpath('//*[@id=$xxx]',xxx='images').extract_first()
response.xpath('//div[count(a)=$xxx]',xxx=5).extract()