lxml练习
1 import requests 2 from lxml import etree 3 4 def loadpage(url): 5 headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"} 6 r=requests.get(url,headers=headers) 7 html=r.text 8 content=etree.HTML(html) 9 link_list=content.xpath('//div[@class="threadlist_lz clearfix"]//a/@href') 10 for link in link_list: 11 fulllink="http://tieba.baidu.com"+link 12 loadpage(fulllink) 13 14 15 def loadimage(link): 16 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"} 17 r = requests.get(link, headers=headers) 18 html = r.text 19 content = etree.HTML(html) 20 link_list = content.xpath('//img[@class="BDE_Image"]/@src') 21 for link in link_list: 22 writeimage(link) 23 24 def writeimage(link): 25 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"} 26 r=requests.get(link,headers=headers) 27 filename=link[-5:] 28 with open(filename,'wb') as f: 29 f.write(r.text)