Xpath数据解析
好段子网内容爬取
1 from lxml import etree 2 import requests 3 url='http://www.haoduanzi.com' 4 headers = { 5 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 6 } 7 url_content=requests.get(url=url,headers=headers).text 8 tree=etree.HTML(url_content) 9 #xpath返回值是一个列表 10 div_list=tree.xpath('//div[@id="main"]/div')[2:-2] 11 ur_list=[] 12 for div in div_list: 13 img_url=div.xpath('./div/img/@src')[0] 14 ur_list.append(img_url) 15 print(ur_list)
煎蛋网图片爬取
1 from lxml import etree 2 import requests 3 import base64 4 url="http://jandan.net/ooxx" 5 headers = { 6 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 7 } 8 url_content=requests.get(url=url,headers=headers).text 9 # url_content 10 tree=etree.HTML(url_content) 11 img_list=tree.xpath('//span[@class="img-hash"]/text()') 12 img_so_list=[] 13 for imgcode in img_list: 14 img_url="http:"+base64.b64decode(imgcode).decode() 15 img_so_list.append(img_url) 16 img_so_list
站长素材中进行免费简历模板的下载
1 from lxml import etree 2 import requests 3 import random 4 url = 'http://sc.chinaz.com/jianli/free_%d.html' 5 headers = { 6 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' 7 'Connection':'close' 8 } 9 10 for pageNum in range(1,3): 11 pageUrl = format(url%pageNum) 12 if pageNum == 1: 13 pageUrl = 'http://sc.chinaz.com/jianli/free.html' 14 #page_text = requests.get(url=pageUrl,headers=headers).text 15 response = requests.get(url=pageUrl,headers=headers) 16 #将请求到的页面源码数据的编码格式进行指定的修改 17 response.encoding = 'utf-8' 18 page_text = response.text 19 #解析:简历的名称 详情页的url 20 tree = etree.HTML(page_text) 21 div_list = tree.xpath('//div[@id="container"]/div') 22 for div in div_list: 23 #存储谋一份建立的12个下载通道 24 download_list = [] 25 26 detail_url = div.xpath('./p/a/@href')[0] 27 title = div.xpath('./p/a/text()')[0] 28 29 #获取详情页面的页面数据 30 detail_text = requests.get(url=detail_url,headers=headers).text 31 #一个etree对象只可以加载一个页面源码数据 32 tree = etree.HTML(detail_text) 33 #12个下载通道的url 34 li_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul[@class="clearfix"]/li') 35 for li in li_list: 36 download_url = li.xpath('./a/@href')[0] 37 download_list.append(download_url) 38 #从12个下载通道中随机选择某一个下载通道 39 download_data_url = random.choice(download_list) 40 #简历模板的下载 41 data = requests.get(url=download_data_url,headers=headers).content 42 data_path = title+'.rar' 43 with open(data_path,'wb') as fp: 44 fp.write(data) 45 print(data_path+'下载成功')