爬虫入门(二)
使用xpath解析数据
环境安装:
1 pip install lxml
解析原理:
1.获取网页源码数据
2.实例化一个etree对象,并切将页面源码数据加载到该对象中
3.调用该对象的xpath方法进行制定标签的定位
ps:xpath函数必须结合xpath表达式进行标签的定位和内容捕获
案例:
1 # 项目需求:解析58二手房的相关数据 2 import requests 3 from lxml import etree 4 5 url = 'https://bj.58.com/shahe/ershoufang/?PGTID=0d30000c-0047-e1d9-5baf-47db5897c065&ClickID=1' 6 headers = { 7 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' 8 } 9 10 page_text = requests.get(url=url,headers=headers).text 11 12 tree = etree.HTML(page_text) 13 li_list = tree.xpath("//ul[@class='house-list-wrap']/li") 14 fp = open('58.csv','w',encoding='utf8') 15 for li in li_list: 16 title = li.xpath('./div[2]/h2/a/text()')[0] 17 price = li.xpath('./div[3]//text()') 18 price = "".join(price) 19 fp.write(title + ':' + price + '\n') 20 fp.close() 21 print("over!")
1 # - 解析图片数据:http://pic.netbian.com/4kmeinv/ 2 import requests 3 import urllib 4 import os 5 from lxml import etree 6 7 url = 'http://pic.netbian.com/4kmeinv/' 8 headers = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' 10 } 11 response = requests.get(url=url,headers=headers) 12 # response.encoding = 'utf-8' 13 page_text = response.text 14 15 tree = etree.HTML(page_text) 16 17 li_list = tree.xpath('//div[@class="slist"]/ul/li') 18 19 if not os.path.exists('./imgs'): 20 os.mkdir('./imgs') 21 22 for li in li_list: 23 title = li.xpath('./a/b/text()')[0] 24 img_name = title.encode('iso-8859-1').decode('gbk') 25 img_url = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0] 26 img_path = './imgs/' + img_name + '.jpg' 27 urllib.request.urlretrieve(url=img_url,filename=img_path) 28 print(img_name,"下载成功") 29 print("over!!!")
1 #【重点】下载煎蛋网中的图片数据:http://jandan.net/ooxx 2 #数据加密 (反爬机制) 3 import requests 4 import urllib 5 import os 6 import base64 7 from lxml import etree 8 9 url = 'http://jandan.net/ooxx' 10 headers = { 11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' 12 } 13 14 page_text = requests.get(url=url,headers=headers).text 15 16 tree = etree.HTML(page_text) 17 18 if not os.path.exists('./jiandan'): 19 os.mkdir('./jiandan') 20 21 img_hash_list = tree.xpath('//span[@class="img-hash"]/text()') 22 for img_hash in img_hash_list: 23 img_url = 'http:' + base64.b64decode(img_hash).decode() 24 img_name = img_url.split('/')[-1] 25 img_path = './jiandan/' + img_name 26 urllib.request.urlretrieve(url=img_url,filename=img_path) 27 print('over!')
1 #爬取站长素材中的简历模板 2 import requests 3 import random 4 import os 5 from lxml import etree 6 7 8 url = 'http://sc.chinaz.com/jianli/free_%s.html' 9 10 headers = { 11 'Connection': 'close', # 当请求成功后,马上断开该次请求(及时释放请求池中的资源) 12 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' 13 } 14 15 if not os.path.exists('./jianli'): 16 os.mkdir('./jianli') 17 18 for i in range(1, 4): 19 if i == 1: 20 new_url = 'http://sc.chinaz.com/jianli/free.html' 21 else: 22 new_url = format(url % i) 23 24 response = requests.get(url=new_url, headers=headers) 25 response.encoding = 'utf8' 26 page_text = response.text 27 28 tree = etree.HTML(page_text) 29 30 div_list = tree.xpath('//div[@id="container"]/div') 31 for div in div_list: 32 name = div.xpath('./p//text()')[0] 33 detail_url = div.xpath('./a/@href')[0] 34 35 detail_page_text = requests.get(url=detail_url, headers=headers).text 36 tree = etree.HTML(detail_page_text) 37 download_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href') 38 download_url = random.choice(download_list) 39 data = requests.get(url=download_url, headers=headers).content 40 file_name = name + '.rar' 41 file_path = './jianli/' + file_name 42 43 with open(file_path,'wb') as fp: 44 fp.write(data) 45 print(file_name, "下载成功") 46 47 print('over!')
1 import requests 2 from lxml import etree 3 4 5 url = 'https://www.aqistudy.cn/historydata/' 6 headers = { 7 'Connection': 'close', # 当请求成功后,马上断开该次请求(及时释放请求池中的资源) 8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' 9 } 10 11 page_text = requests.get(url=url, headers=headers).text 12 tree = etree.HTML(page_text) 13 div_list = tree.xpath('//div[@class="col-lg-9 col-md-8 col-sm-8 col-xs-12"]/div') 14 print(div_list) 15 16 for div in div_list: 17 title = div.xpath('./div[1]//text()')[0] 18 print(title) 19 ul_list = div.xpath('./div[2]/ul | ./div[2]/ul') 20 21 for ul in ul_list: 22 fl = ul.xpath('./div[1]//text()') 23 if fl: 24 print(fl[0]) 25 li_list = ul.xpath('./li | ./div[2]/li') 26 27 for li in li_list: 28 city_name = li.xpath('./a/text()')[0] 29 print(city_name)
图片懒加载:找对属性
代理ip的简单使用
1 #设置请求的代理ip: www.goubanjia.com 快代理 西祠代理 2 #代理ip的类型必须和请求url的协议头保持一致 3 import requests 4 5 url = 'https://www.baidu.com/s?wd=ip' 6 7 page_text = requests.get(url=url, headers=headers, proxies={'https': '114.88.53.19:53281'}).text 8 9 with open('./ip.html','w',encoding='utf-8') as fp: 10 fp.write(page_text)