爬虫第二弹 图片解析
reptile_second
-
知识点回顾
-
robots.txt:
-
UA:
-
1.指定url
-
2.发起请求
-
3.获取页面数据
-
4.数据解析
-
5.持久化存储
-
bs4:
- 实例化bs对象,将页面源代码数据加载到该对象中
- 定位标签:find('name',class_='someone') findall() select()
- 将标签中的文本内容获取,strint,text get_txtx() a['href']
-
-
环境安装: pip install lxml
-
分析原理:
- 获取页面源码数据
- 实例化一个etree的对象,并且将页面源码数据加载到哦该对象中
- 调用该对象的xpath方法进行制定标签的定位
- 注意: xpath函数必须结合着xpath表达式进行标签定位和内容捕获
项目需求: 解析58 二手房的相关数据
import requests
from lxml import etree
url = 'https://bj.58.com/shahe/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT&PGTID=0d30000c-0047-e4e6-f587-683307ca570e&ClickID=1'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
tree =etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
fp = open('58.csv','w',encoding='utf-8')
for li in li_list:
title = li.xpath('./div[2]/h2/a/text()')[0]
price = li.xpath('./div[2]//text()')
price = ''.join(price)
fp.write(title + ':' + price + '\n')
fp.close()
print('over')
解析图片数据: http://pic.netbian.com/4kmeinv/
# ctrl + shift + x
# - 解析图片数据: http://pic.netbian.com/4kmeinv/
import os
import urllib
import requests
from lxml import etree
url = 'http://pic.netbian.com/4kmeinv/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
# response.encording = 'utf-8'
if not os.path.exists('./imgs'):
os.mkdir('./imgs')
page_text = response.text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
img_name = li.xpath('./a/b/text()')[0]
# 处理中文乱码
img_name = img_name.encode('iso-8859-1').decode('gbk')
img_url = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
img_path = './imgs/' + img_name +'.jpg'
urllib.request.urlretrieve(url=img_url,filename=img_path)
print(img_path, '下载成功!')
print('over!!')
# [重点] 下载煎蛋网中的图片数据:http://jandan.net/ooxx
# [重点] 下载煎蛋网中的图片数据:http://jandan.net/ooxx
# 数据加密 (反扒机制)
import base64
import urllib
import requests
from lxml import etree
url = 'http://jandan.net/ooxx'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
img_hash_list = tree.xpath('//span[@class="img-has"]/text()')
for img_hash in img_hash_list:
img_url = 'http:' + base64.b64decode(img_hash).decode()
img_name = img_url.split('/')[-1]
urllib.request.urlretrieve(url=img_url,filename=img_name)
print('over!')
爬取站长素材中的简历模板
import random
import requests
from lxml import etree
headers = {
'Connection':'close', #当请求成功后,马上断开该次请求(及时释放请求池中的资源)
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
url = 'http://sc.chinaz.com/jianli/free_%d.html'
for page in range(1,4):
if page ==1:
new_url = 'http://sc.chinaz.com/jianli/free.html'
else:
new_url =format(url%page)
response = requests.get(url=new_url,headers=headers)
response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@id="container"]/div')
for div in div_list:
detail_url = div.xpath('./a/@href')[0]
name = div.xpath('./a/img/@alt')[0]
detail_page = requests.get(url=detail_url,headers=headers).text
tree = etree.HTML(detail_page)
download_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
download_url = random.choice(download_list)
data = requests.get(url=download_url,headers=headers).content
fileName = name + '.rar'
with open(fileName,'wb') as fp:
fp.write(data)
print(fileName,'下载成功')
print('GG')
解析所有城市名称
import requests
from lxml import etree
headers = {
'Connection':'close', #当请求成功后,马上断开该次请求(及时释放请求池中的资源)
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
url = 'https://www.aqistudy.cn/historydata/'
page_text = requests.get(url=url,headers=headers).tetx
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul/div[2]/li')
for li in li_list:
city_name = li.xpath('./a/text()')[0]
print(city_name)
代理ip
# 设置请求代理ip: www.goubanjia.com 诀代理 西祠代理
# 代理ip的类型必须和请求url 的协议保持一致
url = 'https://www.baidu.com/s?wd=ip'
page_text = requests.get(url=url,headers=headers,proxies={'https':'61.7.170.240:8080'}).text
with open('./ip.html','w',encoding='utf-8') as fp:
fp.write(page_text)
tips
- robots
- UA
- 数据加密
- 懒加载
- 代理ip