解析加密数据
解析加密数据
-
对一个新的网站进行爬取之前,首先要确定即将要进行爬取的数据是否为动态加载!
-
解析加密数据
#需求:解析图片数据 import requests from lxml import etree import base64 from urllib import request headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' } url = 'http://******.net/ooxx/page-62#comments' page_text = requests.get(url=url,headers=headers).text #解析图片的密文 tree = etree.HTML(page_text) code_list = tree.xpath('//span[@class="img-hash"]/text()') for code in code_list: # base64.b64decode(code).decode() 数据解密 img_url ='http:' + base64.b64decode(code).decode() imgName = img_url.split('/')[-1] request.urlretrieve(img_url,imgName) print(imgName,'下载成功!!!')
-
爬取 模板信息
# 爬取某素材网站中的免费建立模板
import requests
from lxml import etree
import random
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Connection':'close'
}
url_page_one = 'http://sc.******.com/jianli/free.html'
#定制了一个通用的url模板
url_demo = 'http://sc.******.com/jianli/free_%d.html'
start_page = int(input('enter a start page num:'))
end_page = int(input('enter a end page num:'))
for pageNum in range(start_page,end_page+1):
if pageNum == 1:
url = url_page_one
else:
url = format(url_demo%pageNum)
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
page_text = response.text
#解析:简历详情页的url,和名称
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@id="container"]/div')
for div in div_list:
name = div.xpath('./p/a/text()')[0]
detail_url = div.xpath('./p/a/@href')[0]
#对详情页的url发起请求,获取详情页的源码数据
detail_page_text = requests.get(url=detail_url,headers=headers).text
#对详情页的源码数据进行解析:下载地址对应的url
tree = etree.HTML(detail_page_text)
li_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li')
#随机选取一个li标签(li标签中包含了下载地址的url)
li = random.choice(li_list)
download_url = li.xpath('./a/@href')[0]
#进行简历数据的下载
data = requests.get(url=download_url,headers=headers).content
name = name+'.rar'
with open(name,'wb') as fp:
fp.write(data)
print(name,'下载成功!')