爬虫数据解析(3个爬虫示例)
(1)(定位标签)
(2)(提取数据)
1.将糗图百科中前5页的图片进行下载
import requests from urllib import request from lxml import etree headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36" } url = 'https://www.qiushibaike.com/pic/page/{}/?s=5196770' for page in range(1, 6): if page == 1: new_url = 'https://www.qiushibaike.com/pic' else: new_url = url.format(page) page_text = requests.get(url=new_url, headers=headers).text tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="content-left"]/div') for div in div_list: img_name = div.xpath('./div[2]/a/img/@alt')[0] img_url = 'https:' + div.xpath('./div[2]/a/img/@src')[0] img_path = './day118/01/' + img_name + '.jpg' # 用request(不是requests)的这个方法可以直接下载保存文件,第一个参数是下载地址,第二个是下载到本地的路径+文件名 request.urlretrieve(img_url, img_path) print(img_name, '下载成功')
2.爬取boss相关的爬虫岗位信息(详情页)
import requests from lxml import etree headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36" } url = 'https://www.zhipin.com/c101010100/?query=%E7%88%AC%E8%99%AB&page=1&ka=page-1' page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="job-list"]/ul/li') for li in li_list: job_url = 'https://www.zhipin.com' + li.xpath('.//h3/a/@href')[0] print(job_url) job_text = requests.get(url=job_url, headers=headers).text job_tree = etree.HTML(job_text) job_name = job_tree.xpath('//div[@class="smallbanner"]/div[1]/div[2]/div[1]/h1/text()')[0] job_salary = job_tree.xpath('//div[@class="smallbanner"]/div[1]/div[2]/div[1]/span/text()')[0] # 第三个工作到第七个工作和之前的html文件不一样,用管道符取出来,并取列表中的最后一项,就是公司名称 job_addr = job_tree.xpath( '//div[@class="detail-content"]/div[5]/div[@class="name"]/text() |//*[@id="main"]/div[3]/div/div[2]/div[2]/div[4]/div[1]/text()')[ -1] # 打印每个工作的岗位,薪资和公司名称 print(job_name, job_salary, job_addr)
3.爬取(站长素材免费建立模板下载)(前10页)http://sc.chinaz.com/jianli/free.html
import requests import random from lxml import etree headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36" } url = 'http://sc.chinaz.com/jianli/free_{}.html' for page in range(1, 11): if page == 1: new_url = 'http://sc.chinaz.com/jianli/free.html' else: new_url = url.format(page) response = requests.get(url=new_url, headers=headers)
#解决编码(乱码)问题 response.encoding = 'utf-8' page_text = response.text tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="container"]/div') for div in div_list: # 简历模板的名字,可以解决乱码问题(第二种方法) res_name = res_name.encode('iso-8859-1').decode('gbk') res_name = div.xpath('./a/img/@alt')[0] # 文件存放的路径 res_path = './day118/03/' + res_name + '.rar' # 简历模板的url res_url = div.xpath('./a/@href')[0] # 请求简历模板的网址 res_text = requests.get(url=res_url, headers=headers).text # 用etree来进行数据分析 res_tree = etree.HTML(res_text) # 简历下载的网络地址列表(福建电信,厦门电信) down_list = res_tree.xpath('//div[@id="down"]/div[2]//a/@href') # 随机抽取一个下载地址 down_url = random.choice(down_list) # 请求下载地址,因为是压缩包,二进制文件,所以用content,不能用text!!!!!! down_res = requests.get(url=down_url, headers=headers).content print(res_path)
# 写入的是二进制的数据,(所以不能写encoding='utf-8')
with open(res_path, 'wb') as f:
f.write(down_res)
print(res_name, '下载完毕')