站长之家简历爬取源码

 1 # -*- ecoding: utf-8 -*-
 2 # @ModuleName: 3、免费简历模版爬虫
 3 # @Function: 
 4 # @Author: merry
 5 # @Time: 2021/1/18 17:02
 6 import requests
 7 from lxml import etree
 8 import os
 9 
10 # 定义请求头
11 headers = {
12     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
13 }
14 
15 url = 'https://sc.chinaz.com/jianli/free.html'
16 # 获取响应页面源码
17 reponse = requests.get(url, headers=headers)
18 # 设置编码
19 reponse.encoding = 'utf-8'
20 new_reponse = reponse.text
21 
22 # 使用etree解析详情页面的超链接
23 tree = etree.HTML(new_reponse)
24 # 解析当前页面简历的大div
25 get_url_list = tree.xpath('//div[@id="container"]/div')
26 # 创建简历保存的文件夹
27 if not os.path.exists('./doc'):
28     os.mkdir('./doc')
29 
30 # 遍历大的div
31 for url_li in get_url_list:
32     # 得到简历详情页的url
33     doc_url = 'https:' + url_li.xpath('./a/@href')[0]
34     # 得到简历的名称
35     doc_name = url_li.xpath('./a/img/@alt')[0]
36     # 请求详情页url
37     reponse = requests.get(doc_url, headers=headers)
38     # 设置响应编码
39     reponse.encoding = 'utf-8'
40     new_reponse = reponse.text
41     # etree解析
42     tree = etree.HTML(new_reponse)
43     # 拼接文件名
44     filename = f'./doc/{doc_name}.rar'
45     # 根据class属性获得第一个下载链接的url列表
46     li_list = tree.xpath('//div[@class="clearfix mt20 downlist"]//li[1]')
47     # 遍历下载链接的url
48     for li in li_list:
49         # 获得下载链接的href属性获取超链接
50         get_down_url = li.xpath('./a/@href')[0]
51         # 请求下载doc链接的二进制文件
52         doc = requests.get(get_down_url, headers=headers).content
53         # 保存到文件夹
54         with open(filename, 'wb') as fp:
55             fp.write(doc)
56             print(f'\033[32m爬取----{doc_name}----完成')
 
posted @ 2021-01-21 09:51  Merry'blog  阅读(311)  评论(0编辑  收藏  举报