# -*- coding: utf-8 -*-
# @Time : 2020/9/21 11:13
# @Author : aqiong
# @Site :
# @File : 站长之家简历爬取.py
# @Software: PyCharm
import requests
from lxml import etree
import random
import os
##
#获得用户代理
#
def getheaders():
user_agent_list = ['Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36']
return random.choice(user_agent_list)
if __name__ == '__main__':
if not os.path.exists('./jl'):
os.mkdir('./jl')
url = 'http://sc.chinaz.com/jianli/fengmian.html'
headers = {
'user-agent':getheaders()
}
page_text = requests.get(url=url,headers = headers)
page_text.encoding='utf-8'###这里是设置编码为utf-8,否则爬取到的数据是乱码
page_text = page_text.text
#print(page_text.text)
page_html=etree.HTML(page_text)
#print(page_html.xpath('//title/text()'))
a_herf_list = page_html.xpath('//div[@class="main_list jl_main"]/div[@class="box col3 ws_block"]/a/@href')
for a_url in a_herf_list:
jl_page_text = requests.get(url=a_url,headers=headers,allow_redirects=False).text#allow_redirects=False当爬虫时报错:requests.exceptions.TooManyRedirects: Exceeded 30 redirects.
jl_html = etree.HTML(jl_page_text)
rar_list = jl_html.xpath('//div[@class="clearfix mt20 downlist"]//ul[@class="clearfix"]/li[1]/a/@href')#获得下载连接
rar_url = rar_list[0]
# print(rar_list)
jl_rar = requests.get(url=rar_url, headers=headers).content
fileName = './jl/' + rar_url.split('/')[-1]
# print(fileName)
with open(fileName, 'wb') as fp:
fp.write(jl_rar)
print(fileName + '保存成功')
#print(a_herf_list)