爬虫

################################################################################################################################################
#该爬虫爬取boss网站，搜索关键字出来的结果，包公司名字，括发布日期，职位名称，薪资，地区，经验限制，学历，职位关键字，职位描述，任职要求
#start_url = 'http://www.zhipin.com/c101270100/?query=Python&page=' #这个设置爬取关键字，本例关键字为python
#dethp = 6 #这里设置爬取深度，boss网站一次爬取深度不能太深，否则会被封，一般爬取深度设置为5
#for i in range(1,dethp):#这里需要设置开始爬取的深度
#excel.save('boss_python_1_150.xlsx') #这个设置保存爬取的数据到excel，每次爬取的时候，需要修改名字，以区分
################################################################################################################################################



import requests
from bs4 import BeautifulSoup
import bs4
import xlwt

def get_html(url):
    kv={'User-Agent':'Mozilla/5.0'}
    try:
        r = requests.get(url,headers=kv,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('爬取失败')
        print(url)

def get_dange_info(company_info,url):
    html = get_html(url) 
    soup = BeautifulSoup(html,'html.parser')
    #获取公司名字，行业，规模
    company = soup.find('div','info-company')
    company_info.append(company.find('h3').get_text())#获取公司名字
    company_info.append(company.find('p').get_text())#获取行业，规模
    #获取发布日期，职位名称，薪资，地区，经验限制，学历，职位关键字
    zhiwei_info = soup.find('div','info-primary')
    for i in zhiwei_info.children:
        if isinstance(i,bs4.element.Tag):#去除\n子标签
            if '\n' in i.get_text():#去除文字中间的\n符合
                a = i.get_text().split('\n')
                b = ''
                for j in a:
                    if len(j) != 0:
                        b =  b + j + ' '
                company_info.append(b)
            else:
                company_info.append(i.get_text().strip())
    #获取职位描述，任职要求                   
    job_sec = soup.find('div','job-sec')
    text = job_sec.find('div','text')
    #zhiweizhize0 = text.get_text().split('任职要求')[0].strip().replace('任职要求','')
    #zhiweizhize = '岗位职责:' + zhiweizhize0.split('岗位职责')[-1].strip()
    #renzhiyaoqiu = '任职要求:' + text.get_text().split('任职要求')[-1].strip()
    #company_info.append(zhiweizhize.strip())
    #company_info.append(renzhiyaoqiu.strip())
    company_info.append(text.get_text().strip())

def get_urllist(url_list,url):
    url_list_html = get_html(url)
    soup = BeautifulSoup(url_list_html,'html.parser')
    job_list =soup.find_all('div','job-primary')#获取job-primary标签
    for i in job_list:
        url_list.append('http://www.zhipin.com' + i.find('a').attrs['href'])


def get_quanbuinfo(tatol_info,url_list):
    for i in url_list:
        company_info = []
        get_dange_info(company_info,i)
        tatol_info.append(company_info)      


def print_html(tatol_info):
    tplt = '{0:^10}\t{1:^6}\t{2:^6}\t{3:^6}\t{4:^6}\t{5:^6}\n\n{6:^50}\n\n{7:^50}'
    print(tplt.format('公司名称','行业规模','发布时间','职位名称及薪资','地区经验学历','职位标签','职位描述','任职要求'))
    for i in tatol_info:
        print(tplt.format(i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7]))
        print('\n\n\n')
  
    with open('boss.txt','w',encoding='utf-8') as f:
        for j in tatol_info:
            f.write(str(j[0:6]) + '\n\n' + str(j[6]) + '\n\n' + str(j[7]) + '\n\n\n')

def save_tatol_info(tatol_info):
    excel = xlwt.Workbook()
    sheet = excel.add_sheet('sheet')
    style = xlwt.XFStyle()
    font = xlwt.Font()
    font.name = 'Times New Roman'
    style.font = font
    #row0 = ['公司名称','行业规模','发布时间','职位名称及薪资','地区经验学历','职位标签','职位描述','任职要求']
    row0 = ['公司名称','行业规模','发布时间','职位名称及薪资','地区经验学历','职位标签','职位描述']
    h = 0
    for k in row0:
        sheet.write(0,h,k,style)
        h += 1
    m = 1
    for i in tatol_info:
        n = 0
        for j in i:
            sheet.write(m,n,j,style)
            n += 1
        m += 1
    excel.save('boss_大数据_151_200.xlsx') #这个设置保存爬取的数据到excel，每次爬取的时候，需要修改名字，以区分
        


def main():
    url_list = []
    tatol_info = []
    start_url = 'http://www.zhipin.com/c101270100/?query=Python&page=' #这个设置爬取关键字，本例关键字为python
    dethp = 6 #这里设置爬取深度，boss网站一次爬取深度不能太深，否则会被封，一般爬取深度设置为5
    for i in range(1,dethp):#这里需要设置开始爬取的深度
        try:
            url = start_url + str(i)
            get_urllist(url_list,url)
        except:
            print('ngngng')
            print('url=',url)
            continue
    get_quanbuinfo(tatol_info,url_list)
    #print_html(tatol_info)
    save_tatol_info(tatol_info)

main()
posted on 2017-12-22 14:15 斯文依旧在阅读(143) 评论(0) 编辑收藏举报