Python 爬虫实例(15) 爬取 百度百聘(微信公众号)
今天闲的无聊,爬取了一个网站,百度百聘,仅供学习参考
直接上代码:
#-*-coding:utf-8-*- from common.contest import * def spider(): headers = { "Host":"zhaopin.baidu.com", "Connection":"keep-alive", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36", "Accept":"*/*", "Referer":"http://zhaopin.baidu.com/qz?query=%E7%88%AC%E8%99%AB%E5%B7%A5%E7%A8%8B%E5%B8%88", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.8", "Cookie":"BAIDUID=F9ED0B117C16BC97A29D64DD28F4CBB9:FG=1; BIDUPSID=F9ED0B117C16BC97A29D64DD28F4CBB9; PSTM=1532340289; locale=zh;
Hm_lvt_c676f95eebbd4fa8a59418f48090ac4d=1532922582; URLTITLESALARY=%20; Hm_lvt_80a630f2b5c230ad2a5d3f1356e18706=1532938867;
Hm_lpvt_80a630f2b5c230ad2a5d3f1356e18706=1532941545; td_cookie=966486977", } url = 'http://zhaopin.baidu.com/api/qzasync' for page in range(0,11): print "正在爬取的页数是:",str(page) data = { "query":"爬虫工程师", "city":"%E5%8C%97%E4%BA%AC", "pcmod":"1", "pn":str(page), "rn":"10", } result = session.get(url=url,params=data,headers=headers).json() result = result['data']['disp_data'] for ii in result: try: ori_size = ii['ori_size'] except: ori_size = "" ori_city = ii['ori_city'] ori_type = ii['ori_type'] StdStl = ii['StdStl'] sourcelink = ii['sourcelink'] _version = ii['_version'] haswapurl = ii['haswapurl'] education = ii['education'] try: size = ii['size'] except: size = "" format_date = ii['format_date'] detailidx = ii['detailidx'] title = ii['title'] ori_employertype = ii['ori_employertype'] requirements = ii['requirements'] company_id = ii['company_id'] ori_salary = ii['ori_salary'] source = ii['source'] location = ii['location'] provider = ii['provider'] employertype = ii['employertype'] lastmod = ii['lastmod'] _update_time = ii['_update_time'] ori_education = ii['ori_education'] try: companyaddress = ii['companyaddress'] except: companyaddress = "" company = ii['company'] try: commonname = ii['commonname'] except: commonname = "" ori_welfare = ii['ori_welfare'] ori_experience = ii['ori_experience'] ori_welfare = str(ori_welfare).decode('unicode_escape') print "ori_salary",ori_salary print "ori_size",ori_size print "ori_city",ori_city print "ori_type",ori_type print "StdStl",StdStl print "sourcelink",sourcelink print "_version",_version print "haswapurl",haswapurl print "education",education print "id",id print "size",size print "format_date",format_date print "detailidx",detailidx print "title",title print "ori_employertype",ori_employertype print "requirements",requirements print "company_id",company_id print "ori_salary",ori_salary print "ori_salary",ori_salary print "source",source print "employertype",employertype print "location",location print "provider",provider print "employertype",employertype print "lastmod",lastmod print "_update_time",_update_time print "ori_education",ori_education print "companyaddress",companyaddress print "company",company print "commonname",commonname print "ori_welfare",ori_welfare print "ori_experience",ori_experience time.sleep(5) spider()
这个网址没有什么难度,只需要简单的请求一下请求接口就能得到数据,注意请求参数 city 需要 URL编码一下就可以,不会的同学请自行百度 URL编码 就可以了
如果觉得对您有帮助,麻烦您点一下推荐,谢谢!
好记忆不如烂笔头
好记忆不如烂笔头