实习僧招聘网站信息采集
1 # _*_ coding:utf-8 _*_ 2 3 import requests 4 from lxml import etree 5 import json 6 7 8 def htmlpage(start_url): 9 """ 10 根据输入的职位请求相关页面,调用单个页面解析函数,同时调用写入函数 11 :param start_url: 12 :return: 13 """ 14 headers={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 15 r=requests.get(start_url) 16 html=r.text 17 content = etree.HTML(html) 18 links=content.xpath('//div[@class="names cutom_font"]/a/@href') 19 for link in links: 20 full_url="https://www.shixiseng.com"+link 21 pageparse(full_url) 22 23 def pageparse(full_url): 24 """ 25 单个页面解析函数,爬取职位名称和相应公司名称。 26 :param full_url: 27 :return: 28 """ 29 headers = { 30 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 31 r = requests.get(full_url) 32 html = r.text 33 content = etree.HTML(html) 34 contents={} 35 jobname=content.xpath('//div[@class="new_job_name"]/@title') 36 #jobdetail=content.xpath('//div[@class="job_detail"]//div/text()') 37 #jobdetails=jobdetail.split(',') 38 comname=content.xpath('//div[@class="job_com_name cutom_font"]/text()') 39 content={ 40 "jobname":jobname, 41 #"jobdetail":jobdetail, 42 "comname":comname 43 } 44 writecontent(content) 45 46 def writecontent(content): 47 """ 48 把爬取的职位信息写入本地 49 :param content: 50 :return: 51 """ 52 with open("shixi.json","a") as f: 53 f.write(json.dumps(content,ensure_ascii = False)+"\n") 54 55 def main(base_url,begain,end): 56 """ 57 调度函数 58 :param base_url: 59 :param begain: 60 :param end: 61 :return: 62 """ 63 for page in range(begain,end): 64 start_url=base_url+"&p="+str(page) 65 htmlpage(start_url) 66 67 if __name__=="__main__": 68 key=input("job:") #用户输入职位名称 69 begain=int(input("start:")) #爬取的初始页码 70 end=int(input("end:")) #爬取的结束页码 71 url="https://www.shixiseng.com/interns?k=" #搜索页码 72 base_url=url+key #启始页码 73 main(base_url,begain,end) #调用主函数
结果
{"jobname": ["爬虫实习"], "comname": ["宜信"]} {"jobname": ["数据挖掘研究实习生(爬虫方向)"], "comname": ["网易游戏"]} {"jobname": ["金融 Java-爬虫方向实习生(广州)"], "comname": ["唯品会"]} {"jobname": ["爬虫工程师"], "comname": ["比地科技"]} {"jobname": ["爬虫工程师"], "comname": ["踪履"]} {"jobname": ["Java爬虫/数据采集工程师实习"], "comname": ["搜狐"]} {"jobname": ["爬虫实习生"], "comname": ["地平线机器人"]} {"jobname": ["Java开发实习生-爬虫开发"], "comname": ["京东金融"]} {"jobname": ["爬虫工程师"], "comname": ["指食针"]} {"jobname": ["爬虫实习生"], "comname": ["同花顺"]} {"jobname": ["爬虫工程师"], "comname": ["TransferEasy"]} {"jobname": ["数据采集(爬虫)工程师"], "comname": ["乐职网"]} {"jobname": ["爬虫工程师"], "comname": ["探迹"]} {"jobname": ["爬虫开发实习生"], "comname": ["妙计旅行"]} {"jobname": ["网络爬虫实习生"], "comname": ["海天瑞声"]} {"jobname": ["爬虫实习生"], "comname": ["阿博茨"]} {"jobname": ["爬虫工程师实习生"], "comname": ["阿博茨"]} {"jobname": ["助理爬虫工程师"], "comname": ["有数金服"]} {"jobname": ["数据采集/爬虫工程师/软件工程师"], "comname": ["上海中估联"]} {"jobname": ["网页爬虫"], "comname": ["赛迪技术"]} {"jobname": ["爬虫实习生"], "comname": ["阿博茨"]} {"jobname": ["JavaEE爬虫数据实习生"], "comname": ["行圆汽车"]} {"jobname": ["Python爬虫数据实习生"], "comname": ["行圆汽车"]} {"jobname": ["Python爬虫实习生"], "comname": ["商智通"]} {"jobname": ["搜狐爬虫开发实习生(python)"], "comname": ["搜狐媒体"]} {"jobname": ["爬虫开发实习生"], "comname": ["北京阿博茨"]} {"jobname": ["爬虫开发实习生"], "comname": ["勤智数码"]} {"jobname": ["爬虫系统工程师(实习)"], "comname": ["爱奇艺"]}