实习僧招聘网站信息采集

 

 

 1 # _*_ coding:utf-8 _*_
 2 
 3 import requests
 4 from lxml import etree
 5 import json
 6 
 7 
 8 def htmlpage(start_url):
 9     """
10     根据输入的职位请求相关页面,调用单个页面解析函数,同时调用写入函数
11     :param start_url: 
12     :return: 
13     """
14     headers={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
15     r=requests.get(start_url)
16     html=r.text
17     content = etree.HTML(html)
18     links=content.xpath('//div[@class="names cutom_font"]/a/@href')
19     for link in links:
20         full_url="https://www.shixiseng.com"+link
21         pageparse(full_url)
22 
23 def pageparse(full_url):
24     """
25     单个页面解析函数,爬取职位名称和相应公司名称。
26     :param full_url: 
27     :return: 
28     """
29     headers = {
30         'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
31     r = requests.get(full_url)
32     html = r.text
33     content = etree.HTML(html)
34     contents={}
35     jobname=content.xpath('//div[@class="new_job_name"]/@title')
36     #jobdetail=content.xpath('//div[@class="job_detail"]//div/text()')
37     #jobdetails=jobdetail.split(',')
38     comname=content.xpath('//div[@class="job_com_name cutom_font"]/text()')
39     content={
40         "jobname":jobname,
41         #"jobdetail":jobdetail,
42         "comname":comname
43     }
44     writecontent(content)
45 
46 def writecontent(content):
47     """
48     把爬取的职位信息写入本地
49     :param content: 
50     :return: 
51     """
52     with open("shixi.json","a") as f:
53         f.write(json.dumps(content,ensure_ascii = False)+"\n")
54 
55 def main(base_url,begain,end):
56     """
57     调度函数
58     :param base_url: 
59     :param begain: 
60     :param end: 
61     :return: 
62     """
63     for page in range(begain,end):
64         start_url=base_url+"&p="+str(page)
65         htmlpage(start_url)
66 
67 if __name__=="__main__":
68     key=input("job:")  #用户输入职位名称
69     begain=int(input("start:")) #爬取的初始页码
70     end=int(input("end:")) #爬取的结束页码
71     url="https://www.shixiseng.com/interns?k=" #搜索页码
72     base_url=url+key #启始页码
73     main(base_url,begain,end) #调用主函数

 结果

{"jobname": ["爬虫实习"], "comname": ["宜信"]}
{"jobname": ["数据挖掘研究实习生(爬虫方向)"], "comname": ["网易游戏"]}
{"jobname": ["金融 Java-爬虫方向实习生(广州)"], "comname": ["唯品会"]}
{"jobname": ["爬虫工程师"], "comname": ["比地科技"]}
{"jobname": ["爬虫工程师"], "comname": ["踪履"]}
{"jobname": ["Java爬虫/数据采集工程师实习"], "comname": ["搜狐"]}
{"jobname": ["爬虫实习生"], "comname": ["地平线机器人"]}
{"jobname": ["Java开发实习生-爬虫开发"], "comname": ["京东金融"]}
{"jobname": ["爬虫工程师"], "comname": ["指食针"]}
{"jobname": ["爬虫实习生"], "comname": ["同花顺"]}
{"jobname": ["爬虫工程师"], "comname": ["TransferEasy"]}
{"jobname": ["数据采集(爬虫)工程师"], "comname": ["乐职网"]}
{"jobname": ["爬虫工程师"], "comname": ["探迹"]}
{"jobname": ["爬虫开发实习生"], "comname": ["妙计旅行"]}
{"jobname": ["网络爬虫实习生"], "comname": ["海天瑞声"]}
{"jobname": ["爬虫实习生"], "comname": ["阿博茨"]}
{"jobname": ["爬虫工程师实习生"], "comname": ["阿博茨"]}
{"jobname": ["助理爬虫工程师"], "comname": ["有数金服"]}
{"jobname": ["数据采集/爬虫工程师/软件工程师"], "comname": ["上海中估联"]}
{"jobname": ["网页爬虫"], "comname": ["赛迪技术"]}
{"jobname": ["爬虫实习生"], "comname": ["阿博茨"]}
{"jobname": ["JavaEE爬虫数据实习生"], "comname": ["行圆汽车"]}
{"jobname": ["Python爬虫数据实习生"], "comname": ["行圆汽车"]}
{"jobname": ["Python爬虫实习生"], "comname": ["商智通"]}
{"jobname": ["搜狐爬虫开发实习生(python)"], "comname": ["搜狐媒体"]}
{"jobname": ["爬虫开发实习生"], "comname": ["北京阿博茨"]}
{"jobname": ["爬虫开发实习生"], "comname": ["勤智数码"]}
{"jobname": ["爬虫系统工程师(实习)"], "comname": ["爱奇艺"]}

 

posted @ 2017-11-21 20:08  不可叽叽歪歪  阅读(395)  评论(0编辑  收藏  举报