爬取爱笔智能招聘职位

 

 

 

 1 import urllib.request
 2 import urllib.parse
 3 import requests
 4 from urllib.parse import urlencode
 5 from pyquery import PyQuery as pq
 6 from pymongo import MongoClient
 7 import json
 8 
 9 
10 url = 'http://aibee.com/cn/joinus.aspx?action=jobinfo'
11 
12 headers = {
13     'Host': 'aibee.com',
14     'Referer': 'http://aibee.com/cn/joinus.aspx',
15     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
16     'X-Requested-With': 'XMLHttpRequest',
17 } 
18 
19 client = MongoClient()
20 db = client['aibee']
21 collection = db['aibee']
22 max_id = 50
23 
24 def get_page(id):  
25 
26     formData = {  
27             'id': id,
28         }  
29      
30     #将str类型转换为bytes类型  
31     data = urllib.parse.urlencode(formData).encode("utf-8")   
32     request = urllib.request.Request(url, data=data, headers=headers)  
33     response = urllib.request.urlopen(request)
34     #print(response.read().decode('utf-8'))
35     result = response.read().decode('utf-8')
36     #print(result)
37     #print(len(result))
38     #print(id)
39 
40     if len(result)!=12:
41         
42        # print(id)
43         content=result.replace(",",":")
44     
45         id=content.split(':')[2].strip()
46         #print(id)
47         title=content.split(':')[4].strip()
48         #print(title)
49         zhize=content.split(':')[6].strip().replace("\t","").replace("<ul>","").replace("<li>","").replace("</li>","").replace("</ul>","").replace("\\t","")
50         #print(zhize)
51         yaoqiu=content.split(':')[8].strip().replace("\t","").replace("<ul>","").replace("<li>","").replace("</li>","").replace("</ul>","").replace("\\t","")
52         #print(yaoqiu)
53         dtt=content.split(':')[12].strip()[:-3]
54         #print(dtt)
55         emailaddr=content.split(':')[16].strip()
56         #print(emailaddr)
57 
58         
59         aibee={
60                 'id':id,
61                 'title':title,
62                 'zhize':zhize,
63                 'yaoqiu':yaoqiu,
64                 'dtt': dtt,
65                 'emailaddr': emailaddr
66             }
67         #print(aibee) 
68     else:
69         aibee=0
70 
71     return aibee
72     
73 
74 def write_to_file(content):
75     with open('aibee.json','a',encoding='utf-8') as f:
76         f.write(json.dumps(content,ensure_ascii=False)+'\n')
77         f.close()
78 
79 
80 def save_to_mongo(result):
81     if collection.insert(result):
82         print('Saved to Mongo')        
83 
84 
85 if __name__ == "__main__":  
86     
87     for id in range(1, max_id + 1):
88         #get_page(id)
89         content = get_page(id)
90         if content!=0:
91             print(content)
92             write_to_file(content)
93             save_to_mongo(content)
94         

 

 

posted @ 2018-06-25 01:57  王琳杰  阅读(191)  评论(0编辑  收藏  举报