爬取爱笔智能招聘职位
1 import urllib.request 2 import urllib.parse 3 import requests 4 from urllib.parse import urlencode 5 from pyquery import PyQuery as pq 6 from pymongo import MongoClient 7 import json 8 9 10 url = 'http://aibee.com/cn/joinus.aspx?action=jobinfo' 11 12 headers = { 13 'Host': 'aibee.com', 14 'Referer': 'http://aibee.com/cn/joinus.aspx', 15 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 16 'X-Requested-With': 'XMLHttpRequest', 17 } 18 19 client = MongoClient() 20 db = client['aibee'] 21 collection = db['aibee'] 22 max_id = 50 23 24 def get_page(id): 25 26 formData = { 27 'id': id, 28 } 29 30 #将str类型转换为bytes类型 31 data = urllib.parse.urlencode(formData).encode("utf-8") 32 request = urllib.request.Request(url, data=data, headers=headers) 33 response = urllib.request.urlopen(request) 34 #print(response.read().decode('utf-8')) 35 result = response.read().decode('utf-8') 36 #print(result) 37 #print(len(result)) 38 #print(id) 39 40 if len(result)!=12: 41 42 # print(id) 43 content=result.replace(",",":") 44 45 id=content.split(':')[2].strip() 46 #print(id) 47 title=content.split(':')[4].strip() 48 #print(title) 49 zhize=content.split(':')[6].strip().replace("\t","").replace("<ul>","").replace("<li>","").replace("</li>","").replace("</ul>","").replace("\\t","") 50 #print(zhize) 51 yaoqiu=content.split(':')[8].strip().replace("\t","").replace("<ul>","").replace("<li>","").replace("</li>","").replace("</ul>","").replace("\\t","") 52 #print(yaoqiu) 53 dtt=content.split(':')[12].strip()[:-3] 54 #print(dtt) 55 emailaddr=content.split(':')[16].strip() 56 #print(emailaddr) 57 58 59 aibee={ 60 'id':id, 61 'title':title, 62 'zhize':zhize, 63 'yaoqiu':yaoqiu, 64 'dtt': dtt, 65 'emailaddr': emailaddr 66 } 67 #print(aibee) 68 else: 69 aibee=0 70 71 return aibee 72 73 74 def write_to_file(content): 75 with open('aibee.json','a',encoding='utf-8') as f: 76 f.write(json.dumps(content,ensure_ascii=False)+'\n') 77 f.close() 78 79 80 def save_to_mongo(result): 81 if collection.insert(result): 82 print('Saved to Mongo') 83 84 85 if __name__ == "__main__": 86 87 for id in range(1, max_id + 1): 88 #get_page(id) 89 content = get_page(id) 90 if content!=0: 91 print(content) 92 write_to_file(content) 93 save_to_mongo(content) 94