使用selenium爬取51Job职位信息 入库mongoDB
selenium_51job_com.py
#!/usr/bin/env python3 # coding=utf-8 # Version:python3.6.1 # File:51job_com.py # Author:LGSP_Harold import pymongo from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from lxml import etree import time class HandleWebdriver: def __init__(self): # 设置无头模式 options = Options() options.add_argument('--headless') self.browser = webdriver.Firefox(firefox_options=options) # self.browser.maximize_window() def handle_job(self): # 打开目的地址 self.browser.get( 'https://search.51job.com/list/000000,000000,0000,00,9,99,+,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=') # 通过WebDriverWait进行显式等待,等待搜索框 if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.ID, 'keywordInput'))): # 外部获取输入岗位信息 input_keyword = input('请输入要查找的岗位:') # 将要查找的信息发送到搜索框 self.browser.find_element_by_id('keywordInput').send_keys(input_keyword) # 点击搜索 self.browser.find_element_by_id('search_btn').click() if WebDriverWait(self.browser, 5, 0.5).until(EC.presence_of_element_located((By.CLASS_NAME, 'j_joblist'))): # 查看网页源代码 # print(self.browser.page_source) while True: time.sleep(2) self.handle_parse(self.browser.page_source) try: if self.browser.find_element_by_xpath('//li[@class="next"]/a'): self.browser.find_element_by_xpath('//li[@class="next"]/a').click() except: break self.browser.quit() def handle_parse(self, page_source): html_obj = etree.HTML(page_source) items = html_obj.xpath('//div[@class="j_joblist"]/div[@class="e"]') data_list = [] for item in items: data = {} data['job_name'] = item.xpath('.//a/p[@class="t"]/span[@class="jname at"]/text()')[0] data['time'] = item.xpath('.//a/p[@class="t"]/span[@class="time"]/text()')[0] try: data['money'] = item.xpath('.//a/p[@class="info"]/span[@class="sal"]/text()')[0] except: data['money'] = '面议' data['address'] = item.xpath('.//a/p[@class="info"]/span[@class="d at"]/text()')[0] try: tags = item.xpath('.//a/p[@class="tags"]/span/i/text()') text = '' for tag in tags: text += tag + ' | ' data['tags'] = text except: data['tags'] = '暂无' data_list.append(data) # print(data_list) self.handle_mongodb(data_list) def handle_mongodb(self, data_list): client = pymongo.MongoClient('mongodb://admin:admin@127.0.0.1:27017') db = client['db_51job_com'] collections = db['collections_51job'] collections.insert_many(data_list) selenium = HandleWebdriver() selenium.handle_job()
略懂,略懂....