python 使用selenium爬取拉钩网

一、爬去方式

　　　用一般的爬取方式会发现得不到任何信息，所以我们选择selenium来爬取数据

二、下面为源码

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

import urllib.parse

class Lagou(object):

    def init(self):
        self.flag = True#判断什么时候结束程序
        opt = webdriver.chrome.options.Options()
        opt.set_headless()
        self.driver = webdriver.Chrome(chrome_options=opt)#隐藏游览器界面
        self.wait = WebDriverWait(self.driver,10)#设置等待时间
        self.job = input('请输入想了解的职业')
        cookie = input('请输入cookie：')
        for item in cookie.split(';'):
            k, v = item.strip().split('=')
            self.driver.add_cookie({'name': k, 'value': v})
        self.url = 'https://www.lagou.com/jobs/list_'+ urllib.parse.quote(self.job)+'?&cl=false&fromSearch=true&labelWords=&suginput='
        self.driver.get(self.url)

    def get_html(self):#获取数据
        try:
            link = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[@class="position_link"]')))
            jobs = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[@class="position_link"]/h3')))
            add = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[@class="position_link"]/span/em')))
            li_b_l = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="p_bot"]//div[@class="li_b_l"]')))
            list_item_bot = self.wait.until(
                EC.presence_of_all_elements_located((By.XPATH, '//div[@class="list_item_bot"]//div[@class="li_b_l"]')))
            for i,q,w,e,r in zip(link,jobs,add,li_b_l,list_item_bot):
                dict = {
                    '链接':i.get_attribute('href'),
                    '职业':q.text,
                    '公司地址':w.text,
                    '工资和要求':e.text,
                    '技能要求':r.text
                             }
                with open('lagou.json', 'a+') as f:
                    f.write(str(dict)+'\n')
        except Exception as e:
             self.flag = False

    def get_next_page(self):#翻页
        count = 0
        while self.flag:
            count += 1
            next = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'pager_next')))
            next.click()
            time.sleep(3)
            self.get_html()
            print('正在爬取第%d页'%count)

if __name__ == '__main__':
    l = Lagou()
    l.init()
    l.get_html()
    l.get_next_page()

posted @ 2019-09-11 14:40 zcb_bai 阅读(408) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

zcb_bai

python 使用selenium爬取拉钩网

一、爬去方式

二、下面为源码

公告