python 使用selenium爬取拉钩网

一、爬去方式

   用一般的爬取方式会发现得不到任何信息,所以我们选择selenium来爬取数据

二、下面为源码

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

import urllib.parse

class Lagou(object):

def init(self):
self.flag = True#判断什么时候结束程序
opt = webdriver.chrome.options.Options()
opt.set_headless()
self.driver = webdriver.Chrome(chrome_options=opt)#隐藏游览器界面
self.wait = WebDriverWait(self.driver,10)#设置等待时间
self.job = input('请输入想了解的职业')
cookie = input('请输入cookie:')
for item in cookie.split(';'):
k, v = item.strip().split('=')
self.driver.add_cookie({'name': k, 'value': v})
self.url = 'https://www.lagou.com/jobs/list_'+ urllib.parse.quote(self.job)+'?&cl=false&fromSearch=true&labelWords=&suginput='
self.driver.get(self.url)

def get_html(self):#获取数据
try:
link = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[@class="position_link"]')))
jobs = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[@class="position_link"]/h3')))
add = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[@class="position_link"]/span/em')))
li_b_l = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="p_bot"]//div[@class="li_b_l"]')))
list_item_bot = self.wait.until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="list_item_bot"]//div[@class="li_b_l"]')))
for i,q,w,e,r in zip(link,jobs,add,li_b_l,list_item_bot):
dict = {
'链接':i.get_attribute('href'),
'职业':q.text,
'公司地址':w.text,
'工资和要求':e.text,
'技能要求':r.text
}
with open('lagou.json', 'a+') as f:
f.write(str(dict)+'\n')
except Exception as e:
self.flag = False

def get_next_page(self):#翻页
count = 0
while self.flag:
count += 1
next = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'pager_next')))
next.click()
time.sleep(3)
self.get_html()
print('正在爬取第%d页'%count)

if __name__ == '__main__':
l = Lagou()
l.init()
l.get_html()
l.get_next_page()

 

posted @ 2019-09-11 14:40  zcb_bai  阅读(408)  评论(0编辑  收藏  举报