selenium爬取拉勾网招聘信息

from urllib import request
import requests
import time
import pymysql
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
db=pymysql.connect('localhost','root','123qwe','ok')
cur=db.cursor()
#sql='create table lagou(name varchar(50),salary varchar(10),tiao varchar(10),company varchar(20),skill varchar(30),daiyu varchar(30))'
#cur.execute(sql)
url='https://www.lagou.com/'
#opt = webdriver.chrome.options.Options()
#opt.set_headless()
broswer=webdriver.Chrome()
broswer.get(url)
broswer.find_element_by_id("cboxClose").click()
time.sleep(1)
ok=broswer.find_element_by_xpath('//input[@id="search_input"]')
broswer.execute_script("arguments[0].click();", ok)
broswer.find_element_by_xpath('//input[@id="search_input"]').send_keys('python')
op=broswer.find_element_by_id('search_button')
broswer.execute_script("arguments[0].click();",op)
k=1
while k <= 30:
  print(k)
  time.sleep(2)
  name=broswer.find_elements_by_xpath('//a[@class="position_link"]/h3')
  classd=broswer.find_elements_by_xpath('//div[@class="p_bot"]/div[@class="li_b_l"]')
  comany=broswer.find_elements_by_xpath('//div[@class="company_name"]/a')
  skill=broswer.find_elements_by_xpath('//div[@class="list_item_bot"]/div[@class="li_b_l"]')
  daiyu=broswer.find_elements_by_xpath('//div[@class="list_item_bot"]/div[@class="li_b_r"]')
  for i in range(len(name)):
      sql='insert into lagou(name,salary,tiao,company,skill,daiyu) values(%s,%s,%s,%s,%s,%s)'
      value=(name[i].text,classd[i].text.split(' ')[0],classd[i].text.split(' ')[1],comany[i].text,skill[i].text,daiyu[i].text)
      cur.execute(sql,value)
      db.commit()
  time.sleep(1)
  js="var q=document.documentElement.scrollTop=3000"
  broswer.execute_script(js)#下拉滚动条
  time.sleep(1)
  pages = broswer.find_element_by_xpath('//*[@id="s_position_list"]/div[2]/div/span[6]')
  ActionChains(broswer).move_to_element(pages).perform()
  try:
      next = WebDriverWait(broswer, 10).until(
          EC.element_to_be_clickable((By.CLASS_NAME, 'pager_next '))
      )
      next.click()
  except TimeoutException:
      next_page()#点击下一页非常关键,用平时click失效的方法,会从第3页直接跳到最后一页
  k=k+1
cur.close()
broswer.close()

posted @ 2019-10-18 23:08  晨曦yd  阅读(473)  评论(0编辑  收藏  举报