python 爬虫系列09-selenium+拉钩

使用selenium爬取拉勾网职位

 1 from selenium import webdriver
 2 from lxml import etree
 3 import re
 4 import time
 5 from selenium.webdriver.support.ui import WebDriverWait
 6 from selenium.webdriver.support import expected_conditions as EC
 7 from selenium.webdriver.common.by import By
 8 class LagouSpider(object):
 9     driver_path = r"D:\driver\chromedriver.exe"
10 
11     def __init__(self):
12         self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
13         self.url = 'https://www.lagou.com/jobs/list_%E4%BA%91%E8%AE%A1%E7%AE%97?labelWords=&fromSearch=true&suginput='
14         self.positions = []
15 
16     def run(self):
17         self.driver.get(self.url)
18         while True:
19             source = self.driver.page_source
20             WebDriverWait(driver=self.driver,timeout=10).until(
21                 EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
22             )
23             self.parse_list_page(source)
24             try:
25                 next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
26                 if "pager_next_disabled" in next_btn.get_attribute("class"):
27                     break
28                 else:
29                     next_btn.click()
30             except:
31                 print(source)
32 
33             time.sleep(1)
34 
35     def parse_list_page(self,source):
36         html = etree.HTML(source)
37         links = html.xpath("//a[@class='position_link']/@href")
38         for link in links:
39             self.request_detail_page(link)
40             time.sleep(1)
41 
42     def request_detail_page(self,url):
43         # self.driver.get(url)
44         print()
45         print(url)
46         print()
47         self.driver.execute_script("window.open('%s')" % url)
48         self.driver.switch_to.window(self.driver.window_handles[1])
49         WebDriverWait(self.driver,timeout=10).until(
50             EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']"))
51         )
52         source = self.driver.page_source
53         self.parse_detail_page(source)
54         self.driver.close()
55         self.driver.switch_to.window(self.driver.window_handles[0])
56 
57     def parse_detail_page(self,source):
58         html = etree.HTML(source)
59         position_name = html.xpath("//span[@class='name']/text()")[0]
60         job_request_spans = html.xpath("//dd[@class='job_request']//span")
61         salary = job_request_spans[0].xpath('.//text()')[0].strip()
62         city = job_request_spans[1].xpath(".//text()")[0].strip()
63         city = re.sub(r"[\s/]", "", city)
64         work_years = job_request_spans[2].xpath(".//text()")[0].strip()
65         work_years = re.sub(r"[\s/]", "", work_years)
66         education = job_request_spans[3].xpath(".//text()")[0].strip()
67         education = re.sub(r"[\s/]", "", education)
68         desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
69         company_name = html.xpath("//h2[@class='f1']/text()")
70         position = {
71             'name': position_name,
72             'company_name': company_name,
73             'salary': salary,
74             'city': city,
75             'work_years': work_years,
76             'education': education,
77             'desc': desc
78         }
79         self.positions.append(position)
80         print(position)
81 if __name__ == '__main__':
82     spider = LagouSpider()
83     spider.run()

 




 

posted on 2018-11-13 18:29  kingle-l  阅读(396)  评论(0编辑  收藏  举报

levels of contents