selenium +lxml爬取拉钩网公司详情页
#encoding: utf-8 import webbrowser import requests from selenium import webdriver from selenium.webdriver.support.ui import Select,WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from lxml import etree import time import re import csv import requests class LagouSpider(object): # chromedriver的绝对路径 driver_path ='/Users/mac126/chromedriver' def __init__(self): # 初始化一个driver,并且指定chromedriver的路径 self.driver = webdriver.Chrome(executable_path=self.driver_path) self.company_lists = [] self.fp = open('lago.csv','a',encoding='utf-8',newline='') self.writer = csv.DictWriter(self.fp,['company_name','img','scale','address','description']) self.writer.writeheader() def run(self): #运行 # url='https://www.lagou.com/jobs/list_java?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=' url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='#路径 self.driver.get(url)#获取路径 while True:#死循环 WebDriverWait(driver=self.driver,timeout=10).until( EC.presence_of_element_located((By.XPATH,"//span[contains(@class,'pager_next')]")) ) resource = self.driver.page_source self.parse_list_page(resource) next_btn = self.driver.find_element_by_xpath("//span[contains(@class,'pager_next')]") if "pager_next_disabled" in next_btn.get_attribute('class'): break next_btn.click() time.sleep(5) def parse_list_page(self,resource): ''' 获取页面信息 :param resource: :return: ''' html = etree.HTML(resource) links = html.xpath("//a[@class='position_link']/@href") for link in links: self.parse_detail_page(link) time.sleep(1) def parse_detail_page(self,url): ''' 详情页解析 :param url: :return: ''' self.driver.execute_script("window.open('"+url+"')") self.driver.switch_to.window(self.driver.window_handles[1]) WebDriverWait(self.driver,timeout=10).until( EC.presence_of_element_located((By.XPATH,"//dd[@class='job_bt']")) ) resource = self.driver.page_source html = etree.HTML(resource) #找到公司详情页链接 self.third_url = html.xpath('//*[@id="job_company"]/dt/a/@href')[0] self.parse_three_page(self.third_url) self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) def parse_three_page(self,url): url=self.third_url self.driver.get(url) resource = self.driver.page_source html = etree.HTML(resource) company_name = html.xpath('//div[@class="company_info"]/div[@class="company_main"]/h1/a/text()')[0]#1 img = html.xpath('//div[@class="top_info"]/div[1]/img/@src')[0] scale = html.xpath('//*[@id="basic_container"]/div[2]/ul/li[3]/span/text()')[0]#1 address = html.xpath('//*[@id="basic_container"]/div[2]/ul/li[4]/span/text()')[0]#1 description = html.xpath('//div[@class="company_intro_text"]/span[@class="company_content"]/p/text()')#1 if description: self.company_list = { 'company_name': company_name, 'img': img, 'scale': scale, 'address': address, 'description': "".join(description), } # print(company_name,img,scale,address,description) else: description = html.xpath('//div[@class="company_intro_text"]/span[@class="company_content"]/text()')#1 if description: self.company_list = { 'company_name': company_name, 'img': img, 'scale': scale, 'address': address, 'description': "".join(description), } self.write_position(self.company_list) def write_position(self,company_list): ''' 保存 :param position: :return: ''' if len(self.company_lists) >= 100: self.writer.writerows(self.company_lists) self.company_lists.clear() self.company_lists.append(self.company_list) print(self.company_list) def main(): spider = LagouSpider() spider.run() if __name__ == '__main__': main()
爬取结果如图: