selenium +lxml爬取拉钩网公司详情页

#encoding: utf-8
import webbrowser

import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select,WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import time
import re
import csv
import requests

class LagouSpider(object):
    # chromedriver的绝对路径
    driver_path ='/Users/mac126/chromedriver'
    def __init__(self):
        # 初始化一个driver，并且指定chromedriver的路径
        self.driver = webdriver.Chrome(executable_path=self.driver_path)
        self.company_lists = []
        self.fp = open('lago.csv','a',encoding='utf-8',newline='')
        self.writer = csv.DictWriter(self.fp,['company_name','img','scale','address','description'])

        self.writer.writeheader()

    def run(self):
        #运行
        # url='https://www.lagou.com/jobs/list_java?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
        url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='#路径
        self.driver.get(url)#获取路径
        while True:#死循环

            WebDriverWait(driver=self.driver,timeout=10).until(
                EC.presence_of_element_located((By.XPATH,"//span[contains(@class,'pager_next')]"))
            )
            resource = self.driver.page_source
            self.parse_list_page(resource)
            next_btn = self.driver.find_element_by_xpath("//span[contains(@class,'pager_next')]")
            if "pager_next_disabled" in next_btn.get_attribute('class'):
                break
            next_btn.click()
            time.sleep(5)


    def parse_list_page(self,resource):
        '''
        获取页面信息
        :param resource:
        :return:
        '''
        html = etree.HTML(resource)
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:
            self.parse_detail_page(link)
            time.sleep(1)

    def parse_detail_page(self,url):
        '''
        详情页解析
        :param url:
        :return:
        '''
        self.driver.execute_script("window.open('"+url+"')")
        self.driver.switch_to.window(self.driver.window_handles[1])
        WebDriverWait(self.driver,timeout=10).until(
            EC.presence_of_element_located((By.XPATH,"//dd[@class='job_bt']"))
        )
        resource = self.driver.page_source
        html = etree.HTML(resource)

        #找到公司详情页链接
        self.third_url = html.xpath('//*[@id="job_company"]/dt/a/@href')[0]
        self.parse_three_page(self.third_url)
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])
    def parse_three_page(self,url):

        url=self.third_url
        self.driver.get(url)
        resource = self.driver.page_source
        html = etree.HTML(resource)
        company_name = html.xpath('//div[@class="company_info"]/div[@class="company_main"]/h1/a/text()')[0]#1
        img = html.xpath('//div[@class="top_info"]/div[1]/img/@src')[0]
        scale = html.xpath('//*[@id="basic_container"]/div[2]/ul/li[3]/span/text()')[0]#1
        address = html.xpath('//*[@id="basic_container"]/div[2]/ul/li[4]/span/text()')[0]#1
        description = html.xpath('//div[@class="company_intro_text"]/span[@class="company_content"]/p/text()')#1
        if description:
            self.company_list = {
                'company_name': company_name,
                'img': img,
                'scale': scale,
                'address': address,
                'description': "".join(description),

            }
        # print(company_name,img,scale,address,description)
        else:
            description = html.xpath('//div[@class="company_intro_text"]/span[@class="company_content"]/text()')#1
            if description:
                self.company_list = {
                    'company_name': company_name,
                    'img': img,
                    'scale': scale,
                    'address': address,
                    'description': "".join(description),

                }
        self.write_position(self.company_list)

    def write_position(self,company_list):
            '''
            保存
            :param position:
            :return:
            '''
            if len(self.company_lists) >= 100:
                self.writer.writerows(self.company_lists)
                self.company_lists.clear()
            self.company_lists.append(self.company_list)
            print(self.company_list)




def main():

    spider = LagouSpider()
    spider.run()

if __name__ == '__main__':
    main()
爬取结果如图：
posted @ 2019-02-17 16:51 青春叛逆者阅读(587) 评论(0) 编辑收藏举报
刷新页面返回顶部
青春叛逆者

selenium +lxml爬取拉钩网公司详情页

公告