1 # -*- coding:utf-8 -*- 2 # Author:Sure Feng 3 4 from selenium import webdriver 5 from lxml import etree 6 import time 7 import json 8 import openpyxl 9 10 class LaGou(object): 11 # 定义浏览器地址 12 # time = time.time() 13 driver_path = r'E:\sure\ware\chromedriver.exe' 14 def __init__(self): 15 # 创建一个浏览器 16 self.drive = webdriver.Chrome(executable_path=LaGou.driver_path) 17 # 定义主页url 18 self.url = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE?city=%E5%B9%BF%E5%B7%9E' 19 self.positions = [] 20 21 def request_detail_page(self,url): 22 '''请求详情页面''' 23 24 # 创建新窗口打开详情页面 25 self.drive.execute_script("window.open('%s')" % url) 26 # 切换到详情页面 27 self.drive.switch_to.window(self.drive.window_handles[1]) 28 # 获取详情页数据 29 source = self.drive.page_source 30 # 解析页面,获取具体数据 31 self.parse_detial_page(source) 32 # 关闭详情页面 33 self.drive.close() 34 # 切换到主页面 35 self.drive.switch_to.window(self.drive.window_handles[0]) 36 37 def parse_detial_page(self, source): 38 '''解析页面,获取具体数据''' 39 html = etree.HTML(source) 40 info_list = [] 41 # xpath解析html,获取具体数据 42 position_id = html.xpath("//a[@class='send-CV-btn s-send-btn fr']/@data-position-id") 43 position_web = "https://www.lagou.com/jobs/{}.html".format(position_id[0] if len(position_id) > 0 else None) 44 info_list.append(position_web) 45 position_name = html.xpath("//div[@class='job-name']/@title") 46 position_name = position_name[0].strip() if len(position_name) > 0 else None 47 info_list.append(position_name) 48 salary = html.xpath("//dd[@class='job_request']/p/span[@class='salary']/text()") 49 salary = salary[0].strip() if len(salary) > 0 else None 50 info_list.append(salary) 51 job_year = html.xpath("//dd[@class='job_request']/p/span[3]/text()") 52 job_year = job_year[0].replace("/","").strip() if len(job_year) > 0 else None 53 info_list.append(job_year) 54 grade = html.xpath("//dd[@class='job_request']/p/span[4]/text()") 55 grade = grade[0].replace("/","").strip() if len(grade) > 0 else None 56 info_list.append(grade) 57 publish_time = html.xpath("//p[@class='publish_time']/text()") 58 publish_time = publish_time[0].replace("\xa0 发布于拉勾网","").strip() if len(publish_time) > 0 else None 59 info_list.append(publish_time) 60 company_name = html.xpath("//img[@class='b2']/@alt") 61 company_name = company_name[0] if len(company_name) > 0 else None 62 info_list.append(company_name) 63 company = html.xpath("//h2[@class='fl']/text()") 64 company = company[0].strip() if len(company) > 0 else None 65 info_list.append(company) 66 job_advantage = html.xpath("//dd[@class='job-advantage']/p/text()") 67 job_advantage = job_advantage[0].strip() if len(job_advantage) > 0 else None 68 info_list.append(job_advantage) 69 job_detail = html.xpath("//div[@class='job-detail']//text()") 70 job_detail = str(job_detail).replace(" ","").strip() if len(job_detail) > 0 else None 71 info_list.append(job_detail) 72 work_addr = html.xpath("//div[@class='work_addr']/a[2]/text()") 73 work_addr = work_addr[0].strip() if len(work_addr) > 0 else None 74 info_list.append(work_addr) 75 work_addr_detail = html.xpath("//div[@class='work_addr']//text()") 76 work_addr_detail = work_addr_detail[-3].strip() if len(work_addr_detail) > 0 else None 77 info_list.append(work_addr_detail) 78 position_label_clearfix = html.xpath("//ul[@class='position-label clearfix']/li[@class='labels']//text()") 79 position_label_clearfix = str(position_label_clearfix).strip() if len(position_label_clearfix) > 0 else None 80 info_list.append(position_label_clearfix) 81 c_feature = html.xpath("//ul[@class='c_feature']/li/text()") 82 zone = c_feature[1].strip() if len(c_feature) > 0 else None 83 info_list.append(zone) 84 development = html.xpath("//i[@class='icon-glyph-trend']/../text()") 85 development = development[1].strip() if len(development) > 0 else None 86 info_list.append(development) 87 people_num = html.xpath("//i[@class='icon-glyph-figure']/../text()") 88 people_num = people_num[1].strip() if len(people_num) > 0 else None 89 info_list.append(people_num) 90 Investment_institution = html.xpath("//p[@class='financeOrg']/text()") 91 Investment_institution = Investment_institution[0].strip() if len(Investment_institution) > 0 else None 92 info_list.append(Investment_institution) 93 94 # 将具体数据保存至dict中 95 # info_dict = { 96 # 'company':company, 97 # 'position_name':position_name, 98 # 'salary':salary, 99 # 'job_year':job_year, 100 # 'grade':grade, 101 # 'publish_time':publish_time, 102 # 'zone':zone, 103 # 'job_advantage':job_advantage, 104 # 'job_detail':job_detail, 105 # } 106 # 将dict保存至职位列表中 107 # self.positions.append(info_dict) 108 self.positions.append(info_list) 109 # with open("lagou.json",encoding="utf-8",mode="a") as f: 110 # f.write(json.dumps(info_dict,ensure_ascii=False,indent=2)) 111 # f.write("\n") 112 print(str(self.positions).encode('GBK','ignore').decode('GBk') ) 113 114 def parse_list_page(self, source): 115 '''解析列表页''' 116 html = etree.HTML(source) 117 # 获取详情url列表 118 links = html.xpath("//a[@class='position_link']/@href") 119 for link in links: 120 print(link) 121 # 请求详情页面 122 self.request_detail_page(link) 123 time.sleep(1) 124 # break 125 126 def save_excel(self, list): 127 wb = openpyxl.Workbook() 128 now_time = time.time() 129 ws = wb.create_sheet("lagou" + str(now_time)) 130 title = ["position_web","position_name","salary","job_year","grade","publish_time","company","company_name","job_advantage","job_detail","work_addr","work_addr_detail","position_label_clearfix","zone","development","people_num","Investment_institution"] 131 ws.append(title) 132 for li in list: 133 ws.append(li) 134 135 wb.save("lagou.xls") 136 137 138 def run(self): 139 # 打开主页 140 self.drive.get(self.url) 141 # num = 0 142 while True: 143 # num += 1 144 # if num == 2: 145 # break 146 # 解析主页,获取详情页url 147 source = self.drive.page_source 148 # 解析列表页 149 self.parse_list_page(source) 150 # 跳转下一页 151 next_btn = self.drive.find_element_by_xpath("//div[@class='pager_container']/span[last()]") 152 if "pager_next pager_next_disabled" in next_btn.get_attribute("class"): 153 break 154 else: 155 next_btn.click() 156 time.sleep(3) 157 # 保存数据到Excel中 158 self.save_excel(self.positions) 159 160 161 162 if __name__ == '__main__': 163 # 创建爬虫对象 164 spider = LaGou() 165 # 调用run()执行爬虫 166 spider.run()