爬虫22-使用selenium爬取信息
1.正常使用cookie爬取拉勾网ajax数据
import requests from lxml import etree import time import re headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=", "Cookie": "user_trace_token=20200226133453-084540c1-9531-4fa8-873f-0dda32aa3ca4; _ga=GA1.2.836052667.1582695295; LGUID=20200226133454-167deda5-1930-4e79-8834-719427ac01be; index_location_city=%E5%85%A8%E5%9B%BD; lagou_utm_source=A; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221707ffdf39c2c3-0001957fd8ade1-3a614f0b-2073600-1707ffdf39de5f%22%2C%22%24device_id%22%3A%221707ffdf39c2c3-0001957fd8ade1-3a614f0b-2073600-1707ffdf39de5f%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; gate_login_token=5976db005818f45ed7756b1348563965e46f1400511d886af3d4d57dd9d9166a; LG_LOGIN_USER_ID=5b895ff2a4e23c48dc4c9110a6a1361bbf709630b5b17ac6756340fef1babfbf; LG_HAS_LOGIN=1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1583857959,1583912708,1583912713; JSESSIONID=ABAAAECABGFABFF1412C84500FD39A23D7C1D5172179D66; WEBTJ-ID=20200315123348-170dc782d0e4cf-05e9fb23740e5e-3a614f0b-2073600-170dc782d0f63d; _gid=GA1.2.1720707822.1584246829; _putrc=387928C58CE0A7D1123F89F2B170EADC; login=true; unick=%E7%90%B3%E7%90%B3; TG-TRACK-CODE=index_search; X_MIDDLE_TOKEN=0a8830791829a77f99654a1bb3d568ae; LGSID=20200315140707-568ce08c-c655-44b2-9cd4-66632e1bb6f4; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist%5Fpython%2Fp-city%5F0%3F%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; _gat=1; SEARCH_ID=79abbbd66c2b4a59b7ca19ee8fb77e01; X_HTTP_TOKEN=9944cc335d13b0d30552524851b568c7665cd1a0ff; LGRID=20200315140911-acf5dfc4-1c8f-4943-a93f-983d364a96db", "Origin": "https://www.lagou.com", 'X-Anit-Forge-Code': "0", "X -Anit-Forge-Token": "None", "X-Requested-With": "XMLHttpRequest" } positions = [] def request_list_page(): url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false" data = { "frist": "false", "pn": "1", "kd": "python" } for x in range(1,10): data['pn']=x response = requests.post(url, data=data, headers=headers) result=response.json() # 如果返回的是json数据,会被load成一个字典 positions=result['content']['positionResult']['result'] for position in positions: positionId=position['positionId']#根据这个id找页面 position_url='https://www.lagou.com/jobs/%s.html'%positionId parse_position_detail(position_url) break time.sleep(2) break def parse_position_detail(url): response=requests.get(url,headers=headers) text=response.text html=etree.HTML(text) name=html.xpath("//div[@class='job-name']/@title")[0] job_span=html.xpath("//dd[@class='job_request']//span") salary=job_span[0].xpath('.//text()')[0].strip() city=job_span[1].xpath(".//text()")[0].strip() city=re.sub(r"[\s/]","",city) position = { 'name': name, 'salary': salary, 'city': city } positions.append(position) def main(): request_list_page() print(positions) if __name__ == '__main__': main()
2.使用selenium爬取拉勾网ajax数据
#encoding: utf-8 from selenium import webdriver from lxml import etree import re import time from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By class LagouSpider(object): def __init__(self): self.driver = webdriver.Firefox() self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' self.positions = [] def run(self): self.driver.get(self.url) while True: source = self.driver.page_source WebDriverWait(driver=self.driver,timeout=10).until( EC.presence_of_element_located((By.XPATH,"div[@class='pager_container']/span[last()]]")) ) self.parse_list_page(source) try: next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]") if "pager_next_disabled" in next_btn.get_attribute("class"): break else: next_btn.click() except: print(source) time.sleep(1) def parse_list_page(self,source): html = etree.HTML(source) links = html.xpath("//a[@class='position_link']/@href") for link in links: self.request_detail_page(link) time.sleep(1) def request_detail_page(self,url): # self.driver.get(url) self.driver.execute_script("window.open('%s')"%url)#打开新标签 self.driver.switch_to.window(self.driver.window_handles[1])#driver移动到新标签 WebDriverWait(self.driver,timeout=10).until( EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']")) ) source = self.driver.page_source self.parse_detail_page(source) self.driver.close()# 关闭当前这个详情页 self.driver.switch_to.window(self.driver.window_handles[0])# 继续切换回职位列表页 def parse_detail_page(self,source): html = etree.HTML(source) name = html.xpath("//div[@class='job-name']/@title")[0] job_span = html.xpath("//dd[@class='job_request']//span") salary = job_span[0].xpath('.//text()')[0].strip() city = job_span[1].xpath(".//text()")[0].strip() city = re.sub(r"[\s/]", "", city) position = { 'name': name, 'salary': salary, 'city': city } self.positions.append(position) print(position) print('='*40) if __name__ == '__main__': spider = LagouSpider() spider.run()