拉钩爬取部分重写
1. 实现方式:
-
scrapy+selenium
-
实现scrapy中的spider即可
2. 实现目标:
-
为实现对接之前的公司项目模板,实现统一化
-
对关键字进行汉字转字母,进行URL拼接操作,然后请求;
-
通过selenium获取到网页源码,进行信息解析;
-
yield返回给item,进行后续模板操作
4. 解决对关键字进行汉字转字母:
1 from pypinyin import lazy_pinyin 2 a = lazy_pinyin("南京") 3 print(a[0]) 4 5 print(a[1]) 6 #字符串拼接 7 print(a[0]+a[1])
5. 结果:
1 nan 2 jing 3 nanjing
6. spider核心代码:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from selenium import webdriver 4 from selenium.webdriver import ActionChains 5 import time 6 from pypinyin import lazy_pinyin 7 from TZtalent.items import TztalentItem 8 from lxml import etree 9 class LagouproSpider(scrapy.Spider): 10 name = 'lagoupro' 11 # allowed_domains = ['www.xxx.com'] 12 # start_urls = ['https://www.lagou.com/'] 13 14 def __init__(self, table_name, keyword, site, webhook, *args, **kwargs): 15 super(LagouproSpider, self).__init__(*args, **kwargs) 16 path = r"C:\Users\Administrator\Desktop\phantomjs-1.9.2-windows\phantomjs.exe" 17 # self.driver = webdriver.PhantomJS(executable_path=path) 18 # 防止selenium识别 19 options = webdriver.ChromeOptions() 20 options.add_experimental_option("excludeSwitches", ["enable-automation"]) 21 options.add_experimental_option('useAutomationExtension', False) 22 self.driver = webdriver.Chrome(options=options) 23 self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { 24 "source": """ 25 Object.defineProperty(navigator, 'webdriver', { 26 get: () => undefined 27 }) 28 """ 29 }) 30 # self.driver = webdriver.Chrome() 31 self.keyword = keyword 32 self.webhook_url = webhook 33 self.table_name = table_name 34 #中文转拼音 35 pinyin = lazy_pinyin(site) 36 print(pinyin) 37 self.site = pinyin[0]+pinyin[1] 38 print(self.site) 39 #字符串拼接---得到地域URL 40 self.start_urls =[f"https://www.lagou.com/{self.site}-zhaopin/"] 41 42 43 def parse(self, response): 44 self.driver.find_element_by_id("keyword").send_keys(self.keyword) 45 #鼠标移动到点击位置 46 ac = self.driver.find_element_by_id("submit") 47 ActionChains(self.driver).move_to_element(ac).perform() 48 time.sleep(2) 49 ActionChains(self.driver).move_to_element(ac).click(ac).perform() 50 time.sleep(2) 51 # 解析selenium发过来的response数据 52 str_html= self.driver.page_source 53 html = etree.HTML(str_html) 54 try: 55 # 父标签---所需要信息标签上的父标签 56 div_list = html.xpath("//ul[@class='item_con_list']/li") 57 item = TztalentItem() 58 for div in div_list: 59 item['title'] = div.xpath(".//h3/text()")[0] 60 # 判断title是否为空 61 if item['title'] == None: 62 continue 63 item['company_name'] = div.xpath(".//div[@class='company_name']/a/text()")[0] 64 item['company_url'] = div.xpath(".//div[@class='company_name']/a/@href")[0] 65 item['site'] = div.xpath(".//span[@class='add']/em//text()")[0] 66 yield item 67 # print(item) 68 69 except: 70 print('没有数据') 71 72 def spider_close(self, spider): 73 # 退出驱动并关闭所有关联的窗口 74 self.driver.quit()