python selenium多线程爬虫
xxx网的反爬做的很好,按照正常套路来,请求头一旦出现一点问题就会被识别出来,如果使用selenium的话可以将网页的源码直接拿下来,所以思路就有了。
我爬的python的岗位内容。
class LaGou(object): # 设置防检测 option = webdriver.ChromeOptions() option.add_experimental_option('useAutomationExtension', False) option.add_experimental_option('excludeSwitches', ['enable-automation']) driver_path = r'E:/chromedriver/chromedriver.exe' # 初始化方法 def __init__(self, url): self.driver = webdriver.Chrome(executable_path=LaGou.driver_path, options=LaGou.option) self.url = url self.__host = 'localhost' self.__port = 3306 self.__user = 'root' self.__password = 'xxxxx' self.__db = 'xxxx' self.__charset = 'utf8' # 连接MySQL def connect(self): self.__conn = pymysql.connect( host=self.__host, port=self.__port, user=self.__user, password=self.__password, db=self.__db, charset=self.__charset )
首先解析出首页里面详情页的url,用selenium向首页发起请
# 请求首页 def get_html(self): page_list = [] self.driver.get(self.url) time.sleep(random.randint(1, 3)) for page in range(1, 2): a = self.driver.find_element_by_xpath('//*[@id="s_position_list"]/div[2]/div/span[6]') self.driver.execute_script('arguments[0].click();', a) time.sleep(random.randint(1, 3)) page_text = self.driver.page_source page_list.append(page_text) return page_list # 解析出详情页url def parse_detail_url(self, html): ul_list = etree.HTML(html) ul = ul_list.xpath('//*[@id="s_position_list"]/ul') for li in ul: detail_url = li.xpath('./li/div[1]/div[1]/div[1]/a/@href') return detail_url
把详情页的url解析出来以后在使用selenium向详情页发起请求拿到详情页的具体数据
# 请求详情页 def get_detail_data(self, url): detail_data_list = [] self.driver.get(url) time.sleep(random.randint(1, 3)) detail_data = self.driver.page_source detail_data_list.append(detail_data) return detail_data_list # 解析详情页具体数据 def parse_detail_data(self, html): detail_data = etree.HTML(html) div_list = detail_data.xpath('//*[@id="__next"]/div[2]') for all_data in div_list: # 职位名称 position = all_data.xpath('./div[1]/div/div[1]/div[1]/h1/span/span/span[1]/text()')[0] # 薪资 pay = all_data.xpath('./div[1]/div/div[1]/div[1]/h1/span/span/span[2]/text()')[0] # 经验 experience = all_data.xpath('./div[1]/div/div[1]/dd/h3/span[2]/text()')[0] # 学历 education = all_data.xpath('./div[1]/div/div[1]/dd/h3/span[3]/text()')[0].replace('/', '') # 岗位 job_title = all_data.xpath('./div[1]/div/div[1]/dd/h3/div/span[2]/text()')[0] # 职位描述 job_description = all_data.xpath('./div[2]/div[1]/dl[1]/dd[2]/div/text()') t_job_description = ''.join(job_description).replace('\n', '') all_data_dict = { 'position': position, 'pay': pay, 'experience': experience, 'education': education, 'job_title': job_title, 't_job_description': t_job_description } self.save_mysql(position, pay, experience, education, job_title, t_job_description)
因为xpath拿到的是列表元素所以直接拿[0]个元素就行
将拿到的数据保存到mysql
# 持久化存储到MySQL def save_mysql(self, position, pay, experience, education, job_title, t_job_description): self.connect() cursor = self.__conn.cursor() sql = ( 'insert into python_lagou(position, pay, experience, education, job_title, t_job_description) values (%s, %s, %s, %s, %s, %s)', position, pay, experience, education, job_title, t_job_description) try: cursor.execute(sql) except Exception as e: print('Error:', e) self.__conn.rollback() finally: cursor.close() self.__conn.close() self.__conn.commit()
最后执行main方法
# main方法 def main(self): html_list = self.get_html() for html in html_list: for url in self.parse_detail_url(html): for data in self.get_detail_data(url): self.parse_detail_data(data) t_list = [] # 开启多线程 for i in range(10): run = LaGou('https://www.xxx.com') t_list.append(run) thread_list = [] for i in t_list: t = Thread(target=i.main, args=()) thread_list.append(t) for t in thread_list: t.start() for t in thread_list: t.join()
完整代码
class LaGou(object): # 设置防检测 option = webdriver.ChromeOptions() option.add_experimental_option('useAutomationExtension', False) option.add_experimental_option('excludeSwitches', ['enable-automation']) driver_path = r'E:/chromedriver/chromedriver.exe' # 初始化方法 def __init__(self, url): self.driver = webdriver.Chrome(executable_path=LaGou.driver_path, options=LaGou.option) self.url = url self.__host = 'localhost' self.__port = 3306 self.__user = 'root' self.__password = 'xxx' self.__db = 'xxx' self.__charset = 'utf8' # 连接MySQL def connect(self): self.__conn = pymysql.connect( host=self.__host, port=self.__port, user=self.__user, password=self.__password, db=self.__db, charset=self.__charset ) # 请求首页 def get_html(self): page_list = [] self.driver.get(self.url) time.sleep(random.randint(1, 3)) for page in range(1, 2): a = self.driver.find_element_by_xpath('//*[@id="s_position_list"]/div[2]/div/span[6]') self.driver.execute_script('arguments[0].click();', a) time.sleep(random.randint(1, 3)) page_text = self.driver.page_source page_list.append(page_text) return page_list # 解析出详情页url def parse_detail_url(self, html): ul_list = etree.HTML(html) ul = ul_list.xpath('//*[@id="s_position_list"]/ul') for li in ul: detail_url = li.xpath('./li/div[1]/div[1]/div[1]/a/@href') return detail_url # 请求详情页 def get_detail_data(self, url): detail_data_list = [] self.driver.get(url) time.sleep(random.randint(1, 3)) detail_data = self.driver.page_source detail_data_list.append(detail_data) return detail_data_list # 解析详情页具体数据 def parse_detail_data(self, html): detail_data = etree.HTML(html) div_list = detail_data.xpath('//*[@id="__next"]/div[2]') for all_data in div_list: # 职位名称 position = all_data.xpath('./div[1]/div/div[1]/div[1]/h1/span/span/span[1]/text()')[0] # 薪资 pay = all_data.xpath('./div[1]/div/div[1]/div[1]/h1/span/span/span[2]/text()')[0] # 经验 experience = all_data.xpath('./div[1]/div/div[1]/dd/h3/span[2]/text()')[0] # 学历 education = all_data.xpath('./div[1]/div/div[1]/dd/h3/span[3]/text()')[0].replace('/', '') # 岗位 job_title = all_data.xpath('./div[1]/div/div[1]/dd/h3/div/span[2]/text()')[0] # 职位描述 job_description = all_data.xpath('./div[2]/div[1]/dl[1]/dd[2]/div/text()') t_job_description = ''.join(job_description).replace('\n', '') all_data_dict = { 'position': position, 'pay': pay, 'experience': experience, 'education': education, 'job_title': job_title, 't_job_description': t_job_description } self.save_mysql(position, pay, experience, education, job_title, t_job_description) # 持久化存储到MySQL def save_mysql(self, position, pay, experience, education, job_title, t_job_description): self.connect() cursor = self.__conn.cursor() sql = ( 'insert into python_lagou(position, pay, experience, education, job_title, t_job_description) values (%s, %s, %s, %s, %s, %s)', position, pay, experience, education, job_title, t_job_description) try: cursor.execute(sql) except Exception as e: print('Error:', e) self.__conn.rollback() finally: cursor.close() self.__conn.close() self.__conn.commit() # main方法 def main(self): html_list = self.get_html() for html in html_list: for url in self.parse_detail_url(html): for data in self.get_detail_data(url): self.parse_detail_data(data) t_list = [] # 开启多线程 for i in range(10): run = LaGou('https://www.xxx.com') t_list.append(run) thread_list = [] for i in t_list: t = Thread(target=i.main, args=()) thread_list.append(t) for t in thread_list: t.start() for t in thread_list: t.join() if __name__ == '__main__': run = LaGou('https://www.xxx.com') run.main()
到这里就结束了。