使用requests爬取广西人才W职位信息(多线程)入库mongodb
requests_thread_gxrc_com.py
#!/usr/bin/env python3 # coding=utf-8 # Version:python3.6.1 # File:gxrc_com.py # Author:LGSP_Harold import pymongo import requests from multiprocessing import Queue from lxml import etree import threading from handle_mongo import MongoClient # 处理页码类 class CrawlPage(threading.Thread): # 重写父类(子类CrawPage需要添加新的参数时,需要重写父类),thread_name, page_queue, data_queue是子类使用,不需要传给父类 def __init__(self, thread_name, page_queue, data_queue, *args, **kwargs): # super自动找到父类(threading.Thread),帮助调用父类方法__init__() super(CrawlPage, self).__init__(*args, **kwargs) # 线程的名称 self.thread_name = thread_name # 页码的队列 self.page_queue = page_queue # 数据的队列 self.data_queue = data_queue # 默认请求头 self.headers = { 'Accept': 'text / html, application / xhtml + xml, application / xml; q = 0.9, image / webp, image / apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'DNT': '1', 'Host': 's.gxrc.com', 'Pragma': 'no-cache', 'Referer': 'https://www.gxrc.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } def run(self) -> None: print('当前启动处理页码的任务线程为:%s' % self.thread_name) while not page_flag: # Queue队列去put或者get的时候,需要设置block # block默认为True,需设置成False # 当前队列没有数据或队列满了,将会抛出异常(empty,full) try: # 通过get方法,将队列里面的页码get出来。 # block默认为True,当队列为空时,会一直等待,或设置timeout等待时间 # 将block设置为False,当队列为空,会报异常,通过捕获来处理 page = self.page_queue.get(block=False) page_url = 'https://s.gxrc.com/sJob?schType=1&page=' + str(page) print('当前构造的url为%s' % page_url) # 配置动态代理 # proxy = { # 'http': 'http://xxxxxx:xxxxxx@baidu.com:9999', # 'https': 'http://xxxxxx:xxxxxx@baidu.com:9999' # } # 通过requests方法请求构造的url # res = requests.get(url=page_url, headers=self.headers, proxies=proxy) res = requests.get(url=page_url, headers=self.headers) # 设置网页编码 res.encoding = 'utf-8' # 将请求回来的数据放到数据队列里面去 self.data_queue.put(res.text) except Exception as e: pass # 处理网页文本数据类 class CrawlHtml(threading.Thread): # 从页码解析过来的数据,需要保存到data_queue def __init__(self, thread_name, data_queue, lock, db, collections, *args, **kwargs): super(CrawlHtml, self).__init__(*args, **kwargs) self.thread_name = thread_name self.data_queue = data_queue self.lock = lock self.db = db self.collections = collections # 处理网页方法 def parse(self, text): # HTML实例化 html = etree.HTML(text) items = html.xpath('//div[@class="rlOne"]/ul[@class="posDetailUL clearfix"]') data_list = [] for item in items: data = {} data['job_name'] = item.xpath('.//a[@class="posName"]/text()')[0] data['company_name'] = item.xpath('.//a[@class="entName"]/text()')[0] try: data['company_address'] = item.xpath('.//li[@class="w4"]/text()')[0] except Exception as e: data['company_address'] = '未知' try: data['money'] = item.xpath('.//li[@class="w3"]/text()')[0] except Exception as e: data['money'] = '面议' data['date'] = item.xpath('.//li[@class="w5"]/text()')[0] data_list.append(data) return data_list def run(self) -> None: print('当前启动处理数据任务线程为:%s' % self.thread_name) while not data_flag: try: # 从队列取数据 text = self.data_queue.get(block=False) # 调用方法处理数据 result = self.parse(text) # print(result) # 引入锁 with self.lock: insert_data = MongoClient(self.db, self.collections) insert_data.insert_db(result) except Exception as e: pass # 定义两个全局的flag page_flag = False data_flag = False def main(): # 定义两个队列,存放页码和文本数据的队列 page_queue = Queue() data_queue = Queue() # 定义一个锁 lock = threading.Lock() # 将页码放到页码队列里面去 for page in range(1, 504): # 通过put方法将页码存放到page_queue里面 page_queue.put(page) # 打印一个提示,page_queue.qsize()返回当前队列的长度 print('当前页码队列的总量为%s' % page_queue.qsize()) # 包含线程的名称,开三个线程 crawl_page_list = ['页码线程1', '页码线程2', '页码线程3'] page_thread_list = [] for thread_name_page in crawl_page_list: thread_page = CrawlPage(thread_name_page, page_queue, data_queue) # 启动线程 thread_page.start() page_thread_list.append(thread_page) # 设置三个线程,处理文本数据 parse_list = ['文本线程1', '文本线程2', '文本线程3'] parse_thread_list = [] db = 'db_gxrc' collections = 'collections_gxrc' for thread_name_parse in parse_list: thread_parse = CrawlHtml(thread_name_parse, data_queue, lock, db, collections) thread_parse.start() parse_thread_list.append(thread_parse) # 设置线程退出机制 # 页码线程 global page_flag # 在page_queue为空时,while不成立 while not page_queue.empty(): pass page_flag = True # 结束页码处理线程 for thread_page_join in page_thread_list: thread_page_join.join() print(thread_page_join.thread_name, '处理结束') # 文本线程 global data_flag while not data_queue.empty(): pass data_flag = True for thread_data_join in parse_thread_list: thread_data_join.join() print(thread_data_join.thread_name, '处理结束') if __name__ == '__main__': # 函数入口 main()
handle_mongo.py
#!/usr/bin/env python3 # coding=utf-8 # Version:python3.6.1 # File:handle_mongo.py # Author:LGSP_Harold import pymongo class MongoClient: def __init__(self, db, collections, *args, **kwargs): super(MongoClient, self).__init__(*args, **kwargs) client = pymongo.MongoClient('mongodb://admin:admin@127.0.0.1:27017') self.db = client[db] self.collections = self.db[collections] def insert_db(self, item): self.collections.insert_many(item)
略懂,略懂....