Python 之糗事百科多线程爬虫案例
import requests from lxml import etree import json import threading import queue # 采集html类 class GetHtml(threading.Thread): def __init__(self, page_queue): threading.Thread.__init__(self) self.page_queue = page_queue def run(self): self.do_get_html() def do_get_html(self): headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"} global data_queue while True: if self.page_queue.empty(): break page = self.page_queue.get() url = "https://www.qiushibaike.com/8hr/page/%s/" % str(page) timeout = 5 while timeout > 0: try: _response = requests.get(url, headers=headers) html = _response.content # 保存到待解析队列 data_queue.put(html) break except ConnectionError as e: print(e) timeout -= 1 if timeout < 0: print("time out, url: " + url) class ParseHtml(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): self.do_parse_data() def do_parse_data(self): global total, f while True: if data_queue.empty(): break try: html = data_queue.get() text = etree.HTML(html) list_node = text.xpath("//li[contains(@id, 'qiushi_tag_')]") for node in list_node: username = node.xpath(".//a[@class='recmd-user']/img/@alt")[0] user_img = node.xpath(".//a[@class='recmd-user']/img/@src")[0] zan_num = node.xpath(".//div[@class='recmd-num']/span[position()=1]/text()")[0] ping_num = node.xpath(".//div[@class='recmd-num']/span[position()=4]/text()") content = node.xpath(".//a[@class='recmd-content']/text()") if len(ping_num) > 0: ping_num = ping_num[0] else: ping_num = 0 if len(content) > 0: content = content[0] else: content = "" result = { "username": username, "imgUrl": user_img, "vote": zan_num, "comments": ping_num, "content": content } total += 1 f.write((json.dumps(result, ensure_ascii=False) + "\n").encode("utf-8")) except RuntimeError as e: print(e) def main(): # 将采集到的html保存到队列 for i in range(1, 21): page_queue.put(i) # 开启采集线程 get_html_thread = [] for i in range(100): get_html = GetHtml(page_queue) get_html.start() get_html_thread.append(get_html) # 等待所有采集线程完成 for thread in get_html_thread: thread.join() # 开启解析线程 parse_html_thread = [] for i in range(100): parse_html = ParseHtml() parse_html.start() parse_html_thread.append(parse_html) # 等待所有解析线程完成 for thread in parse_html_thread: thread.join() # 关闭文件 f.close() print("采集数据完成,总共%s条数据" % total) if __name__ == '__main__': data_queue = queue.Queue() page_queue = queue.Queue() f = open("./qunaerwang.json", "wb") total = 0 main()
数据:
{"username": "夲少姓〖劉〗", "imgUrl": "//pic.qiushibaike.com/system/avtnew/1187/11878716/thumb/20190520091055.jpg?imageView2/1/w/50/h/50", "vote": "873", "comments": "66", "content": "马中赤兔人中啥了?"} {"username": "一枕清霜゛", "imgUrl": "//pic.qiushibaike.com/system/avtnew/3371/33712263/thumb/20190511210156.jpg?imageView2/1/w/50/h/50", "vote": "1224", "comments": "7", "content": "一个段子手,一个神回复"} {"username": "窝里斗窝里", "imgUrl": "//pic.qiushibaike.com/system/avtnew/1427/14275616/thumb/20181228173532.jpg?imageView2/1/w/50/h/50", "vote": "418", "comments": "7", "content": "鹰科猛禽走路的姿势看上去总是屌屌的!!"} {"username": "2丫头还是个宝宝", "imgUrl": "//pic.qiushibaike.com/system/avtnew/2219/22190863/thumb/20190131225946.jpg?imageView2/1/w/50/h/50", "vote": "801", "comments": "26", "content": "都说孩子玩沙子有助于孩子的智力发育,所以家里买了一车沙子放院子给逗逗玩。逗逗拿了一个铲子和一个望远镜玩具,当着我的面把望远镜在埋沙子里。拉着我的手:妈妈,我在沙"} {"username": "★像风一样一样★", "imgUrl": "//pic.qiushibaike.com/system/avtnew/2716/27163432/thumb/20180306191622.JPEG?imageView2/1/w/50/h/50", "vote": "274", "comments": "8", "content": "去朋友家看到的,特殊的插排,一个插排才多少钱啊?"} {"username": "无语滴滴", "imgUrl": "//pic.qiushibaike.com/system/avtnew/3782/37821797/thumb/20190430173233.jpg?imageView2/1/w/50/h/50", "vote": "603", "comments": "15", "content": "朋友和他女友吵架闹分手,我们都去劝。他女友抹抹眼泪看着窗外说了一句话:“要不是还有几个快递在路上,我真想死了算了。”"} {"username": "愚人愚之不如愚己", "imgUrl": "//pic.qiushibaike.com/system/avtnew/1927/19270659/thumb/20160618154530.jpg?imageView2/1/w/50/h/50", "vote": "2055", "comments": "122", "content": "老司机你发了多大的誓?"} 余下数据省略。。。
往后思路:
1、保存到数据库
2、保存到redis中、然后再同步到数据库