看雪精华帖爬虫
看雪自带的搜索感觉不是太好用, 然后弄了个爬虫
目前支持4种功能
1. 爬取某个版块所有的链接, 并保持到文件
2. 自动把精华帖分类出来, 并保存到文件
3. 把含有指定关键字的链接单独保存为文件(针对所有链接)
4. 把含有指定关键字的链接单独保存为文件(针对所有精华帖链接)
github下载地址:
https://github.com/bingghost/pediy_spider
需要下载下面的依赖库
bs4
requests
html5lib
代码如下
#!/usr/bin/env python # encoding: utf-8 """ @author: bingghost @copyright: 2016 bingghost. All rights reserved. @contact: @date: 2016-12-1 @description: 看雪爬虫 """ import re import time import requests import argparse from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding('utf8') class PediySpider: def __init__(self, spider_url, specified_title): self._url = spider_url self.file_dict = {"all_title":"all_title.txt", "good_title":"good_title.txt", "filter_title":"filter_title.txt", "filter_good_title":"filter_good_title.txt"} # good title self.filter_list = ['jhinfo.gif', 'good_3.gif', 'good_2.gif'] # title specified self.specified_title = specified_title self.page_count = self.get_page_count() pass def get_page_content(self, page_num): rep_data = requests.get(self._url + str(page_num)) soup = BeautifulSoup(rep_data.content, 'html5lib') return soup def get_page_count(self): value = int(self.get_page_content(1).select('div.pagenav td.vbmenu_control')[0].get_text().split(' ')[-2]) return value def save_file(self, content, filename): print content with open(filename, 'a+') as f: f.write(content.encode('utf-8') + '\n') def is_good_title(self, item): # The item in threads_box is a <td> tag. when we need to find # the img tag,we must find in its parent tag namely the <tr> tag. img_list = item.parent.find_all('img') for img in img_list: if img.get('src').split('/')[-1].lstrip() in self.filter_list: return True return False pass def is_specified_title(self, title_content): if self.specified_title is None: return False specified_title_encode = self.specified_title.encode('utf8') title_content_encode = title_content.encode('utf8') if specified_title_encode in title_content_encode: return True pass return False pass def is_good_specified_title(self, title_content): if self.specified_title is None: return False specified_title_encode = self.specified_title.encode('utf8') title_content_encode = title_content.encode('utf8') if specified_title_encode in title_content_encode: return True pass return False pass def check_content(self, threads_box): url_head = 'http://bbs.pediy.com/showthread.php?' + 't=' for item in threads_box: title_box = item.find(id=re.compile('thread_title')) title = title_box.get_text() title_url = url_head + title_box.get('href').split('=')[-1] # now get the title and url self.save_file(title + ' ' + title_url, self.file_dict['all_title']) # excellent good and attention title is_good_title = self.is_good_title(item) if is_good_title: # print single_thread_box self.save_file(title + ' ' + title_url, self.file_dict['good_title']) pass # specified title content if self.is_specified_title(title): self.save_file(title + ' ' + title_url, self.file_dict['filter_title']) pass # specified good title content is_good_title_filter = self.is_good_specified_title(title) if is_good_title and is_good_title_filter: self.save_file(title + ' ' + title_url, self.file_dict['filter_good_title']) pass pass def worker(self): for i in range(1, 100000): if i > self.page_count: break # get all title info in current page threads_box = self.get_page_content(i).find_all(id=re.compile('td_threadtitle')) self.check_content(threads_box) time.sleep(3) def start_work(self): print "[-] start spider" self.worker() print "[-] spider okay" pass pass def set_argument(): # add description parser = argparse.ArgumentParser( description="A spider for the bbs of pediy's Android security forum," "also you can modify the url to spider other forum.") # add argument group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '-a', '--all', action='store_true', help='Get all titles') group.add_argument( '-f', '--filter', type=str, default=None, help='filter title') group.add_argument( '-gf', '--gfilter', type=str, default=None, help='filter good title') args = parser.parse_args() return args pass def main(): args = set_argument() spider_dict = {"android":"http://bbs.pediy.com/forumdisplay.php?f=161&order=desc&page=", "ios":"http://bbs.pediy.com/forumdisplay.php?f=166&order=desc&page="} pediy_spider = None if args.all: pediy_spider = PediySpider(spider_dict['android'], None) pass if args.filter: pediy_spider = PediySpider(spider_dict['android'], args.filter) pass if args.gfilter: pediy_spider = PediySpider(spider_dict['android'], args.gfilter) pass pediy_spider.start_work() pass if __name__ == '__main__': main()
效果: