练手代码,聊作备忘:
# encoding: utf-8 # from __future__ import unicode_literals import urllib import urllib2 import re import os import time from threading import Thread class BhsbSpider(object): _url = r'https://bh.sb/post/category/main/'; _page_count = 0 _page_index = 0 def __init__(self, url, page_count = 0): self._url = url self._page_count = page_count folder = '博海拾贝'.decode('utf-8') if not os.path.exists(folder): os.mkdir(folder) def spider(self): while self._page_index < self._page_count: self._page_index += 1 self._url = r'https://bh.sb/post/category/main/page/%d' % self._page_index self.do_spider(self._url) def do_spider(self, url): html = self.get_html(url) pattern = r'(?s)<h2><a\s+href="(?P<url>[^"]+).*?>\[博海拾贝\d+\](?P<title>[^<]+).*?' for i, m in enumerate(re.findall(pattern, html)): info = '%d. url: %s, title: %s' % ((self._page_index - 1) * 20 + i + 1, m[0], m[1]) print info # 多线程爬取页面 Thread(target=self.download, args=(m[0], m[1])).start() time.sleep(2) def download(self, url, title): title = '博海拾贝\\' + title title = title.decode('utf-8') if not os.path.exists(title): os.mkdir(title) html = self.get_html(url) pattern = r'(?s)<p>(?P<title>[^<]+).*?<p><img\s+src="(?P<image>[^"]+)"' for i, m in enumerate(re.findall(pattern, html)): img_title = m[0] img_url = m[1] img_filename = '%s/%s%s' % (title.encode('utf-8'), img_title, os.path.splitext(img_url)[1]) img_filename = img_filename.decode('utf-8') print 'download %s ...' % img_filename if not os.path.exists(img_filename): Thread(target=urllib.urlretrieve, args=(img_url, img_filename)).start() time.sleep(1) def get_html(self, url): try: url = url.encode('utf-8') req = urllib2.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.5.1000 Chrome/39.0.2146.0 Safari/537.36') page = urllib2.urlopen(req) return page.read() except Exception as ex: print 'get url_%s html error, ex=%s' % (url, ex) if __name__ == '__main__': url = r'https://bh.sb/post/category/main/' bs = BhsbSpider(url, 10) bs.spider()
未及细测试,其间有图片丢失情况。结果如下图示: