练习之代码片段,以做备忘:
# encoding=utf8 from __future__ import unicode_literals import urllib, urllib2 import re import os import threading def get_html(url): try: url = url.encode('utf-8') page = urllib2.urlopen(url) return page.read() except Exception as ex: print 'get url_%s html error, ex=%s' % (url, ex) def get_images(url): url = url.encode('utf-8') html = get_html(url) pattern = r'<img.+class="BDE_Image".+src="([^"]+\.jpg)"' img_list = re.findall(pattern, html) pattern = r'<img.+src="([^"]+\.jpg)".+class="BDE_Image"' img_list.extend(re.findall(pattern, html)) # 去重 img_list = sorted(set(img_list), key=img_list.index) return img_list # 指定主题页面之总页面数 def get_page_count(url): html = get_html(url) pattern = r'"total_page":(\d+)' m = re.search(pattern, html) return m.group(1) if m else 0 # 获取每页主题url列表 def get_page_urls(html): pattern = r'<a href="/p/(\d+)"' url_list = re.findall(pattern, html) if url_list: url_list = map(lambda x: 'https://tieba.baidu.com/p/%s' % x, url_list) return url_list # 下载指定页面之图片 def download_page_images(page_url): html = get_html(page_url) title = re.search(r'(?<=<title>)(.*)(?=</title>)', html).group(1) print title page_no = re.search(r'(\d+)', page_url).group(0) page_count = int(get_page_count(page_url)) print 'page: %s, page_count: %d' % (page_no, page_count) for page_idx in range(1, page_count + 1): url = page_url + '?pn=%d' % page_idx img_list = get_images(url) if img_list: print 'page index: %d, image_count: %d' % (page_idx, len(img_list)) if not os.path.exists('images'): os.mkdir('images') img_folder = 'images\%s' % page_no if not os.path.exists(img_folder): os.mkdir(img_folder) idx = 0 for img_url in img_list: img_filename = img_folder + '\%d_%d.jpg' % (page_idx, idx) if not os.path.exists(img_filename): urllib.urlretrieve(img_url, img_filename) idx += 1 def main(): # 扒取最大页数 max_pagecount = 30 base_url = r'https://tieba.baidu.com/f?kw=图片&ie=utf-8?pn=%s' # 分页而扒 for idx in range(1, max_pagecount): url = base_url % ((idx - 1) * 50) html = get_html(url) url_list = get_page_urls(html) for page_url in url_list: try: download_page_images(page_url) threading._sleep(2) except: continue if __name__ == '__main__': main()