python 爬虫煎蛋网
import urllib.request import os from urllib import error import re import base64 def url_open(url): req = urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0') try: response = urllib.request.urlopen(req) except error.HTTPError: print("有异常的url为:" + url) return "" else: html = response.read() return html def get_page(url): if url != "": html = url_open(url).decode('utf-8') a = html.find('current-comment-page') + 23 b = html.find(']',a) return html[a:b] def find_image(url): image_addrs = [] html = url_open(url).decode('utf-8') reg = r'class="img-hash">(.+)</span>' # 正则表达式 src_img = re.compile(reg) image_addrs_base64 = src_img.findall(html) for s in image_addrs_base64: image_addrs.append("http:"+ str(base64.b64decode(s).decode('utf-8')))#图片地址是用base64加密 return image_addrs def save_image(image_addrs): for each in image_addrs: filename = each.split('/')[-1] with open("picture/"+filename,'wb') as f: img = url_open(each) f.write(img) def download_girls(pages = 20): url = 'http://jandan.net/ooxx/' page_num = int(get_page(url)) for i in range(pages): page_num -= 1 page_url = url + 'page-' + str(page_num) + '#comments' image_addrs = find_image(page_url) save_image(image_addrs) if __name__ == '__main__': download_girls() print("执行结束")