imgur.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
 
from __future__ import with_statement
import sys
import os
import urllib2
from urlparse import urlparse
import random
import re
import gevent
from gevent import monkey
monkey.patch_all()
 
def get(url):
  setup = urllib2.build_opener()
  # TODO: Write appropriate headers.
  setup.add_headers = [('None', 'None')]
  urllib2.install_opener(setup)
  try:
    request = urllib2.Request(url)
  except (urllib2.HTTPError, urllib2.URLError), e:
    sys.exit(-1)
  return setup.open(request)
 
def is_url(url):
  res = urlparse(url)
  return 'imgur.com' in res.netloc
 
def fetch(url):
  res = urlparse(url)
  key = res.path.split('/')[2]
  urll = 'https://imgur.com/a/%s/noscript' % key
  return get(urll).read(), key
 
def get_or_create_folder(key, folder=None):
  foldername = key
  if folder is not None:
    foldername = folder
  if not os.path.exists(foldername):
    os.makedirs(foldername)
  return foldername
  
def fetch_images(foldername, images):
  gevent.sleep(random.randint(0, 1) * 0.0001)
  path = os.path.join(foldername, images[1])
  with open(path, 'wb') as img:
    img.write(get(images[0]).read())
  print 'Done:\t%s' % images[0]
 
def save(url, folder=None):
  data, key = fetch(url)
  REGEX = re.compile(r'<img src="(http\:\/\/i\.imgur\.com\/([a-zA-Z0-9]{5}\.(jpg|png|gif)))"')
  images = REGEX.findall(data)
  foldername = get_or_create_folder(key, folder)
  return foldername, images
 
 
if __name__ == '__main__':
  url = sys.argv[1]
  try:
    folder = sys.argv[2]
  except IndexError:
    folder = None
  foldername, images = save(url, folder=folder)
  threads = [gevent.spawn(fetch_images, foldername, image) for image in images]
  gevent.joinall(threads)

 

posted @ 2015-06-22 01:14  白云辉  阅读(268)  评论(0编辑  收藏  举报