A flash of Joy
import re from datetime import timedelta from tornado import httpclient, gen, ioloop, queues peoples = {'011': 71, '012': 66, '013': 54, '014': 50, '015': 66, '041': 61, '042': 103, '044': 31, '061': 32, '062': 41, '063': 33, '073': 93, '074': 50, '077': 108, '081': 55, '083': 55, '084': 92, '102': 56, '105': 29, '106': 27, '107': 25, '108': 25, '141': 50, '143': 66, '144': 68, '161': 52, '162': 50, '163': 50, '164': 52, '167': 50, '181': 133, '201': 166, '202': 10, '203': 8, '204': 99, '211': 18, '212': 50, '213': 24, '214': 19, '215': 25, '216': 24, '217': 24, '221': 67, '222': 52, '224': 67, '261': 67, '271': 8, '274': 31, '291': 82, '292': 62, '296': 8, '312': 104, '341': 52, '316': 52, '331': 47, '332': 56, '333': 72, '335': 57, '351': 36, '352': 50, '371': 120, '372': 50, '373': 56} class AsySpider(object): def __init__(self, urls, concurrency=10, results=None, **kwargs): urls.reverse() self.urls = urls self.concurrency = concurrency self._q = queues.Queue() self._fetching = set() self._fetched = set() if results is None: self.results = [] def fetch(self, url, **kwargs): fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch') return fetch(url, raise_error=False, **kwargs) def handle_html(self, url, html): """handle html page""" print(url) def handle_response(self, url, response): """inherit and rewrite this method if necessary""" if response.code == 200: self.handle_html(url, response.body) elif response.code == 599: # retry self._fetching.remove(url) self._q.put(url) @gen.coroutine def get_page(self, url): try: response = yield self.fetch(url) # print('######fetched %s' % url) except Exception as e: print('Exception: %s %s' % (e, url)) raise gen.Return(e) raise gen.Return(response) @gen.coroutine def _run(self): @gen.coroutine def fetch_url(): current_url = yield self._q.get() try: if current_url in self._fetching: return # print('fetching****** %s' % current_url) self._fetching.add(current_url) response = yield self.get_page(current_url) self.handle_response(current_url, response) # handle reponse self._fetched.add(current_url) for i in range(self.concurrency): if self.urls: yield self._q.put(self.urls.pop()) finally: self._q.task_done() @gen.coroutine def worker(): while True: yield fetch_url() self._q.put(self.urls.pop()) # add first url # Start workers, then wait for the work queue to be empty. for _ in range(self.concurrency): worker() yield self._q.join(timeout=timedelta(seconds=300000)) try: assert self._fetching == self._fetched except AssertionError: print(self._fetching - self._fetched) print(self._fetched - self._fetching) def run(self): io_loop = ioloop.IOLoop.current() io_loop.run_sync(self._run) class MySpider(AsySpider): def fetch(self, url, **kwargs): """重写父类fetch方法""" cookies_str = 'JSESSIONID=0000n4jBi_dKg91XbtHHQHDeeDL:1b4e17j2v; iPlanetDire' \ 'ctoryPro=AQIC5wM2LY4Sfcxu%' \ '2FWPIJWGHttZPiXafd%2B1gowyEoxTmyiY%3D%40AAJTSQACMDE%3D%23' headers = { 'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)', 'cookie': cookies_str } return super(MySpider, self).fetch( url, headers=headers ) def handle_html(self, url, html): url += 'qwertyu' pattern = re.compile('userPhoto&ownerId=(.*)qwertyu') filename = re.findall(pattern, url)[0] # 注意把dir修改成你想要存放照片位置.例如C:/picture/ dir = '/home/innovation/文档/pic/' with open(dir + filename + '.jpg', 'wb') as file: file.write(html) file.close() def main(): urls = [] url_pic = 'http://myportal.sxu.edu.cn/attachmentDownload.portal?notUseCache=true&type=userPhoto&ownerId=' for academy in peoples: for i in range(peoples[academy]): i += 1 if i < 10: i = '00' + str(i) elif 100 > i >= 10: i = '0' + str(i) urls.append(url_pic + '2014' + academy + str(i)) s = MySpider(urls) s.run() if __name__ == '__main__': main()