A flash of Joy

import re
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues

peoples = {'011': 71, '012': 66, '013': 54, '014': 50, '015': 66, '041': 61,
           '042': 103, '044': 31, '061': 32, '062': 41, '063': 33, '073': 93, '074': 50, '077': 108, '081': 55,
           '083': 55, '084': 92, '102': 56, '105': 29, '106': 27,
           '107': 25, '108': 25, '141': 50, '143': 66, '144': 68, '161': 52, '162': 50, '163': 50, '164': 52, '167': 50,
           '181': 133, '201': 166, '202': 10, '203': 8, '204': 99, '211': 18,
           '212': 50, '213': 24, '214': 19, '215': 25, '216': 24, '217': 24, '221': 67, '222': 52, '224': 67,
           '261': 67, '271': 8, '274': 31, '291': 82, '292': 62, '296': 8, '312': 104, '341': 52, '316': 52, '331': 47,
           '332': 56, '333': 72, '335': 57, '351': 36, '352': 50, '371': 120, '372': 50,
           '373': 56}


class AsySpider(object):
    def __init__(self, urls, concurrency=10, results=None, **kwargs):
        urls.reverse()
        self.urls = urls
        self.concurrency = concurrency
        self._q = queues.Queue()
        self._fetching = set()
        self._fetched = set()
        if results is None:
            self.results = []

    def fetch(self, url, **kwargs):
        fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
        return fetch(url, raise_error=False, **kwargs)

    def handle_html(self, url, html):
        """handle html page"""
        print(url)

    def handle_response(self, url, response):
        """inherit and rewrite this method if necessary"""
        if response.code == 200:
            self.handle_html(url, response.body)

        elif response.code == 599:  # retry
            self._fetching.remove(url)
            self._q.put(url)

    @gen.coroutine
    def get_page(self, url):
        try:
            response = yield self.fetch(url)
            # print('######fetched %s' % url)
        except Exception as e:
            print('Exception: %s %s' % (e, url))
            raise gen.Return(e)
        raise gen.Return(response)

    @gen.coroutine
    def _run(self):
        @gen.coroutine
        def fetch_url():
            current_url = yield self._q.get()
            try:
                if current_url in self._fetching:
                    return

                # print('fetching****** %s' % current_url)
                self._fetching.add(current_url)

                response = yield self.get_page(current_url)
                self.handle_response(current_url, response)  # handle reponse

                self._fetched.add(current_url)

                for i in range(self.concurrency):
                    if self.urls:
                        yield self._q.put(self.urls.pop())

            finally:
                self._q.task_done()

        @gen.coroutine
        def worker():
            while True:
                yield fetch_url()

        self._q.put(self.urls.pop())  # add first url

        # Start workers, then wait for the work queue to be empty.
        for _ in range(self.concurrency):
            worker()

        yield self._q.join(timeout=timedelta(seconds=300000))
        try:
            assert self._fetching == self._fetched
        except AssertionError:
            print(self._fetching - self._fetched)
            print(self._fetched - self._fetching)

    def run(self):
        io_loop = ioloop.IOLoop.current()
        io_loop.run_sync(self._run)


class MySpider(AsySpider):
    def fetch(self, url, **kwargs):
        """重写父类fetch方法"""
        cookies_str = 'JSESSIONID=0000n4jBi_dKg91XbtHHQHDeeDL:1b4e17j2v; iPlanetDire' \
                      'ctoryPro=AQIC5wM2LY4Sfcxu%' \
                      '2FWPIJWGHttZPiXafd%2B1gowyEoxTmyiY%3D%40AAJTSQACMDE%3D%23'
        headers = {
            'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
            'cookie': cookies_str
        }
        return super(MySpider, self).fetch(
            url, headers=headers
        )

    def handle_html(self, url, html):
        url += 'qwertyu'
        pattern = re.compile('userPhoto&ownerId=(.*)qwertyu')
        filename = re.findall(pattern, url)[0]
        # 注意把dir修改成你想要存放照片位置.例如C:/picture/
        dir = '/home/innovation/文档/pic/'
        with open(dir + filename + '.jpg', 'wb') as file:
            file.write(html)
            file.close()


def main():
    urls = []
    url_pic = 'http://myportal.sxu.edu.cn/attachmentDownload.portal?notUseCache=true&type=userPhoto&ownerId='
    for academy in peoples:
        for i in range(peoples[academy]):
            i += 1
            if i < 10:
                i = '00' + str(i)
            elif 100 > i >= 10:
                i = '0' + str(i)
            urls.append(url_pic + '2014' + academy + str(i))
    s = MySpider(urls)
    s.run()


if __name__ == '__main__':
    main()

 

posted @ 2016-12-23 16:55  INnoVation-V2  阅读(440)  评论(0编辑  收藏  举报