豆瓣小组爬虫.....^_^

为了抓豆瓣小组的图片写的一个爬虫...大家懂的...
python用的是3.5.2
根据网上gdp12315的版本改出来的, 吧想抓的小组编号添加进url_list就行了
随时能停止, 增加了很多防止重复抓取下载的判断, 坏处是不能判断更新的主题(更新的很少...忽略掉了)
建议多增加点user_agents, 能有效的防止403
如果有什么修改或者好的建议, 请联系我 lzl_17948876@hotmail.com
# -*- coding: utf-8 -*-
# -----------------------------------------------
#   程序：豆瓣小组图片爬虫
#   版本：1.2.2
#   语言：Python 3.5.2
#   作者：刘志林
#
#   感谢: 程序修改自gdp12315的1.0版本  http://blog.csdn.net/gdp12315_gu/article/details/47323613
#
#   2016-11-07
#       修改已处理连接的记录方式, 每个小组一个信息记录
#   2016-11-08
#       修改时间记录位置不对的问题, 改为开始处理前记录
#   2016-11-28
#       增加记录总循环次数
#       增量获取时增加一个判断: 如果最后一个页面仍然有未获取过的主题, 则再获取下一个页面, 直到某个页面主题全部是已获取为止
# -----------------------------------------------

    
import random
import socket, http.cookies, http.cookiejar
import urllib.request
import re
import os, sys
import datetime, time
import pickle

class UrlInfo(object):
    __filename = ''
    dic_topic = {}
    lastdt = ''
    
    def __init__(self, a_filename):
        self.__filename = a_filename
        self.dic_topic = {}
        self.lastdt = ''

    def load(self):
        if os.path.exists(self.__filename):
            f = open(self.__filename, 'rb')
            try:
                tmp = pickle.load(f)
            finally:
                f.close()
            self.__dict__.update(tmp)

    def save(self):
        f = open(self.__filename, 'wb')
        try:
            pickle.dump(self.__dict__, f)
        finally:
            f.close()
        

class BrowserBase(object): 

    def __init__(self):
        socket.setdefaulttimeout(20)

    def speak(self,name,content):
        print('[%s]%s', name,content)

    def openurl(self,url):
        #预制一堆user_agents 防止403
        user_agents = [
                    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
                    'Opera/9.25 (Windows NT 5.1; U; en)',
                    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                    'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
                    ]

        try:
            cookie_support= urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar())
            self.opener = urllib.request.build_opener(cookie_support,urllib.request.HTTPHandler)
            urllib.request.install_opener(self.opener)
            self.opener.addheaders = [
                ('Host','www.douban.com'),
                ('Connection', 'keep-alive'),
                ('Accept', '*/*'),
                ('User-Agent', random.choice(user_agents)),
                ('Referer','http://www.google.com'),
                ]
            
            res = self.opener.open(url)
            #print(res.read())
        except Exception as e:
            self.speak(str(e),url)
            raise Exception
        else:
            return res
        finally:
            time.sleep(1)


if __name__=='__main__':
    splider=BrowserBase()
        
    
#要处理的小组列表, 第一个值是小组Code -> https://www.douban.com/group/小组Code/discussion?start=
url_list = [
    ('tomorrow', '灵异豆瓣'),
    ('439803', '出差男女'),
    ]

#记录处理过的主题

workpath = os.getcwd() + '\\'

loopCount = 0

while True:
    for url_rec in url_list:
        print('\n-------- (L-%d) %s  %s 开始采集 --------'%(loopCount + 1, datetime.datetime.now().strftime('%Y-%m-%d %X'), url_rec[1]))
        #创建目录
        filepath = '%sPictures\\%s\\'%(workpath, url_rec[1])
        if not os.path.exists(filepath):
            os.makedirs(filepath)

        url = 'https://www.douban.com/group/%s/discussion?start='%(url_rec[0])
        try:
            html_topic_list = splider.openurl(url).read().decode('utf-8')
        except:
            continue

        #加载信息数据
        info = UrlInfo('%sPictures\\%s.info'%(workpath, url_rec[1]))
        info.load()

        #最后一次处理时间, 如果是空的就处理全部记录
        if info.lastdt == '':
            print('第一次处理')
            dt_last = None
        else:
            print('上次处理完毕时间: %s'%(info.lastdt))
            dt_last = datetime.datetime.strptime(info.lastdt, '%Y-%m-%d %X')

        page_max = int(re.compile(r'\d+').findall(re.compile(r'data-total-page="\d+"').findall(html_topic_list)[0])[0])
        if dt_last == None:
            page_end = page_max
            num_end = (page_end - 1) * 25
        else:
            t2 = (datetime.datetime.now() - dt_last)
            num_end = t2.days * 24 * 6 + t2.seconds //300 #假设每5分钟会出现一篇新主题
            page_end = num_end // 25 + 1

        #记录当前处理时间
        _lastdt = datetime.datetime.now().strftime('%Y-%m-%d %X')

        num_begin = 0
        page_begin = 1
        while num_begin <= num_end:
            try:
                nFullTopicExists = True
                html_topic_list = splider.openurl(url+str(num_begin)).read().decode('utf-8')
                #获得主题列表
                topic_list = re.compile(r'https://www.douban.com/group/topic/\d+/').findall(html_topic_list)
                topic_count = len(topic_list)
                print('%s page: %d/%d - %d'%(url_rec[1], page_begin, page_end, topic_count))

                for topic_url_index in range(topic_count):
                    topic_url = topic_list[topic_url_index]
                    #print('topic_url '+topic_url)
                    
                    #不再处理已经处理过的主题
                    topic_code = re.findall(r'\d+', topic_url)[0]
                    if topic_code in info.dic_topic:
                        print('#%d '%(topic_url_index + 1), end='')
                        continue
                    else:
                        nFullTopicExists = False
                        print('%d '%(topic_url_index + 1), end='')
                    
                    try:
                        html_topic = splider.openurl(topic_url).read().decode('utf-8')
                    except:
                        continue

                    #记录主题已经处理过
                    info.dic_topic[topic_code] = ''
                    info.save()
                        
                    #获得图片下载地址列表
                    img_list = re.compile(r'https://img\d.doubanio.com/view/group_topic/large/public/p\d+.jpg').findall(html_topic)
                    
                    #遍历图片下载地址并保存
                    for img_url in img_list:
                        #print('img_url: '+img_url)
                        filename = '%s\\%s-%s.jpg'%(filepath, topic_code, re.findall(r'p\d+',img_url)[0])
                        if not os.path.exists(filename):
                            try:
                                #print(filename)
                                download_img = urllib.request.urlretrieve(img_url, filename)
                            except Exception as e:
                                print(e)
                                continue
                            finally:
                                time.sleep(2)
                     #waittime = random.randint(10,15)
                     #print('wait %d'%waittime)
                     #time.sleep(waittime)
                num_begin = num_begin + 25
                if (dt_last != None) and (num_begin > num_end) and (not nFullTopicExists):
                    num_end = num_end + 25
            except Exception as e:
                print(e)
                continue
            finally:
                page_begin = page_begin + 1
            print()

        info.lastdt = _lastdt
        info.save()
        print('-------- %s  %s 采集完成 --------\n'%(datetime.datetime.now().strftime('%Y-%m-%d %X'), url_rec[1]))
    loopCount = loopCount + 1
posted on 2016-11-04 17:02 黑暗煎饼果子阅读(1489) 评论(0) 编辑收藏举报
刷新页面返回顶部
黑暗煎饼果子

豆瓣小组爬虫.....^_^

导航

公告