python下载QQ空间的博客文章

# -*-coding:utf-8-*-
# 作者:华亮
#

import urllib
import urllib2
import re
from HTMLParser import HTMLParser


# 获取QQ空间博客列表
class QQBlogList(HTMLParser):
    in_key_div = False
    in_ul = False
    in_li = False
    in_a = False
    blogList = []
    lasturl = ''
    
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == 'div' and 'class' in attrs and attrs['class'] == 'bloglist':
            self.in_key_div = True
        elif self.in_key_div:
            if tag == 'ul':
                self.in_ul = True
            elif self.in_ul and tag == 'li':
                self.in_li = True
            elif self.in_li and tag == 'a' and 'href' in attrs:
                self.in_a = True
                self.lasturl = attrs['href']
    
    def handle_data(self, data):
        if self.in_a:
            self.blogList.append((data, self.lasturl))
    
    def handle_endtag(self, tag):
        if self.in_key_div and tag == 'div':
            self.in_key_div = False
        elif self.in_ul and tag == 'ul':
            self.in_ul = False
        elif self.in_li and tag == 'li':
            self.in_li = False
        elif self.in_a and tag == 'a':
            self.in_a = False
            
         
            
class QQ:  
    '''
    QQ
        作者:华亮
        说明:自动下载QQ空间博客文章
    '''
        
    @staticmethod      
    def DownloadBlog(qq, filename = None):
        print 'Start'
        blogurl = 'http://qz.qq.com/%s/bloglist?page=0' % qq
        QQ.__Download(blogurl, filename)           
        print 'End'
    
    @staticmethod
    def __Download(starturl, filename):
        url = starturl
        
        cookieFile = urllib2.HTTPCookieProcessor()
        opener = urllib2.build_opener(cookieFile)    
        
        # 获取所有页的文章路径
        while True:
            req = urllib2.Request(url)
            result = opener.open(req)        
            text = result.read()     
            
            qq = QQBlogList()        
            qq.feed(text)
            qq.close()          
                   
            nextpagePattern = re.compile(r'<a href="(.*?)" title="下一页" class="bt_next"><span>下一页</span></a>')              
            nextpage = nextpagePattern.search(text)
            if nextpage:
                url = nextpage.group(1)            
            else:
                break  
          
        if not filename:
            filename = "blog.txt"
        file = open(filename, 'w')    
        
        # 下载文章
        blogContentPattern = re.compile(r'<div class="entry_content">(.*?)</div>', re.S) 
        for title, url in qq.blogList:
            print 'Downloading', title
            req = urllib2.Request(url)
            result = opener.open(req)
            file.write('\n' + title + '\n')
            ret = blogContentPattern.search( result.read() )
            if ret:
                file.write(ret.group(1).replace('<p>', '\n'))
        file.close()
# -*-coding:utf-8-*-
# 作者:华亮
#

from QQ import QQ

if __name__ == '__main__':
    # 第一个参数为QQ号,第二个为保存文件名
    QQ.DownloadBlog('1241224798', 'blog.txt')

posted @ 2012-08-12 21:46  苍术厚朴  阅读(933)  评论(0编辑  收藏  举报