批量下载QQ空间日志

从手机页面读取,有时候也会卡死,解决办法还是重新来………………

# -*-coding:utf-8-*-
# 作者:fwindpeak
#
import urllib
import urllib2
import re
from HTMLParser import HTMLParser

htmstart='''<html>
<head>
<meta charset="utf-8"/>
<title>qzone_blog</title>
</head>
<body>
'''            
htmend='''</body>
</html>
'''

def cn(s):
    return s.decode("utf-8").encode("gbk")
    
def DownloadBlog(qq, filename = None):
    blogList = []
    print 'Start'
    if filename==None:
        filename="%s.htm"%qq
    blogurl = 'http://z.qq.com/blog/blog_list.jsp?sid=AefvkfGVCCDx2PfXiaquF7pf&B_UID=%s'%qq

    url = blogurl
    cookieFile = urllib2.HTTPCookieProcessor()
    opener = urllib2.build_opener(cookieFile)
    opener.addheaders = [('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                        ('Referer','http://z.qq.com/'),
                        ('User-Agent','Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91)\
                                        AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1')]
    
    # 获取所有页的文章路径
    pagenum=0
    while True:
        req = urllib2.Request(url)
        result = opener.open(req)        
        text = result.read()      
        detailPattern = re.compile(r'href="(.+blog_detail.jsp?.+)">(.+)</a>')
        detail = re.findall(detailPattern,text)
        #print detail
        blogList.append(detail)
        nextpagePattern = re.compile(r'href="(.*?)">下页</a>')
        nextpage = nextpagePattern.search(text)
        if nextpage:
            pagenum=pagenum+1
            url = nextpage.group(1)
            url=url.replace('amp;','')
            print url
            print "page %d"%pagenum  
        else:
            break
        
    file = open(filename, 'w')
    htmstart='''<html>\
                <head>\
                <meta charset="utf-8"/>\
                <title>qzone_blog_%s</title>\
                </head>\
                <body>\
                '''%qq 
    file.write(htmstart);
    # 下载文章
    blogContentPattern = re.compile(r'<p class="tabs-1">(.+<br/>.+)<p class="tabs-1"><br/>相关搜索', re.S)
    try:
        for dat in blogList:
            for url,title in dat:
                #print url,title
                url=url.replace('amp;','')
                url=url+"&isReadAllPage=true"
                print 'Downloading', cn(title)
                text = opener.open(url,).read()
                #print text
                ret = blogContentPattern.search(text)
                if ret:
                    txt = ret.group(1)
                    #print txt
                    file.write(txt)
                    file.write("<br/><hr/><br/>")
    except Exception,e:
        print e
        pass
    finally:
        opener.close()
        file.write(htmend)
        file.close()
        
if __name__ == '__main__':
    print "QZone blog download"
    qq=raw_input("QQ:")
    DownloadBlog(qq)

 

 

 

posted @ 2013-10-14 23:17  fwindpeak  阅读(550)  评论(0编辑  收藏  举报