Python下载百度空间文章

Posted on 2012-08-21 10:49 moose 阅读(181) 评论(0) 编辑收藏举报

8.20更新:
保存图片到本地（图片处理方法很有问题，求大牛指导～）
PS: 我用IE无法显示图片，但是使用Opera就可以正常显示。汗……

------------------------------------------------------------------------------------------------------

一段很烂的代码～能用不能看……
代码没有任何技术含量，就是简单的网页另存为……

使用方法：
在cmd中输入
> python "F:\Walkbox\Python\mywork\baidu\getArticleId - r1.py" bspeng922 6
命令格式：python 文件存放路径 [用户名] [下载页数]
下载页数可以不填，不填则为全部下载。如果大于实际总页数，则会重复下载第一页的内容

PS: 只能是新版的百度空间，只测试了”低调优雅“模板，生成的是html文件
PPS:突然发现一个奇特的功能，这段代码竟然可以用来刷百度空间的访问量……
百度空间的设定太坑爹了～

# -*- coding: utf8 -*-
import urllib
import re,os,sys,time


def articleDownload(username,pageCount):
    #判断传入的参数是否合法
    if username == "" : username = "bspeng922"
    if pageCount == "" or int(pageCount)<0 :
        pageCount = 0
    else:
        pageCount = int(pageCount) + 1
    print "Blog: http://hi.baidu.com/new/%s"%username

    #文件保存目录，可修改
    saveDrive = "E:\\test"  #directory to save html files

    #html文件保存目录
    if not os.path.exists(saveDrive) :
        os.mkdir(saveDrive)
    
    mydrive = os.path.join(saveDrive,username)
    if not os.path.exists(mydrive) :
        os.mkdir(mydrive)
        
    #图片保存目录
    imgDir = "img"
    imgPath = os.path.join(saveDrive,username,imgDir)
    if not os.path.exists(imgPath):
        os.mkdir(imgPath)

    #判断传入的页数是否为0，为0则全部下载
    if pageCount == 0 :
        fstbaidu = urllib.urlopen("http://hi.baidu.com/new/%s"%username)    
        totalRecord,pagesize=0,0
    
        for fstline in fstbaidu:        
            if fstline.find("allCount")>0:  #only one tag
                totalRecord = int(fstline[fstline.index("'")+1:fstline.rindex("'")])
            if fstline.find("pageSize")>0:
                pagesize = int(fstline[fstline.index("'")+1:fstline.rindex("'")])
    
            if pagesize != 0 and totalRecord != 0:
                pageCount = totalRecord/pagesize
                if totalRecord / float(pagesize) > totalRecord/pagesize:
                    pageCount = pageCount + 2
        
        fstbaidu.close()
    print "Page Count: ",pageCount - 1 


    
    #根据文章ID获得文章实际链接
    articleCount = 0    
    sumHtmlPath = os.path.join(saveDrive,"%s.html"%username)
    sumfile = open(sumHtmlPath,"w") #the sum file
    aTagCmp = re.compile("""<a href="/%s/item/([\w]*?)" class="a-incontent a-title cs-contentblock-hoverlink" target=_blank>(.*?)</a>"""%username)

    for page in range(1,pageCount):
        thisPageUrl = urllib.urlopen("http://hi.baidu.com/new/%s?page=%d"%(username,page))
        print "Page: ",page
    
        for line in thisPageUrl:
            if line.find("a-incontent a-title")>0 :
                articleCount += 1    #博客文章数目
                linefind = aTagCmp.findall(line)
                #print linefind
            
                for line in linefind :

                    #文章的ID和名称
                    myurl = line[0]
                    mytitle = line[1]
                    sumfile.write("""<a href='%s\\%s.html' target='blank'>%s</a><br>"""%(username,myurl,mytitle))

                    #获得真实的文章，并保存
                    thispath = os.path.join(mydrive,"%s.html"%myurl)
                    thisfile = open(thispath,'w')
                    
                    thisArticle = urllib.urlopen("http://hi.baidu.com/%s/item/%s"%(username,myurl))

                    for thisline in thisArticle:
                        imgCount = 0
                        badImg = 0
                        
                        if thisline.find("content-head clearfix")>0:    #只取正文
                            #匹配图片标签
                            imgTagCmp = re.compile("""<img.*?src="(.*?)".*?>""")
                            imglist = imgTagCmp.findall(thisline)

                            for imglink in imglist :
                                imageNewPath = ""
                                #print imglink

                                if imglink.find("""://""")>0:
                                    imageName = imglink[imglink.rindex("/")+1:]

                                    #下载图片
                                    try:
                                        urllib.urlretrieve(imglink,os.path.join(imgPath,imageName))
                                        imgCount += 1
                                    except :    #不能下载则报错
                                        print "cannot download this image: "+imageName

                                    #替换图片链接
                                    imageNewPath = """<img src="%s/%s" />"""%(imgDir,imageName)
                                    thisImgCmp = re.compile("""<img width="\d{1,4}" height="\d{1,4}" src="http://.*?/%s" />|<img src="http://.*?/%s" small="0" />|<img src="http://.*?/%s" />|<img small="0" src="http://.*?/%s" />"""%(imageName,imageName,imageName,imageName))
                                    #print imageNewPath
                                    
                                    try:
                                        #print thisImgCmp.findall(thisline)
                                        thisline = thisImgCmp.sub(imageNewPath,thisline) #每次都对当前图片标签进行替换
                                        #print thisline
                                    except:
                                        print "UnExpect error"

                                else:
                                    badImg += 1
                                    
                            #删除多余的内容
                            pos = thisline.find("mod-post-info clearfix")
                            if pos>0 :
                                thisline = thisline[0:pos-12]

                            thisfile.write(thisline.strip())                

                    thisfile.close()
                    thisArticle.close()
                    #print "Image Count: %d  Bad Image: %d"%(imgCount, badImg)
        thisPageUrl.close()
    sumfile.close()

    print "Article Count: ",articleCount

if __name__ == "__main__":
    st = time.time()

    #获得命令行参数
    if len(sys.argv) == 2:
        uname = sys.argv[1]
        pages = 0
    elif len(sys.argv)>2:
        uname = sys.argv[1]
        pages = int(sys.argv[2])+1
    else:
        uname = raw_input("Username -> ")
        pages = raw_input("Page -> ")

    
    articleDownload(uname,pages)
    et = time.time()
    print "Time used: %0.2fs"%(et-st)

刷新页面返回顶部

Moose

导航

公告

Python下载百度空间文章