This is an alternative to OfflineExplorer.
Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:
1. L193, change "homepage1_BottomPager" to "homepage1_HomePageDays_BottomPager". Because I can't find "homepage1_BottomPager" in the source code of my cnblog web page at all.
2. L394, set url to your last page.
3. L396, set the output directory on your local disk.
Enjoy it!
1 #! encoding=utf-8 2 3 #cnblogs博客备份,使用方法:修改最下面的url和output,然后执行就可以了。 4 5 import urllib2 6 import re 7 import os 8 import sys 9 # from HTMLParser import HTMLParser 10 import html5lib 11 # from xml.etree.ElementTree import ElementTree 12 from urlparse import urlparse 13 import xml 14 import codecs 15 import traceback 16 import time 17 18 # class MyHTMLParser(HTMLParser): 19 20 # def handle_starttag(self, tag, attrs): 21 # # if tag.lower() == "img": 22 # print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) 23 # for x in attrs: 24 # print "name %s,value %s" % (x[0],x[1]) 25 # def handle_endtag(self, tag): 26 # print "Encountered the end of a %s tag" % tag 27 28 # def handle_startendtag(self, tag, attrs): 29 # print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs)) 30 # for x in attrs: 31 # print "name %s,value %s" % (x[0],x[1]) 32 33 # 资源尝试次数 34 gTestTime = 5 35 36 def DownloadFile(url,output): 37 responseText = None 38 dirssPath = None 39 try: 40 res = urlparse(url) 41 url = res.scheme+"://"+res.netloc+res.path 42 path = res.path 43 index = path.rfind('/') 44 dirss = "/" 45 if index != -1: 46 dirss = output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8") 47 dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8") 48 dirss_ansi = dirss.decode('utf-8') 49 if not os.path.exists(dirss_ansi): 50 os.makedirs(dirss_ansi) 51 global gTestTime 52 count = gTestTime 53 while True: 54 if count < 0: 55 break 56 count = count - 1 57 header={"User-Agent": "Mozilla-Firefox5.0"} 58 if not url.startswith("http://"): 59 break 60 try: 61 # print "url: %s:%d" % (url,count) 62 time.sleep(0.5) 63 request = urllib2.Request(url,None,header) 64 response = urllib2.urlopen(request) 65 dirssPath_ansi = dirssPath.decode("utf-8") 66 if not os.path.exists(dirssPath_ansi): 67 resourceFile = open(dirssPath_ansi,"wb") 68 responseText = response.read() 69 if url.endswith(".js"): 70 responseText = responseText.replace("http://","") 71 responseText = responseText.replace("https://","") 72 resourceFile.write(responseText) 73 resourceFile.close() 74 break 75 except Exception,e: 76 print "DownloadFile: %s:%s:%d" % (e,url,count) 77 # pass 78 # exstr = traceback.format_exc() 79 # print exstr 80 81 except Exception,e: 82 pass 83 # exstr = traceback.format_exc() 84 # print exstr 85 86 return (responseText,url,output) 87 88 def ReadCss(css): 89 # print "ReadCss" 90 mode = 'url\(\"?([^)]+)\"?\)' 91 pattern = re.compile(mode) 92 try: 93 text = css[0] 94 if css[0] == None: 95 return 96 strMatch = pattern.findall(text) 97 size = len(strMatch) 98 # print "size: ",size 99 for i in range(0,size,1): 100 one = strMatch[i] 101 newurl = GetConcatUrl(css[1],one) 102 DownloadFile(newurl,css[2]) 103 except Exception,e: 104 pass 105 # exstr = traceback.format_exc() 106 # print exstr 107 108 def Download(url,output): 109 # try: 110 header={"User-Agent": "Mozilla-Firefox5.0"} 111 namespace = "{http://www.w3.org/1999/xhtml}" 112 request = urllib2.Request(url,None,header) 113 response = urllib2.urlopen(request) 114 115 data = response.read() 116 document = html5lib.parse(data) 117 imgElements = document.findall('.//{0}img'.format(namespace)) 118 # print "imgElements %d" % len(imgElements) 119 for img in imgElements: 120 src = img.attrib["src"] 121 # print "src %s" % src 122 try: 123 res = urlparse(src) 124 # 非cnblogs的图片不下载 125 if not res.netloc.endswith(".cnblogs.com"): 126 print "image not download: %s:%s" % (src,res.netloc) 127 continue 128 except Exception,e: 129 pass 130 DownloadFile(src,output) 131 132 linkElements = document.findall('.//{0}link'.format(namespace)) 133 # print "linkElements %d" % len(linkElements) 134 for link in linkElements: 135 href = link.attrib["href"] 136 # print "href %s" % href 137 text = DownloadFile(href,output) 138 if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet": 139 ReadCss(text) 140 141 scriptElements = document.findall('.//{0}script'.format(namespace)) 142 # print "scriptElements %d" % len(scriptElements) 143 for script in scriptElements: 144 if script.attrib.has_key("src"): 145 src = script.attrib["src"] 146 # print "src %s" % src 147 DownloadFile(src,output) 148 149 htmlNameIndex = url.rfind("/"); 150 urlLen = len(url) 151 htmlName = GetHtmlName(url) 152 output = output.decode("utf-8") + "/"+htmlName+".htm" 153 data = data.replace("http://","") 154 data = data.replace("https://","") 155 data = data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml") 156 157 resourceFile = open(output,"wb") 158 resourceFile.write(data) 159 resourceFile.close() 160 161 def GetConcatUrl(url,png): 162 # one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css 163 count = 0 164 index = png.find("..") 165 startindex = None 166 while index != -1: 167 count = count + 1; 168 startindex = index + 2 169 index = png.find("..",startindex) 170 171 second = png[startindex:] 172 length = len(url) 173 index = url.rfind("/") 174 endindex = 0 175 while count >= 0 and index != -1: 176 endindex = index 177 index = url.rfind("/",0, endindex) 178 count = count - 1 179 first = url[0:endindex] 180 return first+second 181 182 def getAllListUrl(url): 183 header={"User-Agent": "Mozilla-Firefox5.0"} 184 request = urllib2.Request(url,None,header) 185 response = urllib2.urlopen(request) 186 data = response.read() 187 188 # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x). 189 document = html5lib.parse(data) 190 namespace = "{http://www.w3.org/1999/xhtml}" 191 192 # get <div id="homepage1_BottomPager" class="topicListFooter">
193 pageList = document.findall('.//{0}div[@id=\'homepage1_HomePageDays_BottomPager\']'.format(namespace)) 194 print( "Debug>len(pageList)=%d"%len(pageList) ); 195 # get <div class="pager"> 196 alinks = list(pageList[0]) 197 # get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1"> 198 alinks1 = list(alinks[0]) 199 lastArticle = alinks1[len(alinks1)-1] 200 201 # lastArticleHref = u'http://www.cnblogs.com/GnagWang/default.html?page=20' 202 lastArticleHref = lastArticle.attrib["href"] 203 lastPageIndex = lastArticleHref.rfind("=") 204 lastPageNum = int(lastArticleHref[lastPageIndex+1:]) 205 urlInfo = lastArticleHref[0:lastPageIndex] 206 207 urlList = [] 208 for x in xrange(1,lastPageNum+1): 209 listUrl = urlInfo+"="+str(x) 210 urlList.append(listUrl) 211 212 return urlList 213 214 215 def getArticleList(url): 216 # 获取所有的文章url 217 # <div id="article_toplist" class="list"></div> 218 # <div id="article_list" class="list" 219 220 # <div class="list_item article_item" 221 222 # <div class="article_title"> 223 # <span class="ico ico_type_Original"></span> 224 # <h1> 225 # <span class="link_title"> 226 # <a href="/infoworld/article/details/18984183"> 227 228 # <div class="article_manage"> 229 # <span class="link_postdate"></span> 230 231 urlList = getAllListUrl(url) 232 print "文章页数(number of pages) ",len(urlList) 233 header={"User-Agent": "Mozilla-Firefox5.0"} 234 235 allLists = [] 236 237 strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8") 238 pageNum = 0 239 global gTestTime 240 for one in urlList: 241 tryCount = gTestTime # try count 242 pageNum = pageNum + 1 243 pageNumStr = strPage.format(pageNum) 244 print pageNumStr 245 246 while tryCount > 0: 247 try: 248 tryCount = tryCount - 1 249 time.sleep(0.5) #访问太快会不响应 250 request = urllib2.Request(one,None,header) 251 response = urllib2.urlopen(request) 252 253 data = response.read() 254 document = html5lib.parse(data,encoding="utf-8") 255 namespace = "{http://www.w3.org/1999/xhtml}" 256 # .//{0}div[@id=\'article_toplist\'] 257 #topLists = document.findall('.//{0}div[@id=\'article_toplist\']/{0}div[@class=\'list_item article_item\']'.format(namespace)) 258 #articleLists = document.findall('.//{0}div[@id=\'article_list\']/{0}div[@class=\'list_item article_item\']'.format(namespace)) 259 articleLists = document.findall('.//{0}div[@class=\'postTitle\']'.format(namespace)) 260 allLists = allLists + articleLists 261 break 262 except Exception, e: 263 print "getArticleList %s:%s:%d" % (e,one,tryCount) 264 265 266 count = 0 # 文章数 267 artices = [] 268 for article in allLists: 269 count = count+1 270 alink = article.find(".//{0}a".format(namespace)) 271 # href = u'http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html' 272 href = alink.attrib["href"] 273 #oneHref = "http://blog.csdn.net"+href 274 oneHref = href 275 276 childElement = list(alink) 277 linkIter = alink.itertext() 278 title = "".encode("utf-8") 279 for x in linkIter: 280 title = title+x.strip().encode("utf-8") 281 artices.append([oneHref,title]) 282 283 return artices 284 285 def GetUserName(url): 286 htmlNameIndex = url.rfind("/"); 287 urlLen = len(url) 288 htmlName = "" 289 htmlNameIndex1 = url.rfind("/",0,htmlNameIndex) 290 htmlName = url[htmlNameIndex1+1:htmlNameIndex] 291 # if htmlNameIndex+1 == urlLen: 292 # htmlNameIndex = url.rfind("/",0,htmlNameIndex) 293 # htmlName = url[htmlNameIndex+1:urlLen-1] 294 # else: 295 # htmlName = url[htmlNameIndex+1:] 296 return htmlName 297 298 299 def GetHtmlName(url): 300 htmlNameIndex = url.rfind("/"); 301 urlLen = len(url) 302 htmlName = "" 303 if htmlNameIndex+1 == urlLen: 304 htmlNameIndex = url.rfind("/",0,htmlNameIndex) 305 htmlName = url[htmlNameIndex+1:urlLen-1] 306 else: 307 htmlName = url[htmlNameIndex+1:] 308 return htmlName 309 310 311 312 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL 313 def Start(url,output): 314 315 print "备份开始" 316 lists = getArticleList(url) 317 username = GetUserName(url) 318 output_username = output+"/"+username 319 output_username.replace("\\","/") 320 if not os.path.exists(output_username.decode("utf-8")): 321 os.mkdir(output_username.decode("utf-8")) 322 323 totalNum = len(lists) 324 print "总文章数(number of articles): %d" % totalNum 325 326 # 生成首页文件 327 doctype = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n' 328 charset = '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />' 329 indexHtml = output_username + ".htm" 330 f = open(indexHtml.decode("utf-8"),"w") 331 print >> f,doctype 332 print >> f,'<html>' 333 print >> f,'<head>' 334 print >> f,charset 335 print >> f,'</head>' 336 print >> f,'<frameset cols=\"20%,*\">' 337 navigationHtmlName = username+'-navigation.htm' 338 print >> f,'<frame src=\"'+navigationHtmlName+'\" />' 339 firstHtmlName = GetHtmlName(lists[0][0]) 340 print >> f,'<frame src=\"'+username+'/'+firstHtmlName+'.htm\" name=\"showframe\">' 341 print >> f,'</frameset>' 342 print >> f,'</html>' 343 f.close() 344 345 # 生成导航文件 346 navigationHtml = output+"/"+navigationHtmlName 347 # f = open(navigationHtml.decode("utf-8"),"w") 348 f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig") 349 print >> f,doctype 350 print >> f,'<html>' 351 print >> f,'<head>' 352 print >> f,charset 353 print >> f,'<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>' 354 print >> f,'</head>' 355 print >> f,'<body>' 356 count = 0 357 for x in lists: 358 count = count + 1 359 articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm" 360 print >> f,'<a href=\"'+articleIdHtml + '\" target=\"showframe\">'+str(count)+'.'+x[1].decode("utf-8")+'</a><br /><br />' 361 print >> f,'</body>' 362 print >> f,'</html>' 363 f.close() 364 365 print "开始下载文章" 366 currentNum = 0 367 strPage = "{0}:{1}.".decode("utf-8").encode("utf-8") 368 global gTestTime 369 for x in lists: 370 count = gTestTime 371 currentNum = currentNum+1 372 while True: 373 if count < 0: 374 break 375 count = count - 1 376 try: 377 time.sleep(1) #访问太快,csdn会报503错误. 378 strPageTemp = strPage.format(totalNum,currentNum) 379 strPageTemp = strPageTemp+x[1] 380 print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时 381 382 print x[0] 383 print "\n" 384 Download(x[0],output_username) 385 break 386 except Exception, e: 387 # exstr = traceback.format_exc() 388 # print exstr 389 pass 390 391 392 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=21这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL 393 if __name__=='__main__': 394 url = "http://www.cnblogs.com/yaoyansi/default.html?page=4" 395 #output = "C:/Users/apple/Desktop/新建文件夹" 396 output = "/tmp/my_tmp/cnblogs" 397 Start(url,output) 398 # Download("http://blog.csdn.net/dcraw/article/details/6858820", 399 # "C:/Users/apple/Desktop/新建文件夹/infoworld")
Reference:
[1] http://blog.csdn.net/llrraa2010/article/details/35540845
转载请注明出处