yys

Maya插件开发,(多多练习英文吧~)

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::

This is an alternative to OfflineExplorer.

 

Thanks for this article[1] which is listed in Reference secton, and I modify several lines to adapt to my blogs. Here is the change list:

1. L193, change "homepage1_BottomPager" to  "homepage1_HomePageDays_BottomPager". Because I can't find "homepage1_BottomPager" in the source code of my cnblog web page at all.

2. L394, set url to your last page.

3. L396, set the output directory on your local disk.

Enjoy it!

  1 #! encoding=utf-8
  2 
  3 #cnblogs博客备份,使用方法:修改最下面的url和output,然后执行就可以了。
  4 
  5 import urllib2
  6 import re
  7 import os
  8 import sys
  9 # from HTMLParser import HTMLParser
 10 import html5lib
 11 # from xml.etree.ElementTree import ElementTree
 12 from urlparse import urlparse
 13 import xml
 14 import codecs
 15 import traceback
 16 import time
 17 
 18 # class MyHTMLParser(HTMLParser):
 19 
 20 #     def handle_starttag(self, tag, attrs):
 21 #         # if tag.lower() == "img":
 22 #             print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
 23 #             for x in attrs:
 24 #                 print "name %s,value %s" % (x[0],x[1])
 25 #     def handle_endtag(self, tag):
 26 #         print "Encountered the end of a %s tag" % tag
 27 
 28 #     def handle_startendtag(self, tag, attrs):
 29 #         print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
 30 #         for x in attrs:
 31 #             print "name %s,value %s" % (x[0],x[1])
 32 
 33 # 资源尝试次数
 34 gTestTime = 5
 35 
 36 def DownloadFile(url,output):
 37   responseText = None
 38   dirssPath = None
 39   try:
 40     res = urlparse(url)
 41     url = res.scheme+"://"+res.netloc+res.path
 42     path = res.path
 43     index = path.rfind('/')
 44     dirss = "/"
 45     if index != -1:
 46       dirss =  output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8")
 47       dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8")
 48       dirss_ansi = dirss.decode('utf-8')
 49       if not os.path.exists(dirss_ansi):
 50         os.makedirs(dirss_ansi)
 51     global gTestTime
 52     count = gTestTime    
 53     while True:
 54       if count < 0:
 55         break
 56       count = count - 1
 57       header={"User-Agent": "Mozilla-Firefox5.0"}
 58       if not url.startswith("http://"):
 59         break
 60       try:
 61         # print "url: %s:%d" % (url,count)
 62         time.sleep(0.5)
 63         request = urllib2.Request(url,None,header)
 64         response = urllib2.urlopen(request)
 65         dirssPath_ansi = dirssPath.decode("utf-8")
 66         if not os.path.exists(dirssPath_ansi):
 67           resourceFile = open(dirssPath_ansi,"wb")
 68           responseText = response.read()
 69           if url.endswith(".js"):
 70             responseText = responseText.replace("http://","")
 71             responseText = responseText.replace("https://","")
 72           resourceFile.write(responseText)
 73           resourceFile.close()
 74         break         
 75       except Exception,e:
 76         print "DownloadFile: %s:%s:%d" % (e,url,count)
 77         # pass
 78         # exstr = traceback.format_exc()
 79         # print exstr
 80 
 81   except Exception,e:
 82       pass
 83       # exstr = traceback.format_exc()
 84       # print exstr
 85   
 86   return (responseText,url,output)
 87 
 88 def ReadCss(css):
 89   # print "ReadCss"
 90   mode = 'url\(\"?([^)]+)\"?\)'
 91   pattern = re.compile(mode)
 92   try:
 93     text = css[0]
 94     if css[0] == None:
 95       return
 96     strMatch = pattern.findall(text)
 97     size = len(strMatch)
 98     # print "size: ",size
 99     for i in range(0,size,1):
100       one = strMatch[i]
101       newurl = GetConcatUrl(css[1],one)
102       DownloadFile(newurl,css[2])
103   except Exception,e:
104       pass
105       # exstr = traceback.format_exc()
106       # print exstr 
107 
108 def Download(url,output):
109   # try:
110   header={"User-Agent": "Mozilla-Firefox5.0"}
111   namespace = "{http://www.w3.org/1999/xhtml}"
112   request = urllib2.Request(url,None,header)
113   response = urllib2.urlopen(request)
114 
115   data = response.read()
116   document = html5lib.parse(data)
117   imgElements = document.findall('.//{0}img'.format(namespace))
118   # print "imgElements %d" % len(imgElements)
119   for img in imgElements:
120     src = img.attrib["src"]
121     # print "src %s" % src
122     try:
123       res = urlparse(src)
124       # 非cnblogs的图片不下载
125       if not res.netloc.endswith(".cnblogs.com"):
126         print "image not download: %s:%s" % (src,res.netloc)
127         continue
128     except Exception,e:
129       pass
130     DownloadFile(src,output)
131 
132   linkElements = document.findall('.//{0}link'.format(namespace))
133   # print "linkElements %d" % len(linkElements)
134   for link in linkElements:
135     href = link.attrib["href"]
136     # print "href %s" % href
137     text = DownloadFile(href,output)
138     if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":
139       ReadCss(text)
140 
141   scriptElements = document.findall('.//{0}script'.format(namespace))
142   # print "scriptElements %d" % len(scriptElements)
143   for script in scriptElements:
144     if script.attrib.has_key("src"):
145       src = script.attrib["src"]
146       # print "src %s" % src
147       DownloadFile(src,output)
148     
149   htmlNameIndex = url.rfind("/");
150   urlLen = len(url)
151   htmlName = GetHtmlName(url)
152   output = output.decode("utf-8") + "/"+htmlName+".htm"
153   data = data.replace("http://","")
154   data = data.replace("https://","")
155   data = data.replace("www.w3.org/1999/xhtml","http://www.w3.org/1999/xhtml")
156 
157   resourceFile = open(output,"wb")
158   resourceFile.write(data)
159   resourceFile.close()
160 
161 def GetConcatUrl(url,png):
162   # one: "../images/f_icon.png" -- url http://static.csdn.net/public/common/toolbar/css/index.css
163   count = 0
164   index = png.find("..")
165   startindex = None
166   while index != -1:
167     count = count + 1;
168     startindex = index + 2
169     index = png.find("..",startindex)
170 
171   second = png[startindex:]
172   length = len(url)
173   index = url.rfind("/")
174   endindex = 0
175   while count >= 0 and index != -1:
176     endindex = index
177     index = url.rfind("/",0, endindex)
178     count = count - 1
179   first = url[0:endindex]
180   return first+second
181 
182 def getAllListUrl(url):
183   header={"User-Agent": "Mozilla-Firefox5.0"}
184   request = urllib2.Request(url,None,header)
185   response = urllib2.urlopen(request)
186   data = response.read()
187   
188   # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).
189   document = html5lib.parse(data)
190   namespace = "{http://www.w3.org/1999/xhtml}"
191 
192   # get <div id="homepage1_BottomPager" class="topicListFooter">
193 pageList = document.findall('.//{0}div[@id=\'homepage1_HomePageDays_BottomPager\']'.format(namespace)) 194 print( "Debug>len(pageList)=%d"%len(pageList) ); 195 # get <div class="pager"> 196 alinks = list(pageList[0]) 197 # get content in <div class="pager">, like:<a href="http://www.cnblogs.com/GnagWang/default.html?page=1"> 198 alinks1 = list(alinks[0]) 199 lastArticle = alinks1[len(alinks1)-1] 200 201 # lastArticleHref = u'http://www.cnblogs.com/GnagWang/default.html?page=20' 202 lastArticleHref = lastArticle.attrib["href"] 203 lastPageIndex = lastArticleHref.rfind("=") 204 lastPageNum = int(lastArticleHref[lastPageIndex+1:]) 205 urlInfo = lastArticleHref[0:lastPageIndex] 206 207 urlList = [] 208 for x in xrange(1,lastPageNum+1): 209 listUrl = urlInfo+"="+str(x) 210 urlList.append(listUrl) 211 212 return urlList 213 214 215 def getArticleList(url): 216 # 获取所有的文章url 217 # <div id="article_toplist" class="list"></div> 218 # <div id="article_list" class="list" 219 220 # <div class="list_item article_item" 221 222 # <div class="article_title"> 223 # <span class="ico ico_type_Original"></span> 224 # <h1> 225 # <span class="link_title"> 226 # <a href="/infoworld/article/details/18984183"> 227 228 # <div class="article_manage"> 229 # <span class="link_postdate"></span> 230 231 urlList = getAllListUrl(url) 232 print "文章页数(number of pages) ",len(urlList) 233 header={"User-Agent": "Mozilla-Firefox5.0"} 234 235 allLists = [] 236 237 strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8") 238 pageNum = 0 239 global gTestTime 240 for one in urlList: 241 tryCount = gTestTime # try count 242 pageNum = pageNum + 1 243 pageNumStr = strPage.format(pageNum) 244 print pageNumStr 245 246 while tryCount > 0: 247 try: 248 tryCount = tryCount - 1 249 time.sleep(0.5) #访问太快会不响应 250 request = urllib2.Request(one,None,header) 251 response = urllib2.urlopen(request) 252 253 data = response.read() 254 document = html5lib.parse(data,encoding="utf-8") 255 namespace = "{http://www.w3.org/1999/xhtml}" 256 # .//{0}div[@id=\'article_toplist\'] 257 #topLists = document.findall('.//{0}div[@id=\'article_toplist\']/{0}div[@class=\'list_item article_item\']'.format(namespace)) 258 #articleLists = document.findall('.//{0}div[@id=\'article_list\']/{0}div[@class=\'list_item article_item\']'.format(namespace)) 259 articleLists = document.findall('.//{0}div[@class=\'postTitle\']'.format(namespace)) 260 allLists = allLists + articleLists 261 break 262 except Exception, e: 263 print "getArticleList %s:%s:%d" % (e,one,tryCount) 264 265 266 count = 0 # 文章数 267 artices = [] 268 for article in allLists: 269 count = count+1 270 alink = article.find(".//{0}a".format(namespace)) 271 # href = u'http://www.cnblogs.com/GnagWang/archive/2010/04/02/1702721.html' 272 href = alink.attrib["href"] 273 #oneHref = "http://blog.csdn.net"+href 274 oneHref = href 275 276 childElement = list(alink) 277 linkIter = alink.itertext() 278 title = "".encode("utf-8") 279 for x in linkIter: 280 title = title+x.strip().encode("utf-8") 281 artices.append([oneHref,title]) 282 283 return artices 284 285 def GetUserName(url): 286 htmlNameIndex = url.rfind("/"); 287 urlLen = len(url) 288 htmlName = "" 289 htmlNameIndex1 = url.rfind("/",0,htmlNameIndex) 290 htmlName = url[htmlNameIndex1+1:htmlNameIndex] 291 # if htmlNameIndex+1 == urlLen: 292 # htmlNameIndex = url.rfind("/",0,htmlNameIndex) 293 # htmlName = url[htmlNameIndex+1:urlLen-1] 294 # else: 295 # htmlName = url[htmlNameIndex+1:] 296 return htmlName 297 298 299 def GetHtmlName(url): 300 htmlNameIndex = url.rfind("/"); 301 urlLen = len(url) 302 htmlName = "" 303 if htmlNameIndex+1 == urlLen: 304 htmlNameIndex = url.rfind("/",0,htmlNameIndex) 305 htmlName = url[htmlNameIndex+1:urlLen-1] 306 else: 307 htmlName = url[htmlNameIndex+1:] 308 return htmlName 309 310 311 312 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=19这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL 313 def Start(url,output): 314 315 print "备份开始" 316 lists = getArticleList(url) 317 username = GetUserName(url) 318 output_username = output+"/"+username 319 output_username.replace("\\","/") 320 if not os.path.exists(output_username.decode("utf-8")): 321 os.mkdir(output_username.decode("utf-8")) 322 323 totalNum = len(lists) 324 print "总文章数(number of articles): %d" % totalNum 325 326 # 生成首页文件 327 doctype = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n' 328 charset = '<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />' 329 indexHtml = output_username + ".htm" 330 f = open(indexHtml.decode("utf-8"),"w") 331 print >> f,doctype 332 print >> f,'<html>' 333 print >> f,'<head>' 334 print >> f,charset 335 print >> f,'</head>' 336 print >> f,'<frameset cols=\"20%,*\">' 337 navigationHtmlName = username+'-navigation.htm' 338 print >> f,'<frame src=\"'+navigationHtmlName+'\" />' 339 firstHtmlName = GetHtmlName(lists[0][0]) 340 print >> f,'<frame src=\"'+username+'/'+firstHtmlName+'.htm\" name=\"showframe\">' 341 print >> f,'</frameset>' 342 print >> f,'</html>' 343 f.close() 344 345 # 生成导航文件 346 navigationHtml = output+"/"+navigationHtmlName 347 # f = open(navigationHtml.decode("utf-8"),"w") 348 f = codecs.open(navigationHtml.decode("utf-8"),"w","utf-8-sig") 349 print >> f,doctype 350 print >> f,'<html>' 351 print >> f,'<head>' 352 print >> f,charset 353 print >> f,'<style> body{font: 12px Verdana, Arial, Helvetica, sans-serif;}a{color: #808080;}</style>' 354 print >> f,'</head>' 355 print >> f,'<body>' 356 count = 0 357 for x in lists: 358 count = count + 1 359 articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm" 360 print >> f,'<a href=\"'+articleIdHtml + '\" target=\"showframe\">'+str(count)+'.'+x[1].decode("utf-8")+'</a><br /><br />' 361 print >> f,'</body>' 362 print >> f,'</html>' 363 f.close() 364 365 print "开始下载文章" 366 currentNum = 0 367 strPage = "{0}:{1}.".decode("utf-8").encode("utf-8") 368 global gTestTime 369 for x in lists: 370 count = gTestTime 371 currentNum = currentNum+1 372 while True: 373 if count < 0: 374 break 375 count = count - 1 376 try: 377 time.sleep(1) #访问太快,csdn会报503错误. 378 strPageTemp = strPage.format(totalNum,currentNum) 379 strPageTemp = strPageTemp+x[1] 380 print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时 381 382 print x[0] 383 print "\n" 384 Download(x[0],output_username) 385 break 386 except Exception, e: 387 # exstr = traceback.format_exc() 388 # print exstr 389 pass 390 391 392 #url必须是类似http://www.cnblogs.com/GnagWang/default.html?page=21这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL 393 if __name__=='__main__': 394 url = "http://www.cnblogs.com/yaoyansi/default.html?page=4" 395 #output = "C:/Users/apple/Desktop/新建文件夹" 396 output = "/tmp/my_tmp/cnblogs" 397 Start(url,output) 398 # Download("http://blog.csdn.net/dcraw/article/details/6858820", 399 # "C:/Users/apple/Desktop/新建文件夹/infoworld")

 

Reference:

[1] http://blog.csdn.net/llrraa2010/article/details/35540845

posted on 2014-10-24 18:58  yys  阅读(293)  评论(0编辑  收藏  举报