import requests import re import os.path #取得文件名和内容对应字典 def getCode(url): pattern=re.compile(r'<h\d>([^<]+)?</h\d>\n*<pre><code>[^<]*</code>{1}?',re.S) dic={} r=requests.get(url) if r.status_code==200: for g in re.finditer(pattern,r.text): dic[g.group(1)]=g.group(0) return dic #输出到文件中 def saveFile(saveDir,dic): if not os.path.isdir(saveDir): os.makedirs(saveDir) for key in dic: print key,dic[key] # outPath=saveDir fileName=key rIndex=key.rfind("/") if rIndex!=-1: outPath=outPath+"\\"+key[:rIndex] fileName=key[rIndex+1:] if not os.path.isdir(outPath): os.makedirs(outPath) outFile=open(outPath+"\\"+fileName,'w') beginIndex=dic[key].find("<code>") endIndex=dic[key].find("</code>") if beginIndex==-1 or endIndex==-1: print "<code> have not" return subs=dic[key][beginIndex+6:endIndex] outFile.write(subs) outFile.close() url1='http://webpy.org/skeleton/0.3' url2='http://webpy.org/src/blog/0.3' url3='http://webpy.org/src/todo-list/0.3' url4='http://webpy.org/src/wiki/0.3' urls=(url1,url2,url3,url4) for url in urls: #取得文件名和内容对应字典 dic=getCode(url) #相对目录不变 proPath=url.replace("http://","\\").replace("/","\\") #保存根目录设定在【f:\pyworkspace】下 saveFile(r'f:\pyworkspace'+proPath,dic)