import requests
import re
import os.path

#取得文件名和内容对应字典
def getCode(url):
    pattern=re.compile(r'<h\d>([^<]+)?</h\d>\n*<pre><code>[^<]*</code>{1}?',re.S)
    dic={}
    r=requests.get(url)
    if r.status_code==200:
        for g in re.finditer(pattern,r.text):
            dic[g.group(1)]=g.group(0)
    return dic
        
#输出到文件中
def saveFile(saveDir,dic):
    if not os.path.isdir(saveDir):
        os.makedirs(saveDir)
    for key in dic:
        print key,dic[key]
        #
        outPath=saveDir
        fileName=key
        rIndex=key.rfind("/")
        if rIndex!=-1:
            outPath=outPath+"\\"+key[:rIndex]
            fileName=key[rIndex+1:]
            if not os.path.isdir(outPath):
                os.makedirs(outPath)
        outFile=open(outPath+"\\"+fileName,'w')
        beginIndex=dic[key].find("<code>")
        endIndex=dic[key].find("</code>")
        if beginIndex==-1 or endIndex==-1:
            print "<code> have not"
            return
        subs=dic[key][beginIndex+6:endIndex]
        outFile.write(subs)
        outFile.close()

url1='http://webpy.org/skeleton/0.3'
url2='http://webpy.org/src/blog/0.3'
url3='http://webpy.org/src/todo-list/0.3'
url4='http://webpy.org/src/wiki/0.3'

urls=(url1,url2,url3,url4)

for url in urls:
    #取得文件名和内容对应字典
    dic=getCode(url)
    #相对目录不变
    proPath=url.replace("http://","\\").replace("/","\\")
    #保存根目录设定在【f:\pyworkspace】下
    saveFile(r'f:\pyworkspace'+proPath,dic)