pyrebot

Better not to ignore the past but learn from it instead. Otherwise, history has a way of repeating itself.

博客园 首页 新随笔 联系 订阅 管理
  1 #coding:utf-8
  2 import urllib2
  3 import os,sys
  4 from BeautifulSoup import BeautifulSoup          # For processing HTML
  5 from bs4 import BeautifulSoup
  6 class BookSave():
  7     '''
  8     dir:html文件保存目录  url:index.html目录 static_url:js、css所在目录的上级目录
  9     distinguish:用来区分相同tag.name   dis_key:所需的tag属性  key1:所取tag
 10     key2:tag属性   key3:tag属性值
 11     '''
 12     def __init__(self,dir,url,static_url,distinguish,dis_key,key1,key2,key3):
 13         self.dir=dir
 14         self.url = url
 15         self.static_url = static_url
 16         self.distinguish = distinguish
 17         self.dis_key = dis_key
 18         self.key1 = key1
 19         self.key2 = key2
 20         self.key3 = key3
 21 
 22     def AddUrl(self):
 23         if self.dir != '':
 24             list = os.listdir(self.dir)  #列出目录下的所有文件和目录
 25             for line in list:
 26                 if os.path.isdir(line):
 27                     continue
 28                 elif os.path:
 29                     self.JieXiCsss(line)
 30                     self.JieXiJs(line)
 31 
 32     def JieXiCsss(self,file):
 33         filePath = os.path.join(self.dir,file)
 34         print filePath
 35         fp = open(filePath)
 36         soup = BeautifulSoup(fp)
 37         head = soup.head        
 38         tags = head.findAll('link')#,{'rel':'stylesheet'}
 39         if tags != []:
 40             for item in tags:
 41                 try:
 42                     item['href'] = self.static_url + item['href']
 43                     print item['href']    
 44                 except KeyError:
 45                     continue            
 46         else :
 47             print tags,filePath
 48         self.SaveHtml(soup,filePath)
 49 
 50     def JieXiJs(self,file):
 51         filePath = os.path.join(self.dir,file)
 52         fp = open(filePath)
 53         soup = BeautifulSoup(fp)
 54         head = soup.head        
 55         tags = head.findAll('script')#,{'rel':'stylesheet'}
 56         if tags != []:
 57             for item in tags:
 58                 try:
 59                     item['src'] = self.static_url + item['src']
 60                     print item['src']
 61                     self.SaveHtml(soup,filePath)
 62                 except KeyError:
 63                     continue
 64         else :
 65             print tags,filePath
 66         self.SaveFile(soup,filePath)
 67 
 68     def SaveFile(self,soup,file):
 69         html = str(soup)
 70         with open(file,'wb') as code:
 71             code.write(html)
 72 
 73     def IsNullArr(self,Arr):
 74         if Arr != []:
 75             return Arr
 76         else:
 77             print 'array is null'
 78 
 79     def DownLoadHtml(self,arr):
 80         tags = bs.IsNullArr(arr)
 81         for item in tags:
 82             liName = item.parent.name
 83             if any(liName in s for s in self.distinguish):
 84                 continue
 85             else:
 86                 htmlUrl = self.url + item[self.dis_key]
 87                 print htmlUrl
 88                 fileName = os.path.join(self.dir,item[self.dis_key])
 89                 print 'saving:' + htmlUrl
 90                 self.SaveHtml(fileName,htmlUrl)
 91 
 92     def SaveHtml(self,fileName,htmlUrl):
 93         f = urllib2.urlopen(htmlUrl)
 94         html = f.read()
 95         with open(fileName,"wb") as code:
 96             code.write(html)#.decode('utf-8')
 97                     
 98     def GetSearchResult(self):
 99         doc = urllib2.urlopen(self.url)
100         soup = BeautifulSoup(doc)
101         soup.originalEncoding
102         tag = soup.findAll(self.key1,{self.key2:self.key3})
103         return tag
104 
105     def SplitString(self,source,sep):
106         return source.strip().split('/')
107 
108     def CreateDir(self):
109         if not os.path.exists(self.dir):
110             os.makedirs(os.path.join(self.dir))
111 if __name__=='__main__':
112     urls = 'http://docs.python.org/2/library/'
113     static_url = 'http://docs.python.org/2/'
114     dirs = 'E:/demo/PythonLib1/'
115     bs = BookSave(dirs,urls,static_url,'p','href','a','class','reference internal')
116     bs.CreateDir()
117     fileName = os.path.join(dirs,'index.html')
118     htmlUrl = urls + 'index.html'
119     bs.SaveHtml(fileName,htmlUrl)
120     tags = bs.GetSearchResult()
121     #print tags
122     bs.DownLoadHtml(tags)
123     bs.AddUrl()

 

posted on 2014-03-19 17:56  pyrebot  阅读(485)  评论(0编辑  收藏  举报