#小练习 SGMLParser 解析 HTML 分类: HTMLParser 2013-11-12 15:52 472人阅读 评论(0) 收藏
#coding:utf-8 from sgmllib import SGMLParser class GetIdList(SGMLParser): def reset(self): self.IDlist=[] self.flag=False self.getdata=False self.verbatim = 0 SGMLParser.reset(self) def start_div(self,attrs): if self.flag==True: self.verbatim+=1 #进入子层div了,层数+1 for k,v in attrs: if k=="class" and v=="entry-content":#确定进入了<div class='entry-content'> self.flag=True def end_div(self):#遇到</div> if self.verbatim==0: self.flag=False if self.flag==True:#退出子层div了,层数减 1 self.verbatim-=1 def start_p(self,attrs): if self.flag==False: return self.getdata=True def end_p(self):#遇到</p> if self.getdata: self.getdata=False def handle_data(self,text):#处理文本 if self.getdata: self.IDlist.append(text) def printID(self): for i in self.IDlist: print i.decode("utf-8") if __name__ == '__main__': the_page ='''<html> <head> <title>test</title> </head> <body> <h1>title</h1> <div class='entry-content'> <p>感兴趣内容1</p> <p>感兴趣内容2</p> …… <p>感兴趣内容n</p> <div class='entry-content'>我是来捣乱的2<div class= 'ooxx'>我是来捣乱的3</div></div> </div> <div class='content'> <p>内容1</p> <p>内容2</p> …… <p>内容n</p> </div> </body> </html> ''' gid=GetIdList() gid.feed(the_page) gid.printID() gid.close()
结果:
感兴趣内容1
感兴趣内容2
感兴趣内容n