#小练习 解析HTML文件并使用字典保存链接 分类: HTMLParser python 小练习 2013-11-11 12:06 267人阅读 评论(0) 收藏
#coding:utf-8
from HTMLParser import HTMLParser
import pprint
class myhtml(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.d={}
self.flag=None
self.value=None
#解析html声明
def handle_decl(self,declaration):
print 'Declaration:',declaration
#解析开始标签
def handle_starttag(self,tag,attrs):
print 'Start tag:',tag
if tag=='a':
#如果tag是“a”,则添加标识flag=‘a’,处理链接数据时,判断是否tag为‘a’
self.flag='a'
if len(attrs)==0:pass
else:
for key,value in attrs:
if key=='href':
self.value=value
#解析结束标签
def handle_endtag(self,tag):
print 'End tag:',tag
#解析注释标签
def handle_comment(self,comm):
print 'Comment:',comm
#解析数据
def handle_data(self,data):
#添加flag=‘a’判断,否则字典中会增加'test': None 项
if self.flag=='a' and data.strip():#
## print 'Data:',data
self.d[data.strip()]=self.value
print data.decode('utf-8')
else:
## print 'Data:',data
pass
if __name__ == '__main__':
a = '''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\
<html><head><!--insert javaScript here!--><title>test</title>
<body>
<a href="http: //www.163.com">163.com</a>
<a href="www.google.com"> goolge.com </a>\
<A Href="www.pythonclub.org">PythonClub </a>
<A HREF='www.sina.com.cn'> sina </a>
</body></html>'''
m=myhtml()
m.feed(a)
m.close()
print m.d
结果:
Declaration: DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"
Start tag: html
Start tag: head
Comment: insert javaScript here!
Start tag: title
End tag: title
Start tag: body
Start tag: a
163.com
End tag: a
Start tag: a
goolge.com
End tag: a
Start tag: a
PythonClub
End tag: a
Start tag: a
sina
End tag: a
End tag: body
End tag: html
{'goolge.com': 'www.google.com', 'PythonClub': 'www.pythonclub.org', '163.com': 'http: //www.163.com', 'sina': 'www.sina.com.cn'}
版权声明:本文为博主原创文章,未经博主允许不得转载。