python模块之HTMLParser抓页面上的所有URL链接
# -*- coding: utf-8 -*- #python 27 #xiaodeng #python模块之HTMLParser抓页面上的所有URL链接 import urllib #MyParser类写法一 ''' from HTMLParser import HTMLParser class MyParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): if tag == 'a': for name,value in attrs: if name == 'href' and value.startswith('http'): print value ''' #MyParser类写法二 import HTMLParser class MyParser(HTMLParser.HTMLParser): def handle_starttag(self, tag, attrs):
#这里重新定义了处理开始标签的函数 if tag == 'a':#判断标签<a>的属性 for name,value in attrs: if name == 'href' and value.startswith('http'):#以什么字符串开头 print value if __name__ == '__main__': url='http://www.cnblogs.com/' content=urllib.urlopen(url).read() my=MyParser() my.feed(content) my.close() ''' http://www.cnblogs.com/Jaryleely/p/careertwo.html http://www.cnblogs.com/Jaryleely/ http://www.cnblogs.com/Jaryleely/ http://www.cnblogs.com/Jaryleely/p/careertwo.html#commentform http://www.cnblogs.com/Jaryleely/p/careertwo.html http://www.cnblogs.com/AndroidJotting/p/4983688.html http://www.cnblogs.com/AndroidJotting/ http://www.cnblogs.com/AndroidJotting/ http://www.cnblogs.com/AndroidJotting/p/4983688.html#commentform http://www.cnblogs.com/AndroidJotting/p/4983688.html http://www.cnblogs.com/fuly550871915/p/4983682.html http://www.cnblogs.com/fuly550871915/ http://www.cnblogs.com/fuly550871915/ http://www.cnblogs.com/fuly550871915/p/4983682.html#commentform http://www.cnblogs.com/fuly550871915/p/4983682.html http://www.cnblogs.com/Ray-liang/p/4983592.html http://www.cnblogs.com/Ray-liang/ http://www.cnblogs.com/Ray-liang/ http://www.cnblogs.com/Ray-liang/p/4983592.html#commentform http://www.cnblogs.com/Ray-liang/p/4983592.html ....... '''
无语言基础,自学python所做的各种笔记,欢迎大牛指点.