python实践 - 抓取网页中的图片和数据
主要是用了SGMLParser和urllib模块
#!/usr/lib/python
# getimg.py
import sys,os
from sgmllib import SGMLParser
type = sys.getfilesystemencoding()
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.is_Contant=""
self.titles=[]
self.imgs=[]
def start_div(self, attrs):
href = [v for k, v in attrs if k=='class']
if href:
if href[0] == 'posttitle':
self.is_Contant=1
def end_div(self):
self.is_Contant=""
def start_img(self,attrs):
href = [self.imgs.append(v) for k, v in attrs if k=='src']
def handle_data(self, text):
if self.is_Contant:
text = text.decode('UTF-8').encode(type)
self.titles.append(text)
if __name__ == "__main__":
import urllib
u = 'http://www.cnblogs.com'
usock = urllib.urlopen(u)
parser = URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
f = file('result.txt', 'w')
for title in parser.titles:
print title
f.write(title+'\r\n')
for img in parser.imgs:
urllib.urlretrieve(('' if img.find('http://')==0 else u)+img,'d:/tmp/'+img.split('/')[-1])
f.close()
# getimg.py
import sys,os
from sgmllib import SGMLParser
type = sys.getfilesystemencoding()
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.is_Contant=""
self.titles=[]
self.imgs=[]
def start_div(self, attrs):
href = [v for k, v in attrs if k=='class']
if href:
if href[0] == 'posttitle':
self.is_Contant=1
def end_div(self):
self.is_Contant=""
def start_img(self,attrs):
href = [self.imgs.append(v) for k, v in attrs if k=='src']
def handle_data(self, text):
if self.is_Contant:
text = text.decode('UTF-8').encode(type)
self.titles.append(text)
if __name__ == "__main__":
import urllib
u = 'http://www.cnblogs.com'
usock = urllib.urlopen(u)
parser = URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
f = file('result.txt', 'w')
for title in parser.titles:
print title
f.write(title+'\r\n')
for img in parser.imgs:
urllib.urlretrieve(('' if img.find('http://')==0 else u)+img,'d:/tmp/'+img.split('/')[-1])
f.close()
上面的代码将主题保存到了当前目录的result.txt文件里
所有的图片保存到了d:/tmp/目录