豆瓣电影简易爬取
#coding=utf-8 import urllib2 from HTMLParser import HTMLParser class HttpParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.move=[] def handle_starttag(self,tag,attrs): def _attr(attrlist,attrname): for attr in attrlist: if attr[0]==attrname: return attr[1] return None if tag=='li' and _attr(attrs,'data-title'): move={} move['title']=_attr(attrs,'data-title') move['rate'] = _attr(attrs, 'data-rate') self.move.append(move) def get(url): re=urllib2.urlopen(url) parser=HttpParser() parser.feed(re.read()) re.close() return parser.move if __name__ == '__main__': url='https://movie.douban.com/' moves=get(url) import json print json.dumps(moves,ensure_ascii=False,indent=2)