BeautifulSoup
from bs4 import BeautifulSoup import urllib2 html = urllib2.urlopen('http://tieba.baidu.com/p/5058456989') bsobj = BeautifulSoup(html.read(), "html.parser") # 不加"html.parser"会有警告。。。。 print bsobj.title
underline = '-'*100 def get_title(url): try: html = urllib2.urlopen(url) except HTTPError, e: raise e return None try: bsobj = BeautifulSoup(html.read(), "html.parser") title = bsobj except AttributeError, e: raise e return None return title url = 'http://tieba.baidu.com/p/4420237089?see_lz=1' title = get_title(url) if title is None: print 'title is none' else: print underline # print title tmp = title.findAll("div", {"class": "d_post_content j_d_post_content "}) vmp = title.findAll("span", {"class": "tail-info"}) # for v in vmp.tr.next_siblings: # print v for val, f in zip(tmp, vmp[1:-1:3]): print val.get_text() print f.get_text(), underline