python re
from lxml import etree import requests import re url = "http://dec3.jlu.edu.cn/webcourse/t000039/xinshiyeyingyu1/xsyyy1/unit1/a/lp.htm" html = requests.get(url) dr = re.compile(r'<[a][^>]+>',re.S) dd = dr.sub('',html.text) print dd selector = etree.HTML(dd) content = selector.xpath('/html/body/table/tr/td/table/tr/td/p') for each in content: # read = each.get_attribute("src") # read = each.xpath('img')[0] # print read.attrib['src'] # html = etree.tostring(each) # re_h=re.compile('</?\w+[^>]*>') # s=re_h.sub('',html) # print s brlist = each.xpath('br') for br in brlist: print br.tail