python re

from lxml import etree
import requests
import re

url = "http://dec3.jlu.edu.cn/webcourse/t000039/xinshiyeyingyu1/xsyyy1/unit1/a/lp.htm"
html = requests.get(url)
dr = re.compile(r'<[a][^>]+>',re.S)
dd = dr.sub('',html.text)
print dd
selector = etree.HTML(dd)

content = selector.xpath('/html/body/table/tr/td/table/tr/td/p')
for each in content:
    # read = each.get_attribute("src")
    # read = each.xpath('img')[0]
    # print read.attrib['src']
    # html = etree.tostring(each)
    # re_h=re.compile('</?\w+[^>]*>')
    # s=re_h.sub('',html) 
    # print s 
    brlist = each.xpath('br')
    for br in brlist:
        print br.tail

 

posted @ 2016-04-26 23:30  lianhuaren  阅读(67)  评论(0编辑  收藏  举报