python BeautifulSoup4解析网页
html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well.</p> <p class="story">...</p></body></html> """ soup=BS(html,'html.parser') for i in soup.find_all('a'): print('i.text:',i.text)#注释掉的内容就不打印了 str类型 print('i.string:',i.string) #注释掉的内容 都会打印出来,NavigableString对象 print('soup.head.contents:',soup.head.contents,type(soup.head.contents)) print('soup.head.children:',soup.head.children,type(soup.head.children)) print('soup.body.contents:',soup.body.contents)#返回一个子元素的列表 print('soup.body.children:',soup.body.children)#返回一个子元素的迭代器 for i in soup.body.children: print(i) print('子孙节点 都显示出来') for i in soup.body.descendants: print(i) print('soup.body.string:',soup.body.string) print('soup.body.strings:',soup.body.strings) print('soup.body.stripped_strings:',soup.body.stripped_strings) #过滤掉所有空格显示 print('去掉空格的body子元素:') for i in soup.body.stripped_strings: print(i) print('soup.a.parent:',soup.a.parent) print('soup.a.next_sibling:',soup.a.next_sibling) #注意文本节点、换行\n都可能成为当前节点的上一个或者下一个同级节点 print('soup.a.previous_sibling:',soup.a.previous_sibling) print('soup.a.next_element:',soup.a.next_element) #下一个元素 不一定同级 print('soup.a.previous_element:',soup.a.previous_element) print('打印所有后面的同级节点:\n') for i in soup.a.next_siblings: print(i) print('soup.a.next_element:',list(soup.a.next_elements)[1]) print('***********find_all*****') print(soup.find_all('a')) print('引入正则表达式:') import re print(soup.find_all(re.compile(r'^title'))) #正则匹配的是 标签的名字 print('列表的方式匹配:') print(soup.find_all(['a','b'])) print('函数的方式匹配,类似filter') def func(tag): if tag.has_attr('class') and re.search(r'^a',tag.name): return tag print(soup.find_all(func)) html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well.</p> <p class="story">...</p></body></html> """ soup=BS(html,'html.parser') print('按属性值查找:') print(soup.find_all(id='link1')) print(soup.find_all('a',id='link1')) print(soup.find_all(id='link2',href=re.compile(r'laci'))) #返回的都是列表 print(soup.find_all(class_='story')) #注意后面加的下划线 print(soup.find_all(attrs={'class':'sister'})) print('按元素内容查找text参数:') print(soup.find_all(text='Tillie')) print(soup.find_all(text=['Tillie','Lacie'])) #返回的都是元素内容 print(soup.find_all(text=re.compile(r'ormous'))) print('通过内容元素 找到上级元素') print(soup.find_all(text=re.compile(r'ormous'))[1].parent.parent) #限制查找数量 print('limit:') print(soup.find_all('a',limit=2)) print('只在子节点查找:') print(soup.body.find_all('a',limit=2,recursive=False)) #只查找子节点 recursive循环的、递归的 print(soup.body.find_all(class_='story',recursive=False))