crawl——bs4的搜索文档树
概要
代码
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" id='id_pp' name='lqz'>asdfasdf<b>asdfas</b><span>span<b>bbb</b></span></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> <a href="http://example.com/lacie" class="sister1" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup=BeautifulSoup(html_doc,'lxml') # find和find_all的用法:用法完全一样,只不过find找到第一个,find_all找到所有 # 5种过滤器:字符串、正则表达式、列表、True、方法 # 字符串:name:标签名 class_:类名 id:id号 href:href # 只要是BeautifulSoup对象Tag的对象,可以继续find,继续遍历 . 找 # res=soup.find(name='body').p # res=soup.find(name='body').find(name='p') # print(type(res)) # print(res) # res=soup.body.find(id='link2') # res=soup.body.find(href='http://example.com/lacie') # res=soup.body.find(name='a',href='http://example.com/lacie') # print(res) # 列表 # res=soup.find_all(name=['a','p']) # res=soup.find_all(id=['link2','link3'],class_='sister') # print(res) # 正则表达式 # import re # # res=soup.find_all(name=re.compile('^b')) # res=soup.find_all(class_=re.compile('^s'),name='a') # print(res) # True # res=soup.find_all(name=True) # res=soup.find_all(class_=True) # res=soup.find_all(id=True) # res=soup.find_all(href=True) # for i in res: # url=i['href'] # print(url) # print(res) # 方法/函数(了解) # def aaa(tag): # # return tag.has_attr('class') and not tag.has_attr('id') # return tag.has_attr('class') and tag.has_attr('id') # # res=soup.find_all(name=aaa) # print(res)