用Beautiful Soup解析html源码
#xiaodeng #python3 #用Beautiful Soup解析html源码 html_doc = """ <html> <head> <title>The Dormouse's story</title> </head> <body> <b>测试</b> <p class="title"> <b>The Dormouse's story</b> </p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well. </p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,"html.parser") #1、补全html代码格式化输出 #print(soup.prettify()) #2、获取html页面标题 #print(soup.title.string) """ 还有其他类似用法: 1)print(soup.title.title) #result:title 2)print(soup.title.name) #result:title """ #3、查找所有的p标签和p标签的内容,返回一个list #print(soup.find_all("p")) #print(soup.find_all("a")) #print(soup.find_all("title")) """ 查找一条p标签记录,print(soup.p) """ #4、查p标签下属性class的value值 #print(soup.p["class"]) #5、查找所有id="XXX"的标签及标签内容 #print(soup.find_all(id="link3")) #6、从文档中找到所有<a>标签的链接 """ for key in soup.find_all("a"): print(key.get("href")) """ #7、从文档中获取所有的文字内容 #print(soup.get_text()) #8、探索标签数据类型 soup = BeautifulSoup('<b class="boldest">Extremely bold</b>',"html.parser") tag = soup.b #print(type(tag)) #<class 'bs4.element.Tag'> #9、获取标签名字,每个tag都有自己的名字,通过.name来获取 #print(soup.b.name) #10、操作标签属性 #一个tag可能有很多个属性. # tag <b class="boldest"> 有一个 “class” 的属性,值为 “boldest” . tag的属性的操作方法与字典相同 #print(soup.b["class"]) #11、执行删除标签属性等操作 #del tag['class'] #12、正则表达式 #面例子中找出所有以b开头的标签,这表示<body>和<b>标签都应该被找到 """ import re soup = BeautifulSoup(html_doc,"html.parser") for tag in soup.find_all(re.compile("^b")): print(tag.name) """ import re soup = BeautifulSoup(html_doc,"html.parser") #print(soup.find_all(href=re.compile("tillie"))) #href中包含tillie的超链接 #13、列表形式的匹配(匹配a和p标签) soup = BeautifulSoup(html_doc,"html.parser") #print(soup.find_all(["a","p"])) #14、查找a标签下id="XXX"的标签及标签内容 # find_all( name , attrs , recursive , text , **kwargs ) # find_all() 方法搜索当前tag的所有tag子节点,并判断是否符合过滤器的条件.这里有几个例子: #print(soup.find_all("a",id="link3")) #15、查找a标签下class_="sister"的内容 #print(soup.find_all("a",class_="sister")) #16、通过text参数可以搜文档中的字符串内容. #与name参数的可选值一样, text参数接受字符串,正则表达式,列表,True #print(soup.find_all(text="Elsie")) #print(soup.find_all(text=["Tillie", "Elsie", "Lacie"])) #17、限定查找标签的数量 #print(soup.find_all("a",limit=2)) #18、想搜索tag的直接子节点,可以使用参数 recursive=False doc="""<html> <head> <title>The Dormouse's story</title> </head> </html> """ soup = BeautifulSoup(doc,"html.parser") #print(soup.find_all("title",recursive=False)) #19、查找父节点,兄弟节点等办法(待研究) #20、查找title标签 soup = BeautifulSoup(html_doc,"html.parser") #print(soup.select("title")) #21、找到某个tag标签下的直接子标签 #注意事项:p和b之前要空格,此写法"p>b"错误 #print(soup.select("p > b"))#查找p标签下的直接子标签b #print(soup.select("body > b")) #22、通过CSS的类名查找class="sister"的标签 result=soup.select(".sister") #print(result) #23、通过tag的id查找,soup.select("#link1") result=soup.select("#link1") #print(result)#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
无语言基础,自学python所做的各种笔记,欢迎大牛指点.