python3爬虫(find_all用法等)
#read1.html文件 # <html><head><title>The Dormouse's story</title></head> # <body> # <p class="title"><b>The Dormouse's story</b></p> # # <p class="story">Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well.</p> # # <p class="story">...</p></body></html> #!/usr/bin/env python # # -*- coding:UTF-8 -*- import os import re import requests from bs4 import NavigableString from bs4 import BeautifulSoup curpath=os.path.dirname(os.path.realpath(__file__)) hmtlpath=os.path.join(curpath,'read1.html') res=requests.get(hmtlpath) soup=BeautifulSoup(res.content,features="html.parser") for str in soup.stripped_strings: print(repr(str)) links=soup.find_all(class_="sister") for parent in links.parents: if parent is None: print(parent) else: print(parent.name) print(links.next_sibling) for link in links: print(link.next_element) print(link.next_sibling) print(link.privous_element) print(link.privous_sibling) def has_class_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') def not_lacie(href): return href and not re.compile("lacie").search(href) def not_tillie(href): return href and not re.compile("tillie").search(href) def not_tillie1(id): return id and not re.compile("link2").search(id) file=open("soup.html","r",encoding="utf-8") soup=BeautifulSoup(file,features="lxml") #find_all用法 tags=soup.find_all(re.compile('^b')) tags=soup.find_all('b') tags=soup.find_all(['a','b']) tags=soup.find_all(has_class_no_id) tags=soup.find_all(True) tags=soup.find_all(href=not_lacie) for tag in tags: print(tag.name) def surrounded_by_strings(tag): return (isinstance(tag.next_element, NavigableString) and isinstance(tag.previous_element, NavigableString)) tags=soup.find_all(id=not_tillie1) for tag in tags: print(tag) tags=soup.find_all(attrs={"id":"link3"}) for tag in tags: print(tag) soup.find_all(recursive=False) tags=soup.select("body a") tags=soup.select("p > a") tags=soup.select("p > #link1") tags=soup.select("html head title") tags=soup.select(".sister") tags=soup.select("[class~=sister]") tags=soup.select("#link1 + .sister") tags=soup.select("#link1") tags=soup.select("a#link1") tags=soup.select("a[href]") tags=soup.select('a[href^="http://example"]') tags=soup.select('a[href$="tillie"]') tags=soup.select('a[href*=".com/el"]') for tag in tags: print(tag) file=open("soup.html","r",encoding="utf-8") soup=BeautifulSoup(file,features="html.parser") soup=BeautifulSoup(file,features="html.parser") print(soup.prettify()) print(type(soup)) print(type(soup.title)) print(type(soup.title.string)) print(type(soup.b.string)) print(soup.head.name) print(soup.title.name) print(soup.a.name) print(soup.name) tag=soup.a print(tag["href"]) print(tag.string) print(tag["class"]) print(tag.attrs) print(soup.title.string) print(soup.title.name) print(soup.p.attrs) print(soup.a.attrs) print(soup.a["class"])
Stay Hungary Stay Foolish
posted on 2019-01-16 01:01 Anderson_An 阅读(1568) 评论(0) 编辑 收藏 举报