beautifulsoup的一些使用
自动补全代码:
import requests from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') print(soup.prettify())#如果html代码补全,则自动补全 print(soup.title.string)
查找标签
#基本使用 soup.title#<title>xxxxxxx</title> soup.title.string#xxxxxx
获取名称
#基本使用 soup.title#<title>xxxxxxx</title> soup.title.name#title
获取属性
#基本使用 soup.a#<a>xxxxxxx</a> soup.a['name']#a标签的name属性值
获取内容
soup.title.string#xxxxxx
嵌套选择
print(soup.head.title.string)
子节点
import requests from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) print(soup.div.contents)
或者
import requests from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) a=soup.div.children print(a) for i,j in enumerate(a): print(i,j)
子孙节点
import requests from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) a=soup.div.descendants print(a) for i,j in enumerate(a): print(i,j)
获取父节点
import requests from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) a=soup.div.parent print(a)
获取祖先节点
import requests from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) a=soup.div.parents for i,j in enumerate(a): print(i,j)
获取兄弟节点
a=soup.div.next_siblings#后面的兄弟节点(迭代器)
前面的兄弟节点
a=soup.div.next_siblings#前面的兄弟节点(迭代器)
标准选择器
find_all(name,attrs,recursive,**kwargs)
name
import requests from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) print(soup.find_all('ul'))#根据标签名查找
import requests from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) for ul in soup.find_all('ul'): for li in ul.find_all('li'): print(li)
attrs
import requests import re from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) a=soup.find_all(attrs={'class':'lazy'})#<a class='lazy'>xxxxx</lazy> for index,i in enumerate(a): result=re.findall(r'[a-zA-z]+://[^\s]*png',str(i)) url=result[0] res = requests.get(url) with open('%d.png'%index,'wb')as f: f.write(res.content)
import requests import re from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') a=soup.find_all(class_='lazy')
a=soup.find_all(id='lazy')
CSS选择器
import requests import re from bs4 import BeautifulSoup response=requests.get('https://www.ithome.com/html/it/340684.htm',timeout=9) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) a=soup.select('.lazy') for index,i in enumerate(a): result=re.findall(r'[a-zA-z]+://[^\s]*png',str(i)) url=result[0] res = requests.get(url) with open('./test/%d.png'%(index+1),'wb')as f: f.write(res.content)
获取css属性
import requests import re from bs4 import BeautifulSoup headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" } response=requests.get('https://www.zhihu.com/question/20519068/answer/215288567',headers=headers,timeout=None) result=response.text soup=BeautifulSoup(response.content,'lxml') # print(soup.prettify()) a=soup.select('.lazy') # print(a) for index,i in enumerate(a): url=i['data-original']
#
#result=re.findall(r'[a-zA-z]+://[^\s]*jpg',str(i)) # url=result[0] res = requests.get(url) with open('./test/%d.jpg'%(index+1),'wb')as f: f.write(res.content)
获取内容
li.get_text()