bs4模块
bs4
- 实例化BeautifulSoup对象:BeautifulSoup('page_text', 'lxml') 将页面源码加载到对象中
- 使用该对象的相关属性和方法实现标签定位和数据提取
相关方法:
from bs4 import BeautifulSoup fp = open('./test.html', 'r', encoding='utf-8') soup = BeautifulSoup(fp, 'lxml') # soup.tag_name:只可以定位到第一次出现的tag_name标签 soup.title soup.div # soup.find('tag_namae') soup.find('a') # 相当于soup.a soup.find('div',attrs={'class': 'link-item'}) #选择属性 # soup.find_all soup.find_all('div')[2] #soup.select 选择器 soup.select('.song') soup.select('div') # 层级选择 soup.select('.song > ul > li > a') # '>'表示一个层级 soup.select('.song a') # ‘空格’表示多个层级 # 取文本 # string取直系的数据,text取全部的 soup.p.string soup.find('div', class_='tang').text # 取属性 soup.a['href']
bs4爬取三国演义示例
import requests from bs4 import BeautifulSoup url = 'http://shicimingju.com/book/sanguoyanyi.html' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' } page_text = requests.get(url=url, headers=headers).text soup = BeautifulSoup(page_text, 'lxml') li_list = soup.select('.book-mulu > ul > li') fp = open('./sanguo.txt','a',encoding='utf-8') for li in li_list: title = li.a.string detail_url = 'http://shicimingju.com' + li.a['href'] # print(title,detail_li) # 单独对详情页发起请求获取数据 detail_page_text = requests.get(url=detail_url, headers=headers).text soup = BeautifulSoup(detail_page_text,'lxml') content = soup.find('div', class_='chapter_content').text fp.write(title+'\n'+content+'\n') fp.close