bs4模块

bs4

  • 实例化BeautifulSoup对象:BeautifulSoup('page_text', 'lxml') 将页面源码加载到对象中
  • 使用该对象的相关属性和方法实现标签定位和数据提取

相关方法:

from bs4 import BeautifulSoup
fp = open('./test.html', 'r', encoding='utf-8')
soup = BeautifulSoup(fp, 'lxml')

# soup.tag_name:只可以定位到第一次出现的tag_name标签
soup.title
soup.div

# soup.find('tag_namae')
soup.find('a')  # 相当于soup.a
soup.find('div',attrs={'class': 'link-item'}) #选择属性

# soup.find_all
soup.find_all('div')[2]

#soup.select 选择器
soup.select('.song')
soup.select('div')
# 层级选择
soup.select('.song > ul > li > a') # '>'表示一个层级
soup.select('.song a') # ‘空格’表示多个层级

# 取文本
# string取直系的数据,text取全部的
soup.p.string
soup.find('div', class_='tang').text

# 取属性
soup.a['href']

 

bs4爬取三国演义示例

import requests
from bs4 import BeautifulSoup
url = 'http://shicimingju.com/book/sanguoyanyi.html'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(page_text, 'lxml')
li_list = soup.select('.book-mulu > ul > li')
fp = open('./sanguo.txt','a',encoding='utf-8')
for li in li_list:
    title = li.a.string
    detail_url = 'http://shicimingju.com' + li.a['href']
    #  print(title,detail_li)
    # 单独对详情页发起请求获取数据
    detail_page_text = requests.get(url=detail_url, headers=headers).text
    soup = BeautifulSoup(detail_page_text,'lxml')
    content = soup.find('div', class_='chapter_content').text
    fp.write(title+'\n'+content+'\n')

fp.close
    

 

posted @ 2019-09-25 17:52  tianqibucuo  阅读(221)  评论(0)    收藏  举报