文本到语音(tts)

Web Speech API

使你能够将语音数据合并到 Web应用程序中。Web Speech API 有两个部分：SpeechSynthesis 语音合成（文本到语音 TTS）和 SpeechRecognition 语音识别（异步语音识别）

SpeechSynthesis: 语音服务的控制器接口, 获取设备上关于可用的合成声音的信息，开始、暂停语音，或除此之外的其他命令

语音合成通过 SpeechSynthesis 接口进行访问，它提供了文字到语音（TTS）的能力，这使得程序能够读出它们的文字内容（通常使用设备默认的语音合成器）。不同的声音类类型通过 SpeechSynthesisVoice 对象进行表示，不同部分的文字则由 SpeechSynthesisUtterance 对象来表示。你可以将它们传递给 SpeechSynthesis.speak() 方法来产生语音。

SpeechSynthesisUtterance: 语音请求。它包含语音服务应该阅读的内容以及如何阅读的信息（例如语言，音高和音量）

SpeechRecognition: 语音识别

语音识别通过 SpeechRecognition 接口进行访问，它提供了识别从音频输入（通常是设备默认的语音识别服务）中识别语音情景的能力。一般来说，你将使用该接口的构造函数来构造一个新的 SpeechRecognition 对象，该对象包含了一系列有效的对象处理函数来检测识别设备麦克风中的语音输入。SpeechGrammar 接口则表示了你应用中想要识别的特定文法。文法则通过 JSpeech Grammar Format (JSGF.) 来定义。

SpeechGrammar: 语音识别对象服务想要识别的一系列词语或模式

文字到语音

<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Web Speech API</title>
  </head>
  <body>
    <strong>Web Speech API</strong>
    <hr />
    <select class="select-voice"></select>
    <textarea class="text" cols="50" rows="10"></textarea><br /><br />
    <button type="button" class="btn-play">文字语音播放</button>
    <button type="button" class="btn-pause">暂停播放</button>
    <button type="button" class="btn-resume">恢复播放</button>
    <button type="button" class="btn-end">停止播放</button>

    <script>
      // Web Speech API

      const playBtn = document.querySelector('.btn-play');
      const pauseBtn = document.querySelector('.btn-pause');
      const resumeBtn = document.querySelector('.btn-resume');
      const endBtn = document.querySelector('.btn-end');

      // 文字转语音

      // 创建 SpeechSynthesisUtterance 对象
      const synth = globalThis.speechSynthesis;
      // console.log("synth => ", synth)

      const text = document.querySelector('.text');
      text.value = 'hellow world, this is a test for web speech api.';

      // 选择语音声音
      const selectVoice = document.querySelector('.select-voice');
      const fragment = document.createDocumentFragment();
      const voiceList = [];
      synth.addEventListener('voiceschanged', () => {
        if (voiceList.length === 0) {
          synth.getVoices().forEach((voice) => {
            if (voice.lang.includes('zh')) {
              const option = document.createElement('option');
              option.dataset.lang = voice.lang;
              option.value = voice.name;
              option.textContent = voice.name;
              fragment.appendChild(option);
              voiceList.push(voice);
            }
          });
          selectVoice.appendChild(fragment);
        }
        // 选择语音声音
        handleSelectVoice();

        playBtn.removeAttribute('disabled');
      });

      selectVoice.addEventListener('change', handleSelectVoice);

      // 切换语音声音
      function handleSelectVoice() {
        /** @type {SpeechSynthesisVoice} */
        const selectedVoice = voiceList.at(selectVoice.selectedIndex);
        utterance.voice = selectedVoice;
        // console.log('selectedVoice => ', selectedVoice.name);
      }

      const utterance = new SpeechSynthesisUtterance();
      // 设置文本内容
      utterance.text = text.value;
      const info = {
        start: 0,
        end: 0,
        elapsedTime: 0,
        paused: false,
      };
      playBtn.addEventListener('click', () => {
        // 移除所有语音谈话队列中的谈话
        synth.pending && synth.cancel();
        // 添加一个 utterance 到语音谈话队列；它将会在其他语音谈话播放完之后播放。
        synth.speak(utterance);
      });
      // 暂停播放
      pauseBtn.addEventListener('click', () => {
        synth.pause();
      });
      // 恢复播放
      resumeBtn.addEventListener('click', () => {
        synth.cancel();
        const sliceText = utterance.text.slice(info.end);
        // console.log('sliceText => ', sliceText);
        utterance.text = sliceText;
        synth.speak(utterance);
      });
      // 结束播放
      endBtn.addEventListener('click', () => {
        synth.cancel();
        info.paused = false;
      });

      utterance.addEventListener('boundary', (e) => {
        const {
          charIndex,
          charLength,
          elapsedTime,
          utterance: { text },
        } = e;
        // name: `word` 所语音的字符，`sentence` 完整句的边界

        // 保存正在语音的字符索引和已读时间
        // const char = text.slice(charIndex, charIndex + charLength);
        info.start = charIndex;
        info.end = charIndex + charLength;
        info.elapsedTime = elapsedTime;
      });
      utterance.addEventListener('pause', (e) => {
        console.log('pause');
      });
      utterance.addEventListener('resume', (e) => {
        console.log('resume');
      });
      utterance.addEventListener('end', (e) => {
        console.log('end');
      });

      window.addEventListener('beforeunload', () => {
        // 停止播放
        synth.pause();
        synth.cancel();
      });
    </script>
  </body>
</html>

语音识别

<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Web Speech API</title>
  </head>
  <body>
    <strong>Web Speech API</strong>
    <hr />
    <audio src="./music.m4a" controls></audio>
    <textarea class="text" cols="50" rows="10"></textarea><br /><br />
    <button type="button" class="btn-speech">语音转文字</button>

    <script>
      // Web Speech API

      const text = document.querySelector('.text');

      // 按钮控制
      const speechBtn = document.querySelector('.btn-speech');

      // 语音识别

      // 创建 SpeechRecognition 对象
      /** @type {SpeechRecognition} */
      const recognition = new webkitSpeechRecognition();
      // console.log('recognition => ', recognition);

      // 是否连续识别
      recognition.continuous = true;
      // 识别结果是否包含中间结果
      recognition.interimResults = true;
      // 识别语言
      recognition.lang = 'zh-CN'; // zh-CN, en-US

      speechBtn.addEventListener('click', () => {
        // 开始识别
        recognition.start();
      });

      recognition.onstart = (e) => {
        console.log('开始', e);
      };
      recognition.onaudiostart = (e) => {
        console.log('开始录音');
      };
      recognition.onspeechstart = (e) => {
        console.log('开始说话');
      };
      // 识别结束
      recognition.onspeechend = (e) => {
        console.log('语音识别结束');
        recognition.stop();
      };
      recognition.onaudioend = (e) => {
        console.log('结束录音');
      };
      recognition.onend = (e) => {
        console.log('结束');
        // 结束后，重新开始识别
        recognition.start();
      };

      // 识别结果
      recognition.onresult = (e) => {
        const resultList = Object.values(e.results);
        let str = '';
        resultList.forEach((result) => {
          str += result[0].transcript + '\n';
        });

        text.value = str;
        console.log('识别结果: ',e.resultIndex, str);
      };

      // 未识别出结果
      recognition.onnomatch = (e) => {
        console.log('No match', e);
      };

      // 识别错误
      recognition.onerror = (e) => {
        // not-allowed：用户禁止访问麦克风权限 audio-capture: 麦克风未开启 no-speech: 没有检测到语音 network: 网络连接问题
        console.log('识别错误原因: ', e.error);
        if (e.error === 'not-speech') {
          recognition.stop();
        }
      };
    </script>
  </body>
</html>

posted @ 2024-05-02 18:53 _clai 阅读(14) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

_clai

文本到语音(tts)

Web Speech API

SpeechSynthesis: 语音服务的控制器接口, 获取设备上关于可用的合成声音的信息，开始、暂停语音，或除此之外的其他命令

SpeechSynthesisUtterance: 语音请求。 它包含语音服务应该阅读的内容以及如何阅读的信息（例如语言，音高和音量）

SpeechRecognition: 语音识别

SpeechGrammar: 语音识别对象服务想要识别的一系列词语或模式

公告

SpeechSynthesisUtterance: 语音请求。它包含语音服务应该阅读的内容以及如何阅读的信息（例如语言，音高和音量）