Files
ai-agent/digital-human/service/tts_service.go
2026-04-27 14:02:43 +08:00

118 lines
3.4 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"context"
"encoding/base64"
"ai-agent/digital-human/consts/public"
"github.com/gogf/gf/v2/errors/gerror"
"github.com/gogf/gf/v2/frame/g"
)
type tts struct{}
// TTS 统一的模型异步调用封装(通过 model-asynch 中间件)
var TTS = new(tts)
// CreateVoiceDesignTask 设计音频任务VoiceDesign
func (s *tts) CreateVoiceDesignTask(
ctx context.Context,
text string,
instruct string,
language string, // 空则 Auto
speed float64, // <=0 则 1.0
) (taskID string, err error) {
if language == "" {
language = "Auto"
}
if speed <= 0 {
speed = 1.0
}
payload := map[string]any{
"text": text,
"language": language,
"instruct": instruct,
"speed": speed,
"response_format": "wav",
}
g.Log().Info(ctx, "[CreateVoiceDesignTask] %v", payload)
return createModelAsynchTask(ctx, public.ModelNameVoiceDesign, payload, "")
}
// CreateCustomVoiceTask 预设音色CustomVoice任务
// - speaker: 预设说话人(如 Vivian/Serena/Ryan/...
// - instruct: 可选,情绪/风格控制
func (s *tts) CreateCustomVoiceTask(
ctx context.Context,
text string,
speaker string,
language string, // 例如 "Chinese"/"English"/"Auto",空则默认 "Auto"
instruct string, // 可空
speed float64, // 0.5~2.0<=0 则默认 1.0
) (taskID string, err error) {
if language == "" {
language = "Auto"
}
if speed <= 0 {
speed = 1.0
}
payload := map[string]any{
"text": text,
"language": language,
"speaker": speaker,
"instruct": instruct,
"speed": speed,
"response_format": "wav", // 建议统一用 wav
}
g.Log().Info(ctx, "[CreateCustomVoiceTask] %v", payload)
return createModelAsynchTask(ctx, public.ModelNameCustomVoice, payload, "")
}
// CreateBaseTask 声音克隆Base / clone任务
// 说明ref_audio_url 与 ref_audio_base64 二选一
func (s *tts) CreateBaseTask(
ctx context.Context,
text string,
language string, // 例如 "Chinese"/"English"/"Auto",空则默认 "Auto"
refText string, // 当 xVectorOnlyMode=false 时必填
refAudioURL string, // 可空
refAudioBase64 string, // 可空(不带 data: 前缀也可以)
xVectorOnlyMode bool, // true=不需要 refText但质量可能下降
speed float64, // 0.5~2.0<=0 则默认 1.0
) (taskID string, err error) {
if language == "" {
language = "Auto"
}
if speed <= 0 {
speed = 1.0
}
payload := map[string]any{
"text": text,
"language": language,
"ref_text": refText,
"ref_audio_url": refAudioURL,
"ref_audio_base64": refAudioBase64,
"x_vector_only_mode": xVectorOnlyMode,
"speed": speed,
"response_format": "wav",
}
g.Log().Info(ctx, "[CreateBaseTask] %v", payload)
return createModelAsynchTask(ctx, public.ModelNameBase, payload, "")
}
// SpeechToText 语音转文本(预留)
// audioBase64base64 编码的音频数据WAV/MP3等
func (s *tts) SpeechToText(ctx context.Context, audioBase64 string) (text string, err error) {
_ = ctx
if audioBase64 == "" {
return "", gerror.New("audioBase64 不能为空")
}
// 简单校验 base64 合法性
if _, err := base64.StdEncoding.DecodeString(audioBase64); err != nil {
return "", gerror.Wrap(err, "audioBase64 非法")
}
return "", gerror.New("SpeechToText 暂未实现:后续接入语音识别模型后补齐")
}