118 lines
3.4 KiB
Go
118 lines
3.4 KiB
Go
package service
|
||
|
||
import (
|
||
"context"
|
||
"encoding/base64"
|
||
|
||
"ai-agent/digital-human/consts/public"
|
||
|
||
"github.com/gogf/gf/v2/errors/gerror"
|
||
"github.com/gogf/gf/v2/frame/g"
|
||
)
|
||
|
||
type tts struct{}
|
||
|
||
// TTS 统一的模型异步调用封装(通过 model-asynch 中间件)
|
||
var TTS = new(tts)
|
||
|
||
// CreateVoiceDesignTask 设计音频任务(VoiceDesign)
|
||
func (s *tts) CreateVoiceDesignTask(
|
||
ctx context.Context,
|
||
text string,
|
||
instruct string,
|
||
language string, // 空则 Auto
|
||
speed float64, // <=0 则 1.0
|
||
) (taskID string, err error) {
|
||
if language == "" {
|
||
language = "Auto"
|
||
}
|
||
if speed <= 0 {
|
||
speed = 1.0
|
||
}
|
||
payload := map[string]any{
|
||
"text": text,
|
||
"language": language,
|
||
"instruct": instruct,
|
||
"speed": speed,
|
||
"response_format": "wav",
|
||
}
|
||
g.Log().Info(ctx, "[CreateVoiceDesignTask] %v", payload)
|
||
return createModelAsynchTask(ctx, public.ModelNameVoiceDesign, payload, "")
|
||
}
|
||
|
||
// CreateCustomVoiceTask 预设音色(CustomVoice)任务
|
||
// - speaker: 预设说话人(如 Vivian/Serena/Ryan/...)
|
||
// - instruct: 可选,情绪/风格控制
|
||
func (s *tts) CreateCustomVoiceTask(
|
||
ctx context.Context,
|
||
text string,
|
||
speaker string,
|
||
language string, // 例如 "Chinese"/"English"/"Auto",空则默认 "Auto"
|
||
instruct string, // 可空
|
||
speed float64, // 0.5~2.0,<=0 则默认 1.0
|
||
) (taskID string, err error) {
|
||
if language == "" {
|
||
language = "Auto"
|
||
}
|
||
if speed <= 0 {
|
||
speed = 1.0
|
||
}
|
||
payload := map[string]any{
|
||
"text": text,
|
||
"language": language,
|
||
"speaker": speaker,
|
||
"instruct": instruct,
|
||
"speed": speed,
|
||
"response_format": "wav", // 建议统一用 wav
|
||
}
|
||
g.Log().Info(ctx, "[CreateCustomVoiceTask] %v", payload)
|
||
return createModelAsynchTask(ctx, public.ModelNameCustomVoice, payload, "")
|
||
}
|
||
|
||
// CreateBaseTask 声音克隆(Base / clone)任务
|
||
// 说明:ref_audio_url 与 ref_audio_base64 二选一
|
||
func (s *tts) CreateBaseTask(
|
||
ctx context.Context,
|
||
text string,
|
||
language string, // 例如 "Chinese"/"English"/"Auto",空则默认 "Auto"
|
||
refText string, // 当 xVectorOnlyMode=false 时必填
|
||
refAudioURL string, // 可空
|
||
refAudioBase64 string, // 可空(不带 data: 前缀也可以)
|
||
xVectorOnlyMode bool, // true=不需要 refText,但质量可能下降
|
||
speed float64, // 0.5~2.0,<=0 则默认 1.0
|
||
) (taskID string, err error) {
|
||
if language == "" {
|
||
language = "Auto"
|
||
}
|
||
if speed <= 0 {
|
||
speed = 1.0
|
||
}
|
||
|
||
payload := map[string]any{
|
||
"text": text,
|
||
"language": language,
|
||
"ref_text": refText,
|
||
"ref_audio_url": refAudioURL,
|
||
"ref_audio_base64": refAudioBase64,
|
||
"x_vector_only_mode": xVectorOnlyMode,
|
||
"speed": speed,
|
||
"response_format": "wav",
|
||
}
|
||
g.Log().Info(ctx, "[CreateBaseTask] %v", payload)
|
||
return createModelAsynchTask(ctx, public.ModelNameBase, payload, "")
|
||
}
|
||
|
||
// SpeechToText 语音转文本(预留)
|
||
// audioBase64:base64 编码的音频数据(WAV/MP3等)
|
||
func (s *tts) SpeechToText(ctx context.Context, audioBase64 string) (text string, err error) {
|
||
_ = ctx
|
||
if audioBase64 == "" {
|
||
return "", gerror.New("audioBase64 不能为空")
|
||
}
|
||
// 简单校验 base64 合法性
|
||
if _, err := base64.StdEncoding.DecodeString(audioBase64); err != nil {
|
||
return "", gerror.Wrap(err, "audioBase64 非法")
|
||
}
|
||
return "", gerror.New("SpeechToText 暂未实现:后续接入语音识别模型后补齐")
|
||
}
|