package service import ( "context" "encoding/base64" "ai-agent/digital-human/consts/public" "github.com/gogf/gf/v2/errors/gerror" "github.com/gogf/gf/v2/frame/g" ) type tts struct{} // TTS 统一的模型异步调用封装(通过 model-asynch 中间件) var TTS = new(tts) // CreateVoiceDesignTask 设计音频任务(VoiceDesign) func (s *tts) CreateVoiceDesignTask( ctx context.Context, text string, instruct string, language string, // 空则 Auto speed float64, // <=0 则 1.0 ) (taskID string, err error) { if language == "" { language = "Auto" } if speed <= 0 { speed = 1.0 } payload := map[string]any{ "text": text, "language": language, "instruct": instruct, "speed": speed, "response_format": "wav", } g.Log().Info(ctx, "[CreateVoiceDesignTask] %v", payload) return createModelAsynchTask(ctx, public.ModelNameVoiceDesign, payload, "") } // CreateCustomVoiceTask 预设音色(CustomVoice)任务 // - speaker: 预设说话人(如 Vivian/Serena/Ryan/...) // - instruct: 可选,情绪/风格控制 func (s *tts) CreateCustomVoiceTask( ctx context.Context, text string, speaker string, language string, // 例如 "Chinese"/"English"/"Auto",空则默认 "Auto" instruct string, // 可空 speed float64, // 0.5~2.0,<=0 则默认 1.0 ) (taskID string, err error) { if language == "" { language = "Auto" } if speed <= 0 { speed = 1.0 } payload := map[string]any{ "text": text, "language": language, "speaker": speaker, "instruct": instruct, "speed": speed, "response_format": "wav", // 建议统一用 wav } g.Log().Info(ctx, "[CreateCustomVoiceTask] %v", payload) return createModelAsynchTask(ctx, public.ModelNameCustomVoice, payload, "") } // CreateBaseTask 声音克隆(Base / clone)任务 // 说明:ref_audio_url 与 ref_audio_base64 二选一 func (s *tts) CreateBaseTask( ctx context.Context, text string, language string, // 例如 "Chinese"/"English"/"Auto",空则默认 "Auto" refText string, // 当 xVectorOnlyMode=false 时必填 refAudioURL string, // 可空 refAudioBase64 string, // 可空(不带 data: 前缀也可以) xVectorOnlyMode bool, // true=不需要 refText,但质量可能下降 speed float64, // 0.5~2.0,<=0 则默认 1.0 ) (taskID string, err error) { if language == "" { language = "Auto" } if speed <= 0 { speed = 1.0 } payload := map[string]any{ "text": text, "language": language, "ref_text": refText, "ref_audio_url": refAudioURL, "ref_audio_base64": refAudioBase64, "x_vector_only_mode": xVectorOnlyMode, "speed": speed, "response_format": "wav", } g.Log().Info(ctx, "[CreateBaseTask] %v", payload) return createModelAsynchTask(ctx, public.ModelNameBase, payload, "") } // SpeechToText 语音转文本(预留) // audioBase64:base64 编码的音频数据(WAV/MP3等) func (s *tts) SpeechToText(ctx context.Context, audioBase64 string) (text string, err error) { _ = ctx if audioBase64 == "" { return "", gerror.New("audioBase64 不能为空") } // 简单校验 base64 合法性 if _, err := base64.StdEncoding.DecodeString(audioBase64); err != nil { return "", gerror.Wrap(err, "audioBase64 非法") } return "", gerror.New("SpeechToText 暂未实现:后续接入语音识别模型后补齐") }