Files
media/service/asr/whisper_service.go
2026-05-20 11:32:39 +08:00

404 lines
12 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package asr
import (
"context"
"fmt"
"io"
"media/service/setup"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"time"
"github.com/gogf/gf/v2/frame/g"
)
// WhisperBackend 后端类型
type WhisperBackend int
const (
backendPython WhisperBackend = iota // python -m whisper
backendCLI // openai-whisper CLI (whisper 命令)
backendCpp // whisper.cpp (whisper-cpp)
)
type whisperService struct{}
// Whisper 语音识别服务单例
var Whisper = new(whisperService)
// TranscribeReq 语音识别请求
type TranscribeReq struct {
AudioPath string // 音频文件路径
Model string // whisper 模型: tiny/base/small/medium/large
Language string // 语言代码,默认 zh中文
}
// TranscribeRes 语音识别响应
type TranscribeRes struct {
Text string // 完整识别文本
Segments []Segment
Model string // 使用的模型
Language string // 识别的语言
OutputPath string // 输出的 txt 文件路径
}
// Segment 识别片段(带时间戳)
type Segment struct {
Start float64 `json:"start"` // 开始时间(秒)
End float64 `json:"end"` // 结束时间(秒)
Text string `json:"text"` // 文本内容
}
// Transcribe 对音频文件进行语音识别(自动检测后端,自动降级)
func (s *whisperService) Transcribe(ctx context.Context, req *TranscribeReq) (res *TranscribeRes, err error) {
// 1. 校验音频文件
if _, err = os.Stat(req.AudioPath); os.IsNotExist(err) {
return nil, fmt.Errorf("音频文件不存在: %s", req.AudioPath)
}
// 2. 设置默认值
model := req.Model
if model == "" {
model = g.Cfg().MustGet(ctx, "whisper.model", "small").String()
}
language := req.Language
if language == "" {
language = g.Cfg().MustGet(ctx, "whisper.language", "zh").String()
}
// 3. 检测后端C++ 版找不到模型文件时自动降级
backend, whisperPath := s.detectBackend()
if backend == backendCpp {
modelPath := s.resolveCppModelPath(model)
if modelPath == "" {
g.Log().Warningf(ctx, "whisper.cpp 模型文件(%s)未找到,降级到 Python whisper", model)
backend = backendPython
} else {
g.Log().Infof(ctx, "语音识别(whisper.cpp): audio=%s, model=%s", req.AudioPath, modelPath)
return s.transcribeWithCpp(ctx, req, whisperPath, modelPath, language)
}
}
switch backend {
case backendCLI:
g.Log().Infof(ctx, "语音识别(CLI): audio=%s, model=%s, language=%s", req.AudioPath, model, language)
return s.transcribeWithCLI(ctx, req, whisperPath, model, language)
default:
g.Log().Infof(ctx, "语音识别(python): audio=%s, model=%s, language=%s", req.AudioPath, model, language)
return s.transcribeWithPython(ctx, req, model, language)
}
}
// transcribeWithCLI 使用 whisper CLI 命令
func (s *whisperService) transcribeWithCLI(ctx context.Context, req *TranscribeReq, whisperPath, model, language string) (res *TranscribeRes, err error) {
outputDir := filepath.Dir(req.AudioPath)
modelDir := g.Cfg().MustGet(ctx, "whisper.model_dir", "").String()
threads := g.Cfg().MustGet(ctx, "whisper.threads", 2).Int()
args := []string{
req.AudioPath,
"--model", model,
"--language", language,
"--output_dir", outputDir,
"--output_format", "txt",
"--threads", fmt.Sprintf("%d", threads),
}
if modelDir != "" {
args = append(args, "--model_dir", modelDir)
}
cmd := exec.CommandContext(ctx, whisperPath, args...)
output, execErr := cmd.CombinedOutput()
if execErr != nil {
g.Log().Errorf(ctx, "whisper CLI 执行失败: %v\n%s", execErr, string(output))
return nil, fmt.Errorf("语音识别失败: %v", execErr)
}
return s.readTxtResult(outputDir, req.AudioPath, model)
}
// transcribeWithPython 使用 python -m whisper
func (s *whisperService) transcribeWithPython(ctx context.Context, req *TranscribeReq, model, language string) (res *TranscribeRes, err error) {
// 查找 python
pythonPath, err := exec.LookPath("python3")
if err != nil {
pythonPath, err = exec.LookPath("python")
if err != nil {
return nil, fmt.Errorf("未找到 python请安装: pip3 install openai-whisper")
}
}
outputDir := filepath.Dir(req.AudioPath)
modelDir := g.Cfg().MustGet(ctx, "whisper.model_dir", "").String()
threads := g.Cfg().MustGet(ctx, "whisper.threads", 2).Int()
args := []string{
"-m", "whisper",
req.AudioPath,
"--model", model,
"--language", language,
"--output_dir", outputDir,
"--output_format", "txt",
"--threads", fmt.Sprintf("%d", threads),
}
if modelDir != "" {
args = append(args, "--model_dir", modelDir)
}
cmd := exec.CommandContext(ctx, pythonPath, args...)
output, execErr := cmd.CombinedOutput()
if execErr != nil {
g.Log().Errorf(ctx, "whisper(python) 执行失败: %v\n%s", execErr, string(output))
return nil, fmt.Errorf("语音识别失败: %v", execErr)
}
return s.readTxtResult(outputDir, req.AudioPath, model)
}
// readTxtResult 读取 whisper 输出的 txt 文件
func (s *whisperService) readTxtResult(outputDir, audioPath, model string) (res *TranscribeRes, err error) {
baseName := strings.TrimSuffix(filepath.Base(audioPath), filepath.Ext(audioPath))
txtPaths := []string{
filepath.Join(outputDir, baseName+".txt"),
filepath.Join(outputDir, baseName+"."+model+".txt"),
}
var textBytes []byte
var txtPath string
for _, p := range txtPaths {
if b, e := os.ReadFile(p); e == nil {
textBytes = b
txtPath = p
break
}
}
if textBytes == nil {
return nil, fmt.Errorf("读取识别结果文件失败")
}
res = &TranscribeRes{
Text: cleanTranscript(string(textBytes)),
Model: model,
OutputPath: txtPath,
}
return
}
// cleanTranscript 清理识别结果:去换行、合并空格
func cleanTranscript(text string) string {
text = strings.ReplaceAll(text, "\r\n", " ")
text = strings.ReplaceAll(text, "\n", " ")
text = strings.ReplaceAll(text, "\r", " ")
// 合并多个空格
for strings.Contains(text, " ") {
text = strings.ReplaceAll(text, " ", " ")
}
return strings.TrimSpace(text)
}
// detectBackend 检测可用的 whisper 后端,返回后端类型和可执行路径
func (s *whisperService) detectBackend() (WhisperBackend, string) {
// 1. 优先检测 C++ 版 whisper.cpp最快但参数格式不同
for _, name := range []string{"whisper-cpp", "whisper-cli"} {
if path, err := exec.LookPath(name); err == nil {
return backendCpp, path
}
}
// 2. 检查 setup 检测到的 C++ 路径
if setup.DetectedWhisperPath != "" {
base := filepath.Base(setup.DetectedWhisperPath)
if base == "whisper-cpp" || base == "whisper-cli" {
if _, err := os.Stat(setup.DetectedWhisperPath); err == nil {
return backendCpp, setup.DetectedWhisperPath
}
}
}
// 3. 检测 Python CLIwhisper 命令)
if path, err := exec.LookPath("whisper"); err == nil {
return backendCLI, path
}
// 4. 检查 setup 检测到的 Python CLI 路径
if setup.DetectedWhisperPath != "" {
if _, err := os.Stat(setup.DetectedWhisperPath); err == nil {
return backendCLI, setup.DetectedWhisperPath
}
}
// 5. 检查配置中的路径
if p := g.Cfg().MustGet(context.Background(), "whisper.path", "").String(); p != "" {
if _, err := os.Stat(p); err == nil {
return backendCLI, p
}
}
return backendPython, ""
}
// resolveCppModelPath 查找或下载 whisper.cpp 模型文件
func (s *whisperService) resolveCppModelPath(model string) string {
modelName := strings.TrimPrefix(model, "ggml-")
modelName = strings.TrimSuffix(modelName, ".bin")
cppModelName := "ggml-" + modelName + ".bin"
home, _ := os.UserHomeDir()
// 目标路径:~/.cache/whisper/ggml-{model}.bin
targetDir := filepath.Join(home, ".cache", "whisper")
targetPath := filepath.Join(targetDir, cppModelName)
// 1. 如果已存在,直接返回
if _, err := os.Stat(targetPath); err == nil {
return targetPath
}
// 2. 检查其他常见位置
altPaths := []string{
cppModelName,
filepath.Join(home, ".cache", "whisper", "ggml-"+modelName+"-q5_0.bin"),
}
// macOS: Homebrew 安装的 whisper.cpp 模型路径
if runtime.GOOS == "darwin" {
altPaths = append(altPaths,
"/opt/homebrew/share/whisper-cpp/models/"+cppModelName,
"/usr/local/share/whisper-cpp/models/"+cppModelName,
)
}
// Linux: 常见系统安装路径
if runtime.GOOS == "linux" {
altPaths = append(altPaths,
"/usr/share/whisper-cpp/models/"+cppModelName,
"/usr/local/share/whisper-cpp/models/"+cppModelName,
)
}
for _, p := range altPaths {
if _, err := os.Stat(p); err == nil {
return p
}
}
// 3. 自动下载
modelSize := map[string]string{
"tiny": "75MB",
"base": "150MB",
"small": "500MB",
"medium": "1.5GB",
}
size, _ := modelSize[modelName]
// 下载源:先试 hf-mirror国内可访问失败再试官方
modelPath := fmt.Sprintf("ggerganov/whisper.cpp/resolve/main/%s", cppModelName)
urls := []string{
fmt.Sprintf("https://hf-mirror.com/%s", modelPath),
fmt.Sprintf("https://huggingface.co/%s", modelPath),
}
g.Log().Infof(context.TODO(), "[whisper.cpp] 正在下载模型 %s (%s)...", cppModelName, size)
// 创建目录
os.MkdirAll(targetDir, 0755)
// 下载文件(多个源,依次尝试)
var lastErr error
for _, url := range urls {
g.Log().Infof(context.TODO(), "[whisper.cpp] 下载地址: %s", url)
if err := s.downloadFile(url, targetPath, 5*time.Minute); err == nil {
g.Log().Infof(context.TODO(), "[whisper.cpp] 模型下载完成: %s", targetPath)
return targetPath
} else {
lastErr = err
g.Log().Warningf(context.TODO(), "[whisper.cpp] 从 %s 下载失败: %v尝试下一个源...", url, err)
}
}
g.Log().Errorf(context.TODO(), "[whisper.cpp] 所有下载源均失败: %v", lastErr)
return ""
}
// downloadFile 下载文件到指定路径(支持超时)
func (s *whisperService) downloadFile(url, destPath string, timeout time.Duration) error {
tmpPath := destPath + ".tmp"
out, err := os.Create(tmpPath)
if err != nil {
return fmt.Errorf("创建临时文件失败: %v", err)
}
defer out.Close()
client := &http.Client{Timeout: timeout}
resp, err := client.Get(url)
if err != nil {
os.Remove(tmpPath)
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
os.Remove(tmpPath)
return fmt.Errorf("HTTP %d", resp.StatusCode)
}
written, err := io.Copy(out, resp.Body)
if err != nil {
os.Remove(tmpPath)
return err
}
if err := os.Rename(tmpPath, destPath); err != nil {
return fmt.Errorf("文件重命名失败: %v", err)
}
g.Log().Infof(context.TODO(), "[whisper.cpp] 下载完成: %d bytes", written)
return nil
}
// transcribeWithCpp 使用 whisper.cppC++ 版,参数格式不同)
func (s *whisperService) transcribeWithCpp(ctx context.Context, req *TranscribeReq, binaryPath, model, language string) (res *TranscribeRes, err error) {
outputDir := filepath.Dir(req.AudioPath)
baseName := strings.TrimSuffix(filepath.Base(req.AudioPath), filepath.Ext(req.AudioPath))
outputPrefix := filepath.Join(outputDir, baseName)
threads := g.Cfg().MustGet(ctx, "whisper.threads", 2).Int()
// whisper.cpp 参数:
// -f input.mp3 输入文件
// -l zh 语言
// -t 2 线程数
// -otxt 输出 txt
// -of /path/prefix 输出文件前缀(自动加 .txt
args := []string{
"-f", req.AudioPath,
"-l", language,
"-t", fmt.Sprintf("%d", threads),
"-otxt",
"-of", outputPrefix,
"-m", model,
}
cmd := exec.CommandContext(ctx, binaryPath, args...)
output, execErr := cmd.CombinedOutput()
if execErr != nil {
g.Log().Errorf(ctx, "whisper.cpp 执行失败: %v\n%s", execErr, string(output))
return nil, fmt.Errorf("语音识别失败: %v", execErr)
}
// whisper.cpp 输出: {prefix}.txt
txtPath := outputPrefix + ".txt"
textBytes, readErr := os.ReadFile(txtPath)
if readErr != nil {
return nil, fmt.Errorf("读取识别结果文件失败: %v", readErr)
}
res = &TranscribeRes{
Text: cleanTranscript(string(textBytes)),
Model: model,
Language: language,
OutputPath: txtPath,
}
return
}