media/service/asr/whisper_service.go

package asr

import (
	"context"
	"fmt"
	"io"
	"media/service/setup"
	"net/http"
	"os"
	"os/exec"
	"path/filepath"
	"runtime"
	"strings"
	"time"

	"github.com/gogf/gf/v2/frame/g"
)

// WhisperBackend 后端类型
type WhisperBackend int

const (
	backendPython WhisperBackend = iota // python -m whisper
	backendCLI                          // openai-whisper CLI (whisper 命令)
	backendCpp                          // whisper.cpp (whisper-cpp)
)

type whisperService struct{}

// Whisper 语音识别服务单例
var Whisper = new(whisperService)

// TranscribeReq 语音识别请求
type TranscribeReq struct {
	AudioPath string // 音频文件路径
	Model     string // whisper 模型: tiny/base/small/medium/large
	Language  string // 语言代码，默认 zh（中文）
}

// TranscribeRes 语音识别响应
type TranscribeRes struct {
	Text       string // 完整识别文本
	Segments   []Segment
	Model      string // 使用的模型
	Language   string // 识别的语言
	OutputPath string // 输出的 txt 文件路径
}

// Segment 识别片段（带时间戳）
type Segment struct {
	Start float64 `json:"start"` // 开始时间（秒）
	End   float64 `json:"end"`   // 结束时间（秒）
	Text  string  `json:"text"`  // 文本内容
}

// Transcribe 对音频文件进行语音识别（自动检测后端，自动降级）
func (s *whisperService) Transcribe(ctx context.Context, req *TranscribeReq) (res *TranscribeRes, err error) {
	// 1. 校验音频文件
	if _, err = os.Stat(req.AudioPath); os.IsNotExist(err) {
		return nil, fmt.Errorf("音频文件不存在: %s", req.AudioPath)
	}

	// 2. 设置默认值
	model := req.Model
	if model == "" {
		model = g.Cfg().MustGet(ctx, "whisper.model", "small").String()
	}
	language := req.Language
	if language == "" {
		language = g.Cfg().MustGet(ctx, "whisper.language", "zh").String()
	}

	// 3. 检测后端，C++ 版找不到模型文件时自动降级
	backend, whisperPath := s.detectBackend()
	if backend == backendCpp {
		modelPath := s.resolveCppModelPath(model)
		if modelPath == "" {
			g.Log().Warningf(ctx, "whisper.cpp 模型文件(%s)未找到，降级到 Python whisper", model)
			backend = backendPython
		} else {
			g.Log().Infof(ctx, "语音识别(whisper.cpp): audio=%s, model=%s", req.AudioPath, modelPath)
			return s.transcribeWithCpp(ctx, req, whisperPath, modelPath, language)
		}
	}

	switch backend {
	case backendCLI:
		g.Log().Infof(ctx, "语音识别(CLI): audio=%s, model=%s, language=%s", req.AudioPath, model, language)
		return s.transcribeWithCLI(ctx, req, whisperPath, model, language)
	default:
		g.Log().Infof(ctx, "语音识别(python): audio=%s, model=%s, language=%s", req.AudioPath, model, language)
		return s.transcribeWithPython(ctx, req, model, language)
	}
}

// transcribeWithCLI 使用 whisper CLI 命令
func (s *whisperService) transcribeWithCLI(ctx context.Context, req *TranscribeReq, whisperPath, model, language string) (res *TranscribeRes, err error) {
	outputDir := filepath.Dir(req.AudioPath)
	modelDir := g.Cfg().MustGet(ctx, "whisper.model_dir", "").String()
	threads := g.Cfg().MustGet(ctx, "whisper.threads", 2).Int()

	args := []string{
		req.AudioPath,
		"--model", model,
		"--language", language,
		"--output_dir", outputDir,
		"--output_format", "txt",
		"--threads", fmt.Sprintf("%d", threads),
	}
	if modelDir != "" {
		args = append(args, "--model_dir", modelDir)
	}

	cmd := exec.CommandContext(ctx, whisperPath, args...)
	output, execErr := cmd.CombinedOutput()
	if execErr != nil {
		g.Log().Errorf(ctx, "whisper CLI 执行失败: %v\n%s", execErr, string(output))
		return nil, fmt.Errorf("语音识别失败: %v", execErr)
	}

	return s.readTxtResult(outputDir, req.AudioPath, model)
}

// transcribeWithPython 使用 python -m whisper
func (s *whisperService) transcribeWithPython(ctx context.Context, req *TranscribeReq, model, language string) (res *TranscribeRes, err error) {
	// 查找 python
	pythonPath, err := exec.LookPath("python3")
	if err != nil {
		pythonPath, err = exec.LookPath("python")
		if err != nil {
			return nil, fmt.Errorf("未找到 python，请安装: pip3 install openai-whisper")
		}
	}

	outputDir := filepath.Dir(req.AudioPath)
	modelDir := g.Cfg().MustGet(ctx, "whisper.model_dir", "").String()
	threads := g.Cfg().MustGet(ctx, "whisper.threads", 2).Int()

	args := []string{
		"-m", "whisper",
		req.AudioPath,
		"--model", model,
		"--language", language,
		"--output_dir", outputDir,
		"--output_format", "txt",
		"--threads", fmt.Sprintf("%d", threads),
	}
	if modelDir != "" {
		args = append(args, "--model_dir", modelDir)
	}

	cmd := exec.CommandContext(ctx, pythonPath, args...)
	output, execErr := cmd.CombinedOutput()
	if execErr != nil {
		g.Log().Errorf(ctx, "whisper(python) 执行失败: %v\n%s", execErr, string(output))
		return nil, fmt.Errorf("语音识别失败: %v", execErr)
	}

	return s.readTxtResult(outputDir, req.AudioPath, model)
}

// readTxtResult 读取 whisper 输出的 txt 文件
func (s *whisperService) readTxtResult(outputDir, audioPath, model string) (res *TranscribeRes, err error) {
	baseName := strings.TrimSuffix(filepath.Base(audioPath), filepath.Ext(audioPath))
	txtPaths := []string{
		filepath.Join(outputDir, baseName+".txt"),
		filepath.Join(outputDir, baseName+"."+model+".txt"),
	}

	var textBytes []byte
	var txtPath string
	for _, p := range txtPaths {
		if b, e := os.ReadFile(p); e == nil {
			textBytes = b
			txtPath = p
			break
		}
	}
	if textBytes == nil {
		return nil, fmt.Errorf("读取识别结果文件失败")
	}

	res = &TranscribeRes{
		Text:       cleanTranscript(string(textBytes)),
		Model:      model,
		OutputPath: txtPath,
	}
	return
}

// cleanTranscript 清理识别结果：去换行、合并空格
func cleanTranscript(text string) string {
	text = strings.ReplaceAll(text, "\r\n", " ")
	text = strings.ReplaceAll(text, "\n", " ")
	text = strings.ReplaceAll(text, "\r", " ")
	// 合并多个空格
	for strings.Contains(text, "  ") {
		text = strings.ReplaceAll(text, "  ", " ")
	}
	return strings.TrimSpace(text)
}

// detectBackend 检测可用的 whisper 后端，返回后端类型和可执行路径
func (s *whisperService) detectBackend() (WhisperBackend, string) {
	// 1. 优先检测 C++ 版 whisper.cpp（最快，但参数格式不同）
	for _, name := range []string{"whisper-cpp", "whisper-cli"} {
		if path, err := exec.LookPath(name); err == nil {
			return backendCpp, path
		}
	}

	// 2. 检查 setup 检测到的 C++ 路径
	if setup.DetectedWhisperPath != "" {
		base := filepath.Base(setup.DetectedWhisperPath)
		if base == "whisper-cpp" || base == "whisper-cli" {
			if _, err := os.Stat(setup.DetectedWhisperPath); err == nil {
				return backendCpp, setup.DetectedWhisperPath
			}
		}
	}

	// 3. 检测 Python CLI（whisper 命令）
	if path, err := exec.LookPath("whisper"); err == nil {
		return backendCLI, path
	}

	// 4. 检查 setup 检测到的 Python CLI 路径
	if setup.DetectedWhisperPath != "" {
		if _, err := os.Stat(setup.DetectedWhisperPath); err == nil {
			return backendCLI, setup.DetectedWhisperPath
		}
	}

	// 5. 检查配置中的路径
	if p := g.Cfg().MustGet(context.Background(), "whisper.path", "").String(); p != "" {
		if _, err := os.Stat(p); err == nil {
			return backendCLI, p
		}
	}

	return backendPython, ""
}

// resolveCppModelPath 查找或下载 whisper.cpp 模型文件
func (s *whisperService) resolveCppModelPath(model string) string {
	modelName := strings.TrimPrefix(model, "ggml-")
	modelName = strings.TrimSuffix(modelName, ".bin")

	cppModelName := "ggml-" + modelName + ".bin"
	home, _ := os.UserHomeDir()

	// 目标路径：~/.cache/whisper/ggml-{model}.bin
	targetDir := filepath.Join(home, ".cache", "whisper")
	targetPath := filepath.Join(targetDir, cppModelName)

	// 1. 如果已存在，直接返回
	if _, err := os.Stat(targetPath); err == nil {
		return targetPath
	}

	// 2. 检查其他常见位置
	altPaths := []string{
		cppModelName,
		filepath.Join(home, ".cache", "whisper", "ggml-"+modelName+"-q5_0.bin"),
	}
	// macOS: Homebrew 安装的 whisper.cpp 模型路径
	if runtime.GOOS == "darwin" {
		altPaths = append(altPaths,
			"/opt/homebrew/share/whisper-cpp/models/"+cppModelName,
			"/usr/local/share/whisper-cpp/models/"+cppModelName,
		)
	}
	// Linux: 常见系统安装路径
	if runtime.GOOS == "linux" {
		altPaths = append(altPaths,
			"/usr/share/whisper-cpp/models/"+cppModelName,
			"/usr/local/share/whisper-cpp/models/"+cppModelName,
		)
	}
	for _, p := range altPaths {
		if _, err := os.Stat(p); err == nil {
			return p
		}
	}

	// 3. 自动下载
	modelSize := map[string]string{
		"tiny":   "75MB",
		"base":   "150MB",
		"small":  "500MB",
		"medium": "1.5GB",
	}
	size, _ := modelSize[modelName]

	// 下载源：先试 hf-mirror（国内可访问），失败再试官方
	modelPath := fmt.Sprintf("ggerganov/whisper.cpp/resolve/main/%s", cppModelName)
	urls := []string{
		fmt.Sprintf("https://hf-mirror.com/%s", modelPath),
		fmt.Sprintf("https://huggingface.co/%s", modelPath),
	}

	g.Log().Infof(context.TODO(), "[whisper.cpp] 正在下载模型 %s (%s)...", cppModelName, size)

	// 创建目录
	os.MkdirAll(targetDir, 0755)

	// 下载文件（多个源，依次尝试）
	var lastErr error
	for _, url := range urls {
		g.Log().Infof(context.TODO(), "[whisper.cpp] 下载地址: %s", url)
		if err := s.downloadFile(url, targetPath, 5*time.Minute); err == nil {
			g.Log().Infof(context.TODO(), "[whisper.cpp] 模型下载完成: %s", targetPath)
			return targetPath
		} else {
			lastErr = err
			g.Log().Warningf(context.TODO(), "[whisper.cpp] 从 %s 下载失败: %v，尝试下一个源...", url, err)
		}
	}

	g.Log().Errorf(context.TODO(), "[whisper.cpp] 所有下载源均失败: %v", lastErr)
	return ""
}

// downloadFile 下载文件到指定路径（支持超时）
func (s *whisperService) downloadFile(url, destPath string, timeout time.Duration) error {
	tmpPath := destPath + ".tmp"
	out, err := os.Create(tmpPath)
	if err != nil {
		return fmt.Errorf("创建临时文件失败: %v", err)
	}
	defer out.Close()

	client := &http.Client{Timeout: timeout}
	resp, err := client.Get(url)
	if err != nil {
		os.Remove(tmpPath)
		return err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		os.Remove(tmpPath)
		return fmt.Errorf("HTTP %d", resp.StatusCode)
	}

	written, err := io.Copy(out, resp.Body)
	if err != nil {
		os.Remove(tmpPath)
		return err
	}

	if err := os.Rename(tmpPath, destPath); err != nil {
		return fmt.Errorf("文件重命名失败: %v", err)
	}

	g.Log().Infof(context.TODO(), "[whisper.cpp] 下载完成: %d bytes", written)
	return nil
}

// transcribeWithCpp 使用 whisper.cpp（C++ 版，参数格式不同）
func (s *whisperService) transcribeWithCpp(ctx context.Context, req *TranscribeReq, binaryPath, model, language string) (res *TranscribeRes, err error) {
	outputDir := filepath.Dir(req.AudioPath)
	baseName := strings.TrimSuffix(filepath.Base(req.AudioPath), filepath.Ext(req.AudioPath))
	outputPrefix := filepath.Join(outputDir, baseName)
	threads := g.Cfg().MustGet(ctx, "whisper.threads", 2).Int()

	// whisper.cpp 参数:
	//   -f input.mp3         输入文件
	//   -l zh                语言
	//   -t 2                 线程数
	//   -otxt                输出 txt
	//   -of /path/prefix     输出文件前缀（自动加 .txt）
	args := []string{
		"-f", req.AudioPath,
		"-l", language,
		"-t", fmt.Sprintf("%d", threads),
		"-otxt",
		"-of", outputPrefix,
		"-m", model,
	}

	cmd := exec.CommandContext(ctx, binaryPath, args...)
	output, execErr := cmd.CombinedOutput()
	if execErr != nil {
		g.Log().Errorf(ctx, "whisper.cpp 执行失败: %v\n%s", execErr, string(output))
		return nil, fmt.Errorf("语音识别失败: %v", execErr)
	}

	// whisper.cpp 输出: {prefix}.txt
	txtPath := outputPrefix + ".txt"
	textBytes, readErr := os.ReadFile(txtPath)
	if readErr != nil {
		return nil, fmt.Errorf("读取识别结果文件失败: %v", readErr)
	}

	res = &TranscribeRes{
		Text:       cleanTranscript(string(textBytes)),
		Model:      model,
		Language:   language,
		OutputPath: txtPath,
	}
	return
}