media/service/asr/transcribe_service.go

package asr

import (
	"context"
	"errors"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"os"
	"path/filepath"
	"strings"
	"time"

	dto "media/model/dto/audio"
	serviceAudio "media/service/audio"
	serviceScene "media/service/scene"

	"github.com/gogf/gf/v2/frame/g"
)

// VideoTranscribeReq 视频语音识别请求
type VideoTranscribeReq struct {
	VideoPath string
	Model     string
	Language  string
	KeepAudio bool
}

// VideoTranscribeRes 视频语音识别响应
type VideoTranscribeRes struct {
	Text          string `json:"text"`
	Model         string `json:"model"`
	Language      string `json:"language"`
	AudioPath     string `json:"audioPath"`
	AudioSize     int64  `json:"audioSize"`
	AudioDuration string `json:"audioDuration"`
}

type transcribeService struct{}

var VideoTranscribe = new(transcribeService)

// TranscribeWithURLs 从 URL 下载视频并转录
func (s *transcribeService) TranscribeWithURLs(ctx context.Context, req *dto.TranscribeReq) (res *dto.TranscribeRes, err error) {
	if len(req.VideoURLs) == 0 {
		return nil, errors.New("video_urls 不能为空")
	}

	tempDir := getTempDir(ctx)
	os.MkdirAll(tempDir, 0755)

	var savePaths []string
	for _, videoURL := range req.VideoURLs {
		savePath, dlErr := downloadFromURL(ctx, videoURL, tempDir)
		if dlErr != nil {
			continue
		}
		savePaths = append(savePaths, savePath)
	}
	if len(savePaths) == 0 {
		return nil, errors.New("所有视频下载均失败")
	}

	results := s.processVideos(ctx, savePaths, req.Model, req.Language, req.Threshold)
	res = &dto.TranscribeRes{Results: results}
	return
}

// TranscribeUpload 从已保存的文件转录
func (s *transcribeService) TranscribeUpload(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
	return s.processVideos(ctx, savePaths, model, language, threshold)
}

// processVideos 逐个处理视频
func (s *transcribeService) processVideos(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
	var results []dto.TranscribeItem

	for _, savePath := range savePaths {
		fileName := filepath.Base(savePath)
		if idx := strings.Index(fileName, "_"); idx > 0 {
			fileName = fileName[idx+1:]
		}

		// 场景分析
		var scenes *dto.SceneSummaryDTO
		sceneRes, sceneErr := serviceScene.SceneAnalyzer.Analyze(ctx, &serviceScene.SceneAnalyzeReq{
			VideoPaths:       []string{savePath},
			Threshold:        threshold,
			ExtractKeyframes: false,
		})
		if sceneErr == nil && len(sceneRes.Analyses) > 0 {
			scenes = toSceneDTO(&sceneRes.Analyses[0])
		}

		// 语音转文字（内部删除视频文件）
		transRes, transErr := s.TranscribeVideo(ctx, &VideoTranscribeReq{
			VideoPath: savePath,
			Model:     model,
			Language:  language,
		})
		if transErr != nil {
			os.Remove(savePath)
			results = append(results, dto.TranscribeItem{FileName: fileName, Error: transErr.Error()})
			continue
		}

		results = append(results, dto.TranscribeItem{
			FileName: fileName,
			Result: &dto.TranscribeResult{
				Text:          transRes.Text,
				Model:         transRes.Model,
				Language:      transRes.Language,
				AudioPath:     transRes.AudioPath,
				AudioSize:     transRes.AudioSize,
				AudioDuration: transRes.AudioDuration,
				Scenes:        scenes,
			},
		})
	}
	return results
}

// TranscribeVideo 从视频提取音频并转为文字
func (s *transcribeService) TranscribeVideo(ctx context.Context, req *VideoTranscribeReq) (res *VideoTranscribeRes, err error) {
	audioReq := &serviceAudio.ExtractAudioReq{VideoPath: req.VideoPath, Format: "mp3"}
	audioRes, err := serviceAudio.AudioExtract.Extract(ctx, audioReq)
	if err != nil {
		return nil, fmt.Errorf("音频提取失败: %v", err)
	}

	whisperRes, err := Whisper.Transcribe(ctx, &TranscribeReq{AudioPath: audioRes.AudioPath, Model: req.Model, Language: req.Language})
	if err != nil {
		os.Remove(audioRes.AudioPath)
		return nil, fmt.Errorf("语音识别失败: %v", err)
	}

	os.Remove(req.VideoPath)
	if !req.KeepAudio {
		os.Remove(audioRes.AudioPath)
		baseName := strings.TrimSuffix(audioRes.AudioPath, filepath.Ext(audioRes.AudioPath))
		os.Remove(baseName + ".txt")
		os.Remove(baseName + "." + whisperRes.Model + ".txt")
	}

	res = &VideoTranscribeRes{
		Text:          whisperRes.Text,
		Model:         whisperRes.Model,
		Language:      whisperRes.Language,
		AudioPath:     audioRes.AudioPath,
		AudioSize:     audioRes.Size,
		AudioDuration: audioRes.Duration,
	}
	return
}

func downloadFromURL(ctx context.Context, rawURL, tempDir string) (string, error) {
	parsedURL, err := url.Parse(rawURL)
	if err != nil {
		return "", err
	}
	segments := strings.Split(parsedURL.Path, "/")
	fileName := segments[len(segments)-1]
	if fileName == "" {
		fileName = fmt.Sprintf("video_%d.mp4", time.Now().UnixMilli())
	}
	savePath := filepath.Join(tempDir, fmt.Sprintf("%d_%s", time.Now().UnixMilli(), fileName))

	client := &http.Client{Timeout: 10 * time.Minute}
	resp, err := client.Get(rawURL)
	if err != nil {
		return "", err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return "", fmt.Errorf("HTTP %d", resp.StatusCode)
	}

	out, err := os.Create(savePath)
	if err != nil {
		return "", err
	}
	defer out.Close()

	_, err = io.Copy(out, resp.Body)
	if err != nil {
		os.Remove(savePath)
	}
	return savePath, err
}

func getTempDir(ctx context.Context) string {
	tempDir := g.Cfg().MustGet(ctx, "ffmpeg.temp_dir", "resource/temp").String()
	if tempDir == "" {
		tempDir = "resource/temp"
	}
	if !filepath.IsAbs(tempDir) {
		absDir, _ := filepath.Abs(tempDir)
		tempDir = absDir
	}
	return tempDir
}

// toSceneDTO 将场景服务的原始结果转为 DTO 格式
func toSceneDTO(analysis *serviceScene.VideoSceneAnalysis) *dto.SceneSummaryDTO {
	if analysis == nil {
		return nil
	}
	shots := make([]dto.SceneShotDTO, 0, len(analysis.Scenes))
	for _, s := range analysis.Scenes {
		shots = append(shots, dto.SceneShotDTO{
			SceneIndex:   s.SceneIndex,
			StartTimeStr: s.StartTimeStr,
			EndTimeStr:   s.EndTimeStr,
			DurationStr:  s.DurationStr,
			ShotType:     s.ShotType,
			Composition:  s.Composition,
			NarrativePos: s.NarrativePos,
			Description:  s.Description,
		})
	}
	return &dto.SceneSummaryDTO{
		TotalScenes: analysis.TotalScenes,
		DurationStr: analysis.DurationStr,
		AspectRatio: analysis.AspectRatio,
		Orientation: analysis.Orientation,
		Pacing:      analysis.Summary.Pacing,
		ShotTypes:   analysis.Summary.ShotTypeDist,
		Scenes:      shots,
	}
}