package asr import ( "context" "errors" "fmt" "io" "net/http" "net/url" "os" "path/filepath" "strings" "time" dto "media/model/dto/audio" serviceAudio "media/service/audio" serviceScene "media/service/scene" "github.com/gogf/gf/v2/frame/g" ) // VideoTranscribeReq 视频语音识别请求 type VideoTranscribeReq struct { VideoPath string Model string Language string KeepAudio bool } // VideoTranscribeRes 视频语音识别响应 type VideoTranscribeRes struct { Text string `json:"text"` Model string `json:"model"` Language string `json:"language"` AudioPath string `json:"audioPath"` AudioSize int64 `json:"audioSize"` AudioDuration string `json:"audioDuration"` } type transcribeService struct{} var VideoTranscribe = new(transcribeService) // TranscribeWithURLs 从 URL 下载视频并转录 func (s *transcribeService) TranscribeWithURLs(ctx context.Context, req *dto.TranscribeReq) (res *dto.TranscribeRes, err error) { if len(req.VideoURLs) == 0 { return nil, errors.New("video_urls 不能为空") } tempDir := getTempDir(ctx) os.MkdirAll(tempDir, 0755) var savePaths []string for _, videoURL := range req.VideoURLs { savePath, dlErr := downloadFromURL(ctx, videoURL, tempDir) if dlErr != nil { continue } savePaths = append(savePaths, savePath) } if len(savePaths) == 0 { return nil, errors.New("所有视频下载均失败") } results := s.processVideos(ctx, savePaths, req.Model, req.Language, req.Threshold) res = &dto.TranscribeRes{Results: results} return } // TranscribeUpload 从已保存的文件转录 func (s *transcribeService) TranscribeUpload(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem { return s.processVideos(ctx, savePaths, model, language, threshold) } // processVideos 逐个处理视频 func (s *transcribeService) processVideos(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem { var results []dto.TranscribeItem for _, savePath := range savePaths { fileName := filepath.Base(savePath) if idx := strings.Index(fileName, "_"); idx > 0 { fileName = fileName[idx+1:] } // 场景分析 var scenes *dto.SceneSummaryDTO sceneRes, sceneErr := serviceScene.SceneAnalyzer.Analyze(ctx, &serviceScene.SceneAnalyzeReq{ VideoPaths: []string{savePath}, Threshold: threshold, ExtractKeyframes: false, }) if sceneErr == nil && len(sceneRes.Analyses) > 0 { scenes = toSceneDTO(&sceneRes.Analyses[0]) } // 语音转文字(内部删除视频文件) transRes, transErr := s.TranscribeVideo(ctx, &VideoTranscribeReq{ VideoPath: savePath, Model: model, Language: language, }) if transErr != nil { os.Remove(savePath) results = append(results, dto.TranscribeItem{FileName: fileName, Error: transErr.Error()}) continue } results = append(results, dto.TranscribeItem{ FileName: fileName, Result: &dto.TranscribeResult{ Text: transRes.Text, Model: transRes.Model, Language: transRes.Language, AudioPath: transRes.AudioPath, AudioSize: transRes.AudioSize, AudioDuration: transRes.AudioDuration, Scenes: scenes, }, }) } return results } // TranscribeVideo 从视频提取音频并转为文字 func (s *transcribeService) TranscribeVideo(ctx context.Context, req *VideoTranscribeReq) (res *VideoTranscribeRes, err error) { audioReq := &serviceAudio.ExtractAudioReq{VideoPath: req.VideoPath, Format: "mp3"} audioRes, err := serviceAudio.AudioExtract.Extract(ctx, audioReq) if err != nil { return nil, fmt.Errorf("音频提取失败: %v", err) } whisperRes, err := Whisper.Transcribe(ctx, &TranscribeReq{AudioPath: audioRes.AudioPath, Model: req.Model, Language: req.Language}) if err != nil { os.Remove(audioRes.AudioPath) return nil, fmt.Errorf("语音识别失败: %v", err) } os.Remove(req.VideoPath) if !req.KeepAudio { os.Remove(audioRes.AudioPath) baseName := strings.TrimSuffix(audioRes.AudioPath, filepath.Ext(audioRes.AudioPath)) os.Remove(baseName + ".txt") os.Remove(baseName + "." + whisperRes.Model + ".txt") } res = &VideoTranscribeRes{ Text: whisperRes.Text, Model: whisperRes.Model, Language: whisperRes.Language, AudioPath: audioRes.AudioPath, AudioSize: audioRes.Size, AudioDuration: audioRes.Duration, } return } func downloadFromURL(ctx context.Context, rawURL, tempDir string) (string, error) { parsedURL, err := url.Parse(rawURL) if err != nil { return "", err } segments := strings.Split(parsedURL.Path, "/") fileName := segments[len(segments)-1] if fileName == "" { fileName = fmt.Sprintf("video_%d.mp4", time.Now().UnixMilli()) } savePath := filepath.Join(tempDir, fmt.Sprintf("%d_%s", time.Now().UnixMilli(), fileName)) client := &http.Client{Timeout: 10 * time.Minute} resp, err := client.Get(rawURL) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("HTTP %d", resp.StatusCode) } out, err := os.Create(savePath) if err != nil { return "", err } defer out.Close() _, err = io.Copy(out, resp.Body) if err != nil { os.Remove(savePath) } return savePath, err } func getTempDir(ctx context.Context) string { tempDir := g.Cfg().MustGet(ctx, "ffmpeg.temp_dir", "resource/temp").String() if tempDir == "" { tempDir = "resource/temp" } if !filepath.IsAbs(tempDir) { absDir, _ := filepath.Abs(tempDir) tempDir = absDir } return tempDir } // toSceneDTO 将场景服务的原始结果转为 DTO 格式 func toSceneDTO(analysis *serviceScene.VideoSceneAnalysis) *dto.SceneSummaryDTO { if analysis == nil { return nil } shots := make([]dto.SceneShotDTO, 0, len(analysis.Scenes)) for _, s := range analysis.Scenes { shots = append(shots, dto.SceneShotDTO{ SceneIndex: s.SceneIndex, StartTimeStr: s.StartTimeStr, EndTimeStr: s.EndTimeStr, DurationStr: s.DurationStr, ShotType: s.ShotType, Composition: s.Composition, NarrativePos: s.NarrativePos, Description: s.Description, }) } return &dto.SceneSummaryDTO{ TotalScenes: analysis.TotalScenes, DurationStr: analysis.DurationStr, AspectRatio: analysis.AspectRatio, Orientation: analysis.Orientation, Pacing: analysis.Summary.Pacing, ShotTypes: analysis.Summary.ShotTypeDist, Scenes: shots, } }