Files
media/service/asr/transcribe_service.go
2026-05-19 14:33:06 +08:00

233 lines
6.4 KiB
Go

package asr
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"
dto "media/model/dto/audio"
serviceAudio "media/service/audio"
serviceScene "media/service/scene"
"github.com/gogf/gf/v2/frame/g"
)
// VideoTranscribeReq 视频语音识别请求
type VideoTranscribeReq struct {
VideoPath string
Model string
Language string
KeepAudio bool
}
// VideoTranscribeRes 视频语音识别响应
type VideoTranscribeRes struct {
Text string `json:"text"`
Model string `json:"model"`
Language string `json:"language"`
AudioPath string `json:"audioPath"`
AudioSize int64 `json:"audioSize"`
AudioDuration string `json:"audioDuration"`
}
type transcribeService struct{}
var VideoTranscribe = new(transcribeService)
// TranscribeWithURLs 从 URL 下载视频并转录
func (s *transcribeService) TranscribeWithURLs(ctx context.Context, req *dto.TranscribeReq) (res *dto.TranscribeRes, err error) {
if len(req.VideoURLs) == 0 {
return nil, errors.New("video_urls 不能为空")
}
tempDir := getTempDir(ctx)
os.MkdirAll(tempDir, 0755)
var savePaths []string
for _, videoURL := range req.VideoURLs {
savePath, dlErr := downloadFromURL(ctx, videoURL, tempDir)
if dlErr != nil {
continue
}
savePaths = append(savePaths, savePath)
}
if len(savePaths) == 0 {
return nil, errors.New("所有视频下载均失败")
}
results := s.processVideos(ctx, savePaths, req.Model, req.Language, req.Threshold)
res = &dto.TranscribeRes{Results: results}
return
}
// TranscribeUpload 从已保存的文件转录
func (s *transcribeService) TranscribeUpload(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
return s.processVideos(ctx, savePaths, model, language, threshold)
}
// processVideos 逐个处理视频
func (s *transcribeService) processVideos(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
var results []dto.TranscribeItem
for _, savePath := range savePaths {
fileName := filepath.Base(savePath)
if idx := strings.Index(fileName, "_"); idx > 0 {
fileName = fileName[idx+1:]
}
// 场景分析
var scenes *dto.SceneSummaryDTO
sceneRes, sceneErr := serviceScene.SceneAnalyzer.Analyze(ctx, &serviceScene.SceneAnalyzeReq{
VideoPaths: []string{savePath},
Threshold: threshold,
ExtractKeyframes: false,
})
if sceneErr == nil && len(sceneRes.Analyses) > 0 {
scenes = toSceneDTO(&sceneRes.Analyses[0])
}
// 语音转文字(内部删除视频文件)
transRes, transErr := s.TranscribeVideo(ctx, &VideoTranscribeReq{
VideoPath: savePath,
Model: model,
Language: language,
})
if transErr != nil {
os.Remove(savePath)
results = append(results, dto.TranscribeItem{FileName: fileName, Error: transErr.Error()})
continue
}
results = append(results, dto.TranscribeItem{
FileName: fileName,
Result: &dto.TranscribeResult{
Text: transRes.Text,
Model: transRes.Model,
Language: transRes.Language,
AudioPath: transRes.AudioPath,
AudioSize: transRes.AudioSize,
AudioDuration: transRes.AudioDuration,
Scenes: scenes,
},
})
}
return results
}
// TranscribeVideo 从视频提取音频并转为文字
func (s *transcribeService) TranscribeVideo(ctx context.Context, req *VideoTranscribeReq) (res *VideoTranscribeRes, err error) {
audioReq := &serviceAudio.ExtractAudioReq{VideoPath: req.VideoPath, Format: "mp3"}
audioRes, err := serviceAudio.AudioExtract.Extract(ctx, audioReq)
if err != nil {
return nil, fmt.Errorf("音频提取失败: %v", err)
}
whisperRes, err := Whisper.Transcribe(ctx, &TranscribeReq{AudioPath: audioRes.AudioPath, Model: req.Model, Language: req.Language})
if err != nil {
os.Remove(audioRes.AudioPath)
return nil, fmt.Errorf("语音识别失败: %v", err)
}
os.Remove(req.VideoPath)
if !req.KeepAudio {
os.Remove(audioRes.AudioPath)
baseName := strings.TrimSuffix(audioRes.AudioPath, filepath.Ext(audioRes.AudioPath))
os.Remove(baseName + ".txt")
os.Remove(baseName + "." + whisperRes.Model + ".txt")
}
res = &VideoTranscribeRes{
Text: whisperRes.Text,
Model: whisperRes.Model,
Language: whisperRes.Language,
AudioPath: audioRes.AudioPath,
AudioSize: audioRes.Size,
AudioDuration: audioRes.Duration,
}
return
}
func downloadFromURL(ctx context.Context, rawURL, tempDir string) (string, error) {
parsedURL, err := url.Parse(rawURL)
if err != nil {
return "", err
}
segments := strings.Split(parsedURL.Path, "/")
fileName := segments[len(segments)-1]
if fileName == "" {
fileName = fmt.Sprintf("video_%d.mp4", time.Now().UnixMilli())
}
savePath := filepath.Join(tempDir, fmt.Sprintf("%d_%s", time.Now().UnixMilli(), fileName))
client := &http.Client{Timeout: 10 * time.Minute}
resp, err := client.Get(rawURL)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
}
out, err := os.Create(savePath)
if err != nil {
return "", err
}
defer out.Close()
_, err = io.Copy(out, resp.Body)
if err != nil {
os.Remove(savePath)
}
return savePath, err
}
func getTempDir(ctx context.Context) string {
tempDir := g.Cfg().MustGet(ctx, "ffmpeg.temp_dir", "resource/temp").String()
if tempDir == "" {
tempDir = "resource/temp"
}
if !filepath.IsAbs(tempDir) {
absDir, _ := filepath.Abs(tempDir)
tempDir = absDir
}
return tempDir
}
// toSceneDTO 将场景服务的原始结果转为 DTO 格式
func toSceneDTO(analysis *serviceScene.VideoSceneAnalysis) *dto.SceneSummaryDTO {
if analysis == nil {
return nil
}
shots := make([]dto.SceneShotDTO, 0, len(analysis.Scenes))
for _, s := range analysis.Scenes {
shots = append(shots, dto.SceneShotDTO{
SceneIndex: s.SceneIndex,
StartTimeStr: s.StartTimeStr,
EndTimeStr: s.EndTimeStr,
DurationStr: s.DurationStr,
ShotType: s.ShotType,
Composition: s.Composition,
NarrativePos: s.NarrativePos,
Description: s.Description,
})
}
return &dto.SceneSummaryDTO{
TotalScenes: analysis.TotalScenes,
DurationStr: analysis.DurationStr,
AspectRatio: analysis.AspectRatio,
Orientation: analysis.Orientation,
Pacing: analysis.Summary.Pacing,
ShotTypes: analysis.Summary.ShotTypeDist,
Scenes: shots,
}
}