233 lines
6.4 KiB
Go
233 lines
6.4 KiB
Go
package asr
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
dto "media/model/dto/audio"
|
|
serviceAudio "media/service/audio"
|
|
serviceScene "media/service/scene"
|
|
|
|
"github.com/gogf/gf/v2/frame/g"
|
|
)
|
|
|
|
// VideoTranscribeReq 视频语音识别请求
|
|
type VideoTranscribeReq struct {
|
|
VideoPath string
|
|
Model string
|
|
Language string
|
|
KeepAudio bool
|
|
}
|
|
|
|
// VideoTranscribeRes 视频语音识别响应
|
|
type VideoTranscribeRes struct {
|
|
Text string `json:"text"`
|
|
Model string `json:"model"`
|
|
Language string `json:"language"`
|
|
AudioPath string `json:"audioPath"`
|
|
AudioSize int64 `json:"audioSize"`
|
|
AudioDuration string `json:"audioDuration"`
|
|
}
|
|
|
|
type transcribeService struct{}
|
|
|
|
var VideoTranscribe = new(transcribeService)
|
|
|
|
// TranscribeWithURLs 从 URL 下载视频并转录
|
|
func (s *transcribeService) TranscribeWithURLs(ctx context.Context, req *dto.TranscribeReq) (res *dto.TranscribeRes, err error) {
|
|
if len(req.VideoURLs) == 0 {
|
|
return nil, errors.New("video_urls 不能为空")
|
|
}
|
|
|
|
tempDir := getTempDir(ctx)
|
|
os.MkdirAll(tempDir, 0755)
|
|
|
|
var savePaths []string
|
|
for _, videoURL := range req.VideoURLs {
|
|
savePath, dlErr := downloadFromURL(ctx, videoURL, tempDir)
|
|
if dlErr != nil {
|
|
continue
|
|
}
|
|
savePaths = append(savePaths, savePath)
|
|
}
|
|
if len(savePaths) == 0 {
|
|
return nil, errors.New("所有视频下载均失败")
|
|
}
|
|
|
|
results := s.processVideos(ctx, savePaths, req.Model, req.Language, req.Threshold)
|
|
res = &dto.TranscribeRes{Results: results}
|
|
return
|
|
}
|
|
|
|
// TranscribeUpload 从已保存的文件转录
|
|
func (s *transcribeService) TranscribeUpload(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
|
|
return s.processVideos(ctx, savePaths, model, language, threshold)
|
|
}
|
|
|
|
// processVideos 逐个处理视频
|
|
func (s *transcribeService) processVideos(ctx context.Context, savePaths []string, model, language string, threshold float64) []dto.TranscribeItem {
|
|
var results []dto.TranscribeItem
|
|
|
|
for _, savePath := range savePaths {
|
|
fileName := filepath.Base(savePath)
|
|
if idx := strings.Index(fileName, "_"); idx > 0 {
|
|
fileName = fileName[idx+1:]
|
|
}
|
|
|
|
// 场景分析
|
|
var scenes *dto.SceneSummaryDTO
|
|
sceneRes, sceneErr := serviceScene.SceneAnalyzer.Analyze(ctx, &serviceScene.SceneAnalyzeReq{
|
|
VideoPaths: []string{savePath},
|
|
Threshold: threshold,
|
|
ExtractKeyframes: false,
|
|
})
|
|
if sceneErr == nil && len(sceneRes.Analyses) > 0 {
|
|
scenes = toSceneDTO(&sceneRes.Analyses[0])
|
|
}
|
|
|
|
// 语音转文字(内部删除视频文件)
|
|
transRes, transErr := s.TranscribeVideo(ctx, &VideoTranscribeReq{
|
|
VideoPath: savePath,
|
|
Model: model,
|
|
Language: language,
|
|
})
|
|
if transErr != nil {
|
|
os.Remove(savePath)
|
|
results = append(results, dto.TranscribeItem{FileName: fileName, Error: transErr.Error()})
|
|
continue
|
|
}
|
|
|
|
results = append(results, dto.TranscribeItem{
|
|
FileName: fileName,
|
|
Result: &dto.TranscribeResult{
|
|
Text: transRes.Text,
|
|
Model: transRes.Model,
|
|
Language: transRes.Language,
|
|
AudioPath: transRes.AudioPath,
|
|
AudioSize: transRes.AudioSize,
|
|
AudioDuration: transRes.AudioDuration,
|
|
Scenes: scenes,
|
|
},
|
|
})
|
|
}
|
|
return results
|
|
}
|
|
|
|
// TranscribeVideo 从视频提取音频并转为文字
|
|
func (s *transcribeService) TranscribeVideo(ctx context.Context, req *VideoTranscribeReq) (res *VideoTranscribeRes, err error) {
|
|
audioReq := &serviceAudio.ExtractAudioReq{VideoPath: req.VideoPath, Format: "mp3"}
|
|
audioRes, err := serviceAudio.AudioExtract.Extract(ctx, audioReq)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("音频提取失败: %v", err)
|
|
}
|
|
|
|
whisperRes, err := Whisper.Transcribe(ctx, &TranscribeReq{AudioPath: audioRes.AudioPath, Model: req.Model, Language: req.Language})
|
|
if err != nil {
|
|
os.Remove(audioRes.AudioPath)
|
|
return nil, fmt.Errorf("语音识别失败: %v", err)
|
|
}
|
|
|
|
os.Remove(req.VideoPath)
|
|
if !req.KeepAudio {
|
|
os.Remove(audioRes.AudioPath)
|
|
baseName := strings.TrimSuffix(audioRes.AudioPath, filepath.Ext(audioRes.AudioPath))
|
|
os.Remove(baseName + ".txt")
|
|
os.Remove(baseName + "." + whisperRes.Model + ".txt")
|
|
}
|
|
|
|
res = &VideoTranscribeRes{
|
|
Text: whisperRes.Text,
|
|
Model: whisperRes.Model,
|
|
Language: whisperRes.Language,
|
|
AudioPath: audioRes.AudioPath,
|
|
AudioSize: audioRes.Size,
|
|
AudioDuration: audioRes.Duration,
|
|
}
|
|
return
|
|
}
|
|
|
|
func downloadFromURL(ctx context.Context, rawURL, tempDir string) (string, error) {
|
|
parsedURL, err := url.Parse(rawURL)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
segments := strings.Split(parsedURL.Path, "/")
|
|
fileName := segments[len(segments)-1]
|
|
if fileName == "" {
|
|
fileName = fmt.Sprintf("video_%d.mp4", time.Now().UnixMilli())
|
|
}
|
|
savePath := filepath.Join(tempDir, fmt.Sprintf("%d_%s", time.Now().UnixMilli(), fileName))
|
|
|
|
client := &http.Client{Timeout: 10 * time.Minute}
|
|
resp, err := client.Get(rawURL)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return "", fmt.Errorf("HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
out, err := os.Create(savePath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer out.Close()
|
|
|
|
_, err = io.Copy(out, resp.Body)
|
|
if err != nil {
|
|
os.Remove(savePath)
|
|
}
|
|
return savePath, err
|
|
}
|
|
|
|
func getTempDir(ctx context.Context) string {
|
|
tempDir := g.Cfg().MustGet(ctx, "ffmpeg.temp_dir", "resource/temp").String()
|
|
if tempDir == "" {
|
|
tempDir = "resource/temp"
|
|
}
|
|
if !filepath.IsAbs(tempDir) {
|
|
absDir, _ := filepath.Abs(tempDir)
|
|
tempDir = absDir
|
|
}
|
|
return tempDir
|
|
}
|
|
|
|
// toSceneDTO 将场景服务的原始结果转为 DTO 格式
|
|
func toSceneDTO(analysis *serviceScene.VideoSceneAnalysis) *dto.SceneSummaryDTO {
|
|
if analysis == nil {
|
|
return nil
|
|
}
|
|
shots := make([]dto.SceneShotDTO, 0, len(analysis.Scenes))
|
|
for _, s := range analysis.Scenes {
|
|
shots = append(shots, dto.SceneShotDTO{
|
|
SceneIndex: s.SceneIndex,
|
|
StartTimeStr: s.StartTimeStr,
|
|
EndTimeStr: s.EndTimeStr,
|
|
DurationStr: s.DurationStr,
|
|
ShotType: s.ShotType,
|
|
Composition: s.Composition,
|
|
NarrativePos: s.NarrativePos,
|
|
Description: s.Description,
|
|
})
|
|
}
|
|
return &dto.SceneSummaryDTO{
|
|
TotalScenes: analysis.TotalScenes,
|
|
DurationStr: analysis.DurationStr,
|
|
AspectRatio: analysis.AspectRatio,
|
|
Orientation: analysis.Orientation,
|
|
Pacing: analysis.Summary.Pacing,
|
|
ShotTypes: analysis.Summary.ShotTypeDist,
|
|
Scenes: shots,
|
|
}
|
|
}
|