feat(component,video): add task to embed audio to video (#939)

What this PR does: - Implement a pipeline task to embed an audio input to a video input. The end result is the original video with audio stream being replaced by the provided audio. CC @xiaofei-du @pinglin
instill-ai · Dec 23, 2024 · 1aa40c2 · 1aa40c2
1 parent 804e56a
commit 1aa40c2
Show file tree

Hide file tree

Showing 11 changed files with 360 additions and 1 deletion.
diff --git a/go.mod b/go.mod
@@ -64,6 +64,7 @@ require (
 	github.com/slack-go/slack v0.12.5
 	github.com/tmc/langchaingo v0.1.10
 	github.com/u2takey/ffmpeg-go v0.5.0
+	github.com/warmans/ffmpeg-go v1.0.0
 	github.com/weaviate/weaviate v1.26.0-rc.1
 	github.com/weaviate/weaviate-go-client/v4 v4.15.0
 	github.com/xuri/excelize/v2 v2.8.1

diff --git a/go.sum b/go.sum
@@ -1924,6 +1924,8 @@ github.com/vishvananda/netns v0.0.0-20180720170159-13995c7128cc/go.mod h1:ZjcWmF
 github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU=
 github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0=
 github.com/vishvananda/netns v0.0.0-20210104183010-2eb08e3e575f/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0=
+github.com/warmans/ffmpeg-go v1.0.0 h1:t79UD/WpAeyGnPep5XkZAUTisIVhNbllSty74Ry9tM0=
+github.com/warmans/ffmpeg-go v1.0.0/go.mod h1:c383/BhqdlGa+G2BY+b549J8ChhyL3oeVbDginBDr4M=
 github.com/weaviate/weaviate v1.26.0-rc.1 h1:p+8Cw4VfAbevtf90/sEN+43IHMBtdUc5ZH3r4NZKVR8=
 github.com/weaviate/weaviate v1.26.0-rc.1/go.mod h1:o6nFEB4UozA+B2fnAWyF0HZqB2ab44pRhJHYwvxVyyA=
 github.com/weaviate/weaviate-go-client/v4 v4.15.0 h1:+gSKFLpy6iXTDNtjgYFOuCj0RY7F+sICefKpZarnOuA=

diff --git a/pkg/component/operator/video/v0/README.mdx b/pkg/component/operator/video/v0/README.mdx
@@ -11,6 +11,7 @@ It can carry out the following tasks:
 - [Subsample](#subsample)
 - [Extract Audio](#extract-audio)
 - [Extract Frames](#extract-frames)
+- [Embed Audio](#embed-audio)
 
 
 
@@ -155,3 +156,56 @@ Extract image frames from a video at regular intervals or specified timestamps.
 
 
 
+### Embed Audio
+
+Given a pair of audio and video files, embed the audio to the video. The audio input will override any of the video's existing audio.
+
+<div class="markdown-col-no-wrap" data-col-1 data-col-2>
+
+ | Input | ID | Type | Description |
+ | :--- | :--- | :--- | :--- |
+ | Task ID (required) | `task` | string | `TASK_EMBED_AUDIO` |
+ | Video (required) | `video` | string | The source video file to be merged with the source audio file |
+ | Audio (required) | `audio` | string | The source audio file to embed to the source video file |
+</div>
+
+
+<div class="markdown-col-no-wrap" data-col-1 data-col-2>
+
+ | Output | ID | Type | Description |
+ | :--- | :--- | :--- | :--- |
+ | Video | `video` | string | The final video output with audio stream coming from the provided input. |
+</div>
+
+## Example Recipes
+
+Recipe for the `Embed audio` pipeline.
+
+```yaml
+version: v1beta
+
+variable:
+  video:
+    format: video
+    title: Input video
+    description: Video to embed to
+  audio:
+    format: audio
+    title: Input audio
+    description: Audio to embed from
+
+component:
+  video-0:
+    type: video
+    task: TASK_EMBED_AUDIO
+    input:
+      video: ${variable.video}
+      audio: ${variable.audio}
+
+output:
+  video:
+    title: Embedded video
+    value: ${video-0.output.video}
+
+```
+
diff --git a/pkg/component/operator/video/v0/config/definition.json b/pkg/component/operator/video/v0/config/definition.json
@@ -3,7 +3,8 @@
     "TASK_SEGMENT",
     "TASK_SUBSAMPLE",
     "TASK_EXTRACT_AUDIO",
-    "TASK_EXTRACT_FRAMES"
+    "TASK_EXTRACT_FRAMES",
+    "TASK_EMBED_AUDIO"
   ],
   "documentationUrl": "https://www.instill.tech/docs/component/operator/video",
   "icon": "assets/video.svg",

diff --git a/pkg/component/operator/video/v0/config/tasks.json b/pkg/component/operator/video/v0/config/tasks.json
@@ -247,6 +247,69 @@
       "type": "object"
     }
   },
+  "TASK_EMBED_AUDIO": {
+    "instillShortDescription": "Embed an audio to the existing video",
+    "input": {
+      "description": "Input.",
+      "instillEditOnNodeFields": [
+        "video",
+        "audio"
+      ],
+      "instillUIOrder": 0,
+      "properties": {
+        "audio": {
+          "description": "Audio data to embed to video",
+          "instillAcceptFormats": [
+            "audio/*",
+            "application/octet-stream"
+          ],
+          "instillUIOrder": 0,
+          "instillUpstreamTypes": [
+            "reference"
+          ],
+          "title": "Audio",
+          "type": "string"
+        },
+        "video": {
+          "description": "Video input to be embedded with provided audio",
+          "instillAcceptFormats": [
+            "video/*",
+            "application/octet-stream"
+          ],
+          "instillUIOrder": 1,
+          "instillUpstreamTypes": [
+            "reference"
+          ],
+          "title": "Video",
+          "type": "string"
+        }
+      },
+      "required": [
+        "video",
+        "audio"
+      ],
+      "title": "Input",
+      "type": "object"
+    },
+    "output": {
+      "description": "Output for embedding audio task",
+      "instillUIOrder": 0,
+      "properties": {
+        "video": {
+          "description": "Final video embedded with audio",
+          "instillFormat": "video/*",
+          "instillUIOrder": 0,
+          "title": "Video",
+          "type": "string"
+        }
+      },
+      "required": [
+        "video"
+      ],
+      "title": "Output",
+      "type": "object"
+    }
+  },
   "TASK_EXTRACT_FRAMES": {
     "instillShortDescription": "Extract image frames from a video at regular intervals or specified timestamps. This task takes a video input and either an interval value or a list of timestamps. It then produces an array of image frames corresponding to these specifications, allowing for precise capture of key moments or creation of a sequence of still images from the video content.",
     "input": {

diff --git a/pkg/component/operator/video/v0/io.go b/pkg/component/operator/video/v0/io.go
@@ -48,3 +48,12 @@ type extractFramesInput struct {
 type extractFramesOutput struct {
 	Frames []format.Image `instill:"frames"`
 }
+
+type embedAudioInput struct {
+	Video format.Video `instill:"video"`
+	Audio format.Audio `instill:"audio"`
+}
+
+type embedAudioOutput struct {
+	Video format.Video `instill:"video"`
+}
diff --git a/pkg/component/operator/video/v0/main.go b/pkg/component/operator/video/v0/main.go
@@ -16,6 +16,7 @@ const (
 	taskSubsample     = "TASK_SUBSAMPLE"
 	taskExtractAudio  = "TASK_EXTRACT_AUDIO"
 	taskExtractFrames = "TASK_EXTRACT_FRAMES"
+	taskEmbedAudio    = "TASK_EMBED_AUDIO"
 )
 
 var (
@@ -61,6 +62,8 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution,
 		e.execute = extractAudio
 	case taskExtractFrames:
 		e.execute = extractFrames
+	case taskEmbedAudio:
+		e.execute = embedAudio
 	default:
 		return nil, fmt.Errorf("%s task is not supported", x.Task)
 	}

diff --git a/pkg/component/operator/video/v0/task_embed_audio.go b/pkg/component/operator/video/v0/task_embed_audio.go
@@ -0,0 +1,109 @@
+package video
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/google/uuid"
+
+	ffmpeg "github.com/warmans/ffmpeg-go"
+
+	"github.com/instill-ai/pipeline-backend/pkg/component/base"
+	"github.com/instill-ai/pipeline-backend/pkg/data"
+)
+
+func embedAudio(ctx context.Context, job *base.Job) error {
+	var inputStruct embedAudioInput
+	if err := job.Input.ReadData(ctx, &inputStruct); err != nil {
+		return fmt.Errorf("reading input data: %w", err)
+	}
+
+	// Create temporary input video file
+	tempInputVideoFile, err := os.CreateTemp("", "temp-input-video-*.mp4")
+	if err != nil {
+		return fmt.Errorf("creating temp input video file: %w", err)
+	}
+	defer func() {
+		_ = os.Remove(tempInputVideoFile.Name())
+	}()
+
+	videoBytes, err := inputStruct.Video.Binary()
+	if err != nil {
+		return fmt.Errorf("getting video bytes: %w", err)
+	}
+
+	if err := os.WriteFile(tempInputVideoFile.Name(), videoBytes.ByteArray(), 0600); err != nil {
+		return fmt.Errorf("writing to temp input video file: %w", err)
+	}
+
+	// Create temporary input audio file
+	tempInputAudioFile, err := os.CreateTemp("", "temp-input-audio-*.mp3")
+	if err != nil {
+		return fmt.Errorf("creating temp input audio file: %w", err)
+	}
+	defer func() {
+		_ = os.Remove(tempInputAudioFile.Name())
+	}()
+
+	audioBytes, err := inputStruct.Audio.Binary()
+	if err != nil {
+		return fmt.Errorf("getting audio bytes: %w", err)
+	}
+
+	if err := os.WriteFile(tempInputAudioFile.Name(), audioBytes.ByteArray(), 0600); err != nil {
+		return fmt.Errorf("writing to temp input audio file: %w", err)
+	}
+
+	// Embed audio to video and write to a file
+	outputVideoFilePath, err := embedAudioToVideo(tempInputVideoFile.Name(), tempInputAudioFile.Name())
+	if err != nil {
+		return err
+	}
+	defer func() {
+		_ = os.Remove(outputVideoFilePath)
+	}()
+
+	// Read the output video file and export to standard output
+	outputVideoBytes, err := os.ReadFile(outputVideoFilePath)
+	if err != nil {
+		return fmt.Errorf("reading output video file: %w", err)
+	}
+
+	outputVideoData, err := data.NewVideoFromBytes(outputVideoBytes, "video/mp4", fmt.Sprintf("video-%s.mp4", uuid.New().String()))
+	if err != nil {
+		return fmt.Errorf("creating output video data: %w", err)
+	}
+
+	outputData := embedAudioOutput{
+		Video: outputVideoData,
+	}
+
+	if err := job.Output.WriteData(ctx, outputData); err != nil {
+		return fmt.Errorf("writing output data: %w", err)
+	}
+
+	return nil
+}
+
+func embedAudioToVideo(inputVideoFile string, inputAudioFile string) (string, error) {
+	outputFilePath := filepath.Join(os.TempDir(), fmt.Sprintf("video-%s.mp4", uuid.New().String()))
+
+	input := []*ffmpeg.Stream{ffmpeg.Input(inputVideoFile), ffmpeg.Input(inputAudioFile)}
+
+	// https://www.mux.com/articles/merge-audio-and-video-files-with-ffmpeg
+	// Workaround for multiple maps https://github.com/u2takey/ffmpeg-go/issues/1#issuecomment-2507904461
+	err := ffmpeg.Output(input, outputFilePath, ffmpeg.KwArgs{
+		"c:v":   "copy",
+		"c:a":   "aac",
+		"map_0": "0:v:0",
+		"map_1": "1:a:0",
+	}).OverWriteOutput().Run()
+
+	if err != nil {
+		return "", fmt.Errorf("embedding audio to video: %w", err)
+	}
+
+	return outputFilePath, nil
+}