Merge pull request #9 from FolhaSP/fix/dynamic-news-videos

fix: make news generator videos more dynamic
FolhaSP · Nov 22, 2024 · d7d9eab · d7d9eab
2 parents 56323dd + 63d9a46
commit d7d9eab
Show file tree

Hide file tree

Showing 7 changed files with 184 additions and 77 deletions.
diff --git a/src/mosaico/script_generators/news/generator.py b/src/mosaico/script_generators/news/generator.py
@@ -1,9 +1,8 @@
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import TYPE_CHECKING, Any, Literal, Sequence
 
 from pydantic import BaseModel
 from pydantic_extra_types.language_code import LanguageAlpha2
 
-from mosaico.assets.types import AssetType
 from mosaico.media import Media
 from mosaico.script_generators.news.prompts import (
     MEDIA_SUGGESTING_PROMPT,
@@ -23,10 +22,10 @@ class ParagraphMediaSuggestion(BaseModel):
     paragraph: str
     """The paragraph content to which the media object corresponds."""
 
-    media_id: str
-    """The media reference for the shot."""
+    media_ids: list[str]
+    """The media IDs for the shot."""
 
-    type: AssetType
+    type: Literal["image", "video", "audio"]
     """The type of media (image, video, or audio)."""
 
     relevance: str
@@ -44,7 +43,7 @@ class NewsVideoScriptGenerator:
     def __init__(
         self,
         context: str,
-        model: str = "gpt-4o",
+        model: str = "claude-3-5-sonnet-20241022",
         model_params: dict[str, Any] | None = None,
         api_key: str | None = None,
         base_url: str | None = None,

diff --git a/src/mosaico/script_generators/news/prompts.py b/src/mosaico/script_generators/news/prompts.py
@@ -5,15 +5,17 @@
     """
     INSTRUCTIONS:
     You are a helpful news assistant tasked with summarizing the key points of the following context for a journalist
-    in paragraphs. Your summary should be concise, informative, and capture the most important details of the context.
-    The summary will be used by the journalist to produce a self-contained shooting script for an informative video
-    based on the context provided.
-
+    in paragraphs. Your summary should be very concise, informative, and capture the most important details of the
+    context. The summary will be used by the journalist to produce a self-contained shooting script for an informative
+    video based on the context provided.
 
     OUTPUT GUIDELINES:
-    - The summary should have {num_paragraphs} paragraphs.
-    - Each paragraph should be 1 sentence long.
+    - The summary should have {num_paragraphs} short paragraphs.
+    - Each paragraph should be a very short sentence.
     - Adhere to the best practices of journalistic writing.
+    - Make the paragraphs follow the chronology of the context.
+    - Make sure the first paragraph is the lead of the story.
+    - Make sure the last paragraph is the conclusion of the story.
     - Return only the paragraphs in {language} without any additional information.
 
     CONTEXT:
@@ -30,39 +32,64 @@
     the visual appeal and storytelling of an informative video. Your selections should be relevant, engaging, and
     directly correspond to the content of each paragraph.
 
-    From the media objects provided, you will select items that best match the content of each paragraph. Your goal
-    is to choose media that will enhance the viewer's understanding and create a compelling visual narrative.
+    From the media objects provided, your goal is to choose media that will enhance the viewer's understanding and
+    create a compelling visual narrative. Make sure each suggested media object is thoughtfully integrated to enhance
+    the narrative flow.
 
     OUTPUT GUIDELINES:
-    - For each paragraph, select one media object from the provided collection
+    - Use 1-2 media objects for each paragraph, but try to use as many as possible.
+    - The video should be dynamic, so be sure to select different media objects for different shots.
     - Only select media objects that are available in the provided collection
-    - Avoid selecting the same media object for multiple paragraphs
-    - Answer only with the structured response format in the same language as the paragraphs
-
-    PARAGRAPHS:
-    {paragraphs}
+    - Each media object should be used only once.
+    - If there are characters, places, or things in the paragraph, select a media object that shows the character, place,
+    or thing.
+    - Answer only with the structured response format in the same language as the paragraphs.
+
+    EXAMPLE:
+    Paragraph 1: "The president of the United States, Joe Biden, visited the White House on Tuesday."
+    Paragraph 2: "He met with the vice president, Kamala Harris."
+
+    Shot 1:
+    Paragraph 1: "The president of the United States, Joe Biden, visited the White House on Tuesday."
+    Media References:
+        - Media Object: "joe-biden-walking"
+          Description: "President Joe Biden walking towards the White House"
+          Type: "video"
+          Relevance: "Shows the president walking towards the White House"
+        - Media Object: "white-house-exterior"
+          Description: "The White House exterior"
+          Type: "image"
+          Relevance: "Shows the White House exterior"
+
+    Shot 2:
+    Paragraph 2: "He met with the vice president, Kamala Harris."
+    Media References:
+        - Media Object: "biden-meeting-kamala-harris"
+          Description: "President Joe Biden and Vice President Kamala Harris meeting"
+          Type: "video"
+          Relevance: "Shows the president and vice president meeting"
 
     AVAILABLE MEDIA OBJECTS:
     {media_objects}
+ 
+    PARAGRAPHS:
+    {paragraphs}
 
     SUGGESTIONS:
     """
 ).strip()
 
-
 SHOOTING_SCRIPT_PROMPT = textwrap.dedent(
     """
     INSTRUCTIONS:
-    You are an experienced journalist and scriptwriter tasked with creating a detailed shooting script for an
-    informative video based on the following paragraphs and media objects. Your script should suggest specific
-    shot, effects, and narration that effectively tell the story while incorporating the media assets.
-
-    The script should maintain journalistic standards of accuracy and objectivity while being engaging for viewers.
-    Make sure each suggested media object is thoughtfully integrated to enhance the narrative flow.
+    You are an experienced video editor tasked with creating a shooting script for an informative video based on the
+    following paragraphs and media objects. Your script should suggest effects and timings for the media objects to
+    create a visually engaging video.
 
     OUTPUT GUIDELINES:
-    - Provide a detailed shooting script that includes shots, effects, and timings.
-    - Use the paragraphs as subtitles for each shot. Keep them as they are.
+    - Keep the paragraphs and media objects as they are. Avoid changing them.
+    - Use the paragraphs as subtitles for the shots.
+    - Add timings to the media objects. Make sure they do not overlap.
     - Respond only with the structured output format in the same language as the paragraphs.
 
     PARAGRAPHS AND MEDIA OBJECTS SUGGESTIONS:

diff --git a/src/mosaico/script_generators/script.py b/src/mosaico/script_generators/script.py
@@ -1,10 +1,42 @@
+from __future__ import annotations
+
+from typing import Literal
+
 from pydantic import BaseModel, PositiveInt
 from pydantic.fields import Field
+from pydantic.functional_validators import model_validator
 from pydantic.types import NonNegativeFloat, PositiveFloat
+from typing_extensions import Self
 
 from mosaico.effects.types import VideoEffectType
 
 
+class ShotMediaReference(BaseModel):
+    """A reference to a media object."""
+
+    media_id: str
+    """The ID of the media object."""
+
+    type: Literal["image", "video"]
+    """The type of the media object."""
+
+    start_time: NonNegativeFloat
+    """The start time of the media object in seconds."""
+
+    end_time: PositiveFloat
+    """The end time of the media object in seconds."""
+
+    effects: list[VideoEffectType] = Field(default_factory=list)
+    """The effects applied to the media object."""
+
+    @model_validator(mode="after")
+    def _validate_media_references(self) -> Self:
+        """Validate the media references."""
+        if self.start_time >= self.end_time:
+            raise ValueError("The start time must be less than the end time.")
+        return self
+
+
 class Shot(BaseModel):
     """A shot for a script."""
 
@@ -14,20 +46,25 @@ class Shot(BaseModel):
     description: str
     """The description of the shot."""
 
-    start_time: NonNegativeFloat
-    """The start time of the shot in seconds."""
-
-    end_time: PositiveFloat
-    """The end time of the shot in seconds."""
-
     subtitle: str
     """The subtitle for the shot."""
 
-    media_id: str
-    """The media reference for the shot."""
+    media_references: list[ShotMediaReference] = Field(default_factory=list)
+    """The media references for the shot."""
 
-    effects: list[VideoEffectType] = Field(default_factory=list)
-    """The effects applied to the shot."""
+    @property
+    def start_time(self) -> float:
+        """The start time of the shot in seconds."""
+        if not self.media_references:
+            return 0
+        return min(media_reference.start_time for media_reference in self.media_references)
+
+    @property
+    def end_time(self) -> float:
+        """The end time of the shot in seconds."""
+        if not self.media_references:
+            return 0
+        return max(media_reference.end_time for media_reference in self.media_references)
 
     @property
     def duration(self) -> float:

diff --git a/src/mosaico/video/project.py b/src/mosaico/video/project.py
@@ -125,28 +125,44 @@ def from_script_generator(
         # Generate assets and scenes from a scene generator.
         script = script_generator.generate(media, **kwargs)
 
-        # Create assets and asset references from the script.
+        # Create assets and scenes from the script.
         for shot in script.shots:
-            referenced_media = next(m for m in media if m.id == shot.media_id)
+            # Create subtitle asset
             shot_subtitle = SubtitleAsset.from_data(shot.subtitle)
-            shot_effects = [create_effect(effect) for effect in shot.effects]
-            shot_asset = convert_media_to_asset(referenced_media)
-            shot_scene = (
-                Scene(description=shot.description)
-                .add_asset_references(
-                    AssetReference.from_asset(shot_subtitle)
-                    .with_start_time(shot.start_time)
-                    .with_end_time(shot.end_time)
-                )
-                .add_asset_references(
-                    AssetReference.from_asset(shot_asset)
-                    .with_start_time(shot.start_time)
-                    .with_end_time(shot.end_time)
-                    .with_effects(shot_effects if shot_asset.type == "image" else [])
-                )
+
+            # Create scene with initial subtitle reference
+            scene = Scene(description=shot.description).add_asset_references(
+                AssetReference.from_asset(shot_subtitle).with_start_time(shot.start_time).with_end_time(shot.end_time)
             )
 
-            project = project.add_assets(shot_asset).add_assets(shot_subtitle).add_timeline_events(shot_scene)
+            # Add subtitle asset to project
+            project = project.add_assets(shot_subtitle)
+
+            # Process each media reference in the shot
+            for media_ref in shot.media_references:
+                # Find the referenced media
+                referenced_media = next(m for m in media if m.id == media_ref.media_id)
+
+                # Convert media to asset
+                media_asset = convert_media_to_asset(referenced_media)
+
+                # Create asset reference with timing and effects
+                asset_ref = (
+                    AssetReference.from_asset(media_asset)
+                    .with_start_time(media_ref.start_time)
+                    .with_end_time(media_ref.end_time)
+                )
+
+                # Add effects if it's an image asset
+                if media_asset.type == "image" and media_ref.effects:
+                    asset_ref = asset_ref.with_effects([create_effect(effect) for effect in media_ref.effects])
+
+                # Add media asset and its reference to the scene
+                project = project.add_assets(media_asset)
+                scene = scene.add_asset_references(asset_ref)
+
+            # Add completed scene to project timeline
+            project = project.add_timeline_events(scene)
 
         return project
 
@@ -239,12 +255,13 @@ def add_narration(self, speech_synthesizer: SpeechSynthesizer) -> VideoProject:
         """
         Add narration to subtitles inside Scene objects by generating speech audio from subtitle text.
 
-        Updates other asset timings within each Scene based on generated speech duration.
+        Updates asset timings within each Scene to match narration duration, dividing time equally
+        between multiple images.
 
         :param speech_synthesizer: The speech synthesizer to use for generating narration audio
         :return: The updated project with narration added
         """
-        current_time = 0
+        current_time = None
 
         for i, scene in enumerate(self.timeline.sort()):
             if not isinstance(scene, Scene):
@@ -266,15 +283,44 @@ def add_narration(self, speech_synthesizer: SpeechSynthesizer) -> VideoProject:
             # Add narration assets to project
             self.add_assets(narration_assets)
 
-            # Calculate new duration based on narration
-            total_narration_duration = sum(asset.duration for asset in narration_assets)
+            # Calculate total narration duration for this scene
+            total_narration_duration = sum(narration.duration for narration in narration_assets)
+
+            # Get non-subtitle assets to adjust timing
+            non_subtitle_refs = [ref for ref in scene.asset_references if ref.asset_type != "subtitle"]
+            image_refs = [ref for ref in non_subtitle_refs if ref.asset_type == "image"]
+            other_refs = [ref for ref in non_subtitle_refs if ref.asset_type != "image"]
+
+            if current_time is None:
+                current_time = scene.asset_references[0].start_time
 
-            # Create new asset references with scaled timing
             new_refs = []
-            for ref in scene.asset_references:
-                new_start = current_time
-                new_end = current_time + total_narration_duration
-                new_ref = ref.model_copy().with_start_time(new_start).with_end_time(new_end)
+
+            # Adjust image timings - divide narration duration equally
+            if image_refs:
+                time_per_image = total_narration_duration / len(image_refs)
+                for idx, ref in enumerate(image_refs):
+                    new_start = current_time + (idx * time_per_image)
+                    new_end = new_start + time_per_image
+                    new_ref = ref.model_copy().with_start_time(new_start).with_end_time(new_end)
+                    new_refs.append(new_ref)
+
+            # Add other non-image assets with full narration duration
+            for ref in other_refs:
+                new_ref = (
+                    ref.model_copy()
+                    .with_start_time(current_time)
+                    .with_end_time(current_time + total_narration_duration)
+                )
+                new_refs.append(new_ref)
+
+            # Add subtitle references spanning full narration duration
+            for ref in subtitle_refs:
+                new_ref = (
+                    ref.model_copy()
+                    .with_start_time(current_time)
+                    .with_end_time(current_time + total_narration_duration)
+                )
                 new_refs.append(new_ref)
 
             # Add narration references
@@ -286,12 +332,13 @@ def add_narration(self, speech_synthesizer: SpeechSynthesizer) -> VideoProject:
                 )
                 new_refs.append(narration_ref)
 
+            # Update current_time for next scene
+            current_time += total_narration_duration
+
             # Create new scene with updated references
             new_scene = scene.model_copy(update={"asset_references": new_refs})
             self.timeline[i] = new_scene
 
-            current_time += total_narration_duration
-
         return self
 
     def add_captions(
@@ -330,19 +377,15 @@ def add_captions(
                 if ref.asset_type == "subtitle":
                     self.remove_asset(ref.asset_id)
 
-            current_time = 0
-            total_phrase_duration = sum(phrase[-1].end_time - phrase[0].start_time for phrase in phrases)
-
             # Calculate time scale factor if needed
-            time_scale = scene.duration / total_phrase_duration if total_phrase_duration > 0 else 1.0
             current_time = scene.start_time
 
-            for i, phrase in enumerate(phrases):
+            for phrase in phrases:
                 subtitle_text = " ".join(word.text for word in phrase)
                 subtitle = SubtitleAsset.from_data(subtitle_text)
 
                 # Calculate scaled duration
-                phrase_duration = (phrase[-1].end_time - phrase[0].start_time) * time_scale
+                phrase_duration = phrase[-1].end_time - phrase[0].start_time
 
                 start_time = current_time
                 end_time = start_time + phrase_duration

diff --git a/src/mosaico/video/rendering.py b/src/mosaico/video/rendering.py
@@ -105,6 +105,7 @@ def _render_event_clips(
     video_clips = []
 
     for asset, asset_ref in asset_and_ref_pairs:
+        print(asset.type, asset_ref.start_time, asset_ref.end_time, asset_ref.effects)
         clip = make_clip(asset, asset_ref.duration, video_resolution, asset_ref.effects)
         clip = clip.set_start(asset_ref.start_time)