diff --git a/src/mosaico/script_generators/news/generator.py b/src/mosaico/script_generators/news/generator.py index e3b1222..f71e135 100644 --- a/src/mosaico/script_generators/news/generator.py +++ b/src/mosaico/script_generators/news/generator.py @@ -1,9 +1,8 @@ -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any, Literal, Sequence from pydantic import BaseModel from pydantic_extra_types.language_code import LanguageAlpha2 -from mosaico.assets.types import AssetType from mosaico.media import Media from mosaico.script_generators.news.prompts import ( MEDIA_SUGGESTING_PROMPT, @@ -23,10 +22,10 @@ class ParagraphMediaSuggestion(BaseModel): paragraph: str """The paragraph content to which the media object corresponds.""" - media_id: str - """The media reference for the shot.""" + media_ids: list[str] + """The media IDs for the shot.""" - type: AssetType + type: Literal["image", "video", "audio"] """The type of media (image, video, or audio).""" relevance: str @@ -44,7 +43,7 @@ class NewsVideoScriptGenerator: def __init__( self, context: str, - model: str = "gpt-4o", + model: str = "claude-3-5-sonnet-20241022", model_params: dict[str, Any] | None = None, api_key: str | None = None, base_url: str | None = None, diff --git a/src/mosaico/script_generators/news/prompts.py b/src/mosaico/script_generators/news/prompts.py index 1282dc9..0f95e47 100644 --- a/src/mosaico/script_generators/news/prompts.py +++ b/src/mosaico/script_generators/news/prompts.py @@ -5,15 +5,17 @@ """ INSTRUCTIONS: You are a helpful news assistant tasked with summarizing the key points of the following context for a journalist - in paragraphs. Your summary should be concise, informative, and capture the most important details of the context. - The summary will be used by the journalist to produce a self-contained shooting script for an informative video - based on the context provided. - + in paragraphs. Your summary should be very concise, informative, and capture the most important details of the + context. The summary will be used by the journalist to produce a self-contained shooting script for an informative + video based on the context provided. OUTPUT GUIDELINES: - - The summary should have {num_paragraphs} paragraphs. - - Each paragraph should be 1 sentence long. + - The summary should have {num_paragraphs} short paragraphs. + - Each paragraph should be a very short sentence. - Adhere to the best practices of journalistic writing. + - Make the paragraphs follow the chronology of the context. + - Make sure the first paragraph is the lead of the story. + - Make sure the last paragraph is the conclusion of the story. - Return only the paragraphs in {language} without any additional information. CONTEXT: @@ -30,39 +32,64 @@ the visual appeal and storytelling of an informative video. Your selections should be relevant, engaging, and directly correspond to the content of each paragraph. - From the media objects provided, you will select items that best match the content of each paragraph. Your goal - is to choose media that will enhance the viewer's understanding and create a compelling visual narrative. + From the media objects provided, your goal is to choose media that will enhance the viewer's understanding and + create a compelling visual narrative. Make sure each suggested media object is thoughtfully integrated to enhance + the narrative flow. OUTPUT GUIDELINES: - - For each paragraph, select one media object from the provided collection + - Use 1-2 media objects for each paragraph, but try to use as many as possible. + - The video should be dynamic, so be sure to select different media objects for different shots. - Only select media objects that are available in the provided collection - - Avoid selecting the same media object for multiple paragraphs - - Answer only with the structured response format in the same language as the paragraphs - - PARAGRAPHS: - {paragraphs} + - Each media object should be used only once. + - If there are characters, places, or things in the paragraph, select a media object that shows the character, place, + or thing. + - Answer only with the structured response format in the same language as the paragraphs. + + EXAMPLE: + Paragraph 1: "The president of the United States, Joe Biden, visited the White House on Tuesday." + Paragraph 2: "He met with the vice president, Kamala Harris." + + Shot 1: + Paragraph 1: "The president of the United States, Joe Biden, visited the White House on Tuesday." + Media References: + - Media Object: "joe-biden-walking" + Description: "President Joe Biden walking towards the White House" + Type: "video" + Relevance: "Shows the president walking towards the White House" + - Media Object: "white-house-exterior" + Description: "The White House exterior" + Type: "image" + Relevance: "Shows the White House exterior" + + Shot 2: + Paragraph 2: "He met with the vice president, Kamala Harris." + Media References: + - Media Object: "biden-meeting-kamala-harris" + Description: "President Joe Biden and Vice President Kamala Harris meeting" + Type: "video" + Relevance: "Shows the president and vice president meeting" AVAILABLE MEDIA OBJECTS: {media_objects} + + PARAGRAPHS: + {paragraphs} SUGGESTIONS: """ ).strip() - SHOOTING_SCRIPT_PROMPT = textwrap.dedent( """ INSTRUCTIONS: - You are an experienced journalist and scriptwriter tasked with creating a detailed shooting script for an - informative video based on the following paragraphs and media objects. Your script should suggest specific - shot, effects, and narration that effectively tell the story while incorporating the media assets. - - The script should maintain journalistic standards of accuracy and objectivity while being engaging for viewers. - Make sure each suggested media object is thoughtfully integrated to enhance the narrative flow. + You are an experienced video editor tasked with creating a shooting script for an informative video based on the + following paragraphs and media objects. Your script should suggest effects and timings for the media objects to + create a visually engaging video. OUTPUT GUIDELINES: - - Provide a detailed shooting script that includes shots, effects, and timings. - - Use the paragraphs as subtitles for each shot. Keep them as they are. + - Keep the paragraphs and media objects as they are. Avoid changing them. + - Use the paragraphs as subtitles for the shots. + - Add timings to the media objects. Make sure they do not overlap. - Respond only with the structured output format in the same language as the paragraphs. PARAGRAPHS AND MEDIA OBJECTS SUGGESTIONS: diff --git a/src/mosaico/script_generators/script.py b/src/mosaico/script_generators/script.py index dbdeda4..ead0116 100644 --- a/src/mosaico/script_generators/script.py +++ b/src/mosaico/script_generators/script.py @@ -1,10 +1,42 @@ +from __future__ import annotations + +from typing import Literal + from pydantic import BaseModel, PositiveInt from pydantic.fields import Field +from pydantic.functional_validators import model_validator from pydantic.types import NonNegativeFloat, PositiveFloat +from typing_extensions import Self from mosaico.effects.types import VideoEffectType +class ShotMediaReference(BaseModel): + """A reference to a media object.""" + + media_id: str + """The ID of the media object.""" + + type: Literal["image", "video"] + """The type of the media object.""" + + start_time: NonNegativeFloat + """The start time of the media object in seconds.""" + + end_time: PositiveFloat + """The end time of the media object in seconds.""" + + effects: list[VideoEffectType] = Field(default_factory=list) + """The effects applied to the media object.""" + + @model_validator(mode="after") + def _validate_media_references(self) -> Self: + """Validate the media references.""" + if self.start_time >= self.end_time: + raise ValueError("The start time must be less than the end time.") + return self + + class Shot(BaseModel): """A shot for a script.""" @@ -14,20 +46,25 @@ class Shot(BaseModel): description: str """The description of the shot.""" - start_time: NonNegativeFloat - """The start time of the shot in seconds.""" - - end_time: PositiveFloat - """The end time of the shot in seconds.""" - subtitle: str """The subtitle for the shot.""" - media_id: str - """The media reference for the shot.""" + media_references: list[ShotMediaReference] = Field(default_factory=list) + """The media references for the shot.""" - effects: list[VideoEffectType] = Field(default_factory=list) - """The effects applied to the shot.""" + @property + def start_time(self) -> float: + """The start time of the shot in seconds.""" + if not self.media_references: + return 0 + return min(media_reference.start_time for media_reference in self.media_references) + + @property + def end_time(self) -> float: + """The end time of the shot in seconds.""" + if not self.media_references: + return 0 + return max(media_reference.end_time for media_reference in self.media_references) @property def duration(self) -> float: diff --git a/src/mosaico/video/project.py b/src/mosaico/video/project.py index 967e71e..58c9af4 100644 --- a/src/mosaico/video/project.py +++ b/src/mosaico/video/project.py @@ -125,28 +125,44 @@ def from_script_generator( # Generate assets and scenes from a scene generator. script = script_generator.generate(media, **kwargs) - # Create assets and asset references from the script. + # Create assets and scenes from the script. for shot in script.shots: - referenced_media = next(m for m in media if m.id == shot.media_id) + # Create subtitle asset shot_subtitle = SubtitleAsset.from_data(shot.subtitle) - shot_effects = [create_effect(effect) for effect in shot.effects] - shot_asset = convert_media_to_asset(referenced_media) - shot_scene = ( - Scene(description=shot.description) - .add_asset_references( - AssetReference.from_asset(shot_subtitle) - .with_start_time(shot.start_time) - .with_end_time(shot.end_time) - ) - .add_asset_references( - AssetReference.from_asset(shot_asset) - .with_start_time(shot.start_time) - .with_end_time(shot.end_time) - .with_effects(shot_effects if shot_asset.type == "image" else []) - ) + + # Create scene with initial subtitle reference + scene = Scene(description=shot.description).add_asset_references( + AssetReference.from_asset(shot_subtitle).with_start_time(shot.start_time).with_end_time(shot.end_time) ) - project = project.add_assets(shot_asset).add_assets(shot_subtitle).add_timeline_events(shot_scene) + # Add subtitle asset to project + project = project.add_assets(shot_subtitle) + + # Process each media reference in the shot + for media_ref in shot.media_references: + # Find the referenced media + referenced_media = next(m for m in media if m.id == media_ref.media_id) + + # Convert media to asset + media_asset = convert_media_to_asset(referenced_media) + + # Create asset reference with timing and effects + asset_ref = ( + AssetReference.from_asset(media_asset) + .with_start_time(media_ref.start_time) + .with_end_time(media_ref.end_time) + ) + + # Add effects if it's an image asset + if media_asset.type == "image" and media_ref.effects: + asset_ref = asset_ref.with_effects([create_effect(effect) for effect in media_ref.effects]) + + # Add media asset and its reference to the scene + project = project.add_assets(media_asset) + scene = scene.add_asset_references(asset_ref) + + # Add completed scene to project timeline + project = project.add_timeline_events(scene) return project @@ -239,12 +255,13 @@ def add_narration(self, speech_synthesizer: SpeechSynthesizer) -> VideoProject: """ Add narration to subtitles inside Scene objects by generating speech audio from subtitle text. - Updates other asset timings within each Scene based on generated speech duration. + Updates asset timings within each Scene to match narration duration, dividing time equally + between multiple images. :param speech_synthesizer: The speech synthesizer to use for generating narration audio :return: The updated project with narration added """ - current_time = 0 + current_time = None for i, scene in enumerate(self.timeline.sort()): if not isinstance(scene, Scene): @@ -266,15 +283,44 @@ def add_narration(self, speech_synthesizer: SpeechSynthesizer) -> VideoProject: # Add narration assets to project self.add_assets(narration_assets) - # Calculate new duration based on narration - total_narration_duration = sum(asset.duration for asset in narration_assets) + # Calculate total narration duration for this scene + total_narration_duration = sum(narration.duration for narration in narration_assets) + + # Get non-subtitle assets to adjust timing + non_subtitle_refs = [ref for ref in scene.asset_references if ref.asset_type != "subtitle"] + image_refs = [ref for ref in non_subtitle_refs if ref.asset_type == "image"] + other_refs = [ref for ref in non_subtitle_refs if ref.asset_type != "image"] + + if current_time is None: + current_time = scene.asset_references[0].start_time - # Create new asset references with scaled timing new_refs = [] - for ref in scene.asset_references: - new_start = current_time - new_end = current_time + total_narration_duration - new_ref = ref.model_copy().with_start_time(new_start).with_end_time(new_end) + + # Adjust image timings - divide narration duration equally + if image_refs: + time_per_image = total_narration_duration / len(image_refs) + for idx, ref in enumerate(image_refs): + new_start = current_time + (idx * time_per_image) + new_end = new_start + time_per_image + new_ref = ref.model_copy().with_start_time(new_start).with_end_time(new_end) + new_refs.append(new_ref) + + # Add other non-image assets with full narration duration + for ref in other_refs: + new_ref = ( + ref.model_copy() + .with_start_time(current_time) + .with_end_time(current_time + total_narration_duration) + ) + new_refs.append(new_ref) + + # Add subtitle references spanning full narration duration + for ref in subtitle_refs: + new_ref = ( + ref.model_copy() + .with_start_time(current_time) + .with_end_time(current_time + total_narration_duration) + ) new_refs.append(new_ref) # Add narration references @@ -286,12 +332,13 @@ def add_narration(self, speech_synthesizer: SpeechSynthesizer) -> VideoProject: ) new_refs.append(narration_ref) + # Update current_time for next scene + current_time += total_narration_duration + # Create new scene with updated references new_scene = scene.model_copy(update={"asset_references": new_refs}) self.timeline[i] = new_scene - current_time += total_narration_duration - return self def add_captions( @@ -330,19 +377,15 @@ def add_captions( if ref.asset_type == "subtitle": self.remove_asset(ref.asset_id) - current_time = 0 - total_phrase_duration = sum(phrase[-1].end_time - phrase[0].start_time for phrase in phrases) - # Calculate time scale factor if needed - time_scale = scene.duration / total_phrase_duration if total_phrase_duration > 0 else 1.0 current_time = scene.start_time - for i, phrase in enumerate(phrases): + for phrase in phrases: subtitle_text = " ".join(word.text for word in phrase) subtitle = SubtitleAsset.from_data(subtitle_text) # Calculate scaled duration - phrase_duration = (phrase[-1].end_time - phrase[0].start_time) * time_scale + phrase_duration = phrase[-1].end_time - phrase[0].start_time start_time = current_time end_time = start_time + phrase_duration diff --git a/src/mosaico/video/rendering.py b/src/mosaico/video/rendering.py index 32b7913..f5117bc 100644 --- a/src/mosaico/video/rendering.py +++ b/src/mosaico/video/rendering.py @@ -105,6 +105,7 @@ def _render_event_clips( video_clips = [] for asset, asset_ref in asset_and_ref_pairs: + print(asset.type, asset_ref.start_time, asset_ref.end_time, asset_ref.effects) clip = make_clip(asset, asset_ref.duration, video_resolution, asset_ref.effects) clip = clip.set_start(asset_ref.start_time) diff --git a/tests/video/test_project.py b/tests/video/test_project.py index b8eb6c8..09c92fe 100644 --- a/tests/video/test_project.py +++ b/tests/video/test_project.py @@ -668,8 +668,8 @@ def test_add_narration_preserves_relative_timing(): # Calculate expected timings based on MockSpeechSynthesizer's behavior narration_duration = len(subtitle_asset.to_string()) * 0.1 - expected_start = 0 # Start time should be 0 for all assets - expected_end = narration_duration # End time should match narration duration + expected_start = 1 # Start time should be 1 second into the scene + expected_end = narration_duration + 1 # End time should match narration duration + 1 second # Verify relative timing text_ref = next(ref for ref in updated_scene.asset_references if ref.asset_id == "text1") diff --git a/uv.lock b/uv.lock index e48743e..6a5abcc 100644 --- a/uv.lock +++ b/uv.lock @@ -1219,7 +1219,7 @@ wheels = [ [[package]] name = "mosaico" -version = "0.1.0rc0" +version = "0.1.0rc2" source = { editable = "." } dependencies = [ { name = "findsystemfontsfilename" },