Skip to content

Commit

Permalink
Merge pull request #9 from FolhaSP/fix/dynamic-news-videos
Browse files Browse the repository at this point in the history
fix: make news generator videos more dynamic
  • Loading branch information
leodiegues authored Nov 22, 2024
2 parents 56323dd + 63d9a46 commit d7d9eab
Show file tree
Hide file tree
Showing 7 changed files with 184 additions and 77 deletions.
11 changes: 5 additions & 6 deletions src/mosaico/script_generators/news/generator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from typing import TYPE_CHECKING, Any, Sequence
from typing import TYPE_CHECKING, Any, Literal, Sequence

from pydantic import BaseModel
from pydantic_extra_types.language_code import LanguageAlpha2

from mosaico.assets.types import AssetType
from mosaico.media import Media
from mosaico.script_generators.news.prompts import (
MEDIA_SUGGESTING_PROMPT,
Expand All @@ -23,10 +22,10 @@ class ParagraphMediaSuggestion(BaseModel):
paragraph: str
"""The paragraph content to which the media object corresponds."""

media_id: str
"""The media reference for the shot."""
media_ids: list[str]
"""The media IDs for the shot."""

type: AssetType
type: Literal["image", "video", "audio"]
"""The type of media (image, video, or audio)."""

relevance: str
Expand All @@ -44,7 +43,7 @@ class NewsVideoScriptGenerator:
def __init__(
self,
context: str,
model: str = "gpt-4o",
model: str = "claude-3-5-sonnet-20241022",
model_params: dict[str, Any] | None = None,
api_key: str | None = None,
base_url: str | None = None,
Expand Down
73 changes: 50 additions & 23 deletions src/mosaico/script_generators/news/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,17 @@
"""
INSTRUCTIONS:
You are a helpful news assistant tasked with summarizing the key points of the following context for a journalist
in paragraphs. Your summary should be concise, informative, and capture the most important details of the context.
The summary will be used by the journalist to produce a self-contained shooting script for an informative video
based on the context provided.
in paragraphs. Your summary should be very concise, informative, and capture the most important details of the
context. The summary will be used by the journalist to produce a self-contained shooting script for an informative
video based on the context provided.
OUTPUT GUIDELINES:
- The summary should have {num_paragraphs} paragraphs.
- Each paragraph should be 1 sentence long.
- The summary should have {num_paragraphs} short paragraphs.
- Each paragraph should be a very short sentence.
- Adhere to the best practices of journalistic writing.
- Make the paragraphs follow the chronology of the context.
- Make sure the first paragraph is the lead of the story.
- Make sure the last paragraph is the conclusion of the story.
- Return only the paragraphs in {language} without any additional information.
CONTEXT:
Expand All @@ -30,39 +32,64 @@
the visual appeal and storytelling of an informative video. Your selections should be relevant, engaging, and
directly correspond to the content of each paragraph.
From the media objects provided, you will select items that best match the content of each paragraph. Your goal
is to choose media that will enhance the viewer's understanding and create a compelling visual narrative.
From the media objects provided, your goal is to choose media that will enhance the viewer's understanding and
create a compelling visual narrative. Make sure each suggested media object is thoughtfully integrated to enhance
the narrative flow.
OUTPUT GUIDELINES:
- For each paragraph, select one media object from the provided collection
- Use 1-2 media objects for each paragraph, but try to use as many as possible.
- The video should be dynamic, so be sure to select different media objects for different shots.
- Only select media objects that are available in the provided collection
- Avoid selecting the same media object for multiple paragraphs
- Answer only with the structured response format in the same language as the paragraphs
PARAGRAPHS:
{paragraphs}
- Each media object should be used only once.
- If there are characters, places, or things in the paragraph, select a media object that shows the character, place,
or thing.
- Answer only with the structured response format in the same language as the paragraphs.
EXAMPLE:
Paragraph 1: "The president of the United States, Joe Biden, visited the White House on Tuesday."
Paragraph 2: "He met with the vice president, Kamala Harris."
Shot 1:
Paragraph 1: "The president of the United States, Joe Biden, visited the White House on Tuesday."
Media References:
- Media Object: "joe-biden-walking"
Description: "President Joe Biden walking towards the White House"
Type: "video"
Relevance: "Shows the president walking towards the White House"
- Media Object: "white-house-exterior"
Description: "The White House exterior"
Type: "image"
Relevance: "Shows the White House exterior"
Shot 2:
Paragraph 2: "He met with the vice president, Kamala Harris."
Media References:
- Media Object: "biden-meeting-kamala-harris"
Description: "President Joe Biden and Vice President Kamala Harris meeting"
Type: "video"
Relevance: "Shows the president and vice president meeting"
AVAILABLE MEDIA OBJECTS:
{media_objects}
PARAGRAPHS:
{paragraphs}
SUGGESTIONS:
"""
).strip()


SHOOTING_SCRIPT_PROMPT = textwrap.dedent(
"""
INSTRUCTIONS:
You are an experienced journalist and scriptwriter tasked with creating a detailed shooting script for an
informative video based on the following paragraphs and media objects. Your script should suggest specific
shot, effects, and narration that effectively tell the story while incorporating the media assets.
The script should maintain journalistic standards of accuracy and objectivity while being engaging for viewers.
Make sure each suggested media object is thoughtfully integrated to enhance the narrative flow.
You are an experienced video editor tasked with creating a shooting script for an informative video based on the
following paragraphs and media objects. Your script should suggest effects and timings for the media objects to
create a visually engaging video.
OUTPUT GUIDELINES:
- Provide a detailed shooting script that includes shots, effects, and timings.
- Use the paragraphs as subtitles for each shot. Keep them as they are.
- Keep the paragraphs and media objects as they are. Avoid changing them.
- Use the paragraphs as subtitles for the shots.
- Add timings to the media objects. Make sure they do not overlap.
- Respond only with the structured output format in the same language as the paragraphs.
PARAGRAPHS AND MEDIA OBJECTS SUGGESTIONS:
Expand Down
57 changes: 47 additions & 10 deletions src/mosaico/script_generators/script.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,42 @@
from __future__ import annotations

from typing import Literal

from pydantic import BaseModel, PositiveInt
from pydantic.fields import Field
from pydantic.functional_validators import model_validator
from pydantic.types import NonNegativeFloat, PositiveFloat
from typing_extensions import Self

from mosaico.effects.types import VideoEffectType


class ShotMediaReference(BaseModel):
"""A reference to a media object."""

media_id: str
"""The ID of the media object."""

type: Literal["image", "video"]
"""The type of the media object."""

start_time: NonNegativeFloat
"""The start time of the media object in seconds."""

end_time: PositiveFloat
"""The end time of the media object in seconds."""

effects: list[VideoEffectType] = Field(default_factory=list)
"""The effects applied to the media object."""

@model_validator(mode="after")
def _validate_media_references(self) -> Self:
"""Validate the media references."""
if self.start_time >= self.end_time:
raise ValueError("The start time must be less than the end time.")
return self


class Shot(BaseModel):
"""A shot for a script."""

Expand All @@ -14,20 +46,25 @@ class Shot(BaseModel):
description: str
"""The description of the shot."""

start_time: NonNegativeFloat
"""The start time of the shot in seconds."""

end_time: PositiveFloat
"""The end time of the shot in seconds."""

subtitle: str
"""The subtitle for the shot."""

media_id: str
"""The media reference for the shot."""
media_references: list[ShotMediaReference] = Field(default_factory=list)
"""The media references for the shot."""

effects: list[VideoEffectType] = Field(default_factory=list)
"""The effects applied to the shot."""
@property
def start_time(self) -> float:
"""The start time of the shot in seconds."""
if not self.media_references:
return 0
return min(media_reference.start_time for media_reference in self.media_references)

@property
def end_time(self) -> float:
"""The end time of the shot in seconds."""
if not self.media_references:
return 0
return max(media_reference.end_time for media_reference in self.media_references)

@property
def duration(self) -> float:
Expand Down
113 changes: 78 additions & 35 deletions src/mosaico/video/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,28 +125,44 @@ def from_script_generator(
# Generate assets and scenes from a scene generator.
script = script_generator.generate(media, **kwargs)

# Create assets and asset references from the script.
# Create assets and scenes from the script.
for shot in script.shots:
referenced_media = next(m for m in media if m.id == shot.media_id)
# Create subtitle asset
shot_subtitle = SubtitleAsset.from_data(shot.subtitle)
shot_effects = [create_effect(effect) for effect in shot.effects]
shot_asset = convert_media_to_asset(referenced_media)
shot_scene = (
Scene(description=shot.description)
.add_asset_references(
AssetReference.from_asset(shot_subtitle)
.with_start_time(shot.start_time)
.with_end_time(shot.end_time)
)
.add_asset_references(
AssetReference.from_asset(shot_asset)
.with_start_time(shot.start_time)
.with_end_time(shot.end_time)
.with_effects(shot_effects if shot_asset.type == "image" else [])
)

# Create scene with initial subtitle reference
scene = Scene(description=shot.description).add_asset_references(
AssetReference.from_asset(shot_subtitle).with_start_time(shot.start_time).with_end_time(shot.end_time)
)

project = project.add_assets(shot_asset).add_assets(shot_subtitle).add_timeline_events(shot_scene)
# Add subtitle asset to project
project = project.add_assets(shot_subtitle)

# Process each media reference in the shot
for media_ref in shot.media_references:
# Find the referenced media
referenced_media = next(m for m in media if m.id == media_ref.media_id)

# Convert media to asset
media_asset = convert_media_to_asset(referenced_media)

# Create asset reference with timing and effects
asset_ref = (
AssetReference.from_asset(media_asset)
.with_start_time(media_ref.start_time)
.with_end_time(media_ref.end_time)
)

# Add effects if it's an image asset
if media_asset.type == "image" and media_ref.effects:
asset_ref = asset_ref.with_effects([create_effect(effect) for effect in media_ref.effects])

# Add media asset and its reference to the scene
project = project.add_assets(media_asset)
scene = scene.add_asset_references(asset_ref)

# Add completed scene to project timeline
project = project.add_timeline_events(scene)

return project

Expand Down Expand Up @@ -239,12 +255,13 @@ def add_narration(self, speech_synthesizer: SpeechSynthesizer) -> VideoProject:
"""
Add narration to subtitles inside Scene objects by generating speech audio from subtitle text.
Updates other asset timings within each Scene based on generated speech duration.
Updates asset timings within each Scene to match narration duration, dividing time equally
between multiple images.
:param speech_synthesizer: The speech synthesizer to use for generating narration audio
:return: The updated project with narration added
"""
current_time = 0
current_time = None

for i, scene in enumerate(self.timeline.sort()):
if not isinstance(scene, Scene):
Expand All @@ -266,15 +283,44 @@ def add_narration(self, speech_synthesizer: SpeechSynthesizer) -> VideoProject:
# Add narration assets to project
self.add_assets(narration_assets)

# Calculate new duration based on narration
total_narration_duration = sum(asset.duration for asset in narration_assets)
# Calculate total narration duration for this scene
total_narration_duration = sum(narration.duration for narration in narration_assets)

# Get non-subtitle assets to adjust timing
non_subtitle_refs = [ref for ref in scene.asset_references if ref.asset_type != "subtitle"]
image_refs = [ref for ref in non_subtitle_refs if ref.asset_type == "image"]
other_refs = [ref for ref in non_subtitle_refs if ref.asset_type != "image"]

if current_time is None:
current_time = scene.asset_references[0].start_time

# Create new asset references with scaled timing
new_refs = []
for ref in scene.asset_references:
new_start = current_time
new_end = current_time + total_narration_duration
new_ref = ref.model_copy().with_start_time(new_start).with_end_time(new_end)

# Adjust image timings - divide narration duration equally
if image_refs:
time_per_image = total_narration_duration / len(image_refs)
for idx, ref in enumerate(image_refs):
new_start = current_time + (idx * time_per_image)
new_end = new_start + time_per_image
new_ref = ref.model_copy().with_start_time(new_start).with_end_time(new_end)
new_refs.append(new_ref)

# Add other non-image assets with full narration duration
for ref in other_refs:
new_ref = (
ref.model_copy()
.with_start_time(current_time)
.with_end_time(current_time + total_narration_duration)
)
new_refs.append(new_ref)

# Add subtitle references spanning full narration duration
for ref in subtitle_refs:
new_ref = (
ref.model_copy()
.with_start_time(current_time)
.with_end_time(current_time + total_narration_duration)
)
new_refs.append(new_ref)

# Add narration references
Expand All @@ -286,12 +332,13 @@ def add_narration(self, speech_synthesizer: SpeechSynthesizer) -> VideoProject:
)
new_refs.append(narration_ref)

# Update current_time for next scene
current_time += total_narration_duration

# Create new scene with updated references
new_scene = scene.model_copy(update={"asset_references": new_refs})
self.timeline[i] = new_scene

current_time += total_narration_duration

return self

def add_captions(
Expand Down Expand Up @@ -330,19 +377,15 @@ def add_captions(
if ref.asset_type == "subtitle":
self.remove_asset(ref.asset_id)

current_time = 0
total_phrase_duration = sum(phrase[-1].end_time - phrase[0].start_time for phrase in phrases)

# Calculate time scale factor if needed
time_scale = scene.duration / total_phrase_duration if total_phrase_duration > 0 else 1.0
current_time = scene.start_time

for i, phrase in enumerate(phrases):
for phrase in phrases:
subtitle_text = " ".join(word.text for word in phrase)
subtitle = SubtitleAsset.from_data(subtitle_text)

# Calculate scaled duration
phrase_duration = (phrase[-1].end_time - phrase[0].start_time) * time_scale
phrase_duration = phrase[-1].end_time - phrase[0].start_time

start_time = current_time
end_time = start_time + phrase_duration
Expand Down
1 change: 1 addition & 0 deletions src/mosaico/video/rendering.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def _render_event_clips(
video_clips = []

for asset, asset_ref in asset_and_ref_pairs:
print(asset.type, asset_ref.start_time, asset_ref.end_time, asset_ref.effects)
clip = make_clip(asset, asset_ref.duration, video_resolution, asset_ref.effects)
clip = clip.set_start(asset_ref.start_time)

Expand Down
Loading

0 comments on commit d7d9eab

Please sign in to comment.