From 9441afc05aaaec5905dfb0420598fbe4f23290cd Mon Sep 17 00:00:00 2001 From: yaniv-golan Date: Fri, 1 Nov 2024 15:07:07 +0200 Subject: [PATCH] v0.6.0 --- examples/latest/complex.stj.json | 366 ++-- examples/latest/multilingual.stj.json | 113 +- examples/latest/simple.stj.json | 32 +- examples/v0.4.0/complex.stj.json | 340 ++-- examples/v0.4.0/multilingual.stj.json | 116 +- examples/v0.4.0/simple.stj.json | 32 +- examples/v0.6.0/complex.stj.json | 214 +++ examples/v0.6.0/multilingual.stj.json | 76 + examples/v0.6.0/simple.stj.json | 21 + spec/CHANGELOG.md | 119 +- spec/latest/stj-specification.md | 1686 +++++++++++++----- spec/schema/CHANGELOG.md | 37 + spec/schema/latest/stj-schema.json | 458 +++++ spec/schema/v0.6.0/stj-schema.json | 10 +- spec/v0.6.0/stj-specification.md | 1297 ++++++++++---- stjlib/validation.py | 1 + tests/python/test_examples_with_schema.py | 28 + tests/python/test_examples_with_validator.py | 13 + tests/python/test_spec_samples.py | 890 +++++++++ tests/python/test_stj_to_srt.py | 53 +- tests/python/test_stj_validator.py | 199 +-- tests/python/test_validator.py | 105 +- tools/javascript/stj-validator.js | 176 +- tools/python/stj_validator.py | 4 +- 24 files changed, 4900 insertions(+), 1486 deletions(-) create mode 100644 examples/v0.6.0/complex.stj.json create mode 100644 examples/v0.6.0/multilingual.stj.json create mode 100644 examples/v0.6.0/simple.stj.json create mode 100644 spec/schema/latest/stj-schema.json create mode 100644 stjlib/validation.py create mode 100644 tests/python/test_examples_with_schema.py create mode 100644 tests/python/test_examples_with_validator.py create mode 100644 tests/python/test_spec_samples.py diff --git a/examples/latest/complex.stj.json b/examples/latest/complex.stj.json index edb86f5..1602b0e 100644 --- a/examples/latest/complex.stj.json +++ b/examples/latest/complex.stj.json @@ -1,174 +1,214 @@ { - "metadata": { - "transcriber": { - "name": "YAWT", - "version": "0.4.0" - }, - "created_at": "2023-10-19T15:30:00Z", - "source": { - "uri": "https://example.com/funny_conference.mp4", - "duration": 1800.0, - "languages": ["en", "es", "de"] - }, - "languages": ["en", "es", "de"], - "confidence_threshold": 0.6, - "additional_info": { - "project": "Annual Humor Conference", - "client": "LaughCorp International" - } - }, - "transcript": { - "speakers": [ - { - "id": "Speaker1", - "name": "Dr. Chuckles", - "additional_info": { - "role": "Keynote Speaker" - } + "stj": { + "version": "0.6.0", + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" }, - { - "id": "Speaker2", - "name": "Ms. Giggles", - "additional_info": { - "role": "Panelist" - } + "created_at": "2024-10-24T15:30:00Z", + "source": { + "uri": "https://example.com/funny_conference.mp4", + "duration": 1800.0, + "languages": [ + "en", + "es", + "de" + ] }, - { - "id": "Speaker3", - "name": "Herr Lachen", - "additional_info": { - "role": "Guest Speaker" + "languages": [ + "en", + "es", + "de" + ], + "confidence_threshold": 0.6, + "extensions": { + "project": { + "name": "Annual Humor Conference", + "client": "LaughCorp International" } } - ], - "styles": [ - { - "id": "Style1", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "color": "#FF5733", - "background_color": "#000000" + }, + "transcript": { + "speakers": [ + { + "id": "Speaker1", + "name": "Dr. Chuckles", + "extensions": { + "role": { + "title": "Keynote Speaker" + } + } }, - "positioning": { - "align": "center", - "line": "auto", - "position": "50%", - "size": "100%" + { + "id": "Speaker2", + "name": "Ms. Giggles", + "extensions": { + "role": { + "title": "Panelist" + } + } + }, + { + "id": "Speaker3", + "name": "Herr Lachen", + "extensions": { + "role": { + "title": "Guest Speaker" + } + } } - } - ], - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Ladies and gentlemen, welcome to the Annual Humor Conference!", - "speaker_id": "Speaker1", - "confidence": 0.98, - "language": "en", - "style_id": "Style1", - "words": [ - { - "start": 0.0, - "end": 0.5, - "text": "Ladies", - "confidence": 0.99 - }, - { - "start": 0.5, - "end": 0.7, - "text": "and", - "confidence": 0.98 - }, - { - "start": 0.7, - "end": 1.2, - "text": "gentlemen,", - "confidence": 0.97 - }, - { - "start": 1.3, - "end": 2.0, - "text": "welcome", - "confidence": 0.99 - }, - { - "start": 2.1, - "end": 2.3, - "text": "to", - "confidence": 0.98 - }, - { - "start": 2.3, - "end": 2.5, - "text": "the", - "confidence": 0.98 + ], + "styles": [ + { + "id": "Style1", + "text": { + "color": "#FF5733", + "background": "#000000", + "bold": true, + "italic": false, + "underline": false, + "size": "100%" }, - { - "start": 2.6, - "end": 3.5, - "text": "Annual", - "confidence": 0.97 + "display": { + "align": "center", + "vertical": "bottom", + "position": { + "x": "50%", + "y": "90%" + } }, - { - "start": 3.6, - "end": 5.0, - "text": "Humor Conference!", - "confidence": 0.96 + "extensions": { + "custom_webvtt": { + "line": "auto" + } } - ] - }, - { - "start": 5.1, - "end": 10.0, - "text": "Did you hear about the mathematician who's afraid of negative numbers?", - "speaker_id": "Speaker1", - "confidence": 0.96, - "language": "en", - "style_id": "Style1" - }, - { - "start": 10.1, - "end": 12.0, - "text": "He'll stop at nothing to avoid them!", - "speaker_id": "Speaker1", - "confidence": 0.95, - "language": "en", - "style_id": "Style1" - }, - { - "start": 12.1, - "end": 17.0, - "text": "¡Y ahora, un poco de humor en español!", - "speaker_id": "Speaker2", - "confidence": 0.94, - "language": "es" - }, - { - "start": 17.1, - "end": 22.0, - "text": "¿Qué le dijo un techo a otro techo? ¡Techo de menos!", - "speaker_id": "Speaker2", - "confidence": 0.93, - "language": "es" - }, - { - "start": 22.1, - "end": 27.0, - "text": "Und jetzt etwas auf Deutsch!", - "speaker_id": "Speaker3", - "confidence": 0.92, - "language": "de" - }, - { - "start": 27.1, - "end": 32.0, - "text": "Warum können Seeräuber den Kreisumfang so gut berechnen? Weil sie Pi raten!", - "speaker_id": "Speaker3", - "confidence": 0.91, - "language": "de" - } - ] + } + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Ladies and gentlemen, welcome to the Annual Humor Conference!", + "speaker_id": "Speaker1", + "confidence": 0.98, + "language": "en", + "style_id": "Style1", + "word_timing_mode": "complete", + "words": [ + { + "start": 0.0, + "end": 0.5, + "text": "Ladies", + "confidence": 0.99 + }, + { + "start": 0.5, + "end": 0.7, + "text": "and", + "confidence": 0.98 + }, + { + "start": 0.7, + "end": 1.2, + "text": "gentlemen,", + "confidence": 0.97 + }, + { + "start": 1.3, + "end": 2.0, + "text": "welcome", + "confidence": 0.99 + }, + { + "start": 2.1, + "end": 2.3, + "text": "to", + "confidence": 0.98 + }, + { + "start": 2.3, + "end": 2.5, + "text": "the", + "confidence": 0.98 + }, + { + "start": 2.6, + "end": 3.5, + "text": "Annual", + "confidence": 0.97 + }, + { + "start": 3.6, + "end": 4.2, + "text": "Humor", + "confidence": 0.96 + }, + { + "start": 4.3, + "end": 5.0, + "text": "Conference!", + "confidence": 0.96 + } + ] + }, + { + "start": 5.1, + "end": 10.0, + "text": "Did you hear about the mathematician who's afraid of negative numbers?", + "speaker_id": "Speaker1", + "confidence": 0.96, + "language": "en", + "style_id": "Style1", + "word_timing_mode": "none" + }, + { + "start": 10.1, + "end": 12.0, + "text": "He'll stop at nothing to avoid them!", + "speaker_id": "Speaker1", + "confidence": 0.95, + "language": "en", + "style_id": "Style1", + "word_timing_mode": "none" + }, + { + "start": 12.1, + "end": 17.0, + "text": "¡Y ahora, un poco de humor en español!", + "speaker_id": "Speaker2", + "confidence": 0.94, + "language": "es", + "word_timing_mode": "none" + }, + { + "start": 17.1, + "end": 22.0, + "text": "¿Qué le dijo un techo a otro techo? ¡Techo de menos!", + "speaker_id": "Speaker2", + "confidence": 0.93, + "language": "es", + "word_timing_mode": "none" + }, + { + "start": 22.1, + "end": 27.0, + "text": "Und jetzt etwas auf Deutsch!", + "speaker_id": "Speaker3", + "confidence": 0.92, + "language": "de", + "word_timing_mode": "none" + }, + { + "start": 27.1, + "end": 32.0, + "text": "Warum können Seeräuber den Kreisumfang so gut berechnen? Weil sie Pi raten!", + "speaker_id": "Speaker3", + "confidence": 0.91, + "language": "de", + "word_timing_mode": "none" + } + ] + } } -} +} \ No newline at end of file diff --git a/examples/latest/multilingual.stj.json b/examples/latest/multilingual.stj.json index 80ac26f..a3bfe5f 100644 --- a/examples/latest/multilingual.stj.json +++ b/examples/latest/multilingual.stj.json @@ -1,49 +1,76 @@ { - "metadata": { - "transcriber": { - "name": "YAWT", - "version": "0.4.0" - }, - "created_at": "2023-10-20T12:00:00Z", - "source": { - "uri": "https://example.com/conference.mp4", - "duration": 3600.0, - "languages": ["en", "es", "de"] // Source languages: English, Spanish, German - }, - "languages": ["fr", "it"], // Transcription languages: French, Italian - "additional_info": { - "event": "International Multilingual Conference" - } - }, - "transcript": { - "speakers": [ - { "id": "Speaker1", "name": "Dr. Smith", "additional_info": { "role": "Keynote Speaker" } }, - { "id": "Speaker2", "name": "Señora García", "additional_info": { "role": "Panelist" } }, - { "id": "Speaker3", "name": "Herr Müller", "additional_info": { "role": "Guest Speaker" } } - ], - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Bonjour à tous.", - "speaker_id": "Speaker1", - "language": "fr" + "stj": { + "version": "0.6.0", + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" }, - { - "start": 5.1, - "end": 10.0, - "text": "Benvenuti a tutti.", - "speaker_id": "Speaker2", - "language": "it" + "created_at": "2024-10-24T12:00:00Z", + "source": { + "uri": "https://example.com/conference.mp4", + "duration": 3600.0, + "languages": [ + "en", + "es", + "de" + ] }, - { - "start": 10.1, - "end": 15.0, - "text": "Merci d'être venus.", - "speaker_id": "Speaker3", - "language": "fr" + "languages": [ + "fr", + "it" + ], + "extensions": { + "event": "International Multilingual Conference" } - // Additional segments... - ] + }, + "transcript": { + "speakers": [ + { + "id": "Speaker1", + "name": "Dr. Smith", + "extensions": { + "role": "Keynote Speaker" + } + }, + { + "id": "Speaker2", + "name": "Señora García", + "extensions": { + "role": "Panelist" + } + }, + { + "id": "Speaker3", + "name": "Herr Müller", + "extensions": { + "role": "Guest Speaker" + } + } + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Bonjour à tous.", + "speaker_id": "Speaker1", + "language": "fr" + }, + { + "start": 5.1, + "end": 10.0, + "text": "Benvenuti a tutti.", + "speaker_id": "Speaker2", + "language": "it" + }, + { + "start": 10.1, + "end": 15.0, + "text": "Merci d'être venus.", + "speaker_id": "Speaker3", + "language": "fr" + } + ] + } } } \ No newline at end of file diff --git a/examples/latest/simple.stj.json b/examples/latest/simple.stj.json index 932893e..a356b3c 100644 --- a/examples/latest/simple.stj.json +++ b/examples/latest/simple.stj.json @@ -1,19 +1,21 @@ { - "metadata": { - "version": "0.5.0", - "transcriber": { - "name": "YAWT", - "version": "0.4.0" - }, - "created_at": "2023-10-19T15:30:00Z" - }, - "transcript": { - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Hello, world!" + "stj": { + "version": "0.6.0", + "metadata": { + "created_at": "2024-10-24T15:30:00Z", + "transcriber": { + "name": "YAWT", + "version": "0.4.0" } - ] + }, + "transcript": { + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Hello, world!" + } + ] + } } } \ No newline at end of file diff --git a/examples/v0.4.0/complex.stj.json b/examples/v0.4.0/complex.stj.json index edb86f5..66f3c4f 100644 --- a/examples/v0.4.0/complex.stj.json +++ b/examples/v0.4.0/complex.stj.json @@ -1,174 +1,184 @@ { - "metadata": { - "transcriber": { - "name": "YAWT", - "version": "0.4.0" - }, - "created_at": "2023-10-19T15:30:00Z", - "source": { - "uri": "https://example.com/funny_conference.mp4", - "duration": 1800.0, - "languages": ["en", "es", "de"] - }, - "languages": ["en", "es", "de"], - "confidence_threshold": 0.6, - "additional_info": { - "project": "Annual Humor Conference", - "client": "LaughCorp International" - } - }, - "transcript": { - "speakers": [ - { - "id": "Speaker1", - "name": "Dr. Chuckles", - "additional_info": { - "role": "Keynote Speaker" - } + "stj": { + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" }, - { - "id": "Speaker2", - "name": "Ms. Giggles", - "additional_info": { - "role": "Panelist" - } + "created_at": "2023-10-19T15:30:00Z", + "source": { + "uri": "https://example.com/funny_conference.mp4", + "duration": 1800.0, + "languages": [ + "en", + "es", + "de" + ] }, - { - "id": "Speaker3", - "name": "Herr Lachen", - "additional_info": { - "role": "Guest Speaker" - } + "languages": [ + "en", + "es", + "de" + ], + "confidence_threshold": 0.6, + "additional_info": { + "project": "Annual Humor Conference", + "client": "LaughCorp International" } - ], - "styles": [ - { - "id": "Style1", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "color": "#FF5733", - "background_color": "#000000" + }, + "transcript": { + "speakers": [ + { + "id": "Speaker1", + "name": "Dr. Chuckles", + "additional_info": { + "role": "Keynote Speaker" + } }, - "positioning": { - "align": "center", - "line": "auto", - "position": "50%", - "size": "100%" + { + "id": "Speaker2", + "name": "Ms. Giggles", + "additional_info": { + "role": "Panelist" + } + }, + { + "id": "Speaker3", + "name": "Herr Lachen", + "additional_info": { + "role": "Guest Speaker" + } } - } - ], - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Ladies and gentlemen, welcome to the Annual Humor Conference!", - "speaker_id": "Speaker1", - "confidence": 0.98, - "language": "en", - "style_id": "Style1", - "words": [ - { - "start": 0.0, - "end": 0.5, - "text": "Ladies", - "confidence": 0.99 - }, - { - "start": 0.5, - "end": 0.7, - "text": "and", - "confidence": 0.98 - }, - { - "start": 0.7, - "end": 1.2, - "text": "gentlemen,", - "confidence": 0.97 - }, - { - "start": 1.3, - "end": 2.0, - "text": "welcome", - "confidence": 0.99 - }, - { - "start": 2.1, - "end": 2.3, - "text": "to", - "confidence": 0.98 - }, - { - "start": 2.3, - "end": 2.5, - "text": "the", - "confidence": 0.98 + ], + "styles": [ + { + "id": "Style1", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "color": "#FF5733", + "background_color": "#000000" }, - { - "start": 2.6, - "end": 3.5, - "text": "Annual", - "confidence": 0.97 - }, - { - "start": 3.6, - "end": 5.0, - "text": "Humor Conference!", - "confidence": 0.96 + "positioning": { + "align": "center", + "line": "auto", + "position": "50%", + "size": "100%" } - ] - }, - { - "start": 5.1, - "end": 10.0, - "text": "Did you hear about the mathematician who's afraid of negative numbers?", - "speaker_id": "Speaker1", - "confidence": 0.96, - "language": "en", - "style_id": "Style1" - }, - { - "start": 10.1, - "end": 12.0, - "text": "He'll stop at nothing to avoid them!", - "speaker_id": "Speaker1", - "confidence": 0.95, - "language": "en", - "style_id": "Style1" - }, - { - "start": 12.1, - "end": 17.0, - "text": "¡Y ahora, un poco de humor en español!", - "speaker_id": "Speaker2", - "confidence": 0.94, - "language": "es" - }, - { - "start": 17.1, - "end": 22.0, - "text": "¿Qué le dijo un techo a otro techo? ¡Techo de menos!", - "speaker_id": "Speaker2", - "confidence": 0.93, - "language": "es" - }, - { - "start": 22.1, - "end": 27.0, - "text": "Und jetzt etwas auf Deutsch!", - "speaker_id": "Speaker3", - "confidence": 0.92, - "language": "de" - }, - { - "start": 27.1, - "end": 32.0, - "text": "Warum können Seeräuber den Kreisumfang so gut berechnen? Weil sie Pi raten!", - "speaker_id": "Speaker3", - "confidence": 0.91, - "language": "de" - } - ] + } + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Ladies and gentlemen, welcome to the Annual Humor Conference!", + "speaker_id": "Speaker1", + "confidence": 0.98, + "language": "en", + "style_id": "Style1", + "words": [ + { + "start": 0.0, + "end": 0.5, + "text": "Ladies", + "confidence": 0.99 + }, + { + "start": 0.5, + "end": 0.7, + "text": "and", + "confidence": 0.98 + }, + { + "start": 0.7, + "end": 1.2, + "text": "gentlemen,", + "confidence": 0.97 + }, + { + "start": 1.3, + "end": 2.0, + "text": "welcome", + "confidence": 0.99 + }, + { + "start": 2.1, + "end": 2.3, + "text": "to", + "confidence": 0.98 + }, + { + "start": 2.3, + "end": 2.5, + "text": "the", + "confidence": 0.98 + }, + { + "start": 2.6, + "end": 3.5, + "text": "Annual", + "confidence": 0.97 + }, + { + "start": 3.6, + "end": 5.0, + "text": "Humor Conference!", + "confidence": 0.96 + } + ] + }, + { + "start": 5.1, + "end": 10.0, + "text": "Did you hear about the mathematician who's afraid of negative numbers?", + "speaker_id": "Speaker1", + "confidence": 0.96, + "language": "en", + "style_id": "Style1" + }, + { + "start": 10.1, + "end": 12.0, + "text": "He'll stop at nothing to avoid them!", + "speaker_id": "Speaker1", + "confidence": 0.95, + "language": "en", + "style_id": "Style1" + }, + { + "start": 12.1, + "end": 17.0, + "text": "¡Y ahora, un poco de humor en español!", + "speaker_id": "Speaker2", + "confidence": 0.94, + "language": "es" + }, + { + "start": 17.1, + "end": 22.0, + "text": "¿Qué le dijo un techo a otro techo? ¡Techo de menos!", + "speaker_id": "Speaker2", + "confidence": 0.93, + "language": "es" + }, + { + "start": 22.1, + "end": 27.0, + "text": "Und jetzt etwas auf Deutsch!", + "speaker_id": "Speaker3", + "confidence": 0.92, + "language": "de" + }, + { + "start": 27.1, + "end": 32.0, + "text": "Warum können Seeräuber den Kreisumfang so gut berechnen? Weil sie Pi raten!", + "speaker_id": "Speaker3", + "confidence": 0.91, + "language": "de" + } + ] + } } -} +} \ No newline at end of file diff --git a/examples/v0.4.0/multilingual.stj.json b/examples/v0.4.0/multilingual.stj.json index 80ac26f..f765713 100644 --- a/examples/v0.4.0/multilingual.stj.json +++ b/examples/v0.4.0/multilingual.stj.json @@ -1,49 +1,75 @@ { - "metadata": { - "transcriber": { - "name": "YAWT", - "version": "0.4.0" - }, - "created_at": "2023-10-20T12:00:00Z", - "source": { - "uri": "https://example.com/conference.mp4", - "duration": 3600.0, - "languages": ["en", "es", "de"] // Source languages: English, Spanish, German - }, - "languages": ["fr", "it"], // Transcription languages: French, Italian - "additional_info": { - "event": "International Multilingual Conference" - } - }, - "transcript": { - "speakers": [ - { "id": "Speaker1", "name": "Dr. Smith", "additional_info": { "role": "Keynote Speaker" } }, - { "id": "Speaker2", "name": "Señora García", "additional_info": { "role": "Panelist" } }, - { "id": "Speaker3", "name": "Herr Müller", "additional_info": { "role": "Guest Speaker" } } - ], - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Bonjour à tous.", - "speaker_id": "Speaker1", - "language": "fr" + "stj": { + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" }, - { - "start": 5.1, - "end": 10.0, - "text": "Benvenuti a tutti.", - "speaker_id": "Speaker2", - "language": "it" + "created_at": "2023-10-20T12:00:00Z", + "source": { + "uri": "https://example.com/conference.mp4", + "duration": 3600.0, + "languages": [ + "en", + "es", + "de" + ] // Source languages: English, Spanish, German }, - { - "start": 10.1, - "end": 15.0, - "text": "Merci d'être venus.", - "speaker_id": "Speaker3", - "language": "fr" + "languages": [ + "fr", + "it" + ], // Transcription languages: French, Italian + "additional_info": { + "event": "International Multilingual Conference" } - // Additional segments... - ] - } -} \ No newline at end of file + }, + "transcript": { + "speakers": [ + { + "id": "Speaker1", + "name": "Dr. Smith", + "additional_info": { + "role": "Keynote Speaker" + } + }, + { + "id": "Speaker2", + "name": "Señora García", + "additional_info": { + "role": "Panelist" + } + }, + { + "id": "Speaker3", + "name": "Herr Müller", + "additional_info": { + "role": "Guest Speaker" + } + } + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Bonjour à tous.", + "speaker_id": "Speaker1", + "language": "fr" + }, + { + "start": 5.1, + "end": 10.0, + "text": "Benvenuti a tutti.", + "speaker_id": "Speaker2", + "language": "it" + }, + { + "start": 10.1, + "end": 15.0, + "text": "Merci d'être venus.", + "speaker_id": "Speaker3", + "language": "fr" + } + // Additional segments... + ] + } + } \ No newline at end of file diff --git a/examples/v0.4.0/simple.stj.json b/examples/v0.4.0/simple.stj.json index 04b6047..dc07f56 100644 --- a/examples/v0.4.0/simple.stj.json +++ b/examples/v0.4.0/simple.stj.json @@ -1,18 +1,20 @@ { - "metadata": { - "transcriber": { - "name": "YAWT", - "version": "0.4.0" + "stj": { + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" + }, + "created_at": "2023-10-19T15:30:00Z" }, - "created_at": "2023-10-19T15:30:00Z" - }, - "transcript": { - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Hello, world!" - } - ] + "transcript": { + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Hello, world!" + } + ] + } } -} +} \ No newline at end of file diff --git a/examples/v0.6.0/complex.stj.json b/examples/v0.6.0/complex.stj.json new file mode 100644 index 0000000..1602b0e --- /dev/null +++ b/examples/v0.6.0/complex.stj.json @@ -0,0 +1,214 @@ +{ + "stj": { + "version": "0.6.0", + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" + }, + "created_at": "2024-10-24T15:30:00Z", + "source": { + "uri": "https://example.com/funny_conference.mp4", + "duration": 1800.0, + "languages": [ + "en", + "es", + "de" + ] + }, + "languages": [ + "en", + "es", + "de" + ], + "confidence_threshold": 0.6, + "extensions": { + "project": { + "name": "Annual Humor Conference", + "client": "LaughCorp International" + } + } + }, + "transcript": { + "speakers": [ + { + "id": "Speaker1", + "name": "Dr. Chuckles", + "extensions": { + "role": { + "title": "Keynote Speaker" + } + } + }, + { + "id": "Speaker2", + "name": "Ms. Giggles", + "extensions": { + "role": { + "title": "Panelist" + } + } + }, + { + "id": "Speaker3", + "name": "Herr Lachen", + "extensions": { + "role": { + "title": "Guest Speaker" + } + } + } + ], + "styles": [ + { + "id": "Style1", + "text": { + "color": "#FF5733", + "background": "#000000", + "bold": true, + "italic": false, + "underline": false, + "size": "100%" + }, + "display": { + "align": "center", + "vertical": "bottom", + "position": { + "x": "50%", + "y": "90%" + } + }, + "extensions": { + "custom_webvtt": { + "line": "auto" + } + } + } + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Ladies and gentlemen, welcome to the Annual Humor Conference!", + "speaker_id": "Speaker1", + "confidence": 0.98, + "language": "en", + "style_id": "Style1", + "word_timing_mode": "complete", + "words": [ + { + "start": 0.0, + "end": 0.5, + "text": "Ladies", + "confidence": 0.99 + }, + { + "start": 0.5, + "end": 0.7, + "text": "and", + "confidence": 0.98 + }, + { + "start": 0.7, + "end": 1.2, + "text": "gentlemen,", + "confidence": 0.97 + }, + { + "start": 1.3, + "end": 2.0, + "text": "welcome", + "confidence": 0.99 + }, + { + "start": 2.1, + "end": 2.3, + "text": "to", + "confidence": 0.98 + }, + { + "start": 2.3, + "end": 2.5, + "text": "the", + "confidence": 0.98 + }, + { + "start": 2.6, + "end": 3.5, + "text": "Annual", + "confidence": 0.97 + }, + { + "start": 3.6, + "end": 4.2, + "text": "Humor", + "confidence": 0.96 + }, + { + "start": 4.3, + "end": 5.0, + "text": "Conference!", + "confidence": 0.96 + } + ] + }, + { + "start": 5.1, + "end": 10.0, + "text": "Did you hear about the mathematician who's afraid of negative numbers?", + "speaker_id": "Speaker1", + "confidence": 0.96, + "language": "en", + "style_id": "Style1", + "word_timing_mode": "none" + }, + { + "start": 10.1, + "end": 12.0, + "text": "He'll stop at nothing to avoid them!", + "speaker_id": "Speaker1", + "confidence": 0.95, + "language": "en", + "style_id": "Style1", + "word_timing_mode": "none" + }, + { + "start": 12.1, + "end": 17.0, + "text": "¡Y ahora, un poco de humor en español!", + "speaker_id": "Speaker2", + "confidence": 0.94, + "language": "es", + "word_timing_mode": "none" + }, + { + "start": 17.1, + "end": 22.0, + "text": "¿Qué le dijo un techo a otro techo? ¡Techo de menos!", + "speaker_id": "Speaker2", + "confidence": 0.93, + "language": "es", + "word_timing_mode": "none" + }, + { + "start": 22.1, + "end": 27.0, + "text": "Und jetzt etwas auf Deutsch!", + "speaker_id": "Speaker3", + "confidence": 0.92, + "language": "de", + "word_timing_mode": "none" + }, + { + "start": 27.1, + "end": 32.0, + "text": "Warum können Seeräuber den Kreisumfang so gut berechnen? Weil sie Pi raten!", + "speaker_id": "Speaker3", + "confidence": 0.91, + "language": "de", + "word_timing_mode": "none" + } + ] + } + } +} \ No newline at end of file diff --git a/examples/v0.6.0/multilingual.stj.json b/examples/v0.6.0/multilingual.stj.json new file mode 100644 index 0000000..a3bfe5f --- /dev/null +++ b/examples/v0.6.0/multilingual.stj.json @@ -0,0 +1,76 @@ +{ + "stj": { + "version": "0.6.0", + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" + }, + "created_at": "2024-10-24T12:00:00Z", + "source": { + "uri": "https://example.com/conference.mp4", + "duration": 3600.0, + "languages": [ + "en", + "es", + "de" + ] + }, + "languages": [ + "fr", + "it" + ], + "extensions": { + "event": "International Multilingual Conference" + } + }, + "transcript": { + "speakers": [ + { + "id": "Speaker1", + "name": "Dr. Smith", + "extensions": { + "role": "Keynote Speaker" + } + }, + { + "id": "Speaker2", + "name": "Señora García", + "extensions": { + "role": "Panelist" + } + }, + { + "id": "Speaker3", + "name": "Herr Müller", + "extensions": { + "role": "Guest Speaker" + } + } + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Bonjour à tous.", + "speaker_id": "Speaker1", + "language": "fr" + }, + { + "start": 5.1, + "end": 10.0, + "text": "Benvenuti a tutti.", + "speaker_id": "Speaker2", + "language": "it" + }, + { + "start": 10.1, + "end": 15.0, + "text": "Merci d'être venus.", + "speaker_id": "Speaker3", + "language": "fr" + } + ] + } + } +} \ No newline at end of file diff --git a/examples/v0.6.0/simple.stj.json b/examples/v0.6.0/simple.stj.json new file mode 100644 index 0000000..a356b3c --- /dev/null +++ b/examples/v0.6.0/simple.stj.json @@ -0,0 +1,21 @@ +{ + "stj": { + "version": "0.6.0", + "metadata": { + "created_at": "2024-10-24T15:30:00Z", + "transcriber": { + "name": "YAWT", + "version": "0.4.0" + } + }, + "transcript": { + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Hello, world!" + } + ] + } + } +} \ No newline at end of file diff --git a/spec/CHANGELOG.md b/spec/CHANGELOG.md index eddf97b..8473634 100644 --- a/spec/CHANGELOG.md +++ b/spec/CHANGELOG.md @@ -1,17 +1,19 @@ # Changelog -# Changelog - ## [0.6.0] - 2024-10-27 -### Changed +### Breaking Changes -- **File Extension**: - - Selected `.stj.json` as the single primary file extension for consistency. - - Removed alternative extensions to prevent confusion and encourage standardization. +- **File Extensions**: + - Changed primary recommended extension from `.stj.json` to `.stjson` + - Added `.stj` and `.stj.json` as alternative supported extensions + - Applications should be updated to: + - Recognize all three extensions (`.stjson`, `.stj`, `.stj.json`) + - Use `.stjson` as default when creating new files + - Continue supporting `.stj.json` for backward compatibility - **Root Structure**: - - Moved the `version` field from the `metadata` section to the root `"stj"` object. + - Moved the `version` field from the `metadata` section to the root `"stj"` object - Updated the root structure to: ```json @@ -24,74 +26,111 @@ } ``` - - Removed ordering requirements within JSON objects to align with JSON standards. + - Specified that no additional properties are allowed at the root level + +- **Character Encoding Requirements**: + - Changed UTF-8 Byte Order Mark (BOM) from optional to prohibited + - Files **MUST** be encoded in UTF-8 without a BOM + +### Changed - **Mandatory Fields**: - - Clarified that `transcript.segments[].start` and `transcript.segments[].end` are optional fields that become mandatory when timing information is included. - - Updated the "Mandatory vs. Optional Fields Summary" to reflect these changes. + - Clarified that `transcript.segments[].start` and `transcript.segments[].end` are optional fields that become mandatory when timing information is included + - Made the `metadata` section optional + - Updated the "Mandatory vs. Optional Fields Summary" to reflect these changes - **Handling of Anonymous Speakers**: - - Specified that the `name` field **MUST** be omitted for anonymous speakers. - - Updated examples to remove the `"name": "Unknown"` entries for anonymous speakers. - - Ensured consistency in representing anonymous speakers throughout the document. + - Specified that the `name` field **MUST** be omitted for anonymous speakers + - Updated examples to remove the `"name": "Unknown"` entries for anonymous speakers + - Ensured consistency in representing anonymous speakers throughout the document - **Time Format Requirements**: - - Corrected grammatical errors for clarity. - - Removed ordering constraints within JSON objects. - - Emphasized that `is_zero_duration` **MUST** be included when `start` equals `end`, and **MUST NOT** be included otherwise. + - Specified the rounding rules for time values with more than 3 decimal places, using IEEE 754 round-to-nearest-even + - Updated examples to illustrate the rounding behavior and edge cases + - Emphasized that `is_zero_duration` **MUST** be included when `start` equals `end`, and **MUST NOT** be included otherwise + - Added detailed processing requirements for time values, including validation severity levels - **Extensions Field Requirements**: - - Corrected formatting errors and improved clarity regarding custom namespace guidelines. - - Emphasized that applications **MUST** report an error if a reserved namespace is used. - - Provided clearer guidance on using prefixes like `"custom_"` for provisional namespaces. + - Corrected formatting errors and improved clarity regarding custom namespace guidelines + - Emphasized that applications **MUST** report an error if a reserved namespace is used + - Provided clearer guidance on using prefixes like `"custom_"` for provisional namespaces - **Validation Approach**: - - Reordered validation steps for logical flow, moving "Extensions Validation" after "Application-Specific Validation". - - Updated the "Validation Sequence" to reflect this change. + - Added explicit severity levels (ERROR, WARNING, INFO) for validation issues + - Defined specific validation requirements and their corresponding severity levels + - Added structured validation response format requirements + - Reordered validation steps for logical flow + - Added performance considerations for validation implementations + - Required implementations to collect multiple validation issues when possible - **Best Practices and Compliance**: - - Removed any ordering requirements within JSON objects, as JSON objects are unordered collections. - - Ensured that all examples and guidelines align with JSON standards and best practices. - - Maintained consistency in terminology and formatting throughout the document. + - Removed any ordering requirements within JSON objects, as JSON objects are unordered collections + - Ensured that all examples and guidelines align with JSON standards and best practices + - Maintained consistency in terminology and formatting throughout the document ### Fixed - **JSON Examples**: - - Corrected all JSON examples to ensure validity. - - Removed comments within JSON code blocks, as they are not allowed in JSON syntax. + - Corrected all JSON examples to ensure validity + - Removed comments within JSON code blocks, as they are not allowed in JSON syntax - **Formatting Errors**: - - Corrected typographical errors and improved overall formatting for better readability. - - Ensured consistent use of terminology and style throughout the document. + - Corrected typographical errors and improved overall formatting for better readability + - Ensured consistent use of terminology and style throughout the document - **Consistency Issues**: - - Addressed inconsistencies regarding the usage of `is_zero_duration`. - - Confirmed the consistent treatment of overlapping segments as **WARNING** level issues across all relevant sections. + - Addressed inconsistencies regarding the usage of `is_zero_duration` + - Updated examples to reflect the correct usage of `is_zero_duration` + - Confirmed the consistent treatment of overlapping segments as **ERROR** level issues across all relevant sections ### Added -- **Clarification on File Extensions**: - - Added explanations on the rationale for selecting a single primary file extension. - - Encouraged standardization to prevent confusion among users and developers. +- **Empty Value Handling**: + - Added explicit rules for handling empty arrays, objects, and strings + - Specified which fields may be empty and which must be omitted + - Added validation requirements for empty value handling + +- **Validation Response Format**: + - Added structured validation response format requirements + - Specified required fields for validation responses (severity, path, code, message, etc.) + - Added examples of proper validation response formatting - **Clarification on `start` and `end` Fields**: - - Provided clear guidance on when `start` and `end` fields are required. - - Emphasized that they become mandatory when timing information is included. + - Provided clear guidance on when `start` and `end` fields are required + - Emphasized that they become mandatory when timing information is included + +- **RFC 2119 Key Words**: + - Added a section defining the usage of requirement level keywords (MUST, SHOULD, etc.) as per RFC 2119 + - Ensured consistent use of these keywords throughout the document ### Removed - **Ordering Constraints in JSON Objects**: - - Removed any statements imposing ordering on fields within JSON objects. + - Removed any statements imposing ordering on fields within JSON objects + +- **Mandatory `metadata` Section**: + - Removed the requirement for the `metadata` section to be mandatory + - Updated the specification to reflect that `metadata` is now optional ### Clarified - **Usage of `extensions` Field**: - - Provided clearer guidance on the usage of custom namespaces within the `extensions` field. - - Encouraged developers to use prefixes like `"custom_"` to avoid conflicts with reserved namespaces. + - Provided clearer guidance on the usage of custom namespaces within the `extensions` field + - Encouraged developers to use prefixes like `"custom_"` to avoid conflicts with reserved namespaces - **Validation Requirements**: - - Specified that implementations **MUST** perform validation in the sequence outlined to ensure consistency and completeness. - - Clarified the severity levels for validation issues and the appropriate handling for each. + - Specified that implementations **MUST** perform validation in the sequence outlined to ensure consistency and completeness + - Clarified the severity levels for validation issues and the appropriate handling for each + - Added guidance on implementing error recovery strategies + - Specified when recovery attempts are appropriate + +- **Character Encoding Requirements**: + - Clarified that the UTF-8 Byte Order Mark (BOM) **MUST NOT** be used + - Provided guidance on proper handling of control characters and Unicode normalization + +- **Time Value Processing**: + - Clarified the processing and validation requirements for time values, including rounding rules and edge cases + - Added examples to illustrate proper handling of time values ## [0.5.0] - 2024-10-24 diff --git a/spec/latest/stj-specification.md b/spec/latest/stj-specification.md index 682703d..32c90b0 100644 --- a/spec/latest/stj-specification.md +++ b/spec/latest/stj-specification.md @@ -1,7 +1,7 @@ # Standard Transcription JSON (STJ) Format Specification -**Version**: 0.5 -**Date**: 2024-10-24 +**Version**: 0.6.0 +**Date**: 2024-10-27 ## Introduction @@ -9,9 +9,21 @@ The **Standard Transcription JSON (STJ)** format is a proposed standard for repr The STJ format includes detailed transcription segments with associated metadata such as speaker information, timestamps, confidence scores, language codes, and styling options. It also allows for optional metadata about the transcription process, source input, and the transcriber application. -**File Extension**: `.stj.json` -**MIME Type**: `application/vnd.stj+json` -**Character Encoding**: UTF-8 +## RFC 2119 Key Words + +This document uses requirement level keywords as defined in [RFC 2119](https://www.ietf.org/rfc/rfc2119.txt): + +- **MUST**, **REQUIRED**, **SHALL**: The requirement is absolute. +- **MUST NOT**, **SHALL NOT**: The behavior/feature is absolutely prohibited. +- **SHOULD**, **RECOMMENDED**: There may be valid reasons to ignore this requirement, but implications must be understood and carefully weighed. +- **SHOULD NOT**, **NOT RECOMMENDED**: There may be valid reasons to allow this behavior, but implications must be understood and carefully weighed. +- **MAY**, **OPTIONAL**: The item is truly optional. + +These keywords are presented in **UPPERCASE** throughout this document to indicate their special meanings. + +## Version History + +For a detailed list of changes between versions, please see the [CHANGELOG.md](../CHANGELOG.md) file. ## Objectives @@ -24,53 +36,121 @@ The STJ format includes detailed transcription segments with associated metadata ## Specification -The STJ files must include a `version` field within the `metadata` section to indicate the specification version they comply with. This facilitates compatibility and proper validation across different implementations. +- **File Extensions**: + - Primary (Recommended): `.stjson` + - Alternative: `.stj` + - Alternative: `.stj.json` (systems supporting double extensions) +- **MIME Type**: `application/vnd.stj+json` +- **Character Encoding**: UTF-8 -### Version History +The STJ files **MUST** include a `version` field within the `stj` section to indicate the specification version they comply with. This facilitates compatibility and proper validation across different implementations. -For a detailed list of changes between versions, please see the [CHANGELOG.md](../CHANGELOG.md) file. +### MIME Type Registration + +The MIME type `application/vnd.stj+json` is designated for the STJ format. This MIME type is currently pending registration with the Internet Assigned Numbers Authority (IANA). Implementations **SHOULD** use this MIME type when serving STJ files over HTTP or in other contexts where MIME types are applicable. + +Until the registration is finalized, applications **MAY** use `application/json` as a fallback but **SHOULD** transition to `application/vnd.stj+json` once registration is complete. + +### Root Structure + +The STJ file **MUST** contain a single JSON object with the root property name `"stj"`. This root object MUST contain the mandatory fields `version` and `transcript`, and **MAY** include the optional `metadata` field. + +```json +{ + "stj": { + "version": "0.6.0", + "transcript": { ... } + } +} +``` + +- **`version`**: Specifies the STJ specification version the file adheres to. +- **`transcript`**: Contains the actual transcription data, including segments (see the [Transcript Section](#transcript-section) for details). + +The `"metadata"` field is optional and can be included to provide additional context (see the [Metadata Section](#metadata-section) for details). + +```json +{ + "stj": { + "version": "0.6.0", + "metadata": { ... }, + "transcript": { ... } + } +} +``` -### Overview +No additional properties are allowed at the root level. -The STJ file is a JSON object containing two main sections: +#### Examples of invalid root structures -- `"metadata"`: Contains information about the transcription process, source input, and other relevant details. -- `"transcript"`: Contains the actual transcription data, including speaker information, segments, and optional styling. +- Invalid: **Missing mandatory fields:** ```json { - "metadata": { ... }, - "transcript": { ... } + "stj": {} +} +``` + +- Invalid: **Missing transcript:** + +```json +{ + "stj": { + "version": "0.6.0" + } +} +``` + +- Invalid: **Missing stj root object:** + +```json +{ + "version": "0.6.0", // ERROR: Missing stj root object + "transcript": {} } ``` ### Mandatory vs. Optional Fields - **Mandatory Fields**: Essential for basic functionality and compatibility. -- **Optional Fields**: Provide additional information and features but are not required for basic use. + - `stj.version` + - `transcript.segments` (array) + - `transcript.segments[].text` +- **Optional Fields**: Provide additional information and features but are not required for basic use: All other fields, including `metadata`, `start`, `end`, `speakers`, `styles`, `speaker_id`, `confidence`, `language`, `style_id`, `words`, `word_timing_mode`, etc. + +**Note**: If any segment includes timing information, both `start` and `end` become mandatory for that segment and all other segments in the transcript. ### Metadata Section -The `"metadata"` object includes optional and required fields providing context about the transcription. +The `"metadata"` object is **OPTIONAL** and **MAY** include fields providing context about the transcription. The metadata object MAY be empty to indicate metadata processing was attempted but found no properties. #### Fields -- **transcriber** *(mandatory)*: Information about the transcriber application or service. - - **name** *(string, mandatory)*: Name of the transcriber application. - - **version** *(string, mandatory)*: Version of the transcriber application. -- **created_at** *(string, mandatory)*: ISO 8601 timestamp indicating when the transcription was created. -- **version** *(mandatory)*: Specifies the STJ specification version the file adheres to. - - **Format**: Semantic versioning (e.g., `"0.5.0"`) - - **Pattern**: Must follow the regex pattern `^\d+\.\d+\.\d+$` to ensure semantic versioning. -- **source** *(optional)*: Information about the source of the audio/video. +- **transcriber** *(object, optional)*: Information about the transcriber application or service. + - **name** *(string, optional)*: Name of the transcriber application. + - **version** *(string, optional)*: Version of the transcriber application. +- **created_at** *(string, optional)*: ISO 8601 timestamp indicating when the transcription was created. +- **source** *(object, optional)*: Information about the source of the audio/video. - **uri** *(string, optional)*: The URI of the source media. - - MUST conform to the **URI Format Requirements** specified in the **Field Definitions and Constraints** section. - **duration** *(number, optional)*: Duration of the media in seconds. - **languages** *(array of strings, optional)*: List of languages present in the source media, ordered by prevalence. - **languages** *(array of strings, optional)*: List of languages present in the transcription, ordered by prevalence. - **confidence_threshold** *(number, optional)*: Confidence threshold used during transcription (0.0 - 1.0). - **extensions** *(object, optional)*: A key-value map for any additional metadata. - MUST conform to the **Extensions Field Requirements** specified in the **Field Definitions and Constraints** section. + **Note**: The `metadata` section is optional. Include it to provide additional context about the transcription as needed. + +#### Example + +```json +"metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.6.0" + }, + "created_at": "2024-10-27T12:00:00Z" +}, +``` #### Clarification on `languages` Fields @@ -94,14 +174,13 @@ The STJ format includes two `languages` fields within the `metadata` section to "name": "YAWT", "version": "0.4.0" }, - "created_at": "2023-10-20T12:00:00Z", - "version": "0.5.0", + "created_at": "2024-10-20T12:00:00Z", "source": { "uri": "https://example.com/multilingual_media.mp4", "duration": 3600.5, - "languages": ["en", "es"] // Source languages: English and Spanish + "languages": ["en", "es"] }, - "languages": ["fr"], // Transcription language: French + "languages": ["fr"], "confidence_threshold": 0.6, "extensions": { "project_info": { @@ -120,8 +199,8 @@ The `"transcript"` object contains the transcription data, including speaker inf #### Fields -- **speakers** *(array, optional)*: List of speaker objects. -- **styles** *(array, optional)*: List of style definitions for formatting and positioning. +- **speakers** *(array, optional)*: List of speaker objects. May be empty to indicate speaker identification was attempted but no speakers were found. +- **styles** *(array, optional)*: List of style definitions for formatting and positioning. May be empty to indicate style processing was performed but no styles were defined. - **segments** *(array, mandatory)*: List of transcription segments. #### Speakers @@ -130,7 +209,7 @@ Each speaker object includes: - **id** *(string, mandatory)*: Unique identifier for the speaker. - MUST conform to the **Speaker ID Requirements** specified in the **Field Definitions and Constraints** section. -- **name** *(string, optional)*: Display name of the speaker. +- **name** *(string, optional)*: Display name of the speaker. May be empty to indicate an anonymous or unnamed speaker. - **extensions** *(object, optional)*: Any additional information about the speaker. ##### Example @@ -140,10 +219,12 @@ Each speaker object includes: { "id": "Speaker1", "name": "Dr. Smith" }, { "id": "Speaker2", "name": "Señora García" }, { "id": "Speaker3", "name": "Monsieur Dupont" }, - { "id": "Speaker4", "name": "Unknown" } // Anonymous speaker + { "id": "Speaker4" } ] ``` +In this example, Speaker4 is anonymous or unknown. + #### Styles Each style object defines text presentation rules that can be referenced by segments. Basic formatting features are defined in a format-agnostic way, while advanced features can be implemented using `extensions`. @@ -260,16 +341,24 @@ Style with format-specific features: Each segment object includes: -- **start** *(number, mandatory)*: Start time of the segment in seconds. -- **end** *(number, mandatory)*: End time of the segment in seconds. +- **start** *(number, conditionally mandatory)*: Start time of the segment in seconds. If present, `end` **MUST** also be present. +- **end** *(number, conditionally mandatory)*: End time of the segment in seconds. If present, `start` **MUST** also be present. +- **is_zero_duration***(boolean)*: Indicates that the segment has zero duration. + - **MUST** be present and set to `true` when `start` equals `end` + - **MUST NOT** be present when `start` does not equal `end` + - If present, **MUST** be `true` - **text** *(string, mandatory)*: Transcribed text of the segment. - **speaker_id** *(string, optional)*: The `id` of the speaker from the `speakers` list. - **confidence** *(number, optional)*: Confidence score for the segment (0.0 - 1.0). - **language** *(string, optional)*: Language code for the segment (ISO 639-1 or ISO 639-3). - **style_id** *(string, optional)*: The `id` of the style from the `styles` list. -- **words** *(array, optional)*: List of word-level details. +- **words** *(array, optional)*: List of word-level details. When present (in "complete" or "partial" modes), must contain at least one word. Must be omitted entirely (not included as empty) when using `word_timing_mode: "none"` for segments where word timing isn't applicable or fails. - **start** *(number, mandatory)*: Start time of the word in seconds. - **end** *(number, mandatory)*: End time of the word in seconds. + - **is_zero_duration***(boolean)*: Indicates that the word has zero duration. + - **MUST** be present and set to `true` when `start` equals `end` + - **MUST NOT** be present when `start` does not equal `end` + - If present, **MUST** be `true` - **text** *(string, mandatory)*: The word text. - **confidence** *(number, optional)*: Confidence score for the word (0.0 - 1.0). - **word_timing_mode** *(string, optional)*: Indicates the completeness of word-level timing data within the segment. @@ -278,54 +367,73 @@ Each segment object includes: ##### Example ```json -"segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Bonjour tout le monde.", - "speaker_id": "Speaker1", - "confidence": 0.95, - "language": "fr", - "style_id": "Style1", - "word_timing_mode": "complete", - "words": [ - { "start": 0.0, "end": 1.0, "text": "Bonjour" }, - { "start": 1.0, "end": 2.0, "text": "tout" }, - { "start": 2.0, "end": 3.0, "text": "le" }, - { "start": 3.0, "end": 4.0, "text": "monde." } - ] - }, - { - "start": 5.1, - "end": 10.0, - "text": "Gracias por estar aquí hoy.", - "speaker_id": "Speaker2", - "confidence": 0.93, - "language": "es", - "word_timing_mode": "partial", - "words": [ - { "start": 5.1, "end": 5.5, "text": "Gracias" } - // Remaining words are not included - ] - }, - { - "start": 10.1, - "end": 15.0, - "text": "Hello everyone, and welcome.", - "speaker_id": "Speaker3", - "confidence": 0.92, - "language": "en", - "word_timing_mode": "none" - // No words array provided +{ + "stj": { + "version": "0.6.0", + "transcript": { + "speakers": [ + {"id": "Speaker1", "name": "Speaker One"}, + {"id": "Speaker2", "name": "Speaker Two"}, + {"id": "Speaker3", "name": "Speaker Three"} + ], + "styles": [ + { + "id": "Style1", + "text": { + "color": "#FFFFFF", + "background": "#000000" + } + } + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Bonjour tout le monde.", + "speaker_id": "Speaker1", + "confidence": 0.95, + "language": "fr", + "style_id": "Style1", + "word_timing_mode": "complete", + "words": [ + { "start": 0.0, "end": 1.0, "text": "Bonjour" }, + { "start": 1.0, "end": 2.0, "text": "tout" }, + { "start": 2.0, "end": 3.0, "text": "le" }, + { "start": 3.0, "end": 4.0, "text": "monde." } + ] + }, + { + "start": 5.1, + "end": 10.0, + "text": "Gracias por estar aquí hoy.", + "speaker_id": "Speaker2", + "confidence": 0.93, + "language": "es", + "word_timing_mode": "partial", + "words": [ + { "start": 5.1, "end": 5.5, "text": "Gracias" } + ] + }, + { + "start": 10.1, + "end": 10.1, + "is_zero_duration": true, + "text": "[Applause]", + "speaker_id": "Speaker3", + "confidence": 0.92, + "language": "en" + } + ] + } } -] +} ``` In this example: - The first segment has complete word-level data (`word_timing_mode`: `"complete"`). - The second segment has partial word-level data (`word_timing_mode`: `"partial"`). -- The third segment has no word-level data (`word_timing_mode`: `"none"` or omitted). +- The third segment is a zero-duration segment, which must not have word timing mode or words array. ### Handling Multiple Languages @@ -351,41 +459,44 @@ In this example: Imagine a video where presenters speak in English and Spanish, and the transcription has been translated entirely into French and German. ```json -"metadata": { - "transcriber": { - "name": "YAWT", - "version": "0.4.0" - }, - "created_at": "2023-10-20T12:00:00Z", - "version": "0.5.0", - "source": { - "uri": "https://example.com/event.mp4", - "duration": 5400.0, - "languages": ["en", "es"] - }, - "languages": ["fr", "de"], - "extensions": { ... } -}, -"transcript": { - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Bonjour à tous.", - "speaker_id": "Speaker1", - "confidence": 0.95, - "language": "fr" +{ + "stj": { + "version": "0.6.0", + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" + }, + "created_at": "2024-10-20T12:00:00Z", + "source": { + "uri": "https://example.com/event.mp4", + "duration": 5400.0, + "languages": ["en", "es"] + }, + "languages": ["fr", "de"], + "extensions": { ... } }, - { - "start": 5.1, - "end": 10.0, - "text": "Willkommen alle zusammen.", - "speaker_id": "Speaker2", - "confidence": 0.94, - "language": "de" + "transcript": { + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Bonjour à tous.", + "speaker_id": "Speaker1", + "confidence": 0.95, + "language": "fr" + }, + { + "start": 5.1, + "end": 10.0, + "text": "Willkommen alle zusammen.", + "speaker_id": "Speaker2", + "confidence": 0.94, + "language": "de" + } + ] } - // More segments... - ] + } } ``` @@ -398,121 +509,373 @@ In this example: ### Optional vs. Mandatory Fields Summary - **Mandatory Fields**: - - `metadata.transcriber.name` - - `metadata.transcriber.version` - - `metadata.created_at` - - `metadata.version` + - `stj.version` - `transcript.segments` (array) - - `transcript.segments[].start` - - `transcript.segments[].end` - `transcript.segments[].text` + - `transcript.segments[].start` (if timing information is relevant) + - `transcript.segments[].end` (if timing information is relevant) - **Optional Fields**: - - All other fields, including `speakers`, `styles`, `speaker_id`, `confidence`, `language`, `style_id`, `words`, `word_timing_mode`, etc. + - `metadata` and all its subfields + - `speakers`, `styles`, `speaker_id`, `confidence`, `language`, `style_id`, `words`, `word_timing_mode`, etc. + +**Note**: When optional fields are present but empty (empty arrays, objects, or strings), this indicates the field was processed but no content was found. When optional fields are omitted entirely, this indicates the field was not processed or is not applicable. See the Empty Value Constraints section under Structural Requirements for details. ## Field Definitions and Constraints +This section outlines the requirements and constraints for various fields used within the STJ format. It includes structural requirements, data type specifications, and detailed constraints for specific fields. + +### Structural Requirements + +### Default Behavior for Optional Fields + +By default, optional fields **SHOULD** be omitted entirely when: + +- The field is not applicable to the content +- The related feature or processing was not attempted +- There is no meaningful data to include + +#### Empty Array Rules + +- **Always Empty Allowed**: + - `speakers`: When speaker identification attempted but none found + - `styles`: When style processing performed but no styles defined + +- **Never Empty Allowed**: + - `segments`: Must contain at least one segment + - `languages`: If present, **MUST** contain at least one entry + - `words`: **MUST NOT** be empty in any word timing mode: + - In "complete" mode: Must contain all words with timing + - In "partial" mode: Must contain at least one word with timing + - In "none" mode: Array must be entirely omitted + - For segments where word timing fails or is not applicable: Use `word_timing_mode: "none"` and omit the array + +#### Empty Object Rules + +- **Always Empty Allowed**: + - `metadata`: When processing occurred but found no properties + - `extensions`: When processing occurred but found no valid extensions +- **Never Empty Allowed**: + - Required object fields + +#### Empty String Rules + +- **Always Empty Allowed**: + - `speaker.name`: For unnamed/anonymous speakers +- **Never Empty Allowed**: + - All other string fields + +When in doubt, omit optional fields entirely rather than including them as empty. + +#### Empty Value Constraints + +- **Null Values**: + - Null values are **not allowed** for any field unless explicitly documented. + - Optional fields **MUST** be omitted entirely rather than set to null unless explicitly documented as allowing null. + - The `confidence` field **MAY** be null to indicate confidence scoring was attempted but failed. + +##### Confidence Field Exception Details + +The `confidence` field is allowed to be null because it represents three distinct states that need to be distinguishable: + +1. **Field Omitted**: Confidence scoring was not attempted +2. **Null Value**: Confidence scoring was attempted but failed +3. **Numeric Value**: Confidence was successfully calculated (0.0 to 1.0) + +Example: + +```json +{ + "segments": [ + { + "text": "Hello world", + "confidence": null, // Scoring attempted but failed + }, + { + "text": "Next segment" // No confidence scoring attempted + }, + { + "text": "Final segment", + "confidence": 0.95 // Successfully scored + } + ] +} +``` + +Applications processing STJ files should: + +- Treat a missing confidence field as "not attempted" +- Handle null confidence values as "attempted but failed" +- Process numeric confidence values normally + +#### Empty Arrays + +Optional arrays **MAY** be empty only in specific documented cases: + +- **Mandatory Arrays**: + - The `segments` array **MUST NOT** be empty. + - **Severity if violated:** ERROR + - The `languages` array, if present, **MUST** contain at least one entry. + - **Severity if violated:** ERROR + +- **Arrays That MAY Be Empty**: + - `speakers`: Empty array indicates speaker identification was attempted but no speakers were found + - `styles`: Empty array indicates style processing was performed but no styles were defined + +- **Special Case - Words Array**: + - The `words` array has specific rules: + - In "complete" mode: **MUST** contain all words with timing + - In "partial" mode: **MUST** contain at least one word + - In "none" mode: **MUST NOT** be present at all (array must be omitted entirely) + - Empty arrays are **NEVER** allowed in any mode + - **Severity if violated:** ERROR + - When word timing fails or isn't applicable: + - Use `word_timing_mode: "none"` + - Omit the `words` array entirely + - Do not include an empty array + +- **Default Behavior for Other Arrays**: + - Arrays **SHOULD** be omitted entirely rather than included as empty unless explicitly documented as allowing empty state + - Empty arrays in undocumented cases **SHOULD** result in a WARNING + +##### Examples + +Invalid cases: + +```json +{ + "segments": [], // Invalid: mandatory array must not be empty + "languages": [], // Invalid: if present, must contain at least one entry + "word_timing_mode": "complete", + "words": [] // Invalid: words array must not be empty when present +} +``` + +Guidance: Arrays **SHOULD** be omitted entirely (rather than included as empty) when: + +- The feature was not processed or is not applicable +- The presence of the array itself would be misleading + +#### Empty Objects + +- Empty objects are **not allowed** for required object fields. + - **Severity if violated:** ERROR +- The following optional objects **MAY** be empty with specific semantic meanings: + - `metadata`: Empty object indicates metadata processing occurred but found no properties + - `extensions`: Empty object indicates extension processing occurred but found no valid extensions +- Other optional objects **SHOULD** be omitted entirely rather than included as empty unless they represent an intentionally empty state that needs to be distinguished from "not processed" or "not applicable". + - **Severity if violated:** WARNING + +#### Empty Strings + +- Empty strings are **not allowed** for any field except where explicitly documented. +- The following string fields **MAY** be empty with specific semantic meanings: + - `speaker.name`: Empty string indicates an intentionally unnamed or anonymous speaker +- All other optional string fields **MUST** be omitted entirely rather than included as empty strings. + +#### Empty Value Validation Requirements + +Implementations **MUST** validate: + +1. **Mandatory Arrays** + - The `segments` array **MUST NOT** be empty + - **Severity**: ERROR + +2. **Optional Arrays** + - Empty arrays are allowed only for: + - `speakers` + - `styles` + - Other documented cases where empty state has semantic meaning + - **Severity**: WARNING for unexpected empty arrays + +3. **Objects** + - Required objects **MUST NOT** be empty + - Optional objects may be empty only for: + - `metadata` + - `extensions` + - Other documented cases + - **Severity**: WARNING for unexpected empty objects + +4. **Default Field Omission** + - Optional fields **SHOULD** be omitted rather than included empty + - **Severity**: INFO when fields could be omitted + +#### Handling Empty Arrays and Objects Examples + +**Invalid Example of an Empty Mandatory `segments` Array:** + +```json +{ + "segments": [] +} +``` + +Explanation: The `segments` array is mandatory and must contain at least one segment. An empty `segments` array is invalid. + +##### Example Validation Messages + +- ERROR: "segments array must not be empty" +- WARNING: "empty array found for field 'custom_data' - consider omitting the field entirely" +- INFO: "empty metadata object found - consider omitting if no metadata processing was performed" + +#### Array Ordering Requirements + +- **Ordered Arrays**: + - The `segments` array **MUST** maintain temporal order based on `start` times. + - The `words` array within segments **MUST** maintain temporal order based on `start` times. + +- **Unordered Arrays**: + - The `speakers` array order is **not significant**. + - The `styles` array order is **not significant**. + +#### String Content Requirements + +- Leading and trailing whitespace in string values **MUST** be preserved. +- String values **MAY** contain multiple consecutive whitespace characters. +- Line breaks in string values **MUST** be preserved. + +#### Number Format Requirements + +- All numeric values **MUST** use JSON number format. +- Scientific notation is **not allowed**. +- Leading zeros are **not allowed** except for: + - Decimal values less than 1 (e.g., `0.5`) + - Time values, which follow the [Time Format Requirements](#time-format-requirements) specified in their dedicated section +- The negative zero value (`-0`) is **not allowed**. +- The values `Infinity`, `-Infinity`, and `NaN` are **not allowed**. + +**Note:** For time-related fields (`start`, `end`), the [Time Format Requirements](#time-format-requirements) take precedence over these general number format requirements. See the [Time Format Requirements](#time-format-requirements) section for detailed specifications of time value formatting. + ### Time Format Requirements -All time values in the STJ format (`start` and `end` fields) must follow these requirements: +All time values in the STJ format (`start` and `end` fields) **MUST** follow these requirements: #### Format Specifications -- Must be represented as non-negative decimal numbers -- Must have a precision of up to 3 decimal places (millisecond precision) -- Must not exceed 6 significant digits before the decimal point -- Values must be in the range [0.000, 999999.999] -- Leading zeros before the decimal point are allowed but not required -- Trailing zeros after the decimal point are allowed but not required -- The decimal point must be present if there are decimal places -- Scientific notation is not allowed +- **Type**: Non-negative decimal numbers +- **Precision Requirements**: + - Input: Any number of decimal places allowed + - Processing: Values with more than 3 decimal places MUST be rounded to 3 decimal places using IEEE 754 round-to-nearest-even + - Storage: Maximum 3 decimal places (millisecond precision) +- **Range**: [0.000, 999999.999] seconds (after rounding). + - The maximum value 999999.999 is inclusive. Any value that would round to greater than 999999.999 MUST be rejected, even if the unrounded value is less than 999999.999 (e.g., 999999.9994 is valid as it rounds to 999999.999, but 999999.9995 **MUST** be rejected as it would round to 1000000.000). +- **Significant Digits**: Must not exceed 6 digits before the decimal point +- **Formatting Rules**: + - Leading zeros before the decimal point are allowed but not required + - Trailing zeros after the decimal point are allowed but not required + - The decimal point MUST be present if there are decimal places + - Scientific notation is not allowed + - Comma decimal separators are not allowed (**MUST** use period) #### Basic Constraints - For any segment or word: - - `start` must not be greater than `end` - - Both `start` and `end` must be present and valid according to format specifications -- For zero-duration items (`start` equals `end`): - - Must include `is_zero_duration`: `true` - - For segments: - - Must not contain a `words` array - - Must not specify a `word_timing_mode` + - `start` **MUST NOT** be greater than `end` (after rounding) + - If either `start` or `end` is present, the other **MUST** also be present + - Both values **MUST** be valid according to format specifications - The `is_zero_duration` field: - - Must be `true` if and only if `start` equals `end` - - Must be `false` or omitted for items where `start` does not equal `end` - - Must not be included with value `false` when `start` equals `end` + - **MUST** be present and set to `true` when `start` equals `end` (after rounding) + - **MUST NOT** be present when `start` does not equal `end` (after rounding) + - If present, **MUST** be `true` + - For segments: + - **MUST NOT** contain a `words` array + - **MUST NOT** specify a `word_timing_mode` +- Including `is_zero_duration` when `start` does not equal `end` **MUST** result in an ERROR during validation + +#### Examples of Time Values -#### Examples of Valid Time Values +Valid Input Values and Their Processing: -- `0` (zero seconds) -- `0.0` (zero seconds) -- `0.000` (zero seconds with full precision) -- `1.5` (one and a half seconds) -- `10.100` (ten and one hundred milliseconds) -- `999999.999` (maximum allowed value) +- `0` → stored as `0` or `0.0` +- `0.0` → stored as `0.0` +- `0.000` → stored as `0.000` +- `1.5` → stored as `1.5` +- `10.100` → stored as `10.100` +- `999999.999` → stored as `999999.999` -#### Examples of Invalid Time Values +IEEE 754 Round-to-Nearest-Even Examples: + +- `1.2345` → `1.235` (rounded up as 5 is even) +- `1.2335` → `1.234` (rounded up as 4 is even) +- `1.2325` → `1.232` (rounded down as 2 is even) +- `1.2315` → `1.232` (rounded up as 2 is even) +- `1.2305` → `1.230` (rounded down as 0 is even) + +Edge Cases: + +- `0.0005` → `0.001` (rounded up to even) +- `0.0015` → `0.002` (rounded up to even) +- `0.0025` → `0.002` (rounded down to even) +- `0.0035` → `0.004` (rounded up to even) +- `0.0045` → `0.004` (rounded down to even) + +Invalid Values (Must Be Rejected): - `-1.0` (negative values not allowed) - `1.5e3` (scientific notation not allowed) - `1000000.0` (exceeds maximum value) -- `1.2345` (exceeds maximum precision) +- `999999.9995` (would round above maximum) - `1,5` (incorrect decimal separator) +- Non-numeric values + +**Note:** These requirements for time values take precedence over the general [Number Format Requirements](#number-format-requirements) when formatting time-related fields (`start` and `end`). ### Character Encoding Requirements #### Basic Requirements -- Files MUST be encoded in UTF-8 -- The UTF-8 Byte Order Mark (BOM) is optional -- JSON string values MUST follow RFC 8259 encoding rules -- The full Unicode character set MUST be supported +- Files **MUST** be encoded in UTF-8. +- The UTF-8 Byte Order Mark (BOM) **MUST NOT** be used. +- JSON string values **MUST** follow [RFC 8259](https://www.rfc-editor.org/rfc/rfc8259.html) encoding rules. +- The full Unicode character set **MUST** be supported. #### String Content Requirements -- All string values MUST: - - Be valid UTF-8 encoded text - - Properly escape control characters (U+0000 through U+001F) - - Properly handle surrogate pairs for supplementary plane characters -- Forward slash (`/`) characters MAY be escaped but escaping is not required -- Unicode normalization: - - All string values SHOULD be normalized to Unicode Normalization Form C (NFC) - - Applications MUST preserve the normalization form of input text - - Applications MAY normalize text for comparison or search operations +- All string values **MUST**: + - Be valid UTF-8 encoded text. + - Properly escape control characters (U+0000 through U+001F) using `\u` notation. + - Properly handle surrogate pairs for supplementary plane characters. +- Forward slash (`/`) characters **MAY** be escaped but escaping is **not required**. +- Applications **MUST** properly handle and preserve escaped control characters when parsing and generating STJ files. +- **Unicode Normalization**: + - All string values **SHOULD** be normalized to Unicode Normalization Form C (NFC). + - Applications **MUST NOT** alter the normalization form of the text when storing or transmitting it, but **MAY** perform normalization internally for operations like comparison or searching. ### Confidence Scores -Confidence scores are floating-point numbers between `0.0` (no confidence) and `1.0` (full confidence). They are optional but recommended. +- **Type**: Floating-point numbers between `0.0` (no confidence) and `1.0` (full confidence). +- **Usage**: Optional but recommended for segments and words. +- **Purpose**: Provides an indication of the reliability of the transcribed text. ### Language Codes -ISO 639-1 (two-letter codes) is the primary standard and MUST be used when the language has an ISO 639-1 code. - -- Example: Use "en" for English, "fr" for French, "es" for Spanish +#### Standards -ISO 639-3 (three-letter codes) MUST only be used for languages that do not have an ISO 639-1 code. - -- Example: Use "yue" for Cantonese (no ISO 639-1 code), but use "zh" for Mandarin Chinese (has ISO 639-1 code) +- **Primary Standard**: ISO 639-1 (two-letter codes) **MUST** be used when available. + - Examples: `"en"` for English, `"fr"` for French, `"es"` for Spanish. +- **Secondary Standard**: ISO 639-3 (three-letter codes) **MUST** be used only for languages without an ISO 639-1 code. + - Example: `"yue"` for Cantonese (no ISO 639-1 code), but use `"zh"` for Mandarin Chinese (has ISO 639-1 code). #### Consistency Requirements -- A single STJ file MUST NOT mix ISO 639-1 and ISO 639-3 codes for the same language -- All references to a specific language within a file MUST use the same code consistently -- When a language has both ISO 639-1 and ISO 639-3 codes, the ISO 639-1 code MUST be used +- A single STJ file **MUST NOT** mix ISO 639-1 and ISO 639-3 codes for the same language. +- All references to a specific language within a file **MUST** use the same code consistently. +- When a language has both ISO 639-1 and ISO 639-3 codes, the ISO 639-1 code **MUST** be used. #### Application Requirements -Applications MUST: +Applications **MUST**: -- Process both ISO 639-1 and ISO 639-3 codes +- Process both ISO 639-1 and ISO 639-3 codes. - Validate that: - - ISO 639-1 codes are used when available - - ISO 639-3 codes are only used for languages without ISO 639-1 codes - - Language codes are used consistently throughout the file + - ISO 639-1 codes are used when available. + - ISO 639-3 codes are only used for languages without ISO 639-1 codes. + - Language codes are used consistently throughout the file. - Reject files that: - - Use ISO 639-3 codes for languages that have ISO 639-1 codes - - Mix different standards for the same language - - Contain invalid language codes + - Use ISO 639-3 codes for languages that have ISO 639-1 codes. + - Mix different standards for the same language. + - Mix standards across different languages. + - Contain invalid language codes. ### URI Format Requirements @@ -524,33 +887,30 @@ Defines the format and constraints for the `uri` field in the `metadata.source` - **Type**: String representing a Uniform Resource Identifier (URI) as defined in [RFC 3986](https://www.rfc-editor.org/rfc/rfc3986.html). - **Allowed Schemes**: - - **Recommended**: + - **Required Support**: - `http` - `https` - - `file` - - **Optional**: - - Other schemes (e.g., `ftp`, `s3`, `rtsp`) MAY be used if appropriate. + - **Optional Support**: + - Other schemes (e.g., `file`, `ftp`, `s3`, `rtsp`) **MAY** be used if appropriate. - **Absolute URIs**: - - The `uri` SHOULD be an absolute URI, including the scheme component. + - The `uri` **SHOULD** be an absolute URI, including the scheme component. - Examples: - `"http://example.com/media/video.mp4"` - `"https://example.com/media/audio.mp3"` - `"file:///C:/Media/video.mp4"` (Windows) - `"file:///home/user/media/audio.mp3"` (Unix-like systems) - **Relative URIs**: - - Relative URIs or file paths SHOULD NOT be used. - - If a relative URI is provided, consuming applications MUST resolve it relative to a known base URI. + - Relative URIs or file paths **SHOULD NOT** be used. + - If a relative URI is provided, consuming applications **MUST** resolve it relative to a known base URI. - **Note**: Relative URIs can lead to ambiguity and are discouraged. #### Validation Rules -- The `uri` MUST conform to the syntax defined in RFC 3986. -- Implementations SHOULD validate the URI format and report errors if invalid. +- The `uri` **MUST** conform to the syntax defined in RFC 3986. +- Implementations **SHOULD** validate the URI format and report errors if invalid. - **Scheme Support**: - - **Required Support**: - - Implementations MUST support `http` and `https` schemes. - - **Optional Support**: - - Support for other schemes is OPTIONAL and may vary between implementations. + - Implementations **MUST** support `http` and `https` schemes. + - Support for other schemes is **optional** and may vary between implementations. #### Security Considerations @@ -558,7 +918,7 @@ Defines the format and constraints for the `uri` field in the `metadata.source` - Be cautious when including URIs that may reveal sensitive information, such as local file paths or internal network addresses. - Consider omitting the `uri` or sanitizing it if privacy is a concern. - **Security Risks**: - - Applications consuming STJ files SHOULD handle URIs carefully to avoid security risks such as directory traversal or accessing unauthorized resources. + - Applications consuming STJ files **SHOULD** handle URIs carefully to avoid security risks such as directory traversal or accessing unauthorized resources. #### Examples @@ -596,25 +956,25 @@ Defines the format and constraints for the `uri` field in the `metadata.source` #### Format Specifications -- **Type**: String -- **Allowed Characters**: Letters (A-Z, a-z), digits (0-9), underscores (_), and hyphens (-). +- **Type**: String. +- **Allowed Characters**: Letters (`A-Z`, `a-z`), digits (`0-9`), underscores (`_`), and hyphens (`-`). - **Length Constraints**: - - Minimum length: 1 character - - Maximum length: 64 characters + - Minimum length: 1 character. + - Maximum length: 64 characters. - **Uniqueness**: - - Speaker IDs MUST be unique within the `speakers` list. - - `speaker_id` references in segments MUST match an `id` in the `speakers` list. + - Speaker IDs **MUST** be unique within the `speakers` list. + - `speaker_id` references in segments **MUST** match an `id` in the `speakers` list. - **Case Sensitivity**: - Speaker IDs are case-sensitive; `Speaker1` and `speaker1` are considered different IDs. - **Format Recommendations**: - - Use meaningful identifiers when possible, e.g., `Speaker_JohnDoe`. - - For anonymous speakers, use generic IDs like `Speaker1`, `Speaker2`, etc. + - Use meaningful identifiers when possible, e.g., `"Speaker_JohnDoe"`. + - For anonymous speakers, use generic IDs like `"Speaker1"`, `"Speaker2"`, etc. #### Representing Anonymous Speakers - **When the Speaker is Unknown or Anonymous**: - - Use a consistent placeholder ID, such as `Speaker1`, `Speaker2`, etc. - - The `name` field MAY be omitted or set to a placeholder like `"Unknown"` or `"Anonymous"`. + - Use a consistent placeholder ID, such as `"Speaker1"`, `"Speaker2"`, etc. + - The `name` field **MUST** be omitted. - **Consistency**: - Maintain consistent IDs for anonymous speakers throughout the transcript to differentiate between different speakers. - If speaker diarization is uncertain, it is acceptable to assign the same `speaker_id` to multiple segments where the speaker is believed to be the same. @@ -634,304 +994,832 @@ Defines the format and constraints for the `uri` field in the `metadata.source` ```json { - "id": "Speaker1", - "name": "Unknown" + "id": "Speaker1" } ``` -- **Multiple Anonymous Speakers**: +- **Mixing Known and Anonymous Speakers**: ```json "speakers": [ - { "id": "Speaker1", "name": "Unknown" }, - { "id": "Speaker2", "name": "Unknown" }, - { "id": "Speaker3", "name": "Unknown" } + { "id": "Speaker1", "name": "John Doe"}, + { "id": "Speaker2"}, + { "id": "Speaker3"} ] ``` #### Validation Rules - **ID Format Validation**: - - IDs MUST only contain allowed characters. - - IDs MUST meet the length constraints. + - IDs **MUST** only contain allowed characters. + - IDs **MUST** meet the length constraints. - **Uniqueness Validation**: - - IDs in the `speakers` list MUST be unique. - - Duplicate IDs MUST result in a validation error. + - IDs in the `speakers` list **MUST** be unique. + - Duplicate IDs **MUST** result in a validation error. - **Reference Validation**: - - All `speaker_id` references in segments MUST match an `id` in the `speakers` list. - - Invalid references MUST result in a validation error. + - All `speaker_id` references in segments **MUST** match an `id` in the `speakers` list. + - Invalid references **MUST** result in a validation error. + +#### Examples of invalid speaker references + +- Invalid: **References non-existent speaker** + +```json +{ + "speakers": [ + {"id": "Speaker1"} + ], + "segments": [{ + "speaker_id": "Speaker2" + }] +} +``` + +- Invalid: **Invalid character in ID** + +```json +{ + "speakers": [ + {"id": "Speaker@1"}, + {"id": "Speaker1"} + ] +} +``` #### Implementation Notes -- **Applications** SHOULD provide meaningful error messages when validation fails due to speaker ID issues. -- **When Generating STJ Files**: +- Applications **SHOULD** provide meaningful error messages when validation fails due to speaker ID issues. +- When generating STJ files: - Ensure that speaker IDs conform to the specified format requirements. - Assign consistent IDs to anonymous speakers to maintain differentiation. ### Style IDs -If `style_id` is used, it must match an `id` in the `styles` list. +- If `style_id` is used in a segment, it **MUST** match an `id` in the `styles` list. +- Style IDs **MUST** adhere to the same format and uniqueness constraints as speaker IDs. ### Text Fields -`text` fields should be in plain text format. Special formatting or markup should be handled via the `styles` mechanism. +- `text` fields **SHOULD** be in plain text format. +- Special formatting or markup **SHOULD** be handled via the `styles` mechanism. +- Line breaks and whitespace within `text` fields **MUST** be preserved. ### Word Timing Mode Field -#### Purpose - -Indicates the completeness of word-level timing data within the segment. +The `word_timing_mode` field indicates how word-level timing data is handled: #### Allowed Values -- `"complete"`: All words in the segment have timing data -- `"partial"`: Only some words have timing data -- `"none"`: No word-level timing data is provided +- `"complete"`: + - **MUST** include a `words` array + - **MUST** have timing for every `word` in the segment + - Concatenated `words[].text` **MUST** match segment `text` when normalized for whitespace + - **MUST NOT** use for segments where word timing isn't applicable or fails + +- `"partial"`: + - **MUST** include a `words` array with at least one word + - Words in array **MUST** appear in same order as in segment text + +- `"none"`: + - **MUST NOT** include a `words` array + - Use for segments where: + - Word timing wasn't attempted + - Word timing isn't applicable (e.g., "[Music]", "[Applause]") + - Word timing was attempted but failed + +Empty `words` arrays are not allowed in any mode. For segments where: + +- Word timing was attempted but failed +- Word timing isn't applicable +- Word timing wasn't attempted +Use `word_timing_mode: "none"` and omit the words array entirely. #### Default Behavior -- When omitted and `words` array is present with complete coverage: treated as `"complete"` -- When omitted and `words` array is absent: treated as `"none"` -- When omitted and `words` array is present but incomplete: invalid - must explicitly specify `"partial"` +- When `word_timing_mode` is omitted and a `words` array is present with complete coverage: Treated as `"complete"` +- When `word_timing_mode` is omitted and no `words` array is present: Treated as `"none"` +- When `word_timing_mode` is omitted and `words` array is present but incomplete: Invalid—**MUST** explicitly specify `"partial"` -#### Constraints +Note: Empty `words` arrays are never allowed. Use `word_timing_mode: "none"` and omit the array entirely when word timing isn't applicable, fails, or wasn't attempted. -- For `"complete"`: All words must have timing data -- For `"partial"`: Some words must have timing data -- For `"none"`: Must not include `words` array +#### Word Object Requirements -### Extensions Field Requirements +When word timing information is included (modes "complete" or "partial"), the `words` array **MUST** be present and each word object **MUST** include: -#### Purpose +- `text` (string): The word text +- `start` (number): Start time in seconds +- `end` (number): End time in seconds +- `confidence` (number, optional): Confidence score for the word + +Time values MUST follow the Time Format Requirements defined in this specification. -The `extensions` field allows for the inclusion of custom, application-specific metadata and format-specific properties without affecting compatibility with other implementations. +Note: For segments where word timing is not applicable or fails, use `word_timing_mode: "none"` and omit the `words` array entirely. + +### Extensions Field Requirements + +The `extensions` field allows applications to include custom data without affecting core STJ functionality. #### Structure -- The `extensions` field, if present, **MUST** be a JSON object. -- Each key in `extensions` **MUST** represent a namespace and **MUST** be a non-empty string. -- The value corresponding to each namespace **MUST** be a JSON object containing key-value pairs specific to that namespace. - -#### Namespaces - -- **Namespace Naming:** - - Namespaces **SHOULD** be concise and reflect the application, format, or organization. - - Examples include `myapp`, `companyname`, `customformat`. -- **Reserved Namespaces:** - - The following namespaces are **RESERVED** for future use by the STJ specification and **MUST NOT** be used for custom data: - - `stj` (reserved for STJ specification extensions) - - `webvtt` (reserved for WebVTT format mappings) - - `ttml` (reserved for TTML format mappings) - - `ssa` (reserved for SSA/ASS format mappings) - - `srt` (reserved for SubRip format mappings) - - `dfxp` (reserved for DFXP/Timed Text format mappings) - - `smptett` (reserved for SMPTE-TT format mappings) -- **Developer Guidance:** - - Developers who need to include format-specific properties before official definitions are available may: - - Use a custom namespace that clearly indicates its provisional nature, such as `custom_webvtt` or `experimental_ttml`. - - Be prepared to migrate their data to the official namespace once the STJ specification provides the definitions. - -#### Constraints - -- Applications **MUST** ignore any namespaces in `extensions` that they do not recognize. -- The `extensions` field **SHOULD NOT** include essential data required for basic functionality. -- Nested objects and arrays **ARE ALLOWED** within each namespace. -- Keys within namespaces **MUST NOT** duplicate or conflict with standard fields of the containing object. -- **Reserved Namespaces Validation:** - - Namespaces listed as **RESERVED** in the specification **MUST NOT** be used by applications for custom data. - - Applications **MUST** report an error if a reserved namespace is used. +- The `extensions` field, if present, **MUST** be a JSON object +- Each key in `extensions` **MUST** represent a namespace and **MUST** be a non-empty string +- Each namespace **MUST** contain a valid JSON object + +#### Processing Rules + +- Applications **MUST** ignore any namespaces they don't recognize +- Core STJ fields are authoritative for standard processing +- Extension data **MAY** provide supplementary information but **MUST NOT** override core field behavior + +#### Reserved Namespaces + +The following namespaces are **RESERVED** for future use by the STJ specification: + +- `stj*` (reserved for STJ specification extensions) +- `webvtt` (reserved for WebVTT format mappings) +- `ttml` (reserved for TTML format mappings) +- `ssa` (reserved for SSA/ASS format mappings) +- `srt` (reserved for SubRip format mappings) +- `dfxp` (reserved for DFXP/Timed Text format mappings) +- `smptett` (reserved for SMPTE-TT format mappings) + +Applications **MUST** report an error if a reserved namespace is used by applications for custom data + +#### Best Practices + +While not required, extension providers are encouraged to: + +- Document the purpose and usage of their extension fields +- Use clear, descriptive namespace names +- Be especially clear when extension fields relate to core STJ concepts #### Examples -- **In a `segment` object:** +Basic extension: - ```json - "extensions": { - "myapp": { - "custom_property": "value", - "analysis_data": { - "sentiment_score": 0.85, - "keywords": ["innovation", "technology"] - } - }, - "analytics": { - "emotion": "happy", - "confidence": 0.9 +```json +"extensions": { + "myapp": { + "custom_field": "value", + "analysis_data": { + "property": "value" } } - ``` +} +``` -- **In a `style` object with format-specific properties:** +Extension with format-specific properties: - ```json - { - "id": "caption_style", - "text": { - "color": "#FFFFFF", - "background": "#000000" - }, - "display": { - "align": "center", - "vertical": "bottom" - }, - "extensions": { - "custom_webvtt": { - "line": "auto", - "position": "50%", - "size": "100%" - }, - "myapp": { - "custom_style_property": "value" - } - } +```json +"extensions": { + "custom_webvtt": { + "line": "auto", + "position": "50%" } - ``` +} +``` -- **In a `metadata` object:** +## Implementation Requirements - ```json - "extensions": { - "project_info": { - "project": "International Conference", - "client": "Global Events Inc." - }, - "notes": { - "review_status": "approved", - "reviewer": "John Doe" - } - } - ``` +This section defines how implementations should process STJ files, including handling of optional fields, validation processing, and error reporting. It focuses on the practical aspects of implementing the specification. -**Note:** Standard fields defined in the STJ specification **MUST NOT** be duplicated within any namespace in `extensions`. For example, including a key `"start"` within a namespace is prohibited if it conflicts with the mandatory `"start"` field of the segment. +### Handling of Optional Fields -## Implementation Requirements +Implementations **MUST** support files that include only the mandatory elements: `stj.version`, and `transcript.segments` with `text` values. -### Time Value Processing +Implementations **SHOULD** gracefully handle the absence of optional fields and provide reasonable defaults or omit related functionalities. -Implementations MUST: +For example, if timing information is absent, applications may treat the transcription as untimed text. -- Parse time values with up to 3 decimal places -- Preserve the precision of input values up to 3 decimal places -- Round any input with more than 3 decimal places to 3 decimal places using IEEE 754 round-to-nearest-even -- Validate all time values according to the Time Format Requirements section +### Field-Specific Format Precedence -Implementations MUST reject files that contain any of the following: +When multiple format requirements apply to a field, specific requirements take precedence over general requirements. The precedence order is: -- Negative time values -- Values exceeding 999999.999 seconds -- Time values using scientific notation -- Overlapping segments +1. Field-specific requirements (e.g., [Time Format Requirements](#time-format-requirements) for time fields) +2. Type-specific requirements (e.g., general [Number Format Requirements](#number-format-requirements)) +3. Global format requirements + +Examples: + +- Time values may include leading zeros as specified in [Time Format Requirements](#time-format-requirements), despite the general prohibition in [Number Format Requirements](#number-format-requirements) +- Language codes must follow their specific format requirements regardless of general string formatting rules + +### Time Value Processing + +#### Processing Requirements + +Implementations **MUST**: + +1. **Input Validation**: + - Accept numeric values with any number of decimal places + - Accept time values with or without leading zeros + - Verify decimal separator is period (.) + - Check value is non-negative + - Check value is not in scientific notation + - Reject if exceeds maximum range (even if would round to valid value) + - Example: reject `999999.9995` even though it would round to `1000000.000`, which exceeds the maximum allowed value of `999999.999` + +2. **Precision Processing**: + - Round values > 3 decimal places using IEEE 754 round-to-nearest-even + - Preserve original precision up to 3 decimal places + - Do not normalize to 3 decimal places + - Example: `1.5` remains `1.5`, not normalized to `1.500` + +3. **Output Requirements**: + - Store values with maximum 3 decimal places + - Preserve existing decimal places up to 3 + - Preserve leading zeros in time values when present + - Not add or remove leading zeros when processing time values + - Include decimal point if original value had decimal places + - Do not add/remove trailing zeros + +#### Time Value Validation Severity + +- **ERROR Level** (Must reject file): + - Negative values + - Values exceeding range (before or after rounding) + - Scientific notation + - Non-numeric values + - Incorrect decimal separator + - Missing required time field when its pair is present + +- **INFO Level**: + - Rounding of values with more than 3 decimal places + - Preservation of existing precision (not normalizing to 3 decimal places) + +#### Error Handling for Time Values + +Implementations **MUST**: + +- **For Invalid Time Values (ERROR level)**: + - Report specific validation failure (e.g., "negative value", "exceeds range") + - Include the invalid value in error message + - Reject the entire STJ file + - Example message: "Error: Invalid time value -1.0 at segment[0].start (negative values not allowed)" + +- **For Rounded Time Values (INFO level)**: + - MAY report when rounding has occurred + - Include original and rounded values in message + - Example message: "Info: Time value 1.2345 rounded to 1.235 at segment[2].end" + +Implementations **SHOULD**: + +- Collect all time value errors before rejecting file +- Provide line/position information for errors when possible +- Include guidance in error messages about valid time formats + +**Note**: These time value requirements apply to all time fields in the STJ format, including segment times (`start`, `end`) and word-level timing data. For validation severity levels and error handling requirements, see the [Validation Requirements](#validation-requirements) section. + +### Error Handling + +Implementations **MUST**: + +- **For ERROR-level issues**: + - Report the issues to the user or calling process. + - **MUST NOT** proceed with processing the STJ file. + - **Example ERROR issues**: + - Overlapping segments. + - Unordered segments. + - Invalid references. + - Missing required fields. + - Malformed data. + +- **For WARNING-level issues**: + - Report the issues to the user or calling process. + - **MAY** proceed with processing, but should do so cautiously. + - **Example WARNING issues**: + - Use of deprecated fields. + - Non-standard language codes. + +- **For INFO-level issues**: + - Reporting is optional. + - Processing should proceed normally. + - **Example INFO issues**: + - Suggestions for metadata enhancements. + +Implementations **SHOULD** strive to provide meaningful feedback to users to improve the quality of STJ files. ## Validation Requirements +Implementations of the STJ format **MUST** perform validation that categorizes issues by severity levels. This section defines what must be validated, including validation rules and their associated severity levels. This approach ensures that: + +- Users are informed about the nature of any issues found in STJ files +- Appropriate actions can be taken based on severity +- Validation is consistent across implementations + +For details on how to implement these validation requirements, see the Implementation Requirements section. + +### Severity Levels + +The STJ specification uses three severity levels to indicate the impact of validation issues: + +#### ERROR + +- Definition: Critical issues that make the file semantically invalid or could cause incorrect processing +- Result: File MUST be rejected +- Examples: + - Missing required fields + - Invalid field types or values + - Time value violations + - Overlapping segments + - Unordered segments + - Invalid references + - Malformed data + +#### WARNING + +- Definition: Issues that indicate potential problems but don't invalidate the file +- Result: Processing MAY continue with caution +- Examples: + - Use of deprecated fields + - Non-standard language codes + - Non-optimal patterns + - Unnecessary empty arrays/objects + +#### INFO + +- Definition: Suggestions for improvements or notifications of automatic adjustments +- Result: Processing continues normally +- Examples: + - Time value rounding occurred + - Metadata completeness suggestions + - Efficiency recommendations + - Style definition optimizations + +### Validation Sequence + +Implementations **SHOULD** perform validation in the following order: + +1. **Structure Validation**: + - JSON structure validity + - Root object requirements + - Required fields presence + - Array and object structure rules + +2. **Field Validation**: + - Data type requirements + - Value constraints + - Format requirements + +3. **Reference Validation**: + - Speaker ID references + - Style ID references + - Language code consistency + +4. **Content Validation**: + - Segment timing rules + - Word timing rules + - Text content requirements + +5. **Application-Specific Validation**: + - Implementation-specific requirements + - Custom extensions + +This sequence aligns with the guidelines provided in the [Validation Requirements](#validation-requirements) section. + +### Validation Categories and Rules + +This section provides an overview of all validation requirements organized by category. Detailed rules can be found in their referenced sections. + +#### Structure Validation + +##### Basic File Structure + +- JSON structure and encoding: See [Character Encoding Requirements](#character-encoding-requirements) +- Root object requirements: See [Root Structure](#root-structure) +- Additional properties restrictions: See [Root Structure](#root-structure) +- File extension requirements: See [Specification](#specification) + +##### Empty Value Rules + +- Null value restrictions: See [Empty Value Constraints](#empty-value-constraints) +- Empty array handling: See [Empty Array Rules](#empty-array-rules) +- Empty object handling: See [Empty Object Rules](#empty-object-rules) +- Empty string handling: See [Empty String Rules](#empty-string-rules) + +##### Array Structure + +- Array ordering requirements: See [Array Ordering Requirements](#array-ordering-requirements) +- Mandatory vs optional arrays: See [Empty Array Rules](#empty-array-rules) + +#### Field-Specific Validation + +##### Time Values + +- Format and range requirements: See [Time Format Requirements](#time-format-requirements) +- Precision and rounding rules: See [Time Format Requirements](#time-format-requirements) +- Basic constraints: See [Basic Constraints](#basic-constraints) under Time Format Requirements +- Zero-duration requirements: See [Basic Constraints](#basic-constraints) under Time Format Requirements + +##### Language Codes + +- Standard requirements: See [Language Codes > Standards](#standards) +- Consistency requirements: See [Language Codes > Consistency Requirements](#consistency-requirements) +- Application requirements: See [Language Codes > Application Requirements](#application-requirements) + +##### Speaker and Style IDs + +- Format specifications: See [Speaker IDs > Format Specifications](#format-specifications) +- Uniqueness requirements: See [Speaker IDs > Validation Rules](#validation-rules) +- Reference validation: See [Speaker IDs > Examples of invalid speaker references](#examples-of-invalid-speaker-references) +- Style ID requirements: See [Style IDs](#style-ids) + +##### URI Validation + +- Format specifications: See [URI Format Requirements](#uri-format-requirements) +- Scheme support: See [URI Format Requirements > Format Specifications](#format-specifications-1) +- Security considerations: See [URI Format Requirements > Security Considerations](#security-considerations) + +##### Metadata Validation + +- Field requirements: See [Metadata Section > Fields](#fields) +- Language specifications: See [Metadata Section > Clarification on languages Fields](#clarification-on-languages-fields) + +#### Content Validation + +##### Segment Validation + +- Required fields: See [Segment-Level Validation > Required Fields](#required-fields) +- Time field requirements: See [Segment-Level Validation > Time Fields](#time-fields) +- Reference validation: See [Segment-Level Validation > References](#references) +- Ordering requirements: See [Segment-Level Validation > Segment Ordering](#segment-ordering) +- Overlap restrictions: See [Segment-Level Validation > Segment Overlap](#segment-overlap) +- Zero-duration rules: See [Segment-Level Validation > Zero-Duration Segments](#zero-duration-segments) + +##### Word Level Validation + +- Required field validation: See [Word-Level Validation > Required Field Validation](#required-field-validation) +- Timing validation: See [Word-Level Validation > Timing Validation](#timing-validation) +- Mode-specific validation: See [Word-Level Validation > Mode-Specific Validation](#mode-specific-validation) +- Text alignment requirements: See [Word Text Alignment > Requirements](#requirements) + +##### Extensions Validation + +- Structure requirements: See [Extensions Field Requirements > Structure](#structure) +- Reserved namespace protection: See [Extensions Field Requirements > Reserved Namespaces](#reserved-namespaces) +- Processing rules: See [Extensions Field Requirements > Processing Rules](#processing-rules) + +### Error Reporting Requirements + +Implementations **MUST**: + +- Provide clear error messages when **ERROR** level issues are detected. +- Include the JSON path to the problematic field in error messages. +- **MUST NOT** process the STJ file further if **ERROR** level issues are present. +- **SHOULD** report **WARNING** and **INFO** level issues to guide users. +- Report multiple validation issues when possible, rather than stopping at the first error. + +### Response Format + +Implementations **SHOULD** output validation results in a structured format, such as JSON, to facilitate automated processing. + +**Example Response Format:** + +```json +{ + "valid": false, + "issues": [ + { + "severity": "ERROR", + "path": "transcript.segments[0].start", + "code": "INVALID_TIME_FORMAT", + "message": "Segment start time must be a non-negative number.", + "specRef": "#time-format-requirements", + "suggestion": "Ensure 'start' is a non-negative decimal number." + }, + { + "severity": "WARNING", + "path": "transcript.segments[1]", + "code": "OVERLAPPING_SEGMENTS", + "message": "Segments should not overlap in time.", + "specRef": "#segment-overlap", + "suggestion": "Adjust segment timings to prevent overlap." + }, + { + "severity": "INFO", + "path": "metadata", + "code": "MISSING_METADATA", + "message": "Including metadata can enhance the usefulness of the STJ file.", + "specRef": "#metadata-section", + "suggestion": "Consider adding a 'metadata' section." + } + ] +} +``` + ### Segment-Level Validation - **Required Fields**: - - `start` and `end` times MUST conform to the Time Format Requirements section - - `text` MUST be present and non-empty + - `text` **MUST** be present and non-empty. + - **Severity if violated:** ERROR +- **Time Fields**: + - `start` and `end` times, if present, **MUST** conform to the [Time Format Requirements](#time-format-requirements) section. + - **Severity if violated:** ERROR + - If `start` equals end, `is_zero_duration` **MUST** be included and set to `true`. + - **Severity if violated:** ERROR + - **References**: - - `speaker_id`, if present, MUST match an `id` in the `speakers` list - - `style_id`, if present, MUST match an `id` in the `styles` list + - `speaker_id`, if present, **MUST** match an `id` in the `speakers` list. + - **Severity if violated:** ERROR + - `style_id`, if present, **MUST** match an `id` in the `styles` list. + - **Severity if violated:** ERROR + - **Segment Ordering**: - - Segments MUST be ordered by their `start` times in ascending order - - For segments with identical start times, they MUST be ordered by their end times in ascending order + - Segments **MUST** be ordered by their `start` times in ascending order. + - **Severity if violated:** ERROR + - **Rationale**: Unordered segments can disrupt processing logic and lead to incorrect media synchronization. + - For segments with identical start times, they **MUST** be ordered by their end times in ascending order. + - **Severity if violated:** ERROR + - **Rationale**: Consistent ordering is essential for predictable processing and display. + - For segments with identical start and end times, the original array order **MUST** be preserved. + - **Severity if violated:** ERROR + - **Rationale**: Maintaining original order ensures stable sorting and preserves intended sequence of simultaneous events. + - **Segment Overlap**: - - Segments MUST NOT overlap in time - - For any two segments S1 and S2 where S1 appears before S2 in the segments array: - - S1.end MUST be less than or equal to S2.start - - Examples of valid segment ordering: - - Adjacent segments: S1(0.0, 1.0), S2(1.0, 2.0) - - Segments with gap: S1(0.0, 1.0), S2(2.0, 3.0) - - Examples of invalid segment ordering: - - Overlapping segments: S1(0.0, 2.0), S2(1.0, 3.0) - - Out of order segments: S1(1.0, 2.0), S2(0.0, 3.0) + - Segments **MUST NOT** overlap in time. + - **Severity if violated:** ERROR + - **Rationale**: Overlapping segments create ambiguity about which text applies at what time and can cause rendering issues. + - **Error Recovery Guidelines**: + - While overlapping segments make an STJ file invalid, applications processing potentially invalid files **SHOULD** implement error recovery strategies rather than fail completely. + - Recovery strategies **MAY** include: + - Merging overlapping segments + - Adjusting segment timings to eliminate overlaps + - Alerting users to review and correct the overlaps + - Applications implementing recovery strategies **MUST** still report the overlap as an ERROR during validation. + - **Zero-Duration Segments**: - - MUST follow the zero-duration requirements defined in the Time Format Requirements section - - Zero-duration segments MAY share the same timestamp + - **MUST** follow the zero-duration requirements defined in the [Time Format Requirements](#time-format-requirements) section. + - **Severity if violated:** ERROR + - The presence of `is_zero_duration` when `start` does not equal `end` **MUST** result in an ERROR + +- **Timing Consistency**: + - If any segment in a transcript includes timing information (`start` and `end`), all segments in that transcript MUST include timing information. + - **Severity if violated:** ERROR + - **Rationale**: Mixed timed/untimed segments create ambiguity in processing and display. + +#### Overlapping Segments Examples + +**Example of Non-Compliant Overlapping Segments:** + +```json +{ + "segments": [ + { + "start": 5.0, + "end": 10.0, + "text": "First segment" + }, + { + "start": 8.0, + "end": 12.0, + "text": "Second segment" + } + ] +} +``` + +*Explanation*: The second segment starts at 8.0 seconds, which is before the end of the first segment at 10.0 seconds. This creates an overlap between 8.0 and 10.0 seconds, violating the requirement that segments **MUST NOT** overlap. ### Word-Level Validation -- **When `words` array is present**: - - Each word object must have `text`, `start`, and `end` - - All time values must conform to the Time Format Requirements section - - Word timing constraints: - - Word times must be within the parent segment's time range - - Words must be ordered by `start` time - - Word timings must not overlap -- **Word Timing Mode Requirements**: - - When `"complete"` (or omitted with complete coverage): - - The concatenation of all `text` fields in `words` must match the segment's `text`, except for differences in whitespace or punctuation - - When `"partial"`: - - The `text` fields in `words` must be a subset of the words in the segment's `text`, in the same order - - When `"none"` (or omitted without `words` array): - - Must not contain `words` array -- **Zero-Duration Words**: - - Must follow the zero-duration requirements defined in the Time Format Requirements section +#### Required Field Validation + +When the `words` array is present: + +- Each word object **MUST** have: + - `text` (string, non-empty) + - `start` (number) + - `end` (number) + - **Severity if violated:** ERROR + +#### Timing Validation + +- Word times MUST be within the parent segment's time range + - **Severity if violated:** ERROR +- Words MUST be ordered by `start` time + - **Severity if violated:** ERROR +- Word timings SHOULD NOT overlap + - **Severity if violated:** WARNING + +#### Mode-Specific Validation + +##### Complete Mode (`word_timing_mode: "complete"`) + +The `words` array: + +- **MUST** be present and non-empty +- **MUST** have concatenated `words[].text` match segment `text` when normalized for whitespace +- **MUST** have timing data for each word +- **Severity if violated:** ERROR + +##### Partial Mode (`word_timing_mode: "partial"`) + +The `words` array: + +- **MUST** be present and contain at least one word +- **MUST** have each `words[].text` match a substring in segment `text` +- **MUST** have words appear in the same order as in segment `text` +- **Severity if violated:** ERROR + +##### None Mode (`word_timing_mode: "none"`) + +The `words` array: + +- **MUST NOT** be present (array must be completely omitted, not included as empty) +- Use this mode for segments where: + - Word timing wasn't attempted + - Word timing isn't applicable + - Word timing was attempted but failed +- **Severity if violated:** ERROR + +#### Example of Failed Word Timing + +```json +{ + "start": 15.0, + "end": 20.0, + "text": "Background noise made word timing impossible", + "word_timing_mode": "none" + // Note: words array is entirely omitted, not included as empty +} +``` + +### Word Text Alignment + +#### Requirements + +1. **Word Order** + - Words in the `words` array MUST appear in the same order as they do in the segment's `text` field. + - The text of each word in `words[].text` MUST match its corresponding occurrence in the segment's `text` field. + +2. **Text Matching** + - Implementations MUST preserve the exact text content of words, including: + - Case sensitivity + - Punctuation + - Special characters + - Whitespace within word boundaries (if any) + +3. **Tokenization** + - For `word_timing_mode: "complete"`: + - The concatenated `words[].text` MUST match the segment's `text` when normalized for inter-word whitespace. + - For `word_timing_mode: "partial"`: + - Each `words[].text` MUST match a corresponding substring in the segment's `text`. + - Words MUST be tokenized consistently within a segment. + +#### Examples + +1. **Complete Word Timing**: + +```json +{ + "text": "Hello, world!", + "word_timing_mode": "complete", + "words": [ + {"text": "Hello,", "start": 0.0, "end": 0.5}, + {"text": "world!", "start": 0.6, "end": 1.0} + ] +} +``` + +2. **Partial Word Timing:** + +```json +{ + "text": "Hello, wonderful world!", + "word_timing_mode": "partial", + "words": [ + {"text": "Hello,", "start": 0.0, "end": 0.5}, + {"text": "world!", "start": 1.0, "end": 1.5} + ] +} +``` + +3. **Complex Punctuation Example:** + +```json +{ + "text": "\"Don't,\" she said, \"go there!\"", + "word_timing_mode": "partial", + "words": [ + {"text": "\"Don't,\"", "start": 0.0, "end": 0.5}, + {"text": "there!\"", "start": 1.0, "end": 1.5} + ] +} +``` + +### Word Timing Implementation Notes + +#### Tokenization Recommendations + +1. **Basic Tokenization** + - Split on whitespace as a baseline approach + - Preserve punctuation attached to words + - Keep contractions as single tokens + - Maintain quotation marks with their associated words + +2. **Edge Cases** + - Multi-word expressions (e.g., "New York") should be treated as single tokens if timed as one unit + - Hyphenated words should be kept as single tokens + - Numbers, dates, and times should be treated as single tokens + +#### Text Alignment Strategies + +1. **For Complete Mode**: + - Validate that all words are present + - Compare normalized text (removing extra whitespace) to detect missing or extra words + - Report specific mismatches to aid debugging + +2. **For Partial Mode**: + - Use string matching to verify word presence and order + - Consider implementing fuzzy matching for robustness + - Cache tokenization results for efficiency + +#### Performance Considerations + +- Consider caching tokenization results +- Use efficient string matching algorithms for validation +- Implement incremental validation for large documents ### General Validation - **URI Validation Requirements**: - - The `uri` field in `metadata.source` MUST conform to the **URI Format Requirements** specified in the **Field Definitions and Constraints** section. - - Implementations SHOULD validate the URI format according to RFC 3986. - - Invalid URIs SHOULD result in a validation error or warning. - - **Scheme Support**: - - Implementations MUST support `http` and `https` schemes. - - Support for other schemes is OPTIONAL. + - The `uri` field in `metadata.source` **MUST** conform to the [URI Format Requirements](#uri-format-requirements). + - **Severity if violated:** ERROR + - Implementations **SHOULD** validate the URI format according to RFC 3986. + - **Invalid URIs** **SHOULD** result in a **WARNING**. - **Relative URIs**: - - Relative URIs SHOULD NOT be used. - - If present, they MUST be resolved relative to a known base URI by the consuming application. + - Relative URIs **SHOULD NOT** be used. + - **Severity if violated:** WARNING - **Language Code Requirements**: - - All language codes must be valid ISO 639 codes - - Language codes must be consistent with the requirements defined in the Language Codes section + - All language codes **MUST** be valid ISO 639 codes. + - **Severity if violated:** ERROR + - Language codes **MUST** use ISO 639-1 codes when available. + - **Severity if violated:** ERROR + - Language codes **MUST** be consistent throughout the file, using ISO 639-1 where available and ISO 639-3 only for languages without ISO 639-1 codes. + - **Severity if violated:** ERROR - **Confidence Score Requirements**: - - Confidence scores, if present, must be within the range [0.0, 1.0] + - Confidence scores, if present, **MUST** be within the range [0.0, 1.0]. + - **Severity if violated:** ERROR - **Reference Requirements**: - **Speaker IDs**: - - All `speaker_id` references in segments MUST correspond to valid `id` entries in the `speakers` list. - - All `id` values in the `speakers` list MUST conform to the **Speaker ID Requirements** specified in the **Field Definitions and Constraints** section. - - All IDs in the `speakers` array MUST be unique. - - IDs MUST only contain allowed characters and meet length constraints. + - All `speaker_id` references in segments **MUST** correspond to valid `id` entries in the `speakers` list. + - **Severity if violated:** ERROR + - All IDs in the `speakers` array **MUST** be unique. + - **Severity if violated:** ERROR - **Style IDs**: - - All `style_id` references must correspond to valid entries in the `styles` list. - - All IDs in the `styles` array MUST be unique. + - All `style_id` references **MUST** correspond to valid entries in the `styles` list. + - **Severity if violated:** ERROR + - All IDs in the `styles` array **MUST** be unique. + - **Severity if violated:** ERROR - **Character Encoding Requirements**: - - All text content must be valid UTF-8 - - All JSON string values must follow RFC 8259 encoding rules - - Control characters must be properly escaped - - Surrogate pairs must be properly formed - - BOM must be handled correctly if present + - All text content **MUST** be valid UTF-8. + - **Severity if violated:** ERROR + - Control characters **MUST** be properly escaped. + - **Severity if violated:** ERROR + +- **Time Value Requirements**: + - All time values **MUST** conform to the [Time Format Requirements](#time-format-requirements) and Processing Requirements. + - **Severity if violated:** ERROR + - Input validation requirements **MUST** be checked before rounding. + - **Severity if violated:** ERROR + - Leading zeros in time values **MUST** be preserved if present. + - **Severity if violated:** ERROR + - Rounding of time values with more than 3 decimal places **MUST** be reported. + - **Severity level:** INFO + - Precision preservation requirements **MUST** be followed. + - **Severity level:** INFO ### Extensions Field Validation - **Structure Validation:** - - The `extensions` field, if present, **MUST** be a JSON object. - - Namespaces **MUST** be strings and **MUST NOT** be empty. - - Values corresponding to namespaces **MUST** be JSON objects. + - The `extensions` field, if present, **MUST** be a JSON object + - Namespaces **MUST** be strings and **MUST NOT** be empty + - Values corresponding to namespaces **MUST** be JSON objects - **Reserved Namespaces Validation:** - - Namespaces listed as **RESERVED** in the specification **MUST NOT** be used by applications for custom data. - - Applications **MUST** report an error if a reserved namespace is used. + - Namespaces listed as **RESERVED** **MUST NOT** be used by applications for custom data + - Applications **MUST** report an error if a reserved namespace is used - **Content Validation:** - - Applications **MUST** ignore any namespaces or keys within `extensions` that they do not recognize. - - Values within namespaces **MAY** be validated based on application-specific requirements. + - Applications **MUST** ignore any namespaces they don't recognize + - Values within namespaces **MAY** be validated based on application-specific requirements -- **Conflict Resolution:** - - If a key within a namespace in `extensions` conflicts with a standard field, the standard field's value **MUST** take precedence. - - Applications **MUST** report an error if a conflict is detected. +- **Core Field Priority:** + - When processing STJ files, applications **MUST** use core field values for standard functionality + - Extension data **MUST NOT** override or alter the behavior of core fields ### Style Processing Implementations: -- MAY support none, some, or all style properties -- MUST ignore style properties they don't support -- MUST document which style properties they support -- SHOULD provide reasonable fallback behavior for unsupported properties +- **MAY** support none, some, or all style properties +- **MUST** ignore style properties they don't support +- **MUST** document which style properties they support +- **SHOULD** provide reasonable fallback behavior for unsupported properties When converting STJ to other formats, implementations: @@ -991,7 +1879,7 @@ The STJ format is designed to be easily parsed and utilized by a variety of appl ## Extensibility and Customization - **Additional Metadata**: Use the `extensions` fields in both `metadata` and individual objects to include custom data without affecting compatibility. -- **Versioning**: Include a `version` field in `metadata` if needed for future format updates. +- **Versioning**: The `version` field in the root `"stj"` object indicates the specification version. Applications **SHOULD** check this field to ensure compatibility. - **Custom Fields**: Applications can add custom data within appropriately named namespaces in the `extensions` field to include application-specific data without affecting compatibility. ## Adherence to Best Practices @@ -1006,7 +1894,9 @@ The STJ format follows best practices for data interchange formats, drawing insp ## Final Remarks -The STJ format aims to be a comprehensive and flexible standard for transcription data representation. By incorporating features from existing formats and adhering to best practices, it strives to meet the needs of a wide range of applications and facilitate better interoperability in the field of speech transcription and subtitles. +The STJ format is designed as a comprehensive and adaptable standard for transcription data representation. It establishes minimal mandatory requirements, allowing for straightforward implementations in basic scenarios while offering rich optional features for more complex applications. + +By integrating elements from existing standards and following best practices, STJ aims to accommodate a wide range of use cases, promoting greater interoperability in speech transcription and subtitle applications. This flexible approach maximizes adoption potential and ensures future extensibility without compromising compatibility, providing a robust framework for diverse needs in the transcription and subtitle domain. --- diff --git a/spec/schema/CHANGELOG.md b/spec/schema/CHANGELOG.md index e69de29..a3a8ac7 100644 --- a/spec/schema/CHANGELOG.md +++ b/spec/schema/CHANGELOG.md @@ -0,0 +1,37 @@ +# CHANGELOG + +## v0.6.0 + +### Added + +- Introduced the `stj` top-level property, which encapsulates the entire schema. +- Implemented stricter validation for the `metadata` and `transcript` sections: + - Enforced `minProperties` in objects for better compliance with the schema. + - Added `minItems` validation for arrays (e.g., `languages` now requires at least one language). + - Ensured stricter formatting rules for properties like `color`, `background`, `size`, and `position` in the `styles` section. +- Added `allOf` conditionals to validate that segments and words must have both `start` and `end` times when either is present. +- Added new constraints to the `extensions` field, enforcing `minProperties` and forbidding standard subtitle formats (e.g., `stj`, `webvtt`, etc.) to avoid conflicts. + +### Changed + +- `version` format validation updated to ensure semantic versioning patterns (e.g., `0.6.0`). +- Moved most properties inside the `stj` object, streamlining the schema structure. +- Enhanced the `confidence_threshold` validation to ensure it is between 0.0 and 1.0. +- Refined the `is_zero_duration` property across segments and words for additional precision handling. + +### Removed + +- `confidence` is no longer required in every segment or word, offering flexibility in cases where confidence data is unavailable. +- `word_timing_mode` options reduced, making this field optional with stricter values (`complete`, `partial`, `none`). +- The `additionalProperties: false` constraint now applied more widely across objects, ensuring strict validation of properties. + +--- + +## v0.5.0 + +### Initial Version + +- Defined the `metadata` and `transcript` sections, with required fields such as `created_at`, `version`, `transcriber`, and `segments`. +- Supported basic styling (`styles`) and segmentation (`segments`, `words`) with time precision. +- Allowed extensibility with the `extensions` property across several sections. +- Enforced semantic versioning patterns and `multipleOf` constraints on time-related properties. diff --git a/spec/schema/latest/stj-schema.json b/spec/schema/latest/stj-schema.json new file mode 100644 index 0000000..aaf6064 --- /dev/null +++ b/spec/schema/latest/stj-schema.json @@ -0,0 +1,458 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Standard Transcription JSON Schema", + "type": "object", + "required": [ + "stj" + ], + "additionalProperties": false, + "properties": { + "stj": { + "type": "object", + "required": [ + "version", + "transcript" + ], + "additionalProperties": false, + "properties": { + "version": { + "type": "string", + "pattern": "^\\d+\\.\\d+\\.\\d+$", + "description": "Specification version (e.g., '0.6.0')" + }, + "metadata": { + "type": "object", + "additionalProperties": false, + "properties": { + "transcriber": { + "type": "object", + "additionalProperties": false, + "properties": { + "name": { + "type": "string" + }, + "version": { + "type": "string" + } + } + }, + "created_at": { + "type": "string", + "format": "date-time" + }, + "source": { + "type": "object", + "additionalProperties": false, + "properties": { + "uri": { + "type": "string", + "format": "uri" + }, + "duration": { + "type": "number", + "minimum": 0.0, + "maximum": 999999.999 + }, + "languages": { + "type": "array", + "minItems": 1, + "items": { + "type": "string" + } + } + } + }, + "languages": { + "type": "array", + "minItems": 1, + "items": { + "type": "string" + } + }, + "confidence_threshold": { + "type": "number", + "minimum": 0.0, + "maximum": 1.0 + }, + "extensions": { + "type": "object", + "minProperties": 1, + "patternProperties": { + "^(?!stj$|webvtt$|ttml$|ssa$|srt$|dfxp$|smptett$)[A-Za-z0-9_-]+$": { + "type": "object", + "minProperties": 1 + } + }, + "additionalProperties": false + } + } + }, + "transcript": { + "type": "object", + "required": [ + "segments" + ], + "additionalProperties": false, + "properties": { + "speakers": { + "type": "array", + "items": { + "type": "object", + "required": [ + "id" + ], + "additionalProperties": false, + "properties": { + "id": { + "type": "string", + "pattern": "^[A-Za-z0-9_-]{1,64}$", + "description": "Unique identifier between 1 and 64 characters." + }, + "name": { + "type": "string" + }, + "extensions": { + "type": "object", + "minProperties": 1, + "patternProperties": { + "^(?!stj$|webvtt$|ttml$|ssa$|srt$|dfxp$|smptett$)[A-Za-z0-9_-]+$": { + "type": "object", + "minProperties": 1 + } + }, + "additionalProperties": false + } + } + } + }, + "styles": { + "type": "array", + "items": { + "type": "object", + "required": [ + "id" + ], + "additionalProperties": false, + "properties": { + "id": { + "type": "string", + "minLength": 1 + }, + "text": { + "type": "object", + "additionalProperties": false, + "properties": { + "color": { + "type": "string", + "pattern": "^#([0-9A-Fa-f]{6})$" + }, + "background": { + "type": "string", + "pattern": "^#([0-9A-Fa-f]{6})$" + }, + "bold": { + "type": "boolean" + }, + "italic": { + "type": "boolean" + }, + "underline": { + "type": "boolean" + }, + "size": { + "type": "string", + "pattern": "^\\d+%$" + } + } + }, + "display": { + "type": "object", + "additionalProperties": false, + "properties": { + "align": { + "type": "string", + "enum": [ + "left", + "center", + "right" + ] + }, + "vertical": { + "type": "string", + "enum": [ + "top", + "middle", + "bottom" + ] + }, + "position": { + "type": "object", + "additionalProperties": false, + "properties": { + "x": { + "type": "string", + "pattern": "^\\d+%$" + }, + "y": { + "type": "string", + "pattern": "^\\d+%$" + } + } + } + } + }, + "extensions": { + "type": "object", + "minProperties": 1, + "patternProperties": { + "^(?!stj$|webvtt$|ttml$|ssa$|srt$|dfxp$|smptett$)[A-Za-z0-9_-]+$": { + "type": "object", + "minProperties": 1 + } + }, + "additionalProperties": false + } + } + } + }, + "segments": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": [ + "text" + ], + "additionalProperties": false, + "properties": { + "start": { + "type": "number", + "minimum": 0.0, + "maximum": 999999.999 + }, + "end": { + "type": "number", + "minimum": 0.0, + "maximum": 999999.999 + }, + "is_zero_duration": { + "type": "boolean", + "description": "Indicates if the segment has zero duration." + }, + "text": { + "type": "string", + "minLength": 1 + }, + "speaker_id": { + "type": "string", + "pattern": "^[A-Za-z0-9_-]{1,64}$" + }, + "confidence": { + "type": "number", + "minimum": 0.0, + "maximum": 1.0 + }, + "language": { + "type": "string" + }, + "style_id": { + "type": "string" + }, + "word_timing_mode": { + "type": "string", + "enum": [ + "complete", + "partial", + "none" + ] + }, + "words": { + "type": "array", + "items": { + "type": "object", + "required": [ + "start", + "end", + "text" + ], + "additionalProperties": false, + "properties": { + "start": { + "type": "number", + "minimum": 0.0, + "maximum": 999999.999 + }, + "end": { + "type": "number", + "minimum": 0.0, + "maximum": 999999.999 + }, + "is_zero_duration": { + "type": "boolean", + "description": "Indicates if the word has zero duration." + }, + "text": { + "type": "string", + "minLength": 1 + }, + "confidence": { + "type": "number", + "minimum": 0.0, + "maximum": 1.0 + }, + "extensions": { + "type": "object", + "minProperties": 1, + "patternProperties": { + "^(?!stj$|webvtt$|ttml$|ssa$|srt$|dfxp$|smptett$)[A-Za-z0-9_-]+$": { + "type": "object", + "minProperties": 1 + } + }, + "additionalProperties": false + } + } + } + }, + "extensions": { + "type": "object", + "minProperties": 1, + "patternProperties": { + "^(?!stj$|webvtt$|ttml$|ssa$|srt$|dfxp$|smptett$)[A-Za-z0-9_-]+$": { + "type": "object", + "minProperties": 1 + } + }, + "additionalProperties": false + } + }, + "allOf": [ + { + "if": { + "required": [ + "start" + ] + }, + "then": { + "required": [ + "end" + ] + } + }, + { + "if": { + "required": [ + "end" + ] + }, + "then": { + "required": [ + "start" + ] + } + }, + { + "if": { + "properties": { + "start": { + "type": "number" + }, + "end": { + "type": "number" + } + } + }, + "then": { + "properties": { + "is_zero_duration": { + "type": "boolean" + } + } + } + }, + { + "if": { + "properties": { + "word_timing_mode": { + "const": "complete" + } + }, + "required": [ + "word_timing_mode" + ] + }, + "then": { + "required": [ + "words" + ], + "properties": { + "words": { + "type": "array", + "minItems": 1 + } + } + } + }, + { + "if": { + "properties": { + "word_timing_mode": { + "const": "partial" + } + }, + "required": [ + "word_timing_mode" + ] + }, + "then": { + "required": [ + "words" + ], + "properties": { + "words": { + "type": "array", + "minItems": 1 + } + } + } + }, + { + "if": { + "properties": { + "word_timing_mode": { + "const": "none" + } + } + }, + "then": { + "not": { + "required": [ + "words" + ] + } + } + }, + { + "if": { + "required": [ + "words" + ] + }, + "then": { + "properties": { + "words": { + "type": "array", + "minItems": 1 + } + } + } + } + ] + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/spec/schema/v0.6.0/stj-schema.json b/spec/schema/v0.6.0/stj-schema.json index bd7646e..aaf6064 100644 --- a/spec/schema/v0.6.0/stj-schema.json +++ b/spec/schema/v0.6.0/stj-schema.json @@ -376,7 +376,10 @@ "word_timing_mode": { "const": "complete" } - } + }, + "required": [ + "word_timing_mode" + ] }, "then": { "required": [ @@ -396,7 +399,10 @@ "word_timing_mode": { "const": "partial" } - } + }, + "required": [ + "word_timing_mode" + ] }, "then": { "required": [ diff --git a/spec/v0.6.0/stj-specification.md b/spec/v0.6.0/stj-specification.md index 494fce4..32c90b0 100644 --- a/spec/v0.6.0/stj-specification.md +++ b/spec/v0.6.0/stj-specification.md @@ -9,6 +9,18 @@ The **Standard Transcription JSON (STJ)** format is a proposed standard for repr The STJ format includes detailed transcription segments with associated metadata such as speaker information, timestamps, confidence scores, language codes, and styling options. It also allows for optional metadata about the transcription process, source input, and the transcriber application. +## RFC 2119 Key Words + +This document uses requirement level keywords as defined in [RFC 2119](https://www.ietf.org/rfc/rfc2119.txt): + +- **MUST**, **REQUIRED**, **SHALL**: The requirement is absolute. +- **MUST NOT**, **SHALL NOT**: The behavior/feature is absolutely prohibited. +- **SHOULD**, **RECOMMENDED**: There may be valid reasons to ignore this requirement, but implications must be understood and carefully weighed. +- **SHOULD NOT**, **NOT RECOMMENDED**: There may be valid reasons to allow this behavior, but implications must be understood and carefully weighed. +- **MAY**, **OPTIONAL**: The item is truly optional. + +These keywords are presented in **UPPERCASE** throughout this document to indicate their special meanings. + ## Version History For a detailed list of changes between versions, please see the [CHANGELOG.md](../CHANGELOG.md) file. @@ -31,7 +43,7 @@ For a detailed list of changes between versions, please see the [CHANGELOG.md](. - **MIME Type**: `application/vnd.stj+json` - **Character Encoding**: UTF-8 -The STJ files must include a `version` field within the `stj` section to indicate the specification version they comply with. This facilitates compatibility and proper validation across different implementations. +The STJ files **MUST** include a `version` field within the `stj` section to indicate the specification version they comply with. This facilitates compatibility and proper validation across different implementations. ### MIME Type Registration @@ -69,6 +81,35 @@ The `"metadata"` field is optional and can be included to provide additional con No additional properties are allowed at the root level. +#### Examples of invalid root structures + +- Invalid: **Missing mandatory fields:** + +```json +{ + "stj": {} +} +``` + +- Invalid: **Missing transcript:** + +```json +{ + "stj": { + "version": "0.6.0" + } +} +``` + +- Invalid: **Missing stj root object:** + +```json +{ + "version": "0.6.0", // ERROR: Missing stj root object + "transcript": {} +} +``` + ### Mandatory vs. Optional Fields - **Mandatory Fields**: Essential for basic functionality and compatibility. @@ -77,9 +118,11 @@ No additional properties are allowed at the root level. - `transcript.segments[].text` - **Optional Fields**: Provide additional information and features but are not required for basic use: All other fields, including `metadata`, `start`, `end`, `speakers`, `styles`, `speaker_id`, `confidence`, `language`, `style_id`, `words`, `word_timing_mode`, etc. +**Note**: If any segment includes timing information, both `start` and `end` become mandatory for that segment and all other segments in the transcript. + ### Metadata Section -The `"metadata"` object is optional and can include fields providing context about the transcription. +The `"metadata"` object is **OPTIONAL** and **MAY** include fields providing context about the transcription. The metadata object MAY be empty to indicate metadata processing was attempted but found no properties. #### Fields @@ -131,7 +174,7 @@ The STJ format includes two `languages` fields within the `metadata` section to "name": "YAWT", "version": "0.4.0" }, - "created_at": "2023-10-20T12:00:00Z", + "created_at": "2024-10-20T12:00:00Z", "source": { "uri": "https://example.com/multilingual_media.mp4", "duration": 3600.5, @@ -156,8 +199,8 @@ The `"transcript"` object contains the transcription data, including speaker inf #### Fields -- **speakers** *(array, optional)*: List of speaker objects. -- **styles** *(array, optional)*: List of style definitions for formatting and positioning. +- **speakers** *(array, optional)*: List of speaker objects. May be empty to indicate speaker identification was attempted but no speakers were found. +- **styles** *(array, optional)*: List of style definitions for formatting and positioning. May be empty to indicate style processing was performed but no styles were defined. - **segments** *(array, mandatory)*: List of transcription segments. #### Speakers @@ -166,7 +209,7 @@ Each speaker object includes: - **id** *(string, mandatory)*: Unique identifier for the speaker. - MUST conform to the **Speaker ID Requirements** specified in the **Field Definitions and Constraints** section. -- **name** *(string, optional)*: Display name of the speaker. +- **name** *(string, optional)*: Display name of the speaker. May be empty to indicate an anonymous or unnamed speaker. - **extensions** *(object, optional)*: Any additional information about the speaker. ##### Example @@ -298,22 +341,24 @@ Style with format-specific features: Each segment object includes: -- **start** *(number, mandatory)*: Start time of the segment in seconds. -- **end** *(number, mandatory)*: End time of the segment in seconds. -- **is_zero_duration***(boolean, mandatory if `start` equals `end`)*: Indicates that the segment has zero duration. - - **MUST** be `true` if `start` equals `end`. - - **MUST NOT** be included if `start` does not equal `end`. +- **start** *(number, conditionally mandatory)*: Start time of the segment in seconds. If present, `end` **MUST** also be present. +- **end** *(number, conditionally mandatory)*: End time of the segment in seconds. If present, `start` **MUST** also be present. +- **is_zero_duration***(boolean)*: Indicates that the segment has zero duration. + - **MUST** be present and set to `true` when `start` equals `end` + - **MUST NOT** be present when `start` does not equal `end` + - If present, **MUST** be `true` - **text** *(string, mandatory)*: Transcribed text of the segment. - **speaker_id** *(string, optional)*: The `id` of the speaker from the `speakers` list. - **confidence** *(number, optional)*: Confidence score for the segment (0.0 - 1.0). - **language** *(string, optional)*: Language code for the segment (ISO 639-1 or ISO 639-3). - **style_id** *(string, optional)*: The `id` of the style from the `styles` list. -- **words** *(array, optional)*: List of word-level details. +- **words** *(array, optional)*: List of word-level details. When present (in "complete" or "partial" modes), must contain at least one word. Must be omitted entirely (not included as empty) when using `word_timing_mode: "none"` for segments where word timing isn't applicable or fails. - **start** *(number, mandatory)*: Start time of the word in seconds. - **end** *(number, mandatory)*: End time of the word in seconds. - - **is_zero_duration***(boolean, optional)*: Indicates that the word has zero duration. - - **MUST** be `true` if `start` equals `end`. - - **MUST NOT** be included if `start` does not equal `end`. + - **is_zero_duration***(boolean)*: Indicates that the word has zero duration. + - **MUST** be present and set to `true` when `start` equals `end` + - **MUST NOT** be present when `start` does not equal `end` + - If present, **MUST** be `true` - **text** *(string, mandatory)*: The word text. - **confidence** *(number, optional)*: Confidence score for the word (0.0 - 1.0). - **word_timing_mode** *(string, optional)*: Indicates the completeness of word-level timing data within the segment. @@ -322,53 +367,73 @@ Each segment object includes: ##### Example ```json -"segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Bonjour tout le monde.", - "speaker_id": "Speaker1", - "confidence": 0.95, - "language": "fr", - "style_id": "Style1", - "word_timing_mode": "complete", - "words": [ - { "start": 0.0, "end": 1.0, "text": "Bonjour" }, - { "start": 1.0, "end": 2.0, "text": "tout" }, - { "start": 2.0, "end": 3.0, "text": "le" }, - { "start": 3.0, "end": 4.0, "text": "monde." } - ] - }, - { - "start": 5.1, - "end": 10.0, - "text": "Gracias por estar aquí hoy.", - "speaker_id": "Speaker2", - "confidence": 0.93, - "language": "es", - "word_timing_mode": "partial", - "words": [ - { "start": 5.1, "end": 5.5, "text": "Gracias" } - ] - }, - { - "start": 10.1, - "end": 10.1, - "is_zero_duration": true, - "text": "[Applause]", - "speaker_id": "Speaker3", - "confidence": 0.92, - "language": "en", - "word_timing_mode": "none" +{ + "stj": { + "version": "0.6.0", + "transcript": { + "speakers": [ + {"id": "Speaker1", "name": "Speaker One"}, + {"id": "Speaker2", "name": "Speaker Two"}, + {"id": "Speaker3", "name": "Speaker Three"} + ], + "styles": [ + { + "id": "Style1", + "text": { + "color": "#FFFFFF", + "background": "#000000" + } + } + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Bonjour tout le monde.", + "speaker_id": "Speaker1", + "confidence": 0.95, + "language": "fr", + "style_id": "Style1", + "word_timing_mode": "complete", + "words": [ + { "start": 0.0, "end": 1.0, "text": "Bonjour" }, + { "start": 1.0, "end": 2.0, "text": "tout" }, + { "start": 2.0, "end": 3.0, "text": "le" }, + { "start": 3.0, "end": 4.0, "text": "monde." } + ] + }, + { + "start": 5.1, + "end": 10.0, + "text": "Gracias por estar aquí hoy.", + "speaker_id": "Speaker2", + "confidence": 0.93, + "language": "es", + "word_timing_mode": "partial", + "words": [ + { "start": 5.1, "end": 5.5, "text": "Gracias" } + ] + }, + { + "start": 10.1, + "end": 10.1, + "is_zero_duration": true, + "text": "[Applause]", + "speaker_id": "Speaker3", + "confidence": 0.92, + "language": "en" + } + ] + } } -] +} ``` In this example: - The first segment has complete word-level data (`word_timing_mode`: `"complete"`). - The second segment has partial word-level data (`word_timing_mode`: `"partial"`). -- The third segment has no word-level data (`word_timing_mode`: `"none"` or omitted). +- The third segment is a zero-duration segment, which must not have word timing mode or words array. ### Handling Multiple Languages @@ -454,37 +519,204 @@ In this example: - `metadata` and all its subfields - `speakers`, `styles`, `speaker_id`, `confidence`, `language`, `style_id`, `words`, `word_timing_mode`, etc. +**Note**: When optional fields are present but empty (empty arrays, objects, or strings), this indicates the field was processed but no content was found. When optional fields are omitted entirely, this indicates the field was not processed or is not applicable. See the Empty Value Constraints section under Structural Requirements for details. + ## Field Definitions and Constraints This section outlines the requirements and constraints for various fields used within the STJ format. It includes structural requirements, data type specifications, and detailed constraints for specific fields. ### Structural Requirements +### Default Behavior for Optional Fields + +By default, optional fields **SHOULD** be omitted entirely when: + +- The field is not applicable to the content +- The related feature or processing was not attempted +- There is no meaningful data to include + +#### Empty Array Rules + +- **Always Empty Allowed**: + - `speakers`: When speaker identification attempted but none found + - `styles`: When style processing performed but no styles defined + +- **Never Empty Allowed**: + - `segments`: Must contain at least one segment + - `languages`: If present, **MUST** contain at least one entry + - `words`: **MUST NOT** be empty in any word timing mode: + - In "complete" mode: Must contain all words with timing + - In "partial" mode: Must contain at least one word with timing + - In "none" mode: Array must be entirely omitted + - For segments where word timing fails or is not applicable: Use `word_timing_mode: "none"` and omit the array + +#### Empty Object Rules + +- **Always Empty Allowed**: + - `metadata`: When processing occurred but found no properties + - `extensions`: When processing occurred but found no valid extensions +- **Never Empty Allowed**: + - Required object fields + +#### Empty String Rules + +- **Always Empty Allowed**: + - `speaker.name`: For unnamed/anonymous speakers +- **Never Empty Allowed**: + - All other string fields + +When in doubt, omit optional fields entirely rather than including them as empty. + #### Empty Value Constraints - **Null Values**: - - Null values are **not allowed** for any field. - - Optional fields **MUST** be omitted entirely rather than set to null. + - Null values are **not allowed** for any field unless explicitly documented. + - Optional fields **MUST** be omitted entirely rather than set to null unless explicitly documented as allowing null. + - The `confidence` field **MAY** be null to indicate confidence scoring was attempted but failed. + +##### Confidence Field Exception Details + +The `confidence` field is allowed to be null because it represents three distinct states that need to be distinguishable: + +1. **Field Omitted**: Confidence scoring was not attempted +2. **Null Value**: Confidence scoring was attempted but failed +3. **Numeric Value**: Confidence was successfully calculated (0.0 to 1.0) + +Example: + +```json +{ + "segments": [ + { + "text": "Hello world", + "confidence": null, // Scoring attempted but failed + }, + { + "text": "Next segment" // No confidence scoring attempted + }, + { + "text": "Final segment", + "confidence": 0.95 // Successfully scored + } + ] +} +``` + +Applications processing STJ files should: + +- Treat a missing confidence field as "not attempted" +- Handle null confidence values as "attempted but failed" +- Process numeric confidence values normally #### Empty Arrays -- Empty arrays are **not allowed** for mandatory arrays (e.g., `segments`). -- Optional arrays (e.g., `speakers`, `styles`, `words`) **MUST** be omitted entirely rather than included as empty arrays. -- The `languages` array, if present, **MUST** contain at least one entry. -- **Words Array Constraints**: - - If `word_timing_mode` is `"none"`, the `words` array **MUST NOT** be included. - - If `word_timing_mode` is `"partial"`, the `words` array, if present, **MUST** contain at least one word object. +Optional arrays **MAY** be empty only in specific documented cases: + +- **Mandatory Arrays**: + - The `segments` array **MUST NOT** be empty. + - **Severity if violated:** ERROR + - The `languages` array, if present, **MUST** contain at least one entry. + - **Severity if violated:** ERROR + +- **Arrays That MAY Be Empty**: + - `speakers`: Empty array indicates speaker identification was attempted but no speakers were found + - `styles`: Empty array indicates style processing was performed but no styles were defined + +- **Special Case - Words Array**: + - The `words` array has specific rules: + - In "complete" mode: **MUST** contain all words with timing + - In "partial" mode: **MUST** contain at least one word + - In "none" mode: **MUST NOT** be present at all (array must be omitted entirely) + - Empty arrays are **NEVER** allowed in any mode + - **Severity if violated:** ERROR + - When word timing fails or isn't applicable: + - Use `word_timing_mode: "none"` + - Omit the `words` array entirely + - Do not include an empty array + +- **Default Behavior for Other Arrays**: + - Arrays **SHOULD** be omitted entirely rather than included as empty unless explicitly documented as allowing empty state + - Empty arrays in undocumented cases **SHOULD** result in a WARNING + +##### Examples + +Invalid cases: + +```json +{ + "segments": [], // Invalid: mandatory array must not be empty + "languages": [], // Invalid: if present, must contain at least one entry + "word_timing_mode": "complete", + "words": [] // Invalid: words array must not be empty when present +} +``` + +Guidance: Arrays **SHOULD** be omitted entirely (rather than included as empty) when: + +- The feature was not processed or is not applicable +- The presence of the array itself would be misleading #### Empty Objects -- Empty objects are **not allowed** for any required object fields. -- Optional object fields **MUST** be omitted entirely rather than included as empty objects. -- The `extensions` object, if present, **MUST** contain at least one namespace. +- Empty objects are **not allowed** for required object fields. + - **Severity if violated:** ERROR +- The following optional objects **MAY** be empty with specific semantic meanings: + - `metadata`: Empty object indicates metadata processing occurred but found no properties + - `extensions`: Empty object indicates extension processing occurred but found no valid extensions +- Other optional objects **SHOULD** be omitted entirely rather than included as empty unless they represent an intentionally empty state that needs to be distinguished from "not processed" or "not applicable". + - **Severity if violated:** WARNING #### Empty Strings -- Empty strings are **not allowed** for any field except where explicitly permitted. -- Optional string fields **MUST** be omitted entirely rather than included as empty strings. +- Empty strings are **not allowed** for any field except where explicitly documented. +- The following string fields **MAY** be empty with specific semantic meanings: + - `speaker.name`: Empty string indicates an intentionally unnamed or anonymous speaker +- All other optional string fields **MUST** be omitted entirely rather than included as empty strings. + +#### Empty Value Validation Requirements + +Implementations **MUST** validate: + +1. **Mandatory Arrays** + - The `segments` array **MUST NOT** be empty + - **Severity**: ERROR + +2. **Optional Arrays** + - Empty arrays are allowed only for: + - `speakers` + - `styles` + - Other documented cases where empty state has semantic meaning + - **Severity**: WARNING for unexpected empty arrays + +3. **Objects** + - Required objects **MUST NOT** be empty + - Optional objects may be empty only for: + - `metadata` + - `extensions` + - Other documented cases + - **Severity**: WARNING for unexpected empty objects + +4. **Default Field Omission** + - Optional fields **SHOULD** be omitted rather than included empty + - **Severity**: INFO when fields could be omitted + +#### Handling Empty Arrays and Objects Examples + +**Invalid Example of an Empty Mandatory `segments` Array:** + +```json +{ + "segments": [] +} +``` + +Explanation: The `segments` array is mandatory and must contain at least one segment. An empty `segments` array is invalid. + +##### Example Validation Messages + +- ERROR: "segments array must not be empty" +- WARNING: "empty array found for field 'custom_data' - consider omitting the field entirely" +- INFO: "empty metadata object found - consider omitting if no metadata processing was performed" #### Array Ordering Requirements @@ -506,11 +738,13 @@ This section outlines the requirements and constraints for various fields used w - All numeric values **MUST** use JSON number format. - Scientific notation is **not allowed**. -- Leading zeros are **not allowed** except for decimal values less than 1 (e.g., `0.5`). +- Leading zeros are **not allowed** except for: + - Decimal values less than 1 (e.g., `0.5`) + - Time values, which follow the [Time Format Requirements](#time-format-requirements) specified in their dedicated section - The negative zero value (`-0`) is **not allowed**. - The values `Infinity`, `-Infinity`, and `NaN` are **not allowed**. -**Note:** Time values (e.g., `start`, `end`) have specific precision and format requirements as detailed in the [Time Format Requirements](#time-format-requirements) section. These requirements take precedence for time-related fields. +**Note:** For time-related fields (`start`, `end`), the [Time Format Requirements](#time-format-requirements) take precedence over these general number format requirements. See the [Time Format Requirements](#time-format-requirements) section for detailed specifications of time value formatting. ### Time Format Requirements @@ -518,49 +752,73 @@ All time values in the STJ format (`start` and `end` fields) **MUST** follow the #### Format Specifications -- **Type**: Non-negative decimal numbers. -- **Precision**: Up to 3 decimal places (millisecond precision). -- **Range**: `[0.000, 999999.999]` seconds. -- **Significant Digits**: Must not exceed 6 digits before the decimal point. +- **Type**: Non-negative decimal numbers +- **Precision Requirements**: + - Input: Any number of decimal places allowed + - Processing: Values with more than 3 decimal places MUST be rounded to 3 decimal places using IEEE 754 round-to-nearest-even + - Storage: Maximum 3 decimal places (millisecond precision) +- **Range**: [0.000, 999999.999] seconds (after rounding). + - The maximum value 999999.999 is inclusive. Any value that would round to greater than 999999.999 MUST be rejected, even if the unrounded value is less than 999999.999 (e.g., 999999.9994 is valid as it rounds to 999999.999, but 999999.9995 **MUST** be rejected as it would round to 1000000.000). +- **Significant Digits**: Must not exceed 6 digits before the decimal point - **Formatting Rules**: - - Leading zeros before the decimal point are **allowed** but not required. - - Trailing zeros after the decimal point are **allowed** but not required. - - The decimal point **MUST** be present if there are decimal places. - - Scientific notation is **not allowed**. + - Leading zeros before the decimal point are allowed but not required + - Trailing zeros after the decimal point are allowed but not required + - The decimal point MUST be present if there are decimal places + - Scientific notation is not allowed + - Comma decimal separators are not allowed (**MUST** use period) #### Basic Constraints - For any segment or word: - - `start` **MUST NOT** be greater than `end`. - - If either `start` **or** `end` is present, the other **MUST** also be present, and both **MUST** be valid according to format specifications. -- For zero-duration items (`start` equals `end`): - - **MUST** include `is_zero_duration`: `true`. + - `start` **MUST NOT** be greater than `end` (after rounding) + - If either `start` or `end` is present, the other **MUST** also be present + - Both values **MUST** be valid according to format specifications +- The `is_zero_duration` field: + - **MUST** be present and set to `true` when `start` equals `end` (after rounding) + - **MUST NOT** be present when `start` does not equal `end` (after rounding) + - If present, **MUST** be `true` - For segments: - - - **MUST NOT** contain a `words` array. - - **MUST NOT** specify a `word_timing_mode`. - - - The `is_zero_duration` field: - - - **MUST** be `true` if and only if `start` equals `end`. - - **MUST NOT** be included when `start` does not equal `end`. - -#### Examples of Valid Time Values + - **MUST NOT** contain a `words` array + - **MUST NOT** specify a `word_timing_mode` +- Including `is_zero_duration` when `start` does not equal `end` **MUST** result in an ERROR during validation + +#### Examples of Time Values -- `0` (zero seconds) -- `0.0` (zero seconds) -- `0.000` (zero seconds with full precision) -- `1.5` (one and a half seconds) -- `10.100` (ten seconds and one hundred milliseconds) -- `999999.999` (maximum allowed value) +Valid Input Values and Their Processing: -#### Examples of Invalid Time Values +- `0` → stored as `0` or `0.0` +- `0.0` → stored as `0.0` +- `0.000` → stored as `0.000` +- `1.5` → stored as `1.5` +- `10.100` → stored as `10.100` +- `999999.999` → stored as `999999.999` + +IEEE 754 Round-to-Nearest-Even Examples: + +- `1.2345` → `1.235` (rounded up as 5 is even) +- `1.2335` → `1.234` (rounded up as 4 is even) +- `1.2325` → `1.232` (rounded down as 2 is even) +- `1.2315` → `1.232` (rounded up as 2 is even) +- `1.2305` → `1.230` (rounded down as 0 is even) + +Edge Cases: + +- `0.0005` → `0.001` (rounded up to even) +- `0.0015` → `0.002` (rounded up to even) +- `0.0025` → `0.002` (rounded down to even) +- `0.0035` → `0.004` (rounded up to even) +- `0.0045` → `0.004` (rounded down to even) + +Invalid Values (Must Be Rejected): - `-1.0` (negative values not allowed) - `1.5e3` (scientific notation not allowed) - `1000000.0` (exceeds maximum value) -- `1.2345` (exceeds maximum precision) +- `999999.9995` (would round above maximum) - `1,5` (incorrect decimal separator) +- Non-numeric values + +**Note:** These requirements for time values take precedence over the general [Number Format Requirements](#number-format-requirements) when formatting time-related fields (`start` and `end`). ### Character Encoding Requirements @@ -616,6 +874,7 @@ Applications **MUST**: - Reject files that: - Use ISO 639-3 codes for languages that have ISO 639-1 codes. - Mix different standards for the same language. + - Mix standards across different languages. - Contain invalid language codes. ### URI Format Requirements @@ -761,6 +1020,32 @@ Defines the format and constraints for the `uri` field in the `metadata.source` - All `speaker_id` references in segments **MUST** match an `id` in the `speakers` list. - Invalid references **MUST** result in a validation error. +#### Examples of invalid speaker references + +- Invalid: **References non-existent speaker** + +```json +{ + "speakers": [ + {"id": "Speaker1"} + ], + "segments": [{ + "speaker_id": "Speaker2" + }] +} +``` + +- Invalid: **Invalid character in ID** + +```json +{ + "speakers": [ + {"id": "Speaker@1"}, + {"id": "Speaker1"} + ] +} +``` + #### Implementation Notes - Applications **SHOULD** provide meaningful error messages when validation fails due to speaker ID issues. @@ -781,139 +1066,123 @@ Defines the format and constraints for the `uri` field in the `metadata.source` ### Word Timing Mode Field -#### Purpose - -Indicates the completeness of word-level timing data within the segment. +The `word_timing_mode` field indicates how word-level timing data is handled: #### Allowed Values -- `"complete"`: All words in the segment have timing data. -- `"partial"`: Only some words have timing data. -- `"none"`: No word-level timing data is provided. +- `"complete"`: + - **MUST** include a `words` array + - **MUST** have timing for every `word` in the segment + - Concatenated `words[].text` **MUST** match segment `text` when normalized for whitespace + - **MUST NOT** use for segments where word timing isn't applicable or fails + +- `"partial"`: + - **MUST** include a `words` array with at least one word + - Words in array **MUST** appear in same order as in segment text + +- `"none"`: + - **MUST NOT** include a `words` array + - Use for segments where: + - Word timing wasn't attempted + - Word timing isn't applicable (e.g., "[Music]", "[Applause]") + - Word timing was attempted but failed -#### Default Behavior +Empty `words` arrays are not allowed in any mode. For segments where: -- When omitted and a `words` array is present with complete coverage: Treated as `"complete"`. -- When omitted and `words` array is absent: Treated as `"none"`. -- When omitted and `words` array is present but incomplete: Invalid—must explicitly specify `"partial"`. +- Word timing was attempted but failed +- Word timing isn't applicable +- Word timing wasn't attempted +Use `word_timing_mode: "none"` and omit the words array entirely. -#### Constraints +#### Default Behavior -- For `"complete"`: All words **MUST** have timing data, and the concatenation of `words[].text` **SHOULD** match `segment.text`, accounting for whitespace and punctuation. -- For `"partial"`: Some words have timing data; the `words` array **MUST** contain at least one word object. -- For `"none"`: The `words` array **MUST NOT** be included. +- When `word_timing_mode` is omitted and a `words` array is present with complete coverage: Treated as `"complete"` +- When `word_timing_mode` is omitted and no `words` array is present: Treated as `"none"` +- When `word_timing_mode` is omitted and `words` array is present but incomplete: Invalid—**MUST** explicitly specify `"partial"` -### Extensions Field Requirements +Note: Empty `words` arrays are never allowed. Use `word_timing_mode: "none"` and omit the array entirely when word timing isn't applicable, fails, or wasn't attempted. -#### Purpose +#### Word Object Requirements -Allows for the inclusion of custom, application-specific metadata and format-specific properties without affecting compatibility with other implementations. +When word timing information is included (modes "complete" or "partial"), the `words` array **MUST** be present and each word object **MUST** include: -#### Structure +- `text` (string): The word text +- `start` (number): Start time in seconds +- `end` (number): End time in seconds +- `confidence` (number, optional): Confidence score for the word -- The `extensions` field, if present, **MUST** be a JSON object. -- Each key in `extensions` **MUST** represent a namespace and **MUST** be a non-empty string. -- The value corresponding to each namespace **MUST** be a JSON object containing key-value pairs specific to that namespace. +Time values MUST follow the Time Format Requirements defined in this specification. -#### Namespaces +Note: For segments where word timing is not applicable or fails, use `word_timing_mode: "none"` and omit the `words` array entirely. -##### Namespace Naming +### Extensions Field Requirements -- Namespaces **SHOULD** be concise and reflect the application, format, or organization. -- Examples include `"myapp"`, `"companyname"`, `"customformat"`. +The `extensions` field allows applications to include custom data without affecting core STJ functionality. -##### Reserved Namespaces +#### Structure -- The following namespaces are **RESERVED** for future use by the STJ specification and **MUST NOT** be used for custom data: - - `stj*` (reserved for STJ specification extensions) - - `webvtt` (reserved for WebVTT format mappings) - - `ttml` (reserved for TTML format mappings) - - `ssa` (reserved for SSA/ASS format mappings) - - `srt` (reserved for SubRip format mappings) - - `dfxp` (reserved for DFXP/Timed Text format mappings) - - `smptett` (reserved for SMPTE-TT format mappings) +- The `extensions` field, if present, **MUST** be a JSON object +- Each key in `extensions` **MUST** represent a namespace and **MUST** be a non-empty string +- Each namespace **MUST** contain a valid JSON object -**Applications MUST report an error** if a reserved namespace is used for custom data. +#### Processing Rules -##### Custom Namespaces +- Applications **MUST** ignore any namespaces they don't recognize +- Core STJ fields are authoritative for standard processing +- Extension data **MAY** provide supplementary information but **MUST NOT** override core field behavior -Developers who need to include format-specific properties before official definitions are available: +#### Reserved Namespaces -- May use custom prefixes to create unique namespaces that avoid conflicts with reserved namespaces and clearly indicate their provisional nature, such as `"custom_webvtt"`, `"x_srt"`, or `"experimental_ttml"`. -- Be prepared to migrate their data to the official namespace once the STJ specification provides the definitions. +The following namespaces are **RESERVED** for future use by the STJ specification: -##### Examples +- `stj*` (reserved for STJ specification extensions) +- `webvtt` (reserved for WebVTT format mappings) +- `ttml` (reserved for TTML format mappings) +- `ssa` (reserved for SSA/ASS format mappings) +- `srt` (reserved for SubRip format mappings) +- `dfxp` (reserved for DFXP/Timed Text format mappings) +- `smptett` (reserved for SMPTE-TT format mappings) -- **In a `segment` object**: +Applications **MUST** report an error if a reserved namespace is used by applications for custom data - ```json - "extensions": { - "myapp": { - "custom_property": "value", - "analysis_data": { - "sentiment_score": 0.85, - "keywords": ["innovation", "technology"] - } - }, - "analytics": { - "emotion": "happy", - "confidence": 0.9 - } - } - ``` +#### Best Practices -- **In a `style` object with format-specific properties**: +While not required, extension providers are encouraged to: - ```json - { - "id": "caption_style", - "text": { - "color": "#FFFFFF", - "background": "#000000" - }, - "display": { - "align": "center", - "vertical": "bottom" - }, - "extensions": { - "custom_webvtt": { - "line": "auto", - "position": "50%", - "size": "100%" - }, - "myapp": { - "custom_style_property": "value" - } - } - } - ``` +- Document the purpose and usage of their extension fields +- Use clear, descriptive namespace names +- Be especially clear when extension fields relate to core STJ concepts -- **In a `metadata` object**: +#### Examples - ```json - "extensions": { - "project_info": { - "project": "International Conference", - "client": "Global Events Inc." - }, - "notes": { - "review_status": "approved", - "reviewer": "John Doe" +Basic extension: + +```json +"extensions": { + "myapp": { + "custom_field": "value", + "analysis_data": { + "property": "value" } } - ``` - -**Note:** Standard fields defined in the STJ specification **MUST NOT** be duplicated within any namespace in `extensions`. For example, including a key `"start"` within a namespace is prohibited if it conflicts with the mandatory `"start"` field of the segment. +} +``` -#### Constraints +Extension with format-specific properties: -- Applications **MUST** ignore any namespaces in `extensions` that they do not recognize. -- The `extensions` field **SHOULD NOT** include essential data required for basic functionality. -- Nested objects and arrays **ARE ALLOWED** within each namespace. -- Keys within namespaces **MUST NOT** duplicate or conflict with standard fields of the containing object. +```json +"extensions": { + "custom_webvtt": { + "line": "auto", + "position": "50%" + } +} +``` ## Implementation Requirements +This section defines how implementations should process STJ files, including handling of optional fields, validation processing, and error reporting. It focuses on the practical aspects of implementing the specification. + ### Handling of Optional Fields Implementations **MUST** support files that include only the mandatory elements: `stj.version`, and `transcript.segments` with `text` values. @@ -922,82 +1191,283 @@ Implementations **SHOULD** gracefully handle the absence of optional fields and For example, if timing information is absent, applications may treat the transcription as untimed text. +### Field-Specific Format Precedence + +When multiple format requirements apply to a field, specific requirements take precedence over general requirements. The precedence order is: + +1. Field-specific requirements (e.g., [Time Format Requirements](#time-format-requirements) for time fields) +2. Type-specific requirements (e.g., general [Number Format Requirements](#number-format-requirements)) +3. Global format requirements + +Examples: + +- Time values may include leading zeros as specified in [Time Format Requirements](#time-format-requirements), despite the general prohibition in [Number Format Requirements](#number-format-requirements) +- Language codes must follow their specific format requirements regardless of general string formatting rules + ### Time Value Processing +#### Processing Requirements + +Implementations **MUST**: + +1. **Input Validation**: + - Accept numeric values with any number of decimal places + - Accept time values with or without leading zeros + - Verify decimal separator is period (.) + - Check value is non-negative + - Check value is not in scientific notation + - Reject if exceeds maximum range (even if would round to valid value) + - Example: reject `999999.9995` even though it would round to `1000000.000`, which exceeds the maximum allowed value of `999999.999` + +2. **Precision Processing**: + - Round values > 3 decimal places using IEEE 754 round-to-nearest-even + - Preserve original precision up to 3 decimal places + - Do not normalize to 3 decimal places + - Example: `1.5` remains `1.5`, not normalized to `1.500` + +3. **Output Requirements**: + - Store values with maximum 3 decimal places + - Preserve existing decimal places up to 3 + - Preserve leading zeros in time values when present + - Not add or remove leading zeros when processing time values + - Include decimal point if original value had decimal places + - Do not add/remove trailing zeros + +#### Time Value Validation Severity + +- **ERROR Level** (Must reject file): + - Negative values + - Values exceeding range (before or after rounding) + - Scientific notation + - Non-numeric values + - Incorrect decimal separator + - Missing required time field when its pair is present + +- **INFO Level**: + - Rounding of values with more than 3 decimal places + - Preservation of existing precision (not normalizing to 3 decimal places) + +#### Error Handling for Time Values + Implementations **MUST**: -- Parse time values with up to 3 decimal places. -- Preserve the precision of input values up to 3 decimal places. -- Round any input with more than 3 decimal places to 3 decimal places using IEEE 754 round-to-nearest-even. -- Validate all time values according to the [Time Format Requirements](#time-format-requirements) section. +- **For Invalid Time Values (ERROR level)**: + - Report specific validation failure (e.g., "negative value", "exceeds range") + - Include the invalid value in error message + - Reject the entire STJ file + - Example message: "Error: Invalid time value -1.0 at segment[0].start (negative values not allowed)" + +- **For Rounded Time Values (INFO level)**: + - MAY report when rounding has occurred + - Include original and rounded values in message + - Example message: "Info: Time value 1.2345 rounded to 1.235 at segment[2].end" -Implementations **MUST** reject files that contain any of the following: +Implementations **SHOULD**: -- Negative time values. -- Values exceeding 999999.999 seconds. -- Time values using scientific notation. +- Collect all time value errors before rejecting file +- Provide line/position information for errors when possible +- Include guidance in error messages about valid time formats -**Note:** Overlapping segments **SHOULD** be reported as warnings but do not require the file to be rejected. +**Note**: These time value requirements apply to all time fields in the STJ format, including segment times (`start`, `end`) and word-level timing data. For validation severity levels and error handling requirements, see the [Validation Requirements](#validation-requirements) section. ### Error Handling Implementations **MUST**: - **For ERROR-level issues**: - - Report the issues to the user. + - Report the issues to the user or calling process. - **MUST NOT** proceed with processing the STJ file. + - **Example ERROR issues**: + - Overlapping segments. + - Unordered segments. + - Invalid references. + - Missing required fields. + - Malformed data. + - **For WARNING-level issues**: - - Report the issues to the user. - - **MAY** proceed with processing, but **SHOULD** handle the potential inconsistencies. + - Report the issues to the user or calling process. + - **MAY** proceed with processing, but should do so cautiously. + - **Example WARNING issues**: + - Use of deprecated fields. + - Non-standard language codes. + - **For INFO-level issues**: - - **MAY** report the issues to the user for informational purposes. - - **MAY** proceed with processing without any changes. + - Reporting is optional. + - Processing should proceed normally. + - **Example INFO issues**: + - Suggestions for metadata enhancements. Implementations **SHOULD** strive to provide meaningful feedback to users to improve the quality of STJ files. -## Validation Approach +## Validation Requirements + +Implementations of the STJ format **MUST** perform validation that categorizes issues by severity levels. This section defines what must be validated, including validation rules and their associated severity levels. This approach ensures that: -Implementations of the STJ format **MUST** perform validation that categorizes issues by severity levels. This approach ensures that users are informed about the nature of any issues found in the STJ file and can take appropriate action based on the severity. +- Users are informed about the nature of any issues found in STJ files +- Appropriate actions can be taken based on severity +- Validation is consistent across implementations + +For details on how to implement these validation requirements, see the Implementation Requirements section. ### Severity Levels -Validation issues are categorized into three severity levels: - -1. **ERROR** (MUST violations) - - Issues that make the STJ file invalid and unusable. - - Examples: - - Invalid JSON structure. - - Missing mandatory fields (`stj.version`, `transcript.segments[].text`). - - Malformed data types. - -2. **WARNING** (SHOULD violations) - - Issues that do not invalidate the STJ file but may lead to unexpected behavior. - - Examples: - - Duplicate speaker IDs. - - Overlapping time segments. - - Missing recommended fields. - - Inconsistent language codes. - -3. **INFO** (MAY violations) - - Informational messages about optional best practices. - - Examples: - - Unused style definitions. - - Missing optional metadata. - - Unrecognized extensions. - -### Validation Process - -Implementations **SHOULD** follow these guidelines during validation: - -- **Comprehensive Validation**: - - Validate the entire STJ file, collecting all issues, rather than stopping at the first error. -- **Structured Reporting**: - - Provide structured results with clear severity levels. - - Include specific details about each issue. -- **Contextual Information**: - - Include the JSON path to the problematic field. - - Reference the relevant section of the specification. - - Suggest possible fixes when appropriate. +The STJ specification uses three severity levels to indicate the impact of validation issues: + +#### ERROR + +- Definition: Critical issues that make the file semantically invalid or could cause incorrect processing +- Result: File MUST be rejected +- Examples: + - Missing required fields + - Invalid field types or values + - Time value violations + - Overlapping segments + - Unordered segments + - Invalid references + - Malformed data + +#### WARNING + +- Definition: Issues that indicate potential problems but don't invalidate the file +- Result: Processing MAY continue with caution +- Examples: + - Use of deprecated fields + - Non-standard language codes + - Non-optimal patterns + - Unnecessary empty arrays/objects + +#### INFO + +- Definition: Suggestions for improvements or notifications of automatic adjustments +- Result: Processing continues normally +- Examples: + - Time value rounding occurred + - Metadata completeness suggestions + - Efficiency recommendations + - Style definition optimizations + +### Validation Sequence + +Implementations **SHOULD** perform validation in the following order: + +1. **Structure Validation**: + - JSON structure validity + - Root object requirements + - Required fields presence + - Array and object structure rules + +2. **Field Validation**: + - Data type requirements + - Value constraints + - Format requirements + +3. **Reference Validation**: + - Speaker ID references + - Style ID references + - Language code consistency + +4. **Content Validation**: + - Segment timing rules + - Word timing rules + - Text content requirements + +5. **Application-Specific Validation**: + - Implementation-specific requirements + - Custom extensions + +This sequence aligns with the guidelines provided in the [Validation Requirements](#validation-requirements) section. + +### Validation Categories and Rules + +This section provides an overview of all validation requirements organized by category. Detailed rules can be found in their referenced sections. + +#### Structure Validation + +##### Basic File Structure + +- JSON structure and encoding: See [Character Encoding Requirements](#character-encoding-requirements) +- Root object requirements: See [Root Structure](#root-structure) +- Additional properties restrictions: See [Root Structure](#root-structure) +- File extension requirements: See [Specification](#specification) + +##### Empty Value Rules + +- Null value restrictions: See [Empty Value Constraints](#empty-value-constraints) +- Empty array handling: See [Empty Array Rules](#empty-array-rules) +- Empty object handling: See [Empty Object Rules](#empty-object-rules) +- Empty string handling: See [Empty String Rules](#empty-string-rules) + +##### Array Structure + +- Array ordering requirements: See [Array Ordering Requirements](#array-ordering-requirements) +- Mandatory vs optional arrays: See [Empty Array Rules](#empty-array-rules) + +#### Field-Specific Validation + +##### Time Values + +- Format and range requirements: See [Time Format Requirements](#time-format-requirements) +- Precision and rounding rules: See [Time Format Requirements](#time-format-requirements) +- Basic constraints: See [Basic Constraints](#basic-constraints) under Time Format Requirements +- Zero-duration requirements: See [Basic Constraints](#basic-constraints) under Time Format Requirements + +##### Language Codes + +- Standard requirements: See [Language Codes > Standards](#standards) +- Consistency requirements: See [Language Codes > Consistency Requirements](#consistency-requirements) +- Application requirements: See [Language Codes > Application Requirements](#application-requirements) + +##### Speaker and Style IDs + +- Format specifications: See [Speaker IDs > Format Specifications](#format-specifications) +- Uniqueness requirements: See [Speaker IDs > Validation Rules](#validation-rules) +- Reference validation: See [Speaker IDs > Examples of invalid speaker references](#examples-of-invalid-speaker-references) +- Style ID requirements: See [Style IDs](#style-ids) + +##### URI Validation + +- Format specifications: See [URI Format Requirements](#uri-format-requirements) +- Scheme support: See [URI Format Requirements > Format Specifications](#format-specifications-1) +- Security considerations: See [URI Format Requirements > Security Considerations](#security-considerations) + +##### Metadata Validation + +- Field requirements: See [Metadata Section > Fields](#fields) +- Language specifications: See [Metadata Section > Clarification on languages Fields](#clarification-on-languages-fields) + +#### Content Validation + +##### Segment Validation + +- Required fields: See [Segment-Level Validation > Required Fields](#required-fields) +- Time field requirements: See [Segment-Level Validation > Time Fields](#time-fields) +- Reference validation: See [Segment-Level Validation > References](#references) +- Ordering requirements: See [Segment-Level Validation > Segment Ordering](#segment-ordering) +- Overlap restrictions: See [Segment-Level Validation > Segment Overlap](#segment-overlap) +- Zero-duration rules: See [Segment-Level Validation > Zero-Duration Segments](#zero-duration-segments) + +##### Word Level Validation + +- Required field validation: See [Word-Level Validation > Required Field Validation](#required-field-validation) +- Timing validation: See [Word-Level Validation > Timing Validation](#timing-validation) +- Mode-specific validation: See [Word-Level Validation > Mode-Specific Validation](#mode-specific-validation) +- Text alignment requirements: See [Word Text Alignment > Requirements](#requirements) + +##### Extensions Validation + +- Structure requirements: See [Extensions Field Requirements > Structure](#structure) +- Reserved namespace protection: See [Extensions Field Requirements > Reserved Namespaces](#reserved-namespaces) +- Processing rules: See [Extensions Field Requirements > Processing Rules](#processing-rules) + +### Error Reporting Requirements + +Implementations **MUST**: + +- Provide clear error messages when **ERROR** level issues are detected. +- Include the JSON path to the problematic field in error messages. +- **MUST NOT** process the STJ file further if **ERROR** level issues are present. +- **SHOULD** report **WARNING** and **INFO** level issues to guide users. +- Report multiple validation issues when possible, rather than stopping at the first error. ### Response Format @@ -1037,57 +1507,6 @@ Implementations **SHOULD** output validation results in a structured format, suc } ``` -### Processing Instructions - -Implementations **SHOULD** follow the validation sequence outlined in the [Validation Requirements](#validation-sequence) section to ensure consistency and completeness. - -### Best Practices - -- **Error Messages**: - - Be specific and actionable. - - Use consistent terminology. - - Reference relevant specification sections. - -- **Extensibility**: - - Support custom validation rules if needed. - - Allow users to filter or prioritize certain rules. - -- **Performance**: - - Optimize validation to handle large STJ files efficiently. - - Avoid redundant checks by caching results when appropriate. - -## Validation Requirements - -### Validation Sequence - -Implementations **SHOULD** perform validation in the following order: - -1. **Structure Validation**: - - Ensure the JSON structure is valid. - - Validate that the root structure contains a single `"stj"` object with the required fields. -2. **Field Validation**: - - Validate individual fields based on their definitions. -3. **Reference Validation**: - - Check that references (e.g., `speaker_id`, `style_id`) are valid. -4. **Content Validation**: - - Verify content-specific rules (e.g., timing overlaps). -5. **Application-Specific Validation**: - - Perform any additional validations required by the application. -6. **Extensions Validation**: - - Validate the `extensions` field structure and namespaces. - -This sequence aligns with the guidelines provided in the [Validation Approach](#validation-approach) section. - -### Error Reporting Requirements - -Implementations **MUST**: - -- Provide clear error messages when **ERROR** level issues are detected. -- Include the JSON path to the problematic field in error messages. -- **MUST NOT** process the STJ file further if **ERROR** level issues are present. -- **SHOULD** report **WARNING** and **INFO** level issues to guide users. -- Report multiple validation issues when possible, rather than stopping at the first error. - ### Segment-Level Validation - **Required Fields**: @@ -1096,7 +1515,7 @@ Implementations **MUST**: - **Time Fields**: - `start` and `end` times, if present, **MUST** conform to the [Time Format Requirements](#time-format-requirements) section. - **Severity if violated:** ERROR - - If `start` equals end, `is_zero_duration` MUST be included and set to `true`. + - If `start` equals end, `is_zero_duration` **MUST** be included and set to `true`. - **Severity if violated:** ERROR - **References**: @@ -1106,36 +1525,220 @@ Implementations **MUST**: - **Severity if violated:** ERROR - **Segment Ordering**: - - Segments **SHOULD** be ordered by their `start` times in ascending order. - - **Severity if violated:** WARNING - - For segments with identical start times, they **SHOULD** be ordered by their end times in ascending order. - - **Severity if violated:** WARNING + - Segments **MUST** be ordered by their `start` times in ascending order. + - **Severity if violated:** ERROR + - **Rationale**: Unordered segments can disrupt processing logic and lead to incorrect media synchronization. + - For segments with identical start times, they **MUST** be ordered by their end times in ascending order. + - **Severity if violated:** ERROR + - **Rationale**: Consistent ordering is essential for predictable processing and display. + - For segments with identical start and end times, the original array order **MUST** be preserved. + - **Severity if violated:** ERROR + - **Rationale**: Maintaining original order ensures stable sorting and preserves intended sequence of simultaneous events. - **Segment Overlap**: - - Segments **SHOULD NOT** overlap in time. - - **Severity if violated:** WARNING - - **Guidelines for Overlapping Segments**: - - Applications **SHOULD** handle overlapping segments gracefully, such as by merging or adjusting timings. - - Overlapping segments **MAY** indicate issues with the data that users should review. + - Segments **MUST NOT** overlap in time. + - **Severity if violated:** ERROR + - **Rationale**: Overlapping segments create ambiguity about which text applies at what time and can cause rendering issues. + - **Error Recovery Guidelines**: + - While overlapping segments make an STJ file invalid, applications processing potentially invalid files **SHOULD** implement error recovery strategies rather than fail completely. + - Recovery strategies **MAY** include: + - Merging overlapping segments + - Adjusting segment timings to eliminate overlaps + - Alerting users to review and correct the overlaps + - Applications implementing recovery strategies **MUST** still report the overlap as an ERROR during validation. - **Zero-Duration Segments**: - **MUST** follow the zero-duration requirements defined in the [Time Format Requirements](#time-format-requirements) section. - **Severity if violated:** ERROR + - The presence of `is_zero_duration` when `start` does not equal `end` **MUST** result in an ERROR + +- **Timing Consistency**: + - If any segment in a transcript includes timing information (`start` and `end`), all segments in that transcript MUST include timing information. + - **Severity if violated:** ERROR + - **Rationale**: Mixed timed/untimed segments create ambiguity in processing and display. + +#### Overlapping Segments Examples + +**Example of Non-Compliant Overlapping Segments:** + +```json +{ + "segments": [ + { + "start": 5.0, + "end": 10.0, + "text": "First segment" + }, + { + "start": 8.0, + "end": 12.0, + "text": "Second segment" + } + ] +} +``` + +*Explanation*: The second segment starts at 8.0 seconds, which is before the end of the first segment at 10.0 seconds. This creates an overlap between 8.0 and 10.0 seconds, violating the requirement that segments **MUST NOT** overlap. ### Word-Level Validation -- **When `words` array is present**: - - Each word object **MUST** have `text`, `start`, and `end`. - - **Severity if violated:** ERROR - - All time values **MUST** conform to the [Time Format Requirements](#time-format-requirements) section. - - **Severity if violated:** ERROR - - Word timing constraints: - - Word times **MUST** be within the parent segment's time range. - - **Severity if violated:** ERROR - - Words **MUST** be ordered by `start` time. - - **Severity if violated:** ERROR - - Word timings **SHOULD NOT** overlap. - - **Severity if violated:** WARNING +#### Required Field Validation + +When the `words` array is present: + +- Each word object **MUST** have: + - `text` (string, non-empty) + - `start` (number) + - `end` (number) + - **Severity if violated:** ERROR + +#### Timing Validation + +- Word times MUST be within the parent segment's time range + - **Severity if violated:** ERROR +- Words MUST be ordered by `start` time + - **Severity if violated:** ERROR +- Word timings SHOULD NOT overlap + - **Severity if violated:** WARNING + +#### Mode-Specific Validation + +##### Complete Mode (`word_timing_mode: "complete"`) + +The `words` array: + +- **MUST** be present and non-empty +- **MUST** have concatenated `words[].text` match segment `text` when normalized for whitespace +- **MUST** have timing data for each word +- **Severity if violated:** ERROR + +##### Partial Mode (`word_timing_mode: "partial"`) + +The `words` array: + +- **MUST** be present and contain at least one word +- **MUST** have each `words[].text` match a substring in segment `text` +- **MUST** have words appear in the same order as in segment `text` +- **Severity if violated:** ERROR + +##### None Mode (`word_timing_mode: "none"`) + +The `words` array: + +- **MUST NOT** be present (array must be completely omitted, not included as empty) +- Use this mode for segments where: + - Word timing wasn't attempted + - Word timing isn't applicable + - Word timing was attempted but failed +- **Severity if violated:** ERROR + +#### Example of Failed Word Timing + +```json +{ + "start": 15.0, + "end": 20.0, + "text": "Background noise made word timing impossible", + "word_timing_mode": "none" + // Note: words array is entirely omitted, not included as empty +} +``` + +### Word Text Alignment + +#### Requirements + +1. **Word Order** + - Words in the `words` array MUST appear in the same order as they do in the segment's `text` field. + - The text of each word in `words[].text` MUST match its corresponding occurrence in the segment's `text` field. + +2. **Text Matching** + - Implementations MUST preserve the exact text content of words, including: + - Case sensitivity + - Punctuation + - Special characters + - Whitespace within word boundaries (if any) + +3. **Tokenization** + - For `word_timing_mode: "complete"`: + - The concatenated `words[].text` MUST match the segment's `text` when normalized for inter-word whitespace. + - For `word_timing_mode: "partial"`: + - Each `words[].text` MUST match a corresponding substring in the segment's `text`. + - Words MUST be tokenized consistently within a segment. + +#### Examples + +1. **Complete Word Timing**: + +```json +{ + "text": "Hello, world!", + "word_timing_mode": "complete", + "words": [ + {"text": "Hello,", "start": 0.0, "end": 0.5}, + {"text": "world!", "start": 0.6, "end": 1.0} + ] +} +``` + +2. **Partial Word Timing:** + +```json +{ + "text": "Hello, wonderful world!", + "word_timing_mode": "partial", + "words": [ + {"text": "Hello,", "start": 0.0, "end": 0.5}, + {"text": "world!", "start": 1.0, "end": 1.5} + ] +} +``` + +3. **Complex Punctuation Example:** + +```json +{ + "text": "\"Don't,\" she said, \"go there!\"", + "word_timing_mode": "partial", + "words": [ + {"text": "\"Don't,\"", "start": 0.0, "end": 0.5}, + {"text": "there!\"", "start": 1.0, "end": 1.5} + ] +} +``` + +### Word Timing Implementation Notes + +#### Tokenization Recommendations + +1. **Basic Tokenization** + - Split on whitespace as a baseline approach + - Preserve punctuation attached to words + - Keep contractions as single tokens + - Maintain quotation marks with their associated words + +2. **Edge Cases** + - Multi-word expressions (e.g., "New York") should be treated as single tokens if timed as one unit + - Hyphenated words should be kept as single tokens + - Numbers, dates, and times should be treated as single tokens + +#### Text Alignment Strategies + +1. **For Complete Mode**: + - Validate that all words are present + - Compare normalized text (removing extra whitespace) to detect missing or extra words + - Report specific mismatches to aid debugging + +2. **For Partial Mode**: + - Use string matching to verify word presence and order + - Consider implementing fuzzy matching for robustness + - Cache tokenization results for efficiency + +#### Performance Considerations + +- Consider caching tokenization results +- Use efficient string matching algorithms for validation +- Implement incremental validation for large documents ### General Validation @@ -1151,8 +1754,10 @@ Implementations **MUST**: - **Language Code Requirements**: - All language codes **MUST** be valid ISO 639 codes. - **Severity if violated:** ERROR - - Language codes **SHOULD** be consistent throughout the file. - - **Severity if violated:** WARNING + - Language codes **MUST** use ISO 639-1 codes when available. + - **Severity if violated:** ERROR + - Language codes **MUST** be consistent throughout the file, using ISO 639-1 where available and ISO 639-3 only for languages without ISO 639-1 codes. + - **Severity if violated:** ERROR - **Confidence Score Requirements**: - Confidence scores, if present, **MUST** be within the range [0.0, 1.0]. @@ -1176,33 +1781,45 @@ Implementations **MUST**: - Control characters **MUST** be properly escaped. - **Severity if violated:** ERROR +- **Time Value Requirements**: + - All time values **MUST** conform to the [Time Format Requirements](#time-format-requirements) and Processing Requirements. + - **Severity if violated:** ERROR + - Input validation requirements **MUST** be checked before rounding. + - **Severity if violated:** ERROR + - Leading zeros in time values **MUST** be preserved if present. + - **Severity if violated:** ERROR + - Rounding of time values with more than 3 decimal places **MUST** be reported. + - **Severity level:** INFO + - Precision preservation requirements **MUST** be followed. + - **Severity level:** INFO + ### Extensions Field Validation - **Structure Validation:** - - The `extensions` field, if present, **MUST** be a JSON object. - - Namespaces **MUST** be strings and **MUST NOT** be empty. - - Values corresponding to namespaces **MUST** be JSON objects. + - The `extensions` field, if present, **MUST** be a JSON object + - Namespaces **MUST** be strings and **MUST NOT** be empty + - Values corresponding to namespaces **MUST** be JSON objects - **Reserved Namespaces Validation:** - - Namespaces listed as **RESERVED** in the specification **MUST NOT** be used by applications for custom data. - - Applications **MUST** report an error if a reserved namespace is used. + - Namespaces listed as **RESERVED** **MUST NOT** be used by applications for custom data + - Applications **MUST** report an error if a reserved namespace is used - **Content Validation:** - - Applications **MUST** ignore any namespaces or keys within `extensions` that they do not recognize. - - Values within namespaces **MAY** be validated based on application-specific requirements. + - Applications **MUST** ignore any namespaces they don't recognize + - Values within namespaces **MAY** be validated based on application-specific requirements -- **Conflict Resolution:** - - If a key within a namespace in `extensions` conflicts with a standard field, the standard field's value **MUST** take precedence. - - Applications **MUST** report an error if a conflict is detected. +- **Core Field Priority:** + - When processing STJ files, applications **MUST** use core field values for standard functionality + - Extension data **MUST NOT** override or alter the behavior of core fields ### Style Processing Implementations: -- MAY support none, some, or all style properties -- MUST ignore style properties they don't support -- MUST document which style properties they support -- SHOULD provide reasonable fallback behavior for unsupported properties +- **MAY** support none, some, or all style properties +- **MUST** ignore style properties they don't support +- **MUST** document which style properties they support +- **SHOULD** provide reasonable fallback behavior for unsupported properties When converting STJ to other formats, implementations: diff --git a/stjlib/validation.py b/stjlib/validation.py new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/stjlib/validation.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/tests/python/test_examples_with_schema.py b/tests/python/test_examples_with_schema.py new file mode 100644 index 0000000..96f48ae --- /dev/null +++ b/tests/python/test_examples_with_schema.py @@ -0,0 +1,28 @@ +import json +import jsonschema + +def test_simple_example_schema(): + """Test simple.stj.json against JSON schema.""" + # Load schema + with open('spec/schema/latest/stj-schema.json') as f: + schema = json.load(f) + + # Load STJ file + with open('examples/latest/simple.stj.json') as f: + data = json.load(f) + + # Validate against schema + jsonschema.validate(instance=data, schema=schema) + +def test_complex_example_schema(): + """Test complex.stj.json against JSON schema.""" + # Load schema + with open('spec/schema/latest/stj-schema.json') as f: + schema = json.load(f) + + # Load STJ file + with open('examples/latest/complex.stj.json') as f: + data = json.load(f) + + # Validate against schema + jsonschema.validate(instance=data, schema=schema) \ No newline at end of file diff --git a/tests/python/test_examples_with_validator.py b/tests/python/test_examples_with_validator.py new file mode 100644 index 0000000..d9833a5 --- /dev/null +++ b/tests/python/test_examples_with_validator.py @@ -0,0 +1,13 @@ +from stjlib import StandardTranscriptionJSON + +def test_simple_example(): + """Test simple.stj.json with stjlib validation.""" + stj = StandardTranscriptionJSON.from_file('examples/latest/simple.stj.json') + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_complex_example(): + """Test complex.stj.json with stjlib validation.""" + stj = StandardTranscriptionJSON.from_file('examples/latest/complex.stj.json') + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues \ No newline at end of file diff --git a/tests/python/test_spec_samples.py b/tests/python/test_spec_samples.py new file mode 100644 index 0000000..7e1ea52 --- /dev/null +++ b/tests/python/test_spec_samples.py @@ -0,0 +1,890 @@ +"""Tests for examples from the STJ specification document. + +This module contains tests for all JSON examples provided in the STJ specification. +Each test validates that the example from the spec is valid according to stjlib. + +Spec sections covered: +- Root Structure (#root-structure) + - Basic structure + - Invalid root structures +- Metadata Section (#metadata-section) + - Basic metadata + - Languages metadata + - Source metadata +- Transcript Section (#transcript-section) + - Speakers + - Segments + - Styles +- Word Timing (#word-timing) + - Complete mode + - Partial mode + - Text alignment +- Time Format (#time-format) + - Valid time values + - Invalid time values +- Extensions (#extensions) + - Custom extensions + - Format-specific extensions +- Format Comparisons + - SRT examples + - WebVTT examples + - TTML examples +- Empty Value Validation (#empty-value-validation) + - Array validation + - Object validation + - String validation +- Language Code Validation (#language-code-validation) + - ISO 639-1 codes + - ISO 639-3 codes + - Code consistency +- Reserved Namespace Examples (#reserved-namespaces) + - Reserved namespace protection + - Custom namespace usage +""" + +import pytest +from stjlib import StandardTranscriptionJSON + +# Root Structure Examples +def test_basic_stj_structure(): + """Tests the basic STJ structure example. + + Reference: spec/latest/stj-specification.md#root-structure + """ + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "segments": [ + {"text": "Hello world"} + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_invalid_root_structures(): + """Tests the invalid root structure examples. + + Reference: spec/latest/stj-specification.md#examples-of-invalid-root-structures + """ + # Missing mandatory fields + with pytest.raises(Exception): + StandardTranscriptionJSON.from_dict({"stj": {}}) + + # Missing transcript + with pytest.raises(Exception): + StandardTranscriptionJSON.from_dict({ + "stj": { + "version": "0.6.0" + } + }) + + # Missing stj root object + with pytest.raises(Exception): + StandardTranscriptionJSON.from_dict({ + "version": "0.6.0", + "transcript": {} + }) + +def test_metadata_example(): + """Tests the metadata section example. + + Reference: spec/latest/stj-specification.md#metadata-section + """ + stj_data = { + "stj": { + "version": "0.6.0", + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.6.0" + }, + "created_at": "2024-10-27T12:00:00Z" + }, + "transcript": { + "segments": [{"text": "Hello"}] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_languages_metadata_example(): + """Tests the languages metadata example. + + Reference: spec/latest/stj-specification.md#metadata-section + """ + stj_data = { + "stj": { + "version": "0.6.0", + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" + }, + "created_at": "2024-10-20T12:00:00Z", + "source": { + "uri": "https://example.com/multilingual_media.mp4", + "duration": 3600.5, + "languages": ["en", "es"] + }, + "languages": ["fr"], + "confidence_threshold": 0.6, + "extensions": { + "project_info": { + "project": "International Conference", + "client": "Global Events Inc." + } + } + }, + "transcript": { + "segments": [{"text": "Hello"}] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +# Speakers Examples +def test_speakers_example(): + """Tests the speakers example. + + Reference: spec/latest/stj-specification.md#speakers + """ + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "speakers": [ + {"id": "Speaker1", "name": "Dr. Smith"}, + {"id": "Speaker2", "name": "Señora García"}, + {"id": "Speaker3", "name": "Monsieur Dupont"}, + {"id": "Speaker4"} # Anonymous speaker + ], + "segments": [{"text": "Hello"}] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +# Segments Examples +def test_segments_example(): + """Tests the segments example with different word timing modes. + + Reference: spec/latest/stj-specification.md#segments + """ + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "speakers": [ + {"id": "Speaker1", "name": "Speaker One"}, + {"id": "Speaker2", "name": "Speaker Two"}, + {"id": "Speaker3", "name": "Speaker Three"} + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Bonjour tout le monde.", + "speaker_id": "Speaker1", + "confidence": 0.95, + "language": "fr", + "word_timing_mode": "complete", + "words": [ + {"start": 0.0, "end": 1.0, "text": "Bonjour"}, + {"start": 1.0, "end": 2.0, "text": "tout"}, + {"start": 2.0, "end": 3.0, "text": "le"}, + {"start": 3.0, "end": 4.0, "text": "monde."} + ] + }, + { + "start": 5.1, + "end": 10.0, + "text": "Gracias por estar aquí hoy.", + "speaker_id": "Speaker2", + "confidence": 0.93, + "language": "es", + "word_timing_mode": "partial", + "words": [ + {"start": 5.1, "end": 5.5, "text": "Gracias"} + ] + }, + { + "start": 10.1, + "end": 10.1, + "is_zero_duration": True, + "text": "[Applause]", + "speaker_id": "Speaker3", + "confidence": 0.92, + "language": "en" + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_multilingual_example(): + """Tests the multilingual transcription example. + + Reference: spec/latest/stj-specification.md#example-scenario-translated-transcription + """ + stj_data = { + "stj": { + "version": "0.6.0", + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" + }, + "created_at": "2024-10-20T12:00:00Z", + "source": { + "uri": "https://example.com/event.mp4", + "duration": 5400.0, + "languages": ["en", "es"] + }, + "languages": ["fr", "de"] + }, + "transcript": { + "speakers": [ + {"id": "Speaker1", "name": "French Translator"}, + {"id": "Speaker2", "name": "German Translator"} + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Bonjour à tous.", + "speaker_id": "Speaker1", + "confidence": 0.95, + "language": "fr" + }, + { + "start": 5.1, + "end": 10.0, + "text": "Willkommen alle zusammen.", + "speaker_id": "Speaker2", + "confidence": 0.94, + "language": "de" + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +# Style Examples +def test_style_examples(): + """Tests the style examples from the spec. + + Reference: spec/latest/stj-specification.md#examples-1 + """ + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "styles": [ + { + "id": "speaker_1", + "text": { + "color": "#2E4053", + "bold": True, + "size": "110%" + } + }, + { + "id": "caption_style", + "text": { + "color": "#FFFFFF", + "background": "#000000" + }, + "display": { + "align": "center", + "vertical": "bottom", + "position": { + "x": "50%", + "y": "90%" + } + } + }, + { + "id": "advanced_style", + "text": { + "color": "#FFFFFF" + }, + "extensions": { + "custom_ssa": { + "effect": "karaoke", + "outline": 2, + "shadow": 1 + } + } + } + ], + "segments": [{"text": "Hello"}] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_word_text_alignment_examples(): + """Tests the word text alignment examples from the spec. + + Reference: spec/latest/stj-specification.md#word-text-alignment + """ + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "segments": [ + { + "text": "Hello, world!", + "word_timing_mode": "complete", + "words": [ + {"text": "Hello,", "start": 0.0, "end": 0.5}, + {"text": "world!", "start": 0.6, "end": 1.0} + ] + }, + { + "text": "Hello, wonderful world!", + "word_timing_mode": "partial", + "words": [ + {"text": "Hello,", "start": 0.0, "end": 0.5}, + {"text": "world!", "start": 1.0, "end": 1.5} + ] + }, + { + "text": "\"Don't,\" she said, \"go there!\"", + "word_timing_mode": "partial", + "words": [ + {"text": "\"Don't,\"", "start": 0.0, "end": 0.5}, + {"text": "there!\"", "start": 1.0, "end": 1.5} + ] + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_extension_examples(): + """Tests the extension examples from the spec. + + Reference: spec/latest/stj-specification.md#extensions-field-requirements + """ + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "segments": [ + { + "text": "Hello", + "extensions": { + "myapp": { + "custom_field": "value", + "analysis_data": { + "property": "value" + } + } + } + }, + { + "text": "World", + "extensions": { + "custom_webvtt": { + "line": "auto", + "position": "50%" + } + } + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_uri_format_examples(): + """Tests the URI format examples from the spec. + + Reference: spec/latest/stj-specification.md#uri-format-requirements + """ + stj_data = { + "stj": { + "version": "0.6.0", + "metadata": { + "source": { + "uri": "http://example.com/media/video.mp4" + } + }, + "transcript": { + "segments": [{"text": "Hello"}] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + + # Test other URI examples + valid_uris = [ + "https://example.com/media/audio.mp3", + "file:///C:/Media/video.mp4", + "file:///home/user/media/audio.mp3", + "s3://bucket-name/path/to/object" + ] + + for uri in valid_uris: + stj_data["stj"]["metadata"]["source"]["uri"] = uri + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_complex_multilingual_example(): + """Tests the complex multilingual example from the spec. + + Reference: spec/latest/stj-specification.md#example-scenario-translated-transcription + """ + stj_data = { + "stj": { + "version": "0.6.0", + "metadata": { + "transcriber": { + "name": "YAWT", + "version": "0.4.0" + }, + "created_at": "2024-10-19T15:30:00Z", + "source": { + "uri": "https://example.com/funny_conference.mp4", + "duration": 1800.0, + "languages": ["en", "es", "de"] + }, + "languages": ["en", "es", "de"], + "confidence_threshold": 0.6, + "additional_info": { + "project": "Annual Humor Conference", + "client": "LaughCorp International" + } + }, + "transcript": { + "speakers": [ + { + "id": "Speaker1", + "name": "Dr. Chuckles", + "additional_info": { + "role": "Keynote Speaker" + } + }, + { + "id": "Speaker2", + "name": "Ms. Giggles", + "additional_info": { + "role": "Panelist" + } + }, + { + "id": "Speaker3", + "name": "Herr Lachen", + "additional_info": { + "role": "Guest Speaker" + } + } + ], + "styles": [ + { + "id": "Style1", + "formatting": { + "bold": True, + "italic": False, + "underline": False, + "color": "#FF5733", + "background_color": "#000000" + }, + "positioning": { + "align": "center", + "line": "auto", + "position": "50%", + "size": "100%" + } + } + ], + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Ladies and gentlemen, welcome to the Annual Humor Conference!", + "speaker_id": "Speaker1", + "confidence": 0.98, + "language": "en", + "style_id": "Style1", + "words": [ + {"start": 0.0, "end": 0.5, "text": "Ladies", "confidence": 0.99}, + {"start": 0.5, "end": 0.7, "text": "and", "confidence": 0.98}, + {"start": 0.7, "end": 1.2, "text": "gentlemen,", "confidence": 0.97}, + {"start": 1.3, "end": 2.0, "text": "welcome", "confidence": 0.99}, + {"start": 2.1, "end": 2.3, "text": "to", "confidence": 0.98}, + {"start": 2.3, "end": 2.5, "text": "the", "confidence": 0.98}, + {"start": 2.6, "end": 3.5, "text": "Annual", "confidence": 0.97}, + {"start": 3.6, "end": 5.0, "text": "Humor Conference!", "confidence": 0.96} + ] + }, + { + "start": 12.1, + "end": 17.0, + "text": "¡Y ahora, un poco de humor en español!", + "speaker_id": "Speaker2", + "confidence": 0.94, + "language": "es" + }, + { + "start": 22.1, + "end": 27.0, + "text": "Und jetzt etwas auf Deutsch!", + "speaker_id": "Speaker3", + "confidence": 0.92, + "language": "de" + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_confidence_threshold_example(): + """Tests the confidence threshold example from the spec. + + Reference: spec/latest/stj-specification.md#representing-confidence + """ + stj_data = { + "stj": { + "version": "0.6.0", + "metadata": { + "confidence_threshold": 0.8 # High confidence threshold + }, + "transcript": { + "segments": [ + { + "text": "High confidence segment", + "confidence": 0.95, + "words": [ + {"text": "High", "confidence": 0.98, "start": 0.0, "end": 0.5}, + {"text": "confidence", "confidence": 0.96, "start": 0.6, "end": 1.2}, + {"text": "segment", "confidence": 0.92, "start": 1.3, "end": 2.0} + ] + }, + { + "text": "Lower confidence segment", + "confidence": 0.75, # Below threshold + "words": [ + {"text": "Lower", "confidence": 0.78, "start": 2.1, "end": 2.5}, + {"text": "confidence", "confidence": 0.72, "start": 2.6, "end": 3.2}, + {"text": "segment", "confidence": 0.75, "start": 3.3, "end": 4.0} + ] + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_srt_style_example(): + """Tests the SRT-style example from the spec. + + Reference: spec/latest/stj-specification.md#srt-subrip + """ + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "segments": [ + { + "start": 0.0, + "end": 2.0, + "text": "First subtitle" + }, + { + "start": 2.1, + "end": 4.0, + "text": "Second subtitle\nwith line break" + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_webvtt_style_example(): + """Tests the WebVTT-style example from the spec. + + Reference: spec/latest/stj-specification.md#webvtt + """ + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "styles": [ + { + "id": "webvtt_style", + "text": { + "color": "#FFFFFF", + "background": "#000000" + }, + "display": { + "align": "center", + "vertical": "bottom" + }, + "extensions": { + "custom_webvtt": { + "line": "-2", + "position": "50%" + } + } + } + ], + "segments": [ + { + "start": 0.0, + "end": 2.0, + "text": "Styled subtitle", + "style_id": "webvtt_style" + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_ttml_style_example(): + """Tests the TTML-style example from the spec. + + Reference: spec/latest/stj-specification.md#ttml-timed-text-markup-language + """ + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "styles": [ + { + "id": "ttml_style", + "text": { + "color": "#FFFFFF", + "background": "#000000", + "size": "120%" + }, + "extensions": { + "custom_ttml": { + "fontFamily": "Arial", + "textOutline": "black 1px" + } + } + } + ], + "segments": [ + { + "start": 0.0, + "end": 2.0, + "text": "TTML styled text", + "style_id": "ttml_style", + "language": "en" + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_extensibility_example(): + """Tests the extensibility example from the spec. + + Reference: spec/latest/stj-specification.md#extensibility-and-customization + """ + stj_data = { + "stj": { + "version": "0.6.0", + "metadata": { + "extensions": { + "custom_app": { + "project_id": "12345", + "workflow": { + "stage": "review", + "assignee": "editor@example.com" + } + } + } + }, + "transcript": { + "segments": [ + { + "text": "Custom segment", + "extensions": { + "analysis": { + "sentiment": "positive", + "keywords": ["custom", "segment"], + "metrics": { + "clarity": 0.95, + "fluency": 0.88 + } + } + } + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + +def test_reserved_namespace_examples(): + """Tests the reserved namespace examples from the spec. + + Reference: spec/latest/stj-specification.md#reserved-namespaces + """ + # Test invalid use of reserved namespace + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "segments": [ + { + "text": "Test", + "extensions": { + "stj": { # Reserved namespace + "custom": "value" + } + } + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert any("Reserved namespace" in issue.message for issue in validation_issues) + +def test_empty_value_examples(): + """Tests the empty value examples from the spec. + + Reference: spec/latest/stj-specification.md#empty-value-constraints + """ + # Test valid empty arrays + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "speakers": [], # Valid empty array - speakers attempted but none found + "segments": [{"text": "Hello"}] # Cannot be empty + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + assert not validation_issues + + # Test invalid empty arrays + invalid_stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "segments": [] # Invalid - segments cannot be empty + } + } + } + + with pytest.raises(Exception): + StandardTranscriptionJSON.from_dict(invalid_stj_data).validate() + +def test_language_code_validation(): + """Tests the language code validation examples from the spec. + + Reference: spec/latest/stj-specification.md#language-codes + """ + # Test valid language codes + stj_data = { + "stj": { + "version": "0.6.0", + "metadata": { + "languages": ["en", "fr", "es"] # Valid ISO 639-1 codes + }, + "transcript": { + "segments": [ + { + "text": "Hello", + "language": "en" + }, + { + "text": "Bonjour", + "language": "fr" + } + ] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(stj_data) + validation_issues = stj.validate(raise_exception=False) + + assert not validation_issues + + # Test invalid language codes + invalid_stj = { + "stj": { + "version": "0.6.0", + "metadata": { + "languages": ["eng", "en"] # Mixed ISO 639-1 and 639-3 + }, + "transcript": { + "segments": [{"text": "Hello"}] + } + } + } + + stj = StandardTranscriptionJSON.from_dict(invalid_stj) + validation_issues = stj.validate(raise_exception=False) + assert any("language code" in issue.message for issue in validation_issues) + diff --git a/tests/python/test_stj_to_srt.py b/tests/python/test_stj_to_srt.py index bf529ab..662704b 100644 --- a/tests/python/test_stj_to_srt.py +++ b/tests/python/test_stj_to_srt.py @@ -4,6 +4,7 @@ import tempfile import pytest import stjlib +import json @pytest.fixture def base_dir(): @@ -14,29 +15,51 @@ def test_stj_to_srt_conversion(base_dir): # Add version check print(f"Using stjlib version: {stjlib.__version__}") - # Update path to include 'latest' directory + # Create a temporary STJ file with correct structure + stj_data = { + "stj": { + "version": "0.6.0", + "metadata": { + "created_at": "2023-10-19T15:30:00Z", + "transcriber": { + "name": "YAWT", + "version": "0.4.0" + }, + }, + "transcript": { + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Hello, world!" + } + ] + } + } + } + stj_tool = os.path.join(base_dir, 'tools', 'python', 'stj_to_srt.py') - stj_input = os.path.join(base_dir, 'examples', 'latest', 'simple.stj.json') # Updated path expected_srt = os.path.join(base_dir, 'tests', 'expected_outputs', 'expected_simple.srt') - # Add these checks - assert os.path.exists(stj_tool), f"STJ tool not found at: {stj_tool}" - assert os.path.exists(stj_input), f"Input file not found at: {stj_input}" - with tempfile.TemporaryDirectory() as temp_dir: + # Create temporary input file + input_stj = os.path.join(temp_dir, 'input.stj.json') + with open(input_stj, 'w', encoding='utf-8') as f: + json.dump(stj_data, f, indent=2) + output_srt = os.path.join(temp_dir, 'output_test.srt') - # Add debug output - print(f"Running conversion with:") - print(f"Tool: {stj_tool}") - print(f"Input: {stj_input}") - print(f"Output: {output_srt}") - # Run the conversion - subprocess.run(['python', stj_tool, stj_input, output_srt], check=True) + subprocess.run(['python', stj_tool, input_stj, output_srt], check=True) + + # Read and normalize both files for comparison + with open(output_srt, 'r', encoding='utf-8') as f: + actual_content = f.read().strip() + with open(expected_srt, 'r', encoding='utf-8') as f: + expected_content = f.read().strip() - # Compare output with expected SRT file - assert filecmp.cmp(output_srt, expected_srt), "SRT files do not match." + # Compare normalized content + assert actual_content == expected_content, "SRT files do not match." if __name__ == '__main__': pytest.main() diff --git a/tests/python/test_stj_validator.py b/tests/python/test_stj_validator.py index 8c271db..a93761b 100644 --- a/tests/python/test_stj_validator.py +++ b/tests/python/test_stj_validator.py @@ -1,149 +1,62 @@ +"""Tests for the STJ validator CLI tool. + +This module tests the command-line interface of the STJ validator tool. +It uses pytest's tmp_path fixture to create temporary test files. +""" + import os -import json +import subprocess import pytest -from jsonschema import validate, ValidationError, SchemaError -from stj_validator import validate_segments, validate_words # Get the absolute path to the project root PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) -@pytest.fixture -def schema(): - schema_path = os.path.join(PROJECT_ROOT, 'spec', 'schema', 'stj-schema.json') - with open(schema_path, 'r', encoding='utf-8') as f: - return json.load(f) - -def test_valid_stj_file(schema): - # Update the file path to use PROJECT_ROOT - stj_file_path = os.path.join(PROJECT_ROOT, 'examples', 'simple.stj.json') - with open(stj_file_path, 'r', encoding='utf-8') as f: - stj_data = json.load(f) - validate(instance=stj_data, schema=schema) - -def test_invalid_missing_mandatory_field(schema): - stj_data = { - # 'metadata' is missing - "transcript": { - "segments": [] - } - } - with pytest.raises(ValidationError): - validate(instance=stj_data, schema=schema) - -def test_invalid_wrong_data_type(schema): - stj_data = { - "metadata": { - "transcriber": { - "name": "YAWT", - "version": "0.1.0" - }, - "created_at": "2023-10-19T15:30:00Z" - }, - "transcript": { - "segments": [ - { - "start": "not a number", # Incorrect data type - "end": 5.0, - "text": "Sample text" - } - ] - } - } - with pytest.raises(ValidationError): - validate(instance=stj_data, schema=schema) - -def test_invalid_additional_properties(schema): - stj_data = { - "metadata": { - "transcriber": { - "name": "YAWT", - "version": "0.1.0" - }, - "created_at": "2023-10-19T15:30:00Z", - "unexpected_field": "unexpected" - }, - "transcript": { - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Sample text" - } - ] - } - } - with pytest.raises(ValidationError): - validate(instance=stj_data, schema=schema) - -def test_schema_error(): - invalid_schema = { - "type": "invalid_type" # Invalid schema - } - stj_data = {} - with pytest.raises(SchemaError): - validate(instance=stj_data, schema=invalid_schema) - -def test_overlapping_segments(schema): - stj_data = { - "metadata": { - "transcriber": {"name": "Test", "version": "1.0"}, - "created_at": "2023-10-21T12:00:00Z" - }, - "transcript": { - "segments": [ - {"start": 0.0, "end": 5.0, "text": "First segment"}, - {"start": 4.5, "end": 10.0, "text": "Second segment overlaps"} - ] - } - } - with pytest.raises(ValueError, match="Segments overlap or are out of order"): - validate(stj_data, schema) - validate_segments(stj_data['transcript']['segments']) - -def test_invalid_word_timing_mode(schema): - stj_data = { - "metadata": { - "transcriber": {"name": "Test", "version": "1.0"}, - "created_at": "2023-10-21T12:00:00Z" - }, - "transcript": { - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Hello world", - "word_timing_mode": "complete", - "words": [ - {"start": 0.0, "end": 1.0, "text": "Hello"} - # Missing "world" in words array - ] - } - ] - } - } - with pytest.raises(ValueError, match="Concatenated words do not match segment text"): - validate(stj_data, schema) - validate_words(stj_data['transcript']['segments'][0]) - -def test_zero_duration_word_without_flag(schema): - stj_data = { - "metadata": { - "transcriber": {"name": "Test", "version": "1.0"}, - "created_at": "2023-10-21T12:00:00Z" - }, - "transcript": { - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Zero duration word", - "words": [ - {"start": 1.0, "end": 1.0, "text": "Zero"} - ] - } - ] - } - } - with pytest.raises(ValueError, match="Zero-duration word at 1.0 without 'word_duration' set to 'zero'"): - validate(stj_data, schema) - validate_words(stj_data['transcript']['segments'][0]) +def test_cli_valid_file(): + """Test CLI with a valid STJ file.""" + stj_file_path = os.path.join(PROJECT_ROOT, 'examples', 'latest', 'simple.stj.json') + result = subprocess.run( + ['python', 'tools/python/stj_validator.py', stj_file_path], + capture_output=True, + text=True + ) + assert result.returncode == 0 + assert "Validation successful!" in result.stdout + +def test_cli_invalid_file(tmp_path): + """Test CLI with an invalid STJ file. + + Args: + tmp_path: Pytest fixture that provides a temporary directory unique to each test function. + See: https://docs.pytest.org/en/stable/tmpdir.html + """ + # Create invalid test file in the temporary directory + invalid_file = tmp_path / "invalid.stj.json" + invalid_file.write_text('{"invalid": "json"}') + + result = subprocess.run( + ['python', 'tools/python/stj_validator.py', str(invalid_file)], + capture_output=True, + text=True + ) + assert result.returncode == 1 + assert "Validation failed" in result.stdout + +def test_cli_missing_file(): + """Test CLI with a non-existent file.""" + result = subprocess.run( + ['python', 'tools/python/stj_validator.py', 'nonexistent.stj.json'], + capture_output=True, + text=True + ) + assert result.returncode == 1 + assert "File not found" in result.stdout + +def test_cli_no_arguments(): + """Test CLI with no arguments.""" + result = subprocess.run( + ['python', 'tools/python/stj_validator.py'], + capture_output=True, + text=True + ) + assert result.returncode != 0 + assert "error: the following arguments are required: stj_file" in result.stderr diff --git a/tests/python/test_validator.py b/tests/python/test_validator.py index 29fa532..346c3cc 100644 --- a/tests/python/test_validator.py +++ b/tests/python/test_validator.py @@ -2,81 +2,65 @@ import os import pytest from stjlib import StandardTranscriptionJSON -from jsonschema import validate # Add this import +from stjlib.stj import ValidationError PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) -@pytest.fixture -def schema(): - # Load the schema - schema_path = os.path.join(PROJECT_ROOT, 'spec', 'schema', 'stj-schema.json') - with open(schema_path, 'r', encoding='utf-8') as f: - return json.load(f) - def test_valid_stj_file(): - stj_file_path = os.path.join(PROJECT_ROOT, 'examples', 'latest', 'simple.stj.json') + # Test with the example file + stj_file_path = os.path.join(PROJECT_ROOT, 'examples', 'v0.6.0', 'simple.stj.json') stj = StandardTranscriptionJSON.from_file(stj_file_path) validation_issues = stj.validate(raise_exception=False) assert not validation_issues def test_invalid_missing_mandatory_field(): stj_data = { - "metadata": { - "created_at": "2024-10-19T15:30:00Z", - "version": "0.5.0" - # 'transcriber' is still missing, which should cause validation to fail - }, - "transcript": { - "segments": [] + "stj": { + "transcript": { + "segments": [ + { + "text": "Hello, world!" + } + ] + } } } - stj = StandardTranscriptionJSON.from_dict(stj_data) - validation_issues = stj.validate(raise_exception=False) - assert validation_issues # Should have validation issues + with pytest.raises(ValidationError, match="STJ version is required"): + StandardTranscriptionJSON.from_dict(stj_data) def test_invalid_wrong_data_type(): stj_data = { - "metadata": { - "transcriber": { - "name": "test_validator", - "version": "0.1.0" - }, - "created_at": "2024-10-19T15:30:00Z", - "version": "0.5.0" - }, - "transcript": { - "segments": [ - { - "start": "not a number", # Incorrect data type - "end": 5.0, - "text": "Sample text" - } - ] + "stj": { + "version": "0.6.0", + "transcript": { + "segments": [ + { + "start": "not a number", # Incorrect data type + "end": 5.0, + "text": "Sample text" + } + ] + } } } stj = StandardTranscriptionJSON.from_dict(stj_data) - with pytest.raises(AttributeError): - stj.validate(raise_exception=True) + validation_issues = stj.validate(raise_exception=False) + assert validation_issues # Should have validation issues def test_invalid_additional_properties(): stj_data = { - "metadata": { - "transcriber": { - "name": "test_validator", - "version": "0.1.0" - }, - "created_at": "2024-10-19T15:30:00Z", - "version": "0.5.0", - "unexpected_field": "unexpected" - }, - "transcript": { - "segments": [ - { - "start": 0.0, - "end": 5.0, - "text": "Sample text" - } - ] + "stj": { + "version": "0.6.0", + "unexpected_field": "unexpected", # Invalid additional property + "transcript": { + "segments": [ + { + "start": 0.0, + "end": 5.0, + "text": "Sample text" + } + ] + } } } stj = StandardTranscriptionJSON.from_dict(stj_data) @@ -88,6 +72,13 @@ def test_schema_error(): invalid_schema = { "type": "invalid_type" # Invalid schema } - stj_data = {} - with pytest.raises(jsonschema.SchemaError): - validate(instance=stj_data, schema=invalid_schema) + stj_data = { + "stj": { + "version": "0.6.0", + "transcript": { + "segments": [] # Empty segments array should cause validation error + } + } + } + validation_issues = StandardTranscriptionJSON.from_dict(stj_data).validate(raise_exception=False) + assert validation_issues # Should have validation issues diff --git a/tools/javascript/stj-validator.js b/tools/javascript/stj-validator.js index df65b25..8e48716 100644 --- a/tools/javascript/stj-validator.js +++ b/tools/javascript/stj-validator.js @@ -111,88 +111,139 @@ function validateSpeakersAndStyles(data) { function validateSegments(data) { const segments = data.transcript.segments; let previousEnd = -1; + let hasTimingInfo = false; - segments.forEach(segment => { - const { start, end, text, words, word_timing_mode } = segment; + segments.forEach((segment, index) => { + const { start, end, text, words, word_timing_mode, is_zero_duration } = segment; - // Check start and end times - if (start > end) { - throw new Error(`Segment start time ${start} is greater than end time ${end}`); + // Check if any segment has timing info + if (start !== undefined || end !== undefined) { + hasTimingInfo = true; } - // Check for overlapping segments - if (start < previousEnd) { - throw new Error(`Segments overlap or are out of order at time ${start}`); - } + // If timing info exists, validate both start and end are present + if (hasTimingInfo) { + if (start === undefined || end === undefined) { + throw new Error(`Segment ${index}: both start and end times must be present if either is provided`); + } + + // Validate time values + validateTimeValue(start, `segment ${index} start time`); + validateTimeValue(end, `segment ${index} end time`); - // Check for zero-duration segments - if (start === end) { - const segmentDuration = segment.additional_info?.segment_duration; - if (segmentDuration !== 'zero') { - throw new Error(`Zero-duration segment at ${start} without 'segment_duration' set to 'zero'`); + // Check basic timing constraints + if (start > end) { + throw new Error(`Segment start time ${start} is greater than end time ${end} in segment ${index}`); } + + if (start < previousEnd) { + throw new Error(`Segments overlap or are out of order at time ${start} in segment ${index}`); + } + + // Validate zero-duration segments + validateZeroDuration(start, end, is_zero_duration, `segment ${index}`); + + // Zero-duration segments must not have words or word_timing_mode + if (is_zero_duration) { + if (words || word_timing_mode) { + throw new Error(`Zero-duration segment at ${start} must not have words or word_timing_mode`); + } + } + + previousEnd = end; } - // Validate words within the segment - validateWords(segment); + // Validate words if present + if (words) { + validateWords(segment, index); + } // Validate confidence scores - const confidence = segment.confidence; - if (confidence !== undefined && (confidence < 0.0 || confidence > 1.0)) { - throw new Error(`Segment confidence ${confidence} out of range [0.0, 1.0] in segment starting at ${start}`); + if (segment.confidence !== undefined && segment.confidence !== null) { + if (typeof segment.confidence !== 'number' || + segment.confidence < 0.0 || + segment.confidence > 1.0) { + throw new Error(`Segment confidence ${segment.confidence} out of range [0.0, 1.0] in segment ${index}`); + } } - - previousEnd = end; }); + + // If any segment has timing info, all segments must have it + if (hasTimingInfo) { + segments.forEach((segment, index) => { + if (segment.start === undefined || segment.end === undefined) { + throw new Error(`Segment ${index} missing timing information when other segments have it`); + } + }); + } } -function validateWords(segment) { +function validateWords(segment, segmentIndex) { const words = segment.words || []; const wordTimingMode = segment.word_timing_mode || (words.length ? 'complete' : 'none'); const { start: segmentStart, end: segmentEnd } = segment; + // Validate word_timing_mode if (!['complete', 'partial', 'none'].includes(wordTimingMode)) { - throw new Error(`Invalid 'word_timing_mode' in segment starting at ${segmentStart}`); + throw new Error(`Invalid 'word_timing_mode' in segment ${segmentIndex}`); + } + + // Handle different word timing modes + if (wordTimingMode === 'none') { + if (words.length > 0) { + throw new Error(`'word_timing_mode' is 'none' but words are provided in segment ${segmentIndex}`); + } + return; } - if (wordTimingMode !== 'none' && words.length === 0) { - throw new Error(`'word_timing_mode' is '${wordTimingMode}' but no words are provided in segment starting at ${segmentStart}`); + // Validate words presence for non-'none' modes + if (words.length === 0) { + if (wordTimingMode === 'partial') { + throw new Error(`'word_timing_mode' is 'partial' but no words are provided in segment ${segmentIndex}`); + } } let previousWordEnd = segmentStart; let concatenatedWords = ''; - words.forEach(word => { - const { start: wordStart, end: wordEnd, text: wordText } = word; + words.forEach((word, wordIndex) => { + const { start: wordStart, end: wordEnd, text: wordText, is_zero_duration } = word; + + // Validate required fields + if (!wordText) { + throw new Error(`Missing word text in segment ${segmentIndex}, word ${wordIndex}`); + } + + // Validate time values + validateTimeValue(wordStart, `segment ${segmentIndex}, word ${wordIndex} start time`); + validateTimeValue(wordEnd, `segment ${segmentIndex}, word ${wordIndex} end time`); // Check word timings if (wordStart > wordEnd) { - throw new Error(`Word start time ${wordStart} is greater than end time ${wordEnd} in segment starting at ${segmentStart}`); + throw new Error(`Word start time ${wordStart} is greater than end time ${wordEnd} in segment ${segmentIndex}, word ${wordIndex}`); } if (wordStart < segmentStart || wordEnd > segmentEnd) { - throw new Error(`Word timings are outside segment timings in segment starting at ${segmentStart}`); + throw new Error(`Word timings are outside segment timings in segment ${segmentIndex}, word ${wordIndex}`); } if (wordStart < previousWordEnd) { - throw new Error(`Words overlap or are out of order in segment starting at ${segmentStart}`); + throw new Error(`Words overlap or are out of order in segment ${segmentIndex}, word ${wordIndex}`); } - // Check for zero-duration words - if (wordStart === wordEnd) { - const wordDuration = word.additional_info?.word_duration; - if (wordDuration !== 'zero') { - throw new Error(`Zero-duration word at ${wordStart} without 'word_duration' set to 'zero'`); - } - } + // Validate zero-duration words + validateZeroDuration(wordStart, wordEnd, is_zero_duration, `word in segment ${segmentIndex}, word ${wordIndex}`); previousWordEnd = wordEnd; concatenatedWords += wordText + ' '; // Validate word confidence - const wordConfidence = word.confidence; - if (wordConfidence !== undefined && (wordConfidence < 0.0 || wordConfidence > 1.0)) { - throw new Error(`Word confidence ${wordConfidence} out of range [0.0, 1.0] in segment starting at ${segmentStart}`); + if (word.confidence !== undefined && word.confidence !== null) { + if (typeof word.confidence !== 'number' || + word.confidence < 0.0 || + word.confidence > 1.0) { + throw new Error(`Word confidence ${word.confidence} out of range [0.0, 1.0] in segment ${segmentIndex}, word ${wordIndex}`); + } } }); @@ -200,10 +251,51 @@ function validateWords(segment) { if (wordTimingMode === 'complete') { const segmentText = segment.text; const normalizedSegmentText = segmentText.replace(/\s+/g, ''); - const normalizedWordsText = concatenatedWords.replace(/\s+/g, ''); + const normalizedWordsText = concatenatedWords.trim().replace(/\s+/g, ''); if (normalizedWordsText !== normalizedSegmentText) { - throw new Error(`Concatenated words do not match segment text in segment starting at ${segmentStart}`); + throw new Error(`Concatenated words do not match segment text in segment ${segmentIndex}`); + } + } +} + +// Add these utility functions at the top level +function validateTimeValue(time, context) { + // Check basic type and format + if (typeof time !== 'number') { + throw new Error(`Invalid time value in ${context}: must be a number`); + } + + // Check range and format requirements + if (time < 0) { + throw new Error(`Invalid time value in ${context}: negative values not allowed`); + } + + if (time > 999999.999) { + throw new Error(`Invalid time value in ${context}: exceeds maximum allowed value`); + } + + // Convert to string and check format + const timeStr = time.toString(); + if (timeStr.includes('e') || timeStr.includes('E')) { + throw new Error(`Invalid time value in ${context}: scientific notation not allowed`); + } + + // Check decimal places + const parts = timeStr.split('.'); + if (parts[1] && parts[1].length > 3) { + throw new Error(`Invalid time value in ${context}: maximum 3 decimal places allowed`); + } +} + +function validateZeroDuration(start, end, isZeroDuration, context) { + if (start === end) { + if (!isZeroDuration) { + throw new Error(`Zero-duration ${context} must have 'is_zero_duration' set to true`); + } + } else { + if (isZeroDuration) { + throw new Error(`Non-zero-duration ${context} must not have 'is_zero_duration' field`); } } } diff --git a/tools/python/stj_validator.py b/tools/python/stj_validator.py index e6e5a20..dde79f6 100644 --- a/tools/python/stj_validator.py +++ b/tools/python/stj_validator.py @@ -10,10 +10,8 @@ def main(): args = parser.parse_args() try: - # Load STJ file without validation first + # Load and validate STJ file using stjlib stj = StandardTranscriptionJSON.from_file(args.stj_file, validate=False) - - # Run validation and get list of issues, without raising exceptions validation_issues = stj.validate(raise_exception=False) if not validation_issues: