From 1aae6ac5cc0b81b10a01f39a83d38cc31507f609 Mon Sep 17 00:00:00 2001
From: Lucain <lucain@huggingface.co>
Date: Tue, 24 Sep 2024 14:07:19 +0200
Subject: [PATCH] Update TGI specs (+ define TextToSpeech independently) (#915)

This PR pulls latest changes from TGI specs. In particular:
- updated `ChatCompletionInputMessageContent` => now supports single
message (as before), array of messages and **sending image as input**
(in case of VLMs)
- new input: `stream_options`
- tools input have been updated
- more documented fields

**Note:** TGI specs have a `guideline` input parameter for chat
completion. I removed it from the official specs as it's a very specific
parameter and subject to deletion in the near future (it was there to
fix a chat template issue for `ShieldGemma` models). More context
[here](https://huggingface.slack.com/archives/C05CFK1HM0T/p1726822675255589?thread_ts=1724852501.099809&cid=C05CFK1HM0T)
(internal).

**Note:** I also took the opportunity to define `TextToSpeech` specs
independently from `TextToAudio` (otherwise the semi-automatic scripts
are not working properly). Nothing has changed in the specs.

I'll open a related PR on `huggingface_hub` side to reflect this change.
---
 .../tasks/scripts/inference-tei-import.ts     |   4 +-
 .../tasks/scripts/inference-tgi-import.ts     |   4 +-
 .../src/tasks/chat-completion/inference.ts    |  87 ++++++--
 .../src/tasks/chat-completion/spec/input.json | 203 ++++++++++++++----
 .../tasks/chat-completion/spec/output.json    |  46 ++--
 .../chat-completion/spec/stream_output.json   |  71 ++++--
 packages/tasks/src/tasks/index.ts             |   2 +-
 .../src/tasks/text-generation/inference.ts    |  62 ++++++
 .../src/tasks/text-generation/spec/input.json |  24 +++
 .../text-generation/spec/stream_output.json   |   8 +-
 .../src/tasks/text-to-speech/inference.ts     |  10 +-
 .../src/tasks/text-to-speech/spec/input.json  |  28 ++-
 12 files changed, 444 insertions(+), 105 deletions(-)

diff --git a/packages/tasks/scripts/inference-tei-import.ts b/packages/tasks/scripts/inference-tei-import.ts
index b56ed1d5f..2765294a7 100644
--- a/packages/tasks/scripts/inference-tei-import.ts
+++ b/packages/tasks/scripts/inference-tei-import.ts
@@ -73,7 +73,9 @@ async function _extractAndAdapt(task: string, mainComponentName: string, type: "
 					}
 
 					// Add reference to components to export (and scan it too)
-					const newRef = camelFullName + ref.replace(camelName, "");
+					let newRef = camelFullName + ref.replace(camelName, "");
+					// remove duplicated InputInput or OutputOutput in naming
+					newRef = newRef.replace("InputInput", "Input").replace("OutputOutput", "Output");
 					if (!filteredComponents[newRef]) {
 						components[ref]["title"] = newRef; // Rename title to avoid conflicts
 						filteredComponents[newRef] = components[ref];
diff --git a/packages/tasks/scripts/inference-tgi-import.ts b/packages/tasks/scripts/inference-tgi-import.ts
index 49be3b1e9..852d1dc53 100644
--- a/packages/tasks/scripts/inference-tgi-import.ts
+++ b/packages/tasks/scripts/inference-tgi-import.ts
@@ -66,7 +66,9 @@ async function _extractAndAdapt(task: string, mainComponentName: string, type: "
 					}
 
 					// Add reference to components to export (and scan it too)
-					const newRef = camelFullName + ref.replace(camelName, "");
+					let newRef = camelFullName + ref.replace(camelName, "");
+					// remove duplicated InputInput or OutputOutput in naming
+					newRef = newRef.replace("InputInput", "Input").replace("OutputOutput", "Output");
 					if (!filteredComponents[newRef]) {
 						components[ref]["title"] = newRef; // Rename title to avoid conflicts
 						filteredComponents[newRef] = components[ref];
diff --git a/packages/tasks/src/tasks/chat-completion/inference.ts b/packages/tasks/src/tasks/chat-completion/inference.ts
index 488a1e87e..febaffc8f 100644
--- a/packages/tasks/src/tasks/chat-completion/inference.ts
+++ b/packages/tasks/src/tasks/chat-completion/inference.ts
@@ -49,7 +49,7 @@ export interface ChatCompletionInput {
 	 * [UNUSED] ID of the model to use. See the model endpoint compatibility table for details
 	 * on which models work with the Chat API.
 	 */
-	model: string;
+	model?: string;
 	/**
 	 * UNUSED
 	 * How many chat completion choices to generate for each input message. Note that you will
@@ -63,12 +63,14 @@ export interface ChatCompletionInput {
 	 * increasing the model's likelihood to talk about new topics
 	 */
 	presence_penalty?: number;
+	response_format?: ChatCompletionInputGrammarType;
 	seed?: number;
 	/**
 	 * Up to 4 sequences where the API will stop generating further tokens.
 	 */
 	stop?: string[];
 	stream?: boolean;
+	stream_options?: ChatCompletionInputStreamOptions;
 	/**
 	 * What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the
 	 * output more random, while
@@ -77,7 +79,7 @@ export interface ChatCompletionInput {
 	 * We generally recommend altering this or `top_p` but not both.
 	 */
 	temperature?: number;
-	tool_choice?: ChatCompletionInputToolType;
+	tool_choice?: ChatCompletionInputTool;
 	/**
 	 * A prompt to be appended before the tools
 	 */
@@ -87,7 +89,7 @@ export interface ChatCompletionInput {
 	 * Use this to provide a list of
 	 * functions the model may generate JSON inputs for.
 	 */
-	tools?: ChatCompletionInputTool[];
+	tools?: ToolElement[];
 	/**
 	 * An integer between 0 and 5 specifying the number of most likely tokens to return at each
 	 * token position, each with
@@ -105,40 +107,78 @@ export interface ChatCompletionInput {
 }
 
 export interface ChatCompletionInputMessage {
-	content?: string;
+	content: ChatCompletionInputMessageContent;
 	name?: string;
 	role: string;
-	tool_calls?: ChatCompletionInputToolCall[];
 	[property: string]: unknown;
 }
 
-export interface ChatCompletionInputToolCall {
-	function: ChatCompletionInputFunctionDefinition;
-	id: number;
-	type: string;
+export type ChatCompletionInputMessageContent = ChatCompletionInputMessageChunk[] | string;
+
+export interface ChatCompletionInputMessageChunk {
+	image_url?: ChatCompletionInputURL;
+	text?: string;
+	type: ChatCompletionInputMessageChunkType;
 	[property: string]: unknown;
 }
 
-export interface ChatCompletionInputFunctionDefinition {
-	arguments: unknown;
-	description?: string;
-	name: string;
+export interface ChatCompletionInputURL {
+	url: string;
 	[property: string]: unknown;
 }
 
-export type ChatCompletionInputToolType = "OneOf" | ChatCompletionInputToolTypeObject;
+export type ChatCompletionInputMessageChunkType = "text" | "image_url";
 
-export interface ChatCompletionInputToolTypeObject {
-	FunctionName: string;
+export interface ChatCompletionInputGrammarType {
+	type: ChatCompletionInputGrammarTypeType;
+	/**
+	 * A string that represents a [JSON Schema](https://json-schema.org/).
+	 *
+	 * JSON Schema is a declarative language that allows to annotate JSON documents
+	 * with types and descriptions.
+	 */
+	value: unknown;
+	[property: string]: unknown;
+}
+
+export type ChatCompletionInputGrammarTypeType = "json" | "regex";
+
+export interface ChatCompletionInputStreamOptions {
+	/**
+	 * If set, an additional chunk will be streamed before the data: [DONE] message. The usage
+	 * field on this chunk shows the token usage statistics for the entire request, and the
+	 * choices field will always be an empty array. All other chunks will also include a usage
+	 * field, but with a null value.
+	 */
+	include_usage: boolean;
 	[property: string]: unknown;
 }
 
-export interface ChatCompletionInputTool {
+export type ChatCompletionInputTool = ChatCompletionInputToolType | string;
+
+export interface ChatCompletionInputToolType {
+	function?: ChatCompletionInputFunctionName;
+	[property: string]: unknown;
+}
+
+export interface ChatCompletionInputFunctionName {
+	name: string;
+	[property: string]: unknown;
+}
+
+export interface ToolElement {
 	function: ChatCompletionInputFunctionDefinition;
 	type: string;
 	[property: string]: unknown;
 }
 
+export interface ChatCompletionInputFunctionDefinition {
+	arguments: unknown;
+	description?: string;
+	name: string;
+	[property: string]: unknown;
+}
+
 /**
  * Chat Completion Output.
  *
@@ -151,7 +191,6 @@ export interface ChatCompletionOutput {
 	created: number;
 	id: string;
 	model: string;
-	object: string;
 	system_fingerprint: string;
 	usage: ChatCompletionOutputUsage;
 	[property: string]: unknown;
@@ -185,7 +224,6 @@ export interface ChatCompletionOutputTopLogprob {
 
 export interface ChatCompletionOutputMessage {
 	content?: string;
-	name?: string;
 	role: string;
 	tool_calls?: ChatCompletionOutputToolCall[];
 	[property: string]: unknown;
@@ -193,7 +231,7 @@ export interface ChatCompletionOutputMessage {
 
 export interface ChatCompletionOutputToolCall {
 	function: ChatCompletionOutputFunctionDefinition;
-	id: number;
+	id: string;
 	type: string;
 	[property: string]: unknown;
 }
@@ -224,8 +262,8 @@ export interface ChatCompletionStreamOutput {
 	created: number;
 	id: string;
 	model: string;
-	object: string;
 	system_fingerprint: string;
+	usage?: ChatCompletionStreamOutputUsage;
 	[property: string]: unknown;
 }
 
@@ -275,3 +313,10 @@ export interface ChatCompletionStreamOutputTopLogprob {
 	token: string;
 	[property: string]: unknown;
 }
+
+export interface ChatCompletionStreamOutputUsage {
+	completion_tokens: number;
+	prompt_tokens: number;
+	total_tokens: number;
+	[property: string]: unknown;
+}
diff --git a/packages/tasks/src/tasks/chat-completion/spec/input.json b/packages/tasks/src/tasks/chat-completion/spec/input.json
index 0b549cd58..86ca23c82 100644
--- a/packages/tasks/src/tasks/chat-completion/spec/input.json
+++ b/packages/tasks/src/tasks/chat-completion/spec/input.json
@@ -4,7 +4,7 @@
 	"description": "Chat Completion Input.\n\nAuto-generated from TGI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tgi-import.ts.",
 	"title": "ChatCompletionInput",
 	"type": "object",
-	"required": ["model", "messages"],
+	"required": ["messages"],
 	"properties": {
 		"frequency_penalty": {
 			"type": "number",
@@ -47,7 +47,8 @@
 		"model": {
 			"type": "string",
 			"description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
-			"example": "mistralai/Mistral-7B-Instruct-v0.2"
+			"example": "mistralai/Mistral-7B-Instruct-v0.2",
+			"nullable": true
 		},
 		"n": {
 			"type": "integer",
@@ -64,6 +65,15 @@
 			"example": 0.1,
 			"nullable": true
 		},
+		"response_format": {
+			"allOf": [
+				{
+					"$ref": "#/$defs/ChatCompletionInputGrammarType"
+				}
+			],
+			"default": "null",
+			"nullable": true
+		},
 		"seed": {
 			"type": "integer",
 			"format": "int64",
@@ -83,6 +93,14 @@
 		"stream": {
 			"type": "boolean"
 		},
+		"stream_options": {
+			"allOf": [
+				{
+					"$ref": "#/$defs/ChatCompletionInputStreamOptions"
+				}
+			],
+			"nullable": true
+		},
 		"temperature": {
 			"type": "number",
 			"format": "float",
@@ -93,7 +111,7 @@
 		"tool_choice": {
 			"allOf": [
 				{
-					"$ref": "#/$defs/ChatCompletionInputToolType"
+					"$ref": "#/$defs/ChatCompletionInputToolChoice"
 				}
 			],
 			"nullable": true
@@ -101,7 +119,7 @@
 		"tool_prompt": {
 			"type": "string",
 			"description": "A prompt to be appended before the tools",
-			"example": "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\"",
+			"example": "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.",
 			"nullable": true
 		},
 		"tools": {
@@ -132,12 +150,10 @@
 	"$defs": {
 		"ChatCompletionInputMessage": {
 			"type": "object",
-			"required": ["role"],
+			"required": ["role", "content"],
 			"properties": {
 				"content": {
-					"type": "string",
-					"example": "My name is David and I",
-					"nullable": true
+					"$ref": "#/$defs/ChatCompletionInputMessageContent"
 				},
 				"name": {
 					"type": "string",
@@ -147,68 +163,160 @@
 				"role": {
 					"type": "string",
 					"example": "user"
+				}
+			},
+			"title": "ChatCompletionInputMessage"
+		},
+		"ChatCompletionInputMessageContent": {
+			"oneOf": [
+				{
+					"type": "string"
 				},
-				"tool_calls": {
+				{
 					"type": "array",
 					"items": {
-						"$ref": "#/$defs/ChatCompletionInputToolCall"
-					},
-					"nullable": true
+						"$ref": "#/$defs/ChatCompletionInputMessageChunk"
+					}
+				}
+			],
+			"title": "ChatCompletionInputMessageContent"
+		},
+		"ChatCompletionInputMessageChunk": {
+			"oneOf": [
+				{
+					"type": "object",
+					"required": ["text", "type"],
+					"properties": {
+						"text": {
+							"type": "string"
+						},
+						"type": {
+							"type": "string",
+							"enum": ["text"]
+						}
+					}
+				},
+				{
+					"type": "object",
+					"required": ["image_url", "type"],
+					"properties": {
+						"image_url": {
+							"$ref": "#/$defs/ChatCompletionInputUrl"
+						},
+						"type": {
+							"type": "string",
+							"enum": ["image_url"]
+						}
+					}
 				}
+			],
+			"discriminator": {
+				"propertyName": "type"
 			},
-			"title": "ChatCompletionInputMessage"
+			"title": "ChatCompletionInputMessageChunk"
 		},
-		"ChatCompletionInputToolCall": {
+		"ChatCompletionInputUrl": {
 			"type": "object",
-			"required": ["id", "type", "function"],
+			"required": ["url"],
 			"properties": {
-				"function": {
-					"$ref": "#/$defs/ChatCompletionInputFunctionDefinition"
-				},
-				"id": {
-					"type": "integer",
-					"format": "int32",
-					"minimum": 0
-				},
-				"type": {
+				"url": {
 					"type": "string"
 				}
 			},
-			"title": "ChatCompletionInputToolCall"
+			"title": "ChatCompletionInputUrl"
 		},
-		"ChatCompletionInputFunctionDefinition": {
+		"ChatCompletionInputGrammarType": {
+			"oneOf": [
+				{
+					"type": "object",
+					"required": ["type", "value"],
+					"properties": {
+						"type": {
+							"type": "string",
+							"enum": ["json"]
+						},
+						"value": {
+							"description": "A string that represents a [JSON Schema](https://json-schema.org/).\n\nJSON Schema is a declarative language that allows to annotate JSON documents\nwith types and descriptions."
+						}
+					}
+				},
+				{
+					"type": "object",
+					"required": ["type", "value"],
+					"properties": {
+						"type": {
+							"type": "string",
+							"enum": ["regex"]
+						},
+						"value": {
+							"type": "string"
+						}
+					}
+				}
+			],
+			"discriminator": {
+				"propertyName": "type"
+			},
+			"title": "ChatCompletionInputGrammarType"
+		},
+		"ChatCompletionInputStreamOptions": {
 			"type": "object",
-			"required": ["name", "arguments"],
+			"required": ["include_usage"],
 			"properties": {
-				"arguments": {},
-				"description": {
-					"type": "string",
-					"nullable": true
-				},
-				"name": {
-					"type": "string"
+				"include_usage": {
+					"type": "boolean",
+					"description": "If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.",
+					"example": "true"
 				}
 			},
-			"title": "ChatCompletionInputFunctionDefinition"
+			"title": "ChatCompletionInputStreamOptions"
+		},
+		"ChatCompletionInputToolChoice": {
+			"allOf": [
+				{
+					"$ref": "#/$defs/ChatCompletionInputToolType"
+				}
+			],
+			"nullable": true,
+			"title": "ChatCompletionInputToolChoice"
 		},
 		"ChatCompletionInputToolType": {
 			"oneOf": [
 				{
 					"type": "object",
-					"required": ["FunctionName"],
+					"default": null,
+					"nullable": true
+				},
+				{
+					"type": "string"
+				},
+				{
+					"type": "object",
+					"required": ["function"],
 					"properties": {
-						"FunctionName": {
-							"type": "string"
+						"function": {
+							"$ref": "#/$defs/ChatCompletionInputFunctionName"
 						}
 					}
 				},
 				{
-					"type": "string",
-					"enum": ["OneOf"]
+					"type": "object",
+					"default": null,
+					"nullable": true
 				}
 			],
 			"title": "ChatCompletionInputToolType"
 		},
+		"ChatCompletionInputFunctionName": {
+			"type": "object",
+			"required": ["name"],
+			"properties": {
+				"name": {
+					"type": "string"
+				}
+			},
+			"title": "ChatCompletionInputFunctionName"
+		},
 		"ChatCompletionInputTool": {
 			"type": "object",
 			"required": ["type", "function"],
@@ -222,6 +330,21 @@
 				}
 			},
 			"title": "ChatCompletionInputTool"
+		},
+		"ChatCompletionInputFunctionDefinition": {
+			"type": "object",
+			"required": ["name", "arguments"],
+			"properties": {
+				"arguments": {},
+				"description": {
+					"type": "string",
+					"nullable": true
+				},
+				"name": {
+					"type": "string"
+				}
+			},
+			"title": "ChatCompletionInputFunctionDefinition"
 		}
 	}
 }
diff --git a/packages/tasks/src/tasks/chat-completion/spec/output.json b/packages/tasks/src/tasks/chat-completion/spec/output.json
index 5b602ccd6..ce808bf1b 100644
--- a/packages/tasks/src/tasks/chat-completion/spec/output.json
+++ b/packages/tasks/src/tasks/chat-completion/spec/output.json
@@ -4,7 +4,7 @@
 	"description": "Chat Completion Output.\n\nAuto-generated from TGI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tgi-import.ts.",
 	"title": "ChatCompletionOutput",
 	"type": "object",
-	"required": ["id", "object", "created", "model", "system_fingerprint", "choices", "usage"],
+	"required": ["id", "created", "model", "system_fingerprint", "choices", "usage"],
 	"properties": {
 		"choices": {
 			"type": "array",
@@ -25,9 +25,6 @@
 			"type": "string",
 			"example": "mistralai/Mistral-7B-Instruct-v0.2"
 		},
-		"object": {
-			"type": "string"
-		},
 		"system_fingerprint": {
 			"type": "string"
 		},
@@ -110,32 +107,47 @@
 			"title": "ChatCompletionOutputTopLogprob"
 		},
 		"ChatCompletionOutputMessage": {
+			"oneOf": [
+				{
+					"$ref": "#/$defs/ChatCompletionOutputTextMessage"
+				},
+				{
+					"$ref": "#/$defs/ChatCompletionOutputToolCallMessage"
+				}
+			],
+			"title": "ChatCompletionOutputMessage"
+		},
+		"ChatCompletionOutputTextMessage": {
 			"type": "object",
-			"required": ["role"],
+			"required": ["role", "content"],
 			"properties": {
 				"content": {
 					"type": "string",
-					"example": "My name is David and I",
-					"nullable": true
-				},
-				"name": {
-					"type": "string",
-					"example": "\"David\"",
-					"nullable": true
+					"example": "My name is David and I"
 				},
 				"role": {
 					"type": "string",
 					"example": "user"
+				}
+			},
+			"title": "ChatCompletionOutputTextMessage"
+		},
+		"ChatCompletionOutputToolCallMessage": {
+			"type": "object",
+			"required": ["role", "tool_calls"],
+			"properties": {
+				"role": {
+					"type": "string",
+					"example": "assistant"
 				},
 				"tool_calls": {
 					"type": "array",
 					"items": {
 						"$ref": "#/$defs/ChatCompletionOutputToolCall"
-					},
-					"nullable": true
+					}
 				}
 			},
-			"title": "ChatCompletionOutputMessage"
+			"title": "ChatCompletionOutputToolCallMessage"
 		},
 		"ChatCompletionOutputToolCall": {
 			"type": "object",
@@ -145,9 +157,7 @@
 					"$ref": "#/$defs/ChatCompletionOutputFunctionDefinition"
 				},
 				"id": {
-					"type": "integer",
-					"format": "int32",
-					"minimum": 0
+					"type": "string"
 				},
 				"type": {
 					"type": "string"
diff --git a/packages/tasks/src/tasks/chat-completion/spec/stream_output.json b/packages/tasks/src/tasks/chat-completion/spec/stream_output.json
index 72575d913..53d9c55be 100644
--- a/packages/tasks/src/tasks/chat-completion/spec/stream_output.json
+++ b/packages/tasks/src/tasks/chat-completion/spec/stream_output.json
@@ -4,7 +4,7 @@
 	"description": "Chat Completion Stream Output.\n\nAuto-generated from TGI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tgi-import.ts.",
 	"title": "ChatCompletionStreamOutput",
 	"type": "object",
-	"required": ["id", "object", "created", "model", "system_fingerprint", "choices"],
+	"required": ["id", "created", "model", "system_fingerprint", "choices"],
 	"properties": {
 		"choices": {
 			"type": "array",
@@ -25,11 +25,16 @@
 			"type": "string",
 			"example": "mistralai/Mistral-7B-Instruct-v0.2"
 		},
-		"object": {
-			"type": "string"
-		},
 		"system_fingerprint": {
 			"type": "string"
+		},
+		"usage": {
+			"allOf": [
+				{
+					"$ref": "#/$defs/ChatCompletionStreamOutputUsage"
+				}
+			],
+			"nullable": true
 		}
 	},
 	"$defs": {
@@ -61,28 +66,44 @@
 			"title": "ChatCompletionStreamOutputChoice"
 		},
 		"ChatCompletionStreamOutputDelta": {
+			"oneOf": [
+				{
+					"$ref": "#/$defs/ChatCompletionStreamOutputTextMessage"
+				},
+				{
+					"$ref": "#/$defs/ChatCompletionStreamOutputToolCallDelta"
+				}
+			],
+			"title": "ChatCompletionStreamOutputDelta"
+		},
+		"ChatCompletionStreamOutputTextMessage": {
 			"type": "object",
-			"required": ["role"],
+			"required": ["role", "content"],
 			"properties": {
 				"content": {
 					"type": "string",
-					"example": "What is Deep Learning?",
-					"nullable": true
+					"example": "My name is David and I"
 				},
 				"role": {
 					"type": "string",
 					"example": "user"
+				}
+			},
+			"title": "ChatCompletionStreamOutputTextMessage"
+		},
+		"ChatCompletionStreamOutputToolCallDelta": {
+			"type": "object",
+			"required": ["role", "tool_calls"],
+			"properties": {
+				"role": {
+					"type": "string",
+					"example": "assistant"
 				},
 				"tool_calls": {
-					"allOf": [
-						{
-							"$ref": "#/$defs/ChatCompletionStreamOutputDeltaToolCall"
-						}
-					],
-					"nullable": true
+					"$ref": "#/$defs/ChatCompletionStreamOutputDeltaToolCall"
 				}
 			},
-			"title": "ChatCompletionStreamOutputDelta"
+			"title": "ChatCompletionStreamOutputToolCallDelta"
 		},
 		"ChatCompletionStreamOutputDeltaToolCall": {
 			"type": "object",
@@ -165,6 +186,28 @@
 				}
 			},
 			"title": "ChatCompletionStreamOutputTopLogprob"
+		},
+		"ChatCompletionStreamOutputUsage": {
+			"type": "object",
+			"required": ["prompt_tokens", "completion_tokens", "total_tokens"],
+			"properties": {
+				"completion_tokens": {
+					"type": "integer",
+					"format": "int32",
+					"minimum": 0
+				},
+				"prompt_tokens": {
+					"type": "integer",
+					"format": "int32",
+					"minimum": 0
+				},
+				"total_tokens": {
+					"type": "integer",
+					"format": "int32",
+					"minimum": 0
+				}
+			},
+			"title": "ChatCompletionStreamOutputUsage"
 		}
 	}
 }
diff --git a/packages/tasks/src/tasks/index.ts b/packages/tasks/src/tasks/index.ts
index 4de86795e..e1ee20be9 100644
--- a/packages/tasks/src/tasks/index.ts
+++ b/packages/tasks/src/tasks/index.ts
@@ -72,7 +72,7 @@ export type * from "./sentence-similarity/inference";
 export type * from "./summarization/inference";
 export type * from "./table-question-answering/inference";
 export type { TextToImageInput, TextToImageOutput, TextToImageParameters } from "./text-to-image/inference";
-export type { TextToAudioParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference";
+export type { TextToSpeechParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference";
 export type * from "./token-classification/inference";
 export type { TranslationInput, TranslationOutput } from "./translation/inference";
 export type {
diff --git a/packages/tasks/src/tasks/text-generation/inference.ts b/packages/tasks/src/tasks/text-generation/inference.ts
index 37395c580..67de53afa 100644
--- a/packages/tasks/src/tasks/text-generation/inference.ts
+++ b/packages/tasks/src/tasks/text-generation/inference.ts
@@ -19,23 +19,84 @@ export interface TextGenerationInput {
 }
 
 export interface TextGenerationInputGenerateParameters {
+	/**
+	 * Lora adapter id
+	 */
+	adapter_id?: string;
+	/**
+	 * Generate best_of sequences and return the one if the highest token logprobs.
+	 */
 	best_of?: number;
+	/**
+	 * Whether to return decoder input token logprobs and ids.
+	 */
 	decoder_input_details?: boolean;
+	/**
+	 * Whether to return generation details.
+	 */
 	details?: boolean;
+	/**
+	 * Activate logits sampling.
+	 */
 	do_sample?: boolean;
+	/**
+	 * The parameter for frequency penalty. 1.0 means no penalty
+	 * Penalize new tokens based on their existing frequency in the text so far,
+	 * decreasing the model's likelihood to repeat the same line verbatim.
+	 */
 	frequency_penalty?: number;
 	grammar?: TextGenerationInputGrammarType;
+	/**
+	 * Maximum number of tokens to generate.
+	 */
 	max_new_tokens?: number;
+	/**
+	 * The parameter for repetition penalty. 1.0 means no penalty.
+	 * See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+	 */
 	repetition_penalty?: number;
+	/**
+	 * Whether to prepend the prompt to the generated text
+	 */
 	return_full_text?: boolean;
+	/**
+	 * Random sampling seed.
+	 */
 	seed?: number;
+	/**
+	 * Stop generating tokens if a member of `stop` is generated.
+	 */
 	stop?: string[];
+	/**
+	 * The value used to module the logits distribution.
+	 */
 	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
 	top_k?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-n-filtering.
+	 */
 	top_n_tokens?: number;
+	/**
+	 * Top-p value for nucleus sampling.
+	 */
 	top_p?: number;
+	/**
+	 * Truncate inputs tokens to the given size.
+	 */
 	truncate?: number;
+	/**
+	 * Typical Decoding mass
+	 * See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666)
+	 * for more information.
+	 */
 	typical_p?: number;
+	/**
+	 * Watermarking with [A Watermark for Large Language
+	 * Models](https://arxiv.org/abs/2301.10226).
+	 */
 	watermark?: boolean;
 	[property: string]: unknown;
 }
@@ -125,6 +186,7 @@ export interface TextGenerationStreamOutput {
 export interface TextGenerationStreamOutputStreamDetails {
 	finish_reason: TextGenerationOutputFinishReason;
 	generated_tokens: number;
+	input_length: number;
 	seed?: number;
 	[property: string]: unknown;
 }
diff --git a/packages/tasks/src/tasks/text-generation/spec/input.json b/packages/tasks/src/tasks/text-generation/spec/input.json
index 0742cefe0..108d9fb3c 100644
--- a/packages/tasks/src/tasks/text-generation/spec/input.json
+++ b/packages/tasks/src/tasks/text-generation/spec/input.json
@@ -22,8 +22,16 @@
 		"TextGenerationInputGenerateParameters": {
 			"type": "object",
 			"properties": {
+				"adapter_id": {
+					"type": "string",
+					"description": "Lora adapter id",
+					"default": "null",
+					"example": "null",
+					"nullable": true
+				},
 				"best_of": {
 					"type": "integer",
+					"description": "Generate best_of sequences and return the one if the highest token logprobs.",
 					"default": "null",
 					"example": 1,
 					"nullable": true,
@@ -32,20 +40,24 @@
 				},
 				"decoder_input_details": {
 					"type": "boolean",
+					"description": "Whether to return decoder input token logprobs and ids.",
 					"default": "false"
 				},
 				"details": {
 					"type": "boolean",
+					"description": "Whether to return generation details.",
 					"default": "true"
 				},
 				"do_sample": {
 					"type": "boolean",
+					"description": "Activate logits sampling.",
 					"default": "false",
 					"example": true
 				},
 				"frequency_penalty": {
 					"type": "number",
 					"format": "float",
+					"description": "The parameter for frequency penalty. 1.0 means no penalty\nPenalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
 					"default": "null",
 					"example": 0.1,
 					"nullable": true,
@@ -63,6 +75,7 @@
 				"max_new_tokens": {
 					"type": "integer",
 					"format": "int32",
+					"description": "Maximum number of tokens to generate.",
 					"default": "100",
 					"example": "20",
 					"nullable": true,
@@ -71,6 +84,7 @@
 				"repetition_penalty": {
 					"type": "number",
 					"format": "float",
+					"description": "The parameter for repetition penalty. 1.0 means no penalty.\nSee [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.",
 					"default": "null",
 					"example": 1.03,
 					"nullable": true,
@@ -78,6 +92,7 @@
 				},
 				"return_full_text": {
 					"type": "boolean",
+					"description": "Whether to prepend the prompt to the generated text",
 					"default": "null",
 					"example": false,
 					"nullable": true
@@ -85,6 +100,7 @@
 				"seed": {
 					"type": "integer",
 					"format": "int64",
+					"description": "Random sampling seed.",
 					"default": "null",
 					"example": "null",
 					"nullable": true,
@@ -96,12 +112,14 @@
 					"items": {
 						"type": "string"
 					},
+					"description": "Stop generating tokens if a member of `stop` is generated.",
 					"example": ["photographer"],
 					"maxItems": 4
 				},
 				"temperature": {
 					"type": "number",
 					"format": "float",
+					"description": "The value used to module the logits distribution.",
 					"default": "null",
 					"example": 0.5,
 					"nullable": true,
@@ -110,6 +128,7 @@
 				"top_k": {
 					"type": "integer",
 					"format": "int32",
+					"description": "The number of highest probability vocabulary tokens to keep for top-k-filtering.",
 					"default": "null",
 					"example": 10,
 					"nullable": true,
@@ -118,6 +137,7 @@
 				"top_n_tokens": {
 					"type": "integer",
 					"format": "int32",
+					"description": "The number of highest probability vocabulary tokens to keep for top-n-filtering.",
 					"default": "null",
 					"example": 5,
 					"nullable": true,
@@ -127,6 +147,7 @@
 				"top_p": {
 					"type": "number",
 					"format": "float",
+					"description": "Top-p value for nucleus sampling.",
 					"default": "null",
 					"example": 0.95,
 					"nullable": true,
@@ -135,6 +156,7 @@
 				},
 				"truncate": {
 					"type": "integer",
+					"description": "Truncate inputs tokens to the given size.",
 					"default": "null",
 					"example": "null",
 					"nullable": true,
@@ -143,6 +165,7 @@
 				"typical_p": {
 					"type": "number",
 					"format": "float",
+					"description": "Typical Decoding mass\nSee [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.",
 					"default": "null",
 					"example": 0.95,
 					"nullable": true,
@@ -151,6 +174,7 @@
 				},
 				"watermark": {
 					"type": "boolean",
+					"description": "Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226).",
 					"default": "false",
 					"example": true
 				}
diff --git a/packages/tasks/src/tasks/text-generation/spec/stream_output.json b/packages/tasks/src/tasks/text-generation/spec/stream_output.json
index e1ef8a0dc..fc566c6fe 100644
--- a/packages/tasks/src/tasks/text-generation/spec/stream_output.json
+++ b/packages/tasks/src/tasks/text-generation/spec/stream_output.json
@@ -39,7 +39,7 @@
 	"$defs": {
 		"TextGenerationStreamOutputStreamDetails": {
 			"type": "object",
-			"required": ["finish_reason", "generated_tokens"],
+			"required": ["finish_reason", "generated_tokens", "input_length"],
 			"properties": {
 				"finish_reason": {
 					"$ref": "#/$defs/TextGenerationStreamOutputFinishReason"
@@ -50,6 +50,12 @@
 					"example": 1,
 					"minimum": 0
 				},
+				"input_length": {
+					"type": "integer",
+					"format": "int32",
+					"example": 1,
+					"minimum": 0
+				},
 				"seed": {
 					"type": "integer",
 					"format": "int64",
diff --git a/packages/tasks/src/tasks/text-to-speech/inference.ts b/packages/tasks/src/tasks/text-to-speech/inference.ts
index 8a72ab56b..7e470b817 100644
--- a/packages/tasks/src/tasks/text-to-speech/inference.ts
+++ b/packages/tasks/src/tasks/text-to-speech/inference.ts
@@ -5,9 +5,7 @@
  */
 
 /**
- * Inputs for Text to Speech inference
- *
- * Inputs for Text To Audio inference
+ * Inputs for Text To Speech inference
  */
 export interface TextToSpeechInput {
 	/**
@@ -17,16 +15,16 @@ export interface TextToSpeechInput {
 	/**
 	 * Additional inference parameters
 	 */
-	parameters?: TextToAudioParameters;
+	parameters?: TextToSpeechParameters;
 	[property: string]: unknown;
 }
 
 /**
  * Additional inference parameters
  *
- * Additional inference parameters for Text To Audio
+ * Additional inference parameters for Text To Speech
  */
-export interface TextToAudioParameters {
+export interface TextToSpeechParameters {
 	/**
 	 * Parametrization of the text generation process
 	 */
diff --git a/packages/tasks/src/tasks/text-to-speech/spec/input.json b/packages/tasks/src/tasks/text-to-speech/spec/input.json
index 7d2bac092..6246f0fec 100644
--- a/packages/tasks/src/tasks/text-to-speech/spec/input.json
+++ b/packages/tasks/src/tasks/text-to-speech/spec/input.json
@@ -1,7 +1,31 @@
 {
-	"$ref": "/inference/schemas/text-to-audio/input.json",
 	"$id": "/inference/schemas/text-to-speech/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Text To Speech inference",
 	"title": "TextToSpeechInput",
-	"description": "Inputs for Text to Speech inference"
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The input text data",
+			"type": "string"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/TextToSpeechParameters"
+		}
+	},
+	"$defs": {
+		"TextToSpeechParameters": {
+			"title": "TextToSpeechParameters",
+			"description": "Additional inference parameters for Text To Speech",
+			"type": "object",
+			"properties": {
+				"generate": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
 }