huggingface · hanouticelina · Nov 28, 2024 · Nov 19, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
@@ -369,8 +369,6 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ZeroShotClassificationInput
 
-[[autodoc]] huggingface_hub.ZeroShotClassificationInputData
-
 [[autodoc]] huggingface_hub.ZeroShotClassificationOutputElement
 
 [[autodoc]] huggingface_hub.ZeroShotClassificationParameters
@@ -381,8 +379,6 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationInput
 
-[[autodoc]] huggingface_hub.ZeroShotImageClassificationInputData
-
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationOutputElement
 
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationParameters
@@ -395,6 +391,6 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ZeroShotObjectDetectionInput
 
-[[autodoc]] huggingface_hub.ZeroShotObjectDetectionInputData
-
 [[autodoc]] huggingface_hub.ZeroShotObjectDetectionOutputElement
+
+[[autodoc]] huggingface_hub.ZeroShotObjectDetectionParameters
diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
@@ -368,8 +368,6 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ZeroShotClassificationInput
 
-[[autodoc]] huggingface_hub.ZeroShotClassificationInputData
-
 [[autodoc]] huggingface_hub.ZeroShotClassificationOutputElement
 
 [[autodoc]] huggingface_hub.ZeroShotClassificationParameters
@@ -380,8 +378,6 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationInput
 
-[[autodoc]] huggingface_hub.ZeroShotImageClassificationInputData
-
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationOutputElement
 
 [[autodoc]] huggingface_hub.ZeroShotImageClassificationParameters
@@ -394,6 +390,6 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ZeroShotObjectDetectionInput
 
-[[autodoc]] huggingface_hub.ZeroShotObjectDetectionInputData
-
 [[autodoc]] huggingface_hub.ZeroShotObjectDetectionOutputElement
+
+[[autodoc]] huggingface_hub.ZeroShotObjectDetectionParameters
diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
@@ -415,17 +415,15 @@
         "VisualQuestionAnsweringOutputElement",
         "VisualQuestionAnsweringParameters",
         "ZeroShotClassificationInput",
-        "ZeroShotClassificationInputData",
         "ZeroShotClassificationOutputElement",
         "ZeroShotClassificationParameters",
         "ZeroShotImageClassificationInput",
-        "ZeroShotImageClassificationInputData",
         "ZeroShotImageClassificationOutputElement",
         "ZeroShotImageClassificationParameters",
         "ZeroShotObjectDetectionBoundingBox",
         "ZeroShotObjectDetectionInput",
-        "ZeroShotObjectDetectionInputData",
         "ZeroShotObjectDetectionOutputElement",
+        "ZeroShotObjectDetectionParameters",
     ],
     "inference_api": [
         "InferenceApi",
@@ -945,17 +943,15 @@ def __dir__():
         VisualQuestionAnsweringOutputElement,  # noqa: F401
         VisualQuestionAnsweringParameters,  # noqa: F401
         ZeroShotClassificationInput,  # noqa: F401
-        ZeroShotClassificationInputData,  # noqa: F401
         ZeroShotClassificationOutputElement,  # noqa: F401
         ZeroShotClassificationParameters,  # noqa: F401
         ZeroShotImageClassificationInput,  # noqa: F401
-        ZeroShotImageClassificationInputData,  # noqa: F401
         ZeroShotImageClassificationOutputElement,  # noqa: F401
         ZeroShotImageClassificationParameters,  # noqa: F401
         ZeroShotObjectDetectionBoundingBox,  # noqa: F401
         ZeroShotObjectDetectionInput,  # noqa: F401
-        ZeroShotObjectDetectionInputData,  # noqa: F401
         ZeroShotObjectDetectionOutputElement,  # noqa: F401
+        ZeroShotObjectDetectionParameters,  # noqa: F401
     )
     from .inference_api import InferenceApi  # noqa: F401
     from .keras_mixin import (

diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
@@ -348,7 +348,7 @@ def audio_classification(
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
 
         Returns:
             `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -980,7 +980,7 @@ def document_question_answering(
         >>> from huggingface_hub import InferenceClient
         >>> client = InferenceClient()
         >>> client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
-        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)]
+        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16)]
         ```
         """
         inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
@@ -1131,7 +1131,7 @@ def image_classification(
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
             function_to_apply (`"ImageClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
         Returns:
@@ -1814,7 +1814,7 @@ def text_classification(
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
 
         Returns:
             `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -2494,11 +2494,11 @@ def text_to_speech(
             max_length (`int`, *optional*):
                 The maximum length (in tokens) of the generated text, including the input.
             max_new_tokens (`int`, *optional*):
-                The maximum number of tokens to generate. Takes precedence over maxLength.
+                The maximum number of tokens to generate. Takes precedence over max_length.
             min_length (`int`, *optional*):
                 The minimum length (in tokens) of the generated text, including the input.
             min_new_tokens (`int`, *optional*):
-                The minimum number of tokens to generate. Takes precedence over maxLength.
+                The minimum number of tokens to generate. Takes precedence over min_length.
             num_beam_groups (`int`, *optional*):
                 Number of groups to divide num_beams into in order to ensure diversity among different groups of beams.
                 See [this paper](https://hf.co/papers/1610.02424) for more details.
@@ -2801,12 +2801,13 @@ def zero_shot_classification(
                 the label likelihoods for each sequence is 1. If true, the labels are considered independent and
                 probabilities are normalized for each candidate.
             hypothesis_template (`str`, *optional*):
-                The sentence used in conjunction with candidateLabels to attempt the text classification by replacing
-                the placeholder with the candidate labels.
+                The sentence used in conjunction with `candidate_labels` to attempt the text classification by
+                replacing the placeholder with the candidate labels.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
 
+
         Returns:
             `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
 
@@ -2897,12 +2898,12 @@ def zero_shot_image_classification(
         self,
         image: ContentT,
         # temporarily keeping it optional for backward compatibility.
-        candidate_labels: Optional[List[str]] = None,
+        candidate_labels: List[str] = None,  # type: ignore
         *,
         model: Optional[str] = None,
         hypothesis_template: Optional[str] = None,
         # deprecated argument
-        labels: Optional[List[str]] = None,  # type: ignore
+        labels: List[str] = None,  # type: ignore
     ) -> List[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
@@ -2918,8 +2919,8 @@ def zero_shot_image_classification(
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
             hypothesis_template (`str`, *optional*):
-                The sentence used in conjunction with candidateLabels to attempt the text classification by replacing
-                the placeholder with the candidate labels.
+                The sentence used in conjunction with `candidate_labels` to attempt the image classification by
+                replacing the placeholder with the candidate labels.
 
         Returns:
             `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.

diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -381,7 +381,7 @@ async def audio_classification(
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"AudioClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
 
         Returns:
             `List[AudioClassificationOutputElement]`: List of [`AudioClassificationOutputElement`] items containing the predicted labels and their confidence.
@@ -1023,7 +1023,7 @@ async def document_question_answering(
         >>> from huggingface_hub import AsyncInferenceClient
         >>> client = AsyncInferenceClient()
         >>> await client.document_question_answering(image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", question="What is the invoice number?")
-        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16, words=None)]
+        [DocumentQuestionAnsweringOutputElement(answer='us-001', end=16, score=0.9999666213989258, start=16)]
         ```
         """
         inputs: Dict[str, Any] = {"question": question, "image": _b64_encode(image)}
@@ -1176,7 +1176,7 @@ async def image_classification(
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
             function_to_apply (`"ImageClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
         Returns:
@@ -1876,7 +1876,7 @@ async def text_classification(
             top_k (`int`, *optional*):
                 When specified, limits the output to the top K most probable classes.
             function_to_apply (`"TextClassificationOutputTransform"`, *optional*):
-                The function to apply to the output.
+                The function to apply to the model outputs in order to retrieve the scores.
 
         Returns:
             `List[TextClassificationOutputElement]`: a list of [`TextClassificationOutputElement`] items containing the predicted label and associated probability.
@@ -2559,11 +2559,11 @@ async def text_to_speech(
             max_length (`int`, *optional*):
                 The maximum length (in tokens) of the generated text, including the input.
             max_new_tokens (`int`, *optional*):
-                The maximum number of tokens to generate. Takes precedence over maxLength.
+                The maximum number of tokens to generate. Takes precedence over max_length.
             min_length (`int`, *optional*):
                 The minimum length (in tokens) of the generated text, including the input.
             min_new_tokens (`int`, *optional*):
-                The minimum number of tokens to generate. Takes precedence over maxLength.
+                The minimum number of tokens to generate. Takes precedence over min_length.
             num_beam_groups (`int`, *optional*):
                 Number of groups to divide num_beams into in order to ensure diversity among different groups of beams.
                 See [this paper](https://hf.co/papers/1610.02424) for more details.
@@ -2870,12 +2870,13 @@ async def zero_shot_classification(
                 the label likelihoods for each sequence is 1. If true, the labels are considered independent and
                 probabilities are normalized for each candidate.
             hypothesis_template (`str`, *optional*):
-                The sentence used in conjunction with candidateLabels to attempt the text classification by replacing
-                the placeholder with the candidate labels.
+                The sentence used in conjunction with `candidate_labels` to attempt the text classification by
+                replacing the placeholder with the candidate labels.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot classification model will be used.
 
+
         Returns:
             `List[ZeroShotClassificationOutputElement]`: List of [`ZeroShotClassificationOutputElement`] items containing the predicted labels and their confidence.
 
@@ -2968,12 +2969,12 @@ async def zero_shot_image_classification(
         self,
         image: ContentT,
         # temporarily keeping it optional for backward compatibility.
-        candidate_labels: Optional[List[str]] = None,
+        candidate_labels: List[str] = None,  # type: ignore
         *,
         model: Optional[str] = None,
         hypothesis_template: Optional[str] = None,
         # deprecated argument
-        labels: Optional[List[str]] = None,  # type: ignore
+        labels: List[str] = None,  # type: ignore
     ) -> List[ZeroShotImageClassificationOutputElement]:
         """
         Provide input image and text labels to predict text labels for the image.
@@ -2989,8 +2990,8 @@ async def zero_shot_image_classification(
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. If not provided, the default recommended zero-shot image classification model will be used.
             hypothesis_template (`str`, *optional*):
-                The sentence used in conjunction with candidateLabels to attempt the text classification by replacing
-                the placeholder with the candidate labels.
+                The sentence used in conjunction with `candidate_labels` to attempt the image classification by
+                replacing the placeholder with the candidate labels.
 
         Returns:
             `List[ZeroShotImageClassificationOutputElement]`: List of [`ZeroShotImageClassificationOutputElement`] items containing the predicted labels and their confidence.

diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py
@@ -167,19 +167,17 @@
 )
 from .zero_shot_classification import (
     ZeroShotClassificationInput,
-    ZeroShotClassificationInputData,
     ZeroShotClassificationOutputElement,
     ZeroShotClassificationParameters,
 )
 from .zero_shot_image_classification import (
     ZeroShotImageClassificationInput,
-    ZeroShotImageClassificationInputData,
     ZeroShotImageClassificationOutputElement,
     ZeroShotImageClassificationParameters,
 )
 from .zero_shot_object_detection import (
     ZeroShotObjectDetectionBoundingBox,
     ZeroShotObjectDetectionInput,
-    ZeroShotObjectDetectionInputData,
     ZeroShotObjectDetectionOutputElement,
+    ZeroShotObjectDetectionParameters,
 )
diff --git a/src/huggingface_hub/inference/_generated/types/audio_classification.py b/src/huggingface_hub/inference/_generated/types/audio_classification.py
@@ -19,7 +19,7 @@ class AudioClassificationParameters(BaseInferenceType):
     """
 
     function_to_apply: Optional["AudioClassificationOutputTransform"] = None
-    """The function to apply to the output."""
+    """The function to apply to the model outputs in order to retrieve the scores."""
     top_k: Optional[int] = None
     """When specified, limits the output to the top K most probable classes."""
 

diff --git a/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py b/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py
@@ -80,7 +80,7 @@ class AutomaticSpeechRecognitionParameters(BaseInferenceType):
     Additional inference parameters for Automatic Speech Recognition
     """
 
-    generate: Optional[AutomaticSpeechRecognitionGenerationParameters] = None
+    generation_parameters: Optional[AutomaticSpeechRecognitionGenerationParameters] = None
     """Parametrization of the text generation process"""
     return_timestamps: Optional[bool] = None
     """Whether to output corresponding timestamps with the generated text"""

diff --git a/src/huggingface_hub/inference/_generated/types/document_question_answering.py b/src/huggingface_hub/inference/_generated/types/document_question_answering.py
@@ -81,5 +81,3 @@ class DocumentQuestionAnsweringOutputElement(BaseInferenceType):
     """The start word index of the answer (in the OCR’d version of the input or provided word
     boxes).
     """
-    words: List[int]
-    """The index of each word/box pair that is in the answer"""
diff --git a/src/huggingface_hub/inference/_generated/types/image_classification.py b/src/huggingface_hub/inference/_generated/types/image_classification.py
@@ -19,7 +19,7 @@ class ImageClassificationParameters(BaseInferenceType):
     """
 
     function_to_apply: Optional["ImageClassificationOutputTransform"] = None
-    """The function to apply to the output."""
+    """The function to apply to the model outputs in order to retrieve the scores."""
     top_k: Optional[int] = None
     """When specified, limits the output to the top K most probable classes."""
 

diff --git a/src/huggingface_hub/inference/_generated/types/image_to_text.py b/src/huggingface_hub/inference/_generated/types/image_to_text.py
@@ -80,7 +80,7 @@ class ImageToTextParameters(BaseInferenceType):
     Additional inference parameters for Image To Text
     """
 
-    generate: Optional[ImageToTextGenerationParameters] = None
+    generation_parameters: Optional[ImageToTextGenerationParameters] = None
     """Parametrization of the text generation process"""
     max_new_tokens: Optional[int] = None
     """The amount of maximum tokens to generate."""

diff --git a/src/huggingface_hub/inference/_generated/types/text_classification.py b/src/huggingface_hub/inference/_generated/types/text_classification.py
@@ -14,18 +14,14 @@
 
 @dataclass
 class TextClassificationParameters(BaseInferenceType):
-    """
-    Additional inference parameters for Text Classification.
+    """Additional inference parameters
+    Additional inference parameters for Text Classification
     """
 
     function_to_apply: Optional["TextClassificationOutputTransform"] = None
-    """
-    The function to apply to the output.
-    """
+    """The function to apply to the model outputs in order to retrieve the scores."""
     top_k: Optional[int] = None
-    """
-    When specified, limits the output to the top K most probable classes.
-    """
+    """When specified, limits the output to the top K most probable classes."""
 
 
 @dataclass