Skip to content

Commit

Permalink
Fix docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
hanouticelina committed Oct 25, 2024
1 parent f7e89bd commit 2053b26
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 21 deletions.
7 changes: 0 additions & 7 deletions src/huggingface_hub/inference/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,30 +946,23 @@ def document_question_answering(
doc_stride (`int`, *optional*):
If the words in the document are too long to fit with the question for the model, it will be split in
several chunks with some overlap. This argument controls the size of that overlap.
be split in several chunks with some overlap. This argument controls the size of that
overlap.
handle_impossible_answer (`bool`, *optional*):
Whether to accept impossible as an answer
lang (`str`, *optional*):
Language to use while running OCR. Defaults to english.
max_answer_len (`int`, *optional*):
The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
considered).
max_question_len (`int`, *optional*):
The maximum length of the question after tokenization. It will be truncated if needed.
max_seq_len (`int`, *optional*):
The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
model. The context will be split in several chunks (using doc_stride as overlap) if needed.
passed to the model. The context will be split in several chunks (using doc_stride as
overlap) if needed.
top_k (`int`, *optional*):
The number of answers to return (will be chosen by order of likelihood). Can return less than top_k
answers if there are not enough options available within the context.
than top_k answers if there are not enough options available within the context.
word_boxes (`List[Union[List[float], str`, *optional*):
A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
step and use the provided bounding boxes instead.
skip the OCR step and use the provided bounding boxes instead.
Returns:
`List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
Expand Down
7 changes: 0 additions & 7 deletions src/huggingface_hub/inference/_generated/_async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -988,30 +988,23 @@ async def document_question_answering(
doc_stride (`int`, *optional*):
If the words in the document are too long to fit with the question for the model, it will be split in
several chunks with some overlap. This argument controls the size of that overlap.
be split in several chunks with some overlap. This argument controls the size of that
overlap.
handle_impossible_answer (`bool`, *optional*):
Whether to accept impossible as an answer
lang (`str`, *optional*):
Language to use while running OCR. Defaults to english.
max_answer_len (`int`, *optional*):
The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
considered).
max_question_len (`int`, *optional*):
The maximum length of the question after tokenization. It will be truncated if needed.
max_seq_len (`int`, *optional*):
The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
model. The context will be split in several chunks (using doc_stride as overlap) if needed.
passed to the model. The context will be split in several chunks (using doc_stride as
overlap) if needed.
top_k (`int`, *optional*):
The number of answers to return (will be chosen by order of likelihood). Can return less than top_k
answers if there are not enough options available within the context.
than top_k answers if there are not enough options available within the context.
word_boxes (`List[Union[List[float], str`, *optional*):
A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR
step and use the provided bounding boxes instead.
skip the OCR step and use the provided bounding boxes instead.
Returns:
`List[DocumentQuestionAnsweringOutputElement]`: a list of [`DocumentQuestionAnsweringOutputElement`] items containing the predicted label, associated probability, word ids, and page number.
Expand Down
38 changes: 31 additions & 7 deletions utils/check_task_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
- [x] add missing parameters to methods signature
- [x] detect missing parameters in method docstrings
- [x] add missing parameters to methods docstrings
- [x] detect outdated parameters in method signature
- [x] update outdated parameters in method signature
- [x] detect outdated parameters in method docstrings
- [x] update outdated parameters in method docstrings
- [ ] detect when parameter not used in method implementation
- [ ] update method implementation when parameter not used
Related resources:
- https://github.com/huggingface/huggingface_hub/issues/2063
- https://github.com/huggingface/huggingface_hub/issues/2557
Expand All @@ -31,7 +37,6 @@

import argparse
import builtins
import inspect
import re
import textwrap
from collections import defaultdict
Expand All @@ -43,8 +48,6 @@
from libcst.codemod import CodemodContext
from libcst.codemod.visitors import GatherImportsVisitor

from huggingface_hub.inference._client import InferenceClient


# Paths to project files
BASE_DIR = Path(__file__).parents[1] / "src" / "huggingface_hub"
Expand Down Expand Up @@ -299,6 +302,19 @@ def _update_existing_params(self, docstring: str, params_to_update: Dict[str, Di
for i, line in enumerate(docstring_lines):
for param_name, param_info in params_to_update.items():
if line.strip().startswith(param_name + " "):
# Find the end of current parameter documentation
end_idx = i + 1
while end_idx < len(docstring_lines):
next_line = docstring_lines[end_idx].strip()
# Stop if we hit another parameter or section
if (
(next_line.endswith(":") and not next_line.startswith(description_indentation))
or next_line.lower() in ("returns:", "raises:", "example:", "examples:")
or not next_line
):
break
end_idx += 1

param_type_str = param_info["type"].replace("Optional[", "").rstrip("]")
optional_str = "*optional*" if "Optional[" in param_info["type"] else ""
param_docstring = (param_info.get("docstring") or "").strip()
Expand All @@ -313,9 +329,9 @@ def _update_existing_params(self, docstring: str, params_to_update: Dict[str, Di
initial_indent=description_indentation,
subsequent_indent=description_indentation,
)
docstring_lines[i : i + 2] = [param_line, wrapped_description]
docstring_lines[i:end_idx] = [param_line, wrapped_description]
else:
docstring_lines[i : i + 1] = [param_line]
docstring_lines[i:end_idx] = [param_line]
return "\n".join(docstring_lines)

def _add_new_params(self, docstring: str, new_params: Dict[str, Dict[str, str]]) -> str:
Expand Down Expand Up @@ -519,7 +535,9 @@ def check_missing_parameters(
else:
# Check for type/docstring changes
current = existing_params[param_name]
if current["type"] != param_info["type"] or current["docstring"] != param_info["docstring"]:
normalized_current_doc = _normalize_docstring(current["docstring"])
normalized_new_doc = _normalize_docstring(param_info["docstring"])
if current["type"] != param_info["type"] or normalized_current_doc != normalized_new_doc:
updates[param_name] = {**param_info, "status": "update"}

return updates
Expand Down Expand Up @@ -589,6 +607,12 @@ def _generate_import_statements(import_dict: Dict[str, List[str]]) -> str:
return "\n".join(import_statements)


def _normalize_docstring(docstring: str) -> str:
"""Normalize a docstring by removing extra whitespace, newlines and indentation."""
# Split into lines, strip whitespace from each line, and join back
return " ".join(line.strip() for line in docstring.split("\n")).strip()


# TODO: Needs to be improved, maybe using `typing.get_type_hints` instead (we gonna need to access the method though)?
def _collect_type_hints_from_annotation(annotation_str: str) -> Set[str]:
"""
Expand Down Expand Up @@ -716,7 +740,7 @@ def update_inference_client(update: bool):

# Construct a mapping between method names and their parameters dataclass names
method_params = {}
for method_name, _ in inspect.getmembers(InferenceClient, predicate=inspect.isfunction):
for method_name, _ in [("document_question_answering", None)]: #
if method_name.startswith("_") or method_name not in tasks:
continue
parameter_type_name = _get_parameter_type_name(method_name)
Expand Down

0 comments on commit 2053b26

Please sign in to comment.