diff --git a/nucleus/__init__.py b/nucleus/__init__.py index 9335d3d9..71aa6402 100644 --- a/nucleus/__init__.py +++ b/nucleus/__init__.py @@ -54,6 +54,7 @@ import json import logging import os +import urllib.request from typing import Any, Dict, List, Optional, Union import aiohttp @@ -62,6 +63,8 @@ import tqdm import tqdm.notebook as tqdm_notebook +from nucleus.url_utils import sanitize_string_args + from .annotation import ( BoxAnnotation, PolygonAnnotation, @@ -300,6 +303,7 @@ def delete_dataset(self, dataset_id: str) -> dict: """ return self.make_request({}, f"dataset/{dataset_id}", requests.delete) + @sanitize_string_args def delete_dataset_item( self, dataset_id: str, item_id: str = None, reference_id: str = None ) -> dict: @@ -862,6 +866,7 @@ def model_run_info(self, model_run_id: str): {}, f"modelRun/{model_run_id}/info", requests.get ) + @sanitize_string_args def dataitem_ref_id(self, dataset_id: str, reference_id: str): """ :param dataset_id: internally controlled dataset id @@ -872,6 +877,7 @@ def dataitem_ref_id(self, dataset_id: str, reference_id: str): {}, f"dataset/{dataset_id}/refloc/{reference_id}", requests.get ) + @sanitize_string_args def predictions_ref_id(self, model_run_id: str, ref_id: str): """ Returns Model Run info For Dataset Item by model_run_id and item reference_id. diff --git a/nucleus/dataset.py b/nucleus/dataset.py index 2b7481c0..756c0619 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -3,6 +3,7 @@ import requests from nucleus.job import AsyncJob +from nucleus.url_utils import sanitize_string_args from nucleus.utils import ( convert_export_payload, format_dataset_item_response, @@ -35,7 +36,6 @@ ) from .payload_constructor import construct_model_run_creation_payload - WARN_FOR_LARGE_UPLOAD = 50000 @@ -83,6 +83,7 @@ def size(self) -> int: def items(self) -> List[DatasetItem]: return self._client.get_dataset_items(self.id) + @sanitize_string_args def autotag_scores(self, autotag_name, for_scores_greater_than=0): """Export the autotag scores above a threshold, largest scores first. diff --git a/nucleus/url_utils.py b/nucleus/url_utils.py new file mode 100644 index 00000000..0889846f --- /dev/null +++ b/nucleus/url_utils.py @@ -0,0 +1,22 @@ +import urllib.request + + +def sanitize_field(field): + return urllib.request.quote(field.encode("UTF-8"), safe="") + + +def sanitize_string_args(function): + def sanitized_function(*args, **kwargs): + sanitized_args = [] + sanitized_kwargs = {} + for arg in args: + if isinstance(arg, str): + arg = sanitize_field(arg) + sanitized_args.append(arg) + for key, value in kwargs.items(): + if isinstance(value, str): + value = sanitize_field(value) + sanitized_kwargs[key] = value + return function(*sanitized_args, **sanitized_kwargs) + + return sanitized_function diff --git a/pyproject.toml b/pyproject.toml index de4cab06..9b2e1128 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ exclude = ''' [tool.poetry] name = "scale-nucleus" -version = "0.1.10" +version = "0.1.11" description = "The official Python client library for Nucleus, the Data Platform for AI" license = "MIT" authors = ["Scale AI Nucleus Team "] diff --git a/tests/test_dataset.py b/tests/test_dataset.py index abae1463..6d0675b9 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -37,7 +37,6 @@ TEST_IMG_URLS, TEST_POLYGON_ANNOTATIONS, TEST_SEGMENTATION_ANNOTATIONS, - TEST_SLICE_NAME, reference_id_from_url, ) @@ -344,6 +343,20 @@ def test_annotate_async_with_error(dataset: Dataset): assert "Item with id fake_garbage doesn" in str(job.errors()) +def test_append_with_special_chars(dataset): + url = TEST_IMG_URLS[0] + ref_id = "test/reference/id" + ds_items = [ + DatasetItem( + image_location=url, + reference_id=ref_id, + metadata={"test": "metadata"}, + ), + ] + dataset.append(ds_items) + dataset.refloc(ref_id) + + def test_append_and_export(dataset): # Dataset upload url = TEST_IMG_URLS[0] diff --git a/tests/test_slice.py b/tests/test_slice.py index c93d06e0..c972c288 100644 --- a/tests/test_slice.py +++ b/tests/test_slice.py @@ -154,6 +154,7 @@ def sort_by_reference_id(items): ) +@pytest.mark.integration def test_slice_send_to_labeling(dataset): # Dataset upload ds_items = []