From 90e47b7a5f2603ad4e62606b1a2f60f860f78fb9 Mon Sep 17 00:00:00 2001 From: Diego Ardila Date: Tue, 11 May 2021 03:45:50 -0700 Subject: [PATCH 1/8] passing integration test --- nucleus/__init__.py | 12 ++-- nucleus/constants.py | 106 +++++++++++++++++---------------- nucleus/dataset.py | 48 ++++++++++++--- nucleus/dataset_item.py | 22 +++++-- nucleus/job.py | 51 ++++++++++++++++ nucleus/payload_constructor.py | 4 +- nucleus/utils.py | 36 ++++++++--- pyproject.toml | 1 + tests/__init__.py | 0 tests/test_annotation.py | 2 +- tests/test_dataset.py | 66 +++++++++++++------- tests/test_indexing.py | 2 +- tests/test_models.py | 2 +- tests/test_prediction.py | 2 +- tests/test_slice.py | 2 +- 15 files changed, 248 insertions(+), 108 deletions(-) create mode 100644 nucleus/job.py create mode 100644 tests/__init__.py diff --git a/nucleus/__init__.py b/nucleus/__init__.py index a6d07eba..94144de2 100644 --- a/nucleus/__init__.py +++ b/nucleus/__init__.py @@ -70,7 +70,7 @@ # pylint: disable=C0302 from requests.packages.urllib3.util.retry import Retry -from .constants import REFERENCE_IDS_KEY, DATASET_ITEM_IDS_KEY +from .constants import REFERENCE_IDS_KEY, DATASET_ITEM_IDS_KEY, UPDATE_KEY from .dataset import Dataset from .dataset_item import DatasetItem from .annotation import ( @@ -123,7 +123,6 @@ AUTOTAGS_KEY, ANNOTATION_METADATA_SCHEMA_KEY, ITEM_METADATA_SCHEMA_KEY, - FORCE_KEY, EMBEDDINGS_URL_KEY, ) from .model import Model @@ -151,11 +150,14 @@ def __init__( self, api_key: str, use_notebook: bool = False, - endpoint=NUCLEUS_ENDPOINT, + endpoint: str = None, ): self.api_key = api_key self.tqdm_bar = tqdm.tqdm - self.endpoint = endpoint + if endpoint is None: + self.endpoint = os.environ.get( + "NUCLEUS_ENDPOINT", NUCLEUS_ENDPOINT + ) self._use_notebook = use_notebook if use_notebook: self.tqdm_bar = tqdm_notebook.tqdm @@ -497,7 +499,7 @@ def exception_handler(request, exception): items = payload[ITEMS_KEY] payloads = [ # batch_size images per request - {ITEMS_KEY: items[i : i + batch_size], FORCE_KEY: update} + {ITEMS_KEY: items[i : i + batch_size], UPDATE_KEY: update} for i in range(0, len(items), batch_size) ] diff --git a/nucleus/constants.py b/nucleus/constants.py index 666ceb34..5172068d 100644 --- a/nucleus/constants.py +++ b/nucleus/constants.py @@ -1,63 +1,65 @@ -NUCLEUS_ENDPOINT = "https://api.scale.com/v1/nucleus" -DEFAULT_NETWORK_TIMEOUT_SEC = 120 -ITEMS_KEY = "items" -ITEM_KEY = "item" -REFERENCE_ID_KEY = "reference_id" -REFERENCE_IDS_KEY = "reference_ids" -DATASET_ID_KEY = "dataset_id" -IMAGE_KEY = "image" -IMAGE_URL_KEY = "image_url" -NEW_ITEMS = "new_items" -UPDATED_ITEMS = "updated_items" -IGNORED_ITEMS = "ignored_items" -ERROR_ITEMS = "upload_errors" -ERROR_PAYLOAD = "error_payload" -ERROR_CODES = "error_codes" +ANNOTATIONS_IGNORED_KEY = "annotations_ignored" ANNOTATIONS_KEY = "annotations" -ANNOTATION_ID_KEY = "annotation_id" ANNOTATIONS_PROCESSED_KEY = "annotations_processed" -ANNOTATIONS_IGNORED_KEY = "annotations_ignored" -PREDICTIONS_PROCESSED_KEY = "predictions_processed" -PREDICTIONS_IGNORED_KEY = "predictions_ignored" +ANNOTATION_ID_KEY = "annotation_id" +ANNOTATION_METADATA_SCHEMA_KEY = "annotation_metadata_schema" +BOX_TYPE = "box" +POLYGON_TYPE = "polygon" +SEGMENTATION_TYPE = "segmentation" +ANNOTATION_TYPES = (BOX_TYPE, POLYGON_TYPE, SEGMENTATION_TYPE) ANNOTATION_UPDATE_KEY = "update" -DEFAULT_ANNOTATION_UPDATE_MODE = False -STATUS_CODE_KEY = "status_code" -STATUS_KEY = "status" -SUCCESS_STATUS_CODES = [200, 201, 202] -ERRORS_KEY = "errors" -MODEL_RUN_ID_KEY = "model_run_id" -MODEL_ID_KEY = "model_id" -DATASET_ITEM_ID_KEY = "dataset_item_id" -ITEM_ID_KEY = "item_id" +AUTOTAGS_KEY = "autotags" + +CONFIDENCE_KEY = "confidence" +DATASET_ID_KEY = "dataset_id" DATASET_ITEM_IDS_KEY = "dataset_item_ids" -SLICE_ID_KEY = "slice_id" -DATASET_NAME_KEY = "name" +DATASET_ITEM_ID_KEY = "dataset_item_id" +DATASET_LENGTH_KEY = "length" DATASET_MODEL_RUNS_KEY = "model_run_ids" +DATASET_NAME_KEY = "name" DATASET_SLICES_KEY = "slice_ids" -DATASET_LENGTH_KEY = "length" -FORCE_KEY = "update" +DEFAULT_ANNOTATION_UPDATE_MODE = False +DEFAULT_NETWORK_TIMEOUT_SEC = 120 +EMBEDDINGS_URL_KEY = "embeddings_url" +ERRORS_KEY = "errors" +ERROR_CODES = "error_codes" +ERROR_ITEMS = "upload_errors" +ERROR_PAYLOAD = "error_payload" +GEOMETRY_KEY = "geometry" +HEIGHT_KEY = "height" +IGNORED_ITEMS = "ignored_items" +IMAGE_KEY = "image" +IMAGE_URL_KEY = "image_url" +INDEX_KEY = "index" +ITEMS_KEY = "items" +ITEM_ID_KEY = "item_id" +ITEM_KEY = "item" +ITEM_METADATA_SCHEMA_KEY = "item_metadata_schema" +JOB_ID_KEY = "job_id" +LABEL_KEY = "label" +MASK_URL_KEY = "mask_url" +MESSAGE_KEY = "message" METADATA_KEY = "metadata" +MODEL_ID_KEY = "model_id" +MODEL_RUN_ID_KEY = "model_run_id" NAME_KEY = "name" -LABEL_KEY = "label" -CONFIDENCE_KEY = "confidence" +NEW_ITEMS = "new_items" +NUCLEUS_ENDPOINT = "https://api.scale.com/v1/nucleus" ORIGINAL_IMAGE_URL_KEY = "original_image_url" -X_KEY = "x" -Y_KEY = "y" -WIDTH_KEY = "width" -HEIGHT_KEY = "height" +PREDICTIONS_IGNORED_KEY = "predictions_ignored" +PREDICTIONS_PROCESSED_KEY = "predictions_processed" +REFERENCE_IDS_KEY = "reference_ids" +REFERENCE_ID_KEY = "reference_id" +REQUEST_ID_KEY = "requestId" +SEGMENTATIONS_KEY = "segmentations" +SLICE_ID_KEY = "slice_id" +STATUS_CODE_KEY = "status_code" +STATUS_KEY = "status" +SUCCESS_STATUS_CODES = [200, 201, 202] TYPE_KEY = "type" +UPDATED_ITEMS = "updated_items" +UPDATE_KEY = "update" VERTICES_KEY = "vertices" -BOX_TYPE = "box" -POLYGON_TYPE = "polygon" -SEGMENTATION_TYPE = "segmentation" -ANNOTATION_TYPES = (BOX_TYPE, POLYGON_TYPE, SEGMENTATION_TYPE) -GEOMETRY_KEY = "geometry" -AUTOTAGS_KEY = "autotags" -ANNOTATION_METADATA_SCHEMA_KEY = "annotation_metadata_schema" -ITEM_METADATA_SCHEMA_KEY = "item_metadata_schema" -MASK_URL_KEY = "mask_url" -INDEX_KEY = "index" -SEGMENTATIONS_KEY = "segmentations" -EMBEDDINGS_URL_KEY = "embeddings_url" -JOB_ID_KEY = "job_id" -MESSAGE_KEY = "message" +WIDTH_KEY = "width" +X_KEY = "x" +Y_KEY = "y" diff --git a/nucleus/dataset.py b/nucleus/dataset.py index 79bc8d09..a3206137 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -1,8 +1,13 @@ -from typing import Any, Dict, List, Optional +import uuid +from typing import Any, Dict, List, Optional, Union import requests -from nucleus.utils import format_dataset_item_response +from nucleus.job import AsyncJob +from nucleus.utils import ( + format_dataset_item_response, + serialize_and_write_to_presigned_url, +) from .annotation import Annotation from .constants import ( @@ -14,8 +19,10 @@ DEFAULT_ANNOTATION_UPDATE_MODE, NAME_KEY, REFERENCE_IDS_KEY, + REQUEST_ID_KEY, + UPDATE_KEY, ) -from .dataset_item import DatasetItem +from .dataset_item import DatasetItem, check_all_paths_remote from .payload_constructor import construct_model_run_creation_payload @@ -26,7 +33,11 @@ class Dataset: compare model performance on you data. """ - def __init__(self, dataset_id: str, client): + def __init__( + self, + dataset_id: str, + client: "NucleusClient", # type:ignore # noqa: F821 + ): self.id = dataset_id self._client = client @@ -160,16 +171,18 @@ def ingest_tasks(self, task_ids: dict): def append( self, dataset_items: List[DatasetItem], - force: Optional[bool] = False, + update: Optional[bool] = False, batch_size: Optional[int] = 20, - ) -> dict: + asynchronous=False, + ) -> Union[dict, AsyncJob]: """ Appends images with metadata (dataset items) to the dataset. Overwrites images on collision if forced. Parameters: :param dataset_items: items to upload - :param force: if True overwrites images on collision + :param update: if True overwrites images and metadata on collision :param batch_size: batch parameter for long uploads + :param aynchronous: if True, return a job object representing asynchronous ingestion job. :return: { 'dataset_id': str, @@ -178,10 +191,29 @@ def append( 'ignored_items': int, } """ + if asynchronous: + check_all_paths_remote(dataset_items) + request_id = uuid.uuid4().hex + response = self._client.make_request( + payload={}, + route=f"dataset/{self.id}/signedUrl/{request_id}", + requests_command=requests.get, + ) + print(response["signed_url"]) + + serialize_and_write_to_presigned_url( + dataset_items, response["signed_url"] + ) + response = self._client.make_request( + payload={REQUEST_ID_KEY: request_id, UPDATE_KEY: update}, + route=f"dataset/{self.id}/append?async=1", + ) + return AsyncJob(response["job_id"], self._client) + return self._client.populate_dataset( self.id, dataset_items, - force=force, + force=update, batch_size=batch_size, ) diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py index 2a5fc6b6..e18bbd96 100644 --- a/nucleus/dataset_item.py +++ b/nucleus/dataset_item.py @@ -1,7 +1,7 @@ import json import os.path from dataclasses import dataclass -from typing import Optional +from typing import List, Optional from .constants import ( DATASET_ITEM_ID_KEY, @@ -22,7 +22,7 @@ class DatasetItem: def __post_init__(self): self.image_url = self.image_location - self.local = self._is_local_path(self.image_location) + self.local = is_local_path(self.image_location) @classmethod def from_json(cls, payload: dict): @@ -36,10 +36,6 @@ def from_json(cls, payload: dict): metadata=payload.get(METADATA_KEY, {}), ) - def _is_local_path(self, path: str) -> bool: - path_components = [comp.lower() for comp in path.split("/")] - return path_components[0] not in {"https:", "http:", "s3:", "gs:"} - def local_file_exists(self): return os.path.isfile(self.image_url) @@ -56,3 +52,17 @@ def to_payload(self) -> dict: def to_json(self) -> str: return json.dumps(self.to_payload()) + + +def is_local_path(path: str) -> bool: + path_components = [comp.lower() for comp in path.split("/")] + return path_components[0] not in {"https:", "http:", "s3:", "gs:"} + + +def check_all_paths_remote(dataset_items: List[DatasetItem]): + for item in dataset_items: + if is_local_path(item.image_location): + raise ValueError( + f"All paths must be remote, but {item.image_location} is either " + "local, or a remote URL type that is not supported." + ) diff --git a/nucleus/job.py b/nucleus/job.py new file mode 100644 index 00000000..2319d3f0 --- /dev/null +++ b/nucleus/job.py @@ -0,0 +1,51 @@ +from dataclasses import dataclass +import time +from typing import Dict, List + +import requests + + +@dataclass +class AsyncJob: + id: str + client: "NucleusClient" # type: ignore # noqa: F821 + + def status(self) -> Dict[str, str]: + return self.client.make_request( + payload={}, + route=f"job/{self.id}", + requests_command=requests.get, + ) + + def errors(self) -> List[str]: + return self.client.make_request( + payload={}, + route=f"job/${self.id}/errors", + requests_command=requests.get, + ) + + def sleep_until_complete(self, verbose_std_out=True): + while 1: + time.sleep(1) + status = self.status() + if status["status"] == "Running": + if verbose_std_out: + print(status) + continue + break + final_status = status + if final_status["status"] == "Errored": + raise JobError(final_status, self) + + +class JobError(Exception): + def __init__(self, job_status: Dict[str, str], job: AsyncJob): + final_status_message = job_status["message"] + final_status = job_status["status"] + message = ( + f"The job reported a final status of {final_status} " + "This could, however, mean a partial success with some successes and some failures. " + f"The final status message was: {final_status_message} \n" + f"For more detailed error messages you can call {str(job)}.errors()" + ) + super().__init__(message) diff --git a/nucleus/payload_constructor.py b/nucleus/payload_constructor.py index c467604d..efe847a7 100644 --- a/nucleus/payload_constructor.py +++ b/nucleus/payload_constructor.py @@ -17,7 +17,7 @@ REFERENCE_ID_KEY, ANNOTATIONS_KEY, ITEMS_KEY, - FORCE_KEY, + UPDATE_KEY, MODEL_ID_KEY, ANNOTATION_METADATA_SCHEMA_KEY, SEGMENTATIONS_KEY, @@ -34,7 +34,7 @@ def construct_append_payload( return ( {ITEMS_KEY: items} if not force - else {ITEMS_KEY: items, FORCE_KEY: True} + else {ITEMS_KEY: items, UPDATE_KEY: True} ) diff --git a/nucleus/utils.py b/nucleus/utils.py index 56e6fa90..1736b29d 100644 --- a/nucleus/utils.py +++ b/nucleus/utils.py @@ -1,18 +1,18 @@ """Shared stateless utility function library""" -from typing import List, Union, Dict +import io +from typing import IO, Dict, List, Sequence, Union + +import requests +from requests.models import HTTPError from nucleus.annotation import Annotation + +from .constants import ANNOTATION_TYPES, ANNOTATIONS_KEY, ITEM_KEY from .dataset_item import DatasetItem from .prediction import BoxPrediction, PolygonPrediction -from .constants import ( - ITEM_KEY, - ANNOTATIONS_KEY, - ANNOTATION_TYPES, -) - def _get_all_field_values(metadata_list: List[dict], key: str): return {metadata[key] for metadata in metadata_list if key in metadata} @@ -73,9 +73,9 @@ def format_dataset_item_response(response: dict) -> dict: def serialize_and_write( - upload_unit: List[Union[DatasetItem, Annotation]], file_pointer + upload_units: Sequence[Union[DatasetItem, Annotation]], file_pointer ): - for unit in upload_unit: + for unit in upload_units: try: file_pointer.write(unit.to_json() + "\n") except TypeError as e: @@ -92,3 +92,21 @@ def serialize_and_write( ) message += f"The specific error was {e}" raise ValueError(message) from e + + +def upload_to_presigned_url(presigned_url: str, file_pointer: IO): + # TODO optimize this further to deal with truly huge files and flaky internet connection. + upload_response = requests.put(presigned_url, file_pointer) + if not upload_response.ok: + raise HTTPError( + f"Tried to put a file to url, but failed with status {upload_response.status_code}. The detailed error was: {upload_response.text}" + ) + + +def serialize_and_write_to_presigned_url( + upload_units: Sequence[Union[DatasetItem, Annotation]], presigned_url +): + strio = io.StringIO() + serialize_and_write(upload_units, strio) + strio.seek(0) + upload_to_presigned_url(presigned_url, strio) diff --git a/pyproject.toml b/pyproject.toml index c2ede34f..06a52ba1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ flake8 = "^3.9.1" mypy = "^0.812" coverage = "^5.5" pre-commit = "^2.12.1" +jupyterlab = "^3.0.14" [build-system] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_annotation.py b/tests/test_annotation.py index 0552cd3f..f0e3c772 100644 --- a/tests/test_annotation.py +++ b/tests/test_annotation.py @@ -1,6 +1,6 @@ import pytest -from helpers import ( +from .helpers import ( TEST_DATASET_NAME, TEST_IMG_URLS, TEST_BOX_ANNOTATIONS, diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 7e7006ef..75e29b01 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,7 +1,7 @@ import pytest import os -from helpers import ( +from .helpers import ( TEST_SLICE_NAME, TEST_DATASET_NAME, TEST_IMG_URLS, @@ -58,7 +58,29 @@ def dataset(CLIENT): yield ds response = CLIENT.delete_dataset(ds.id) - assert response == {} + assert response == {"message": "Beginning dataset deletion..."} + + +def make_dataset_items(): + ds_items_with_metadata = [] + for i, url in enumerate(TEST_IMG_URLS): + ds_items_with_metadata.append( + DatasetItem( + image_location=url, + reference_id=reference_id_from_url(url), + metadata={ + "made_with_pytest": True, + "example_int": i, + "example_str": "hello", + "example_float": 0.5, + "example_dict": { + "nested": True, + }, + "example_list": ["hello", i, False], + }, + ) + ) + return ds_items_with_metadata def test_dataset_create_and_delete(CLIENT): @@ -95,26 +117,8 @@ def check_is_expected_response(response): check_is_expected_response(response) # With reference ids and metadata: - ds_items_with_metadata = [] - for i, url in enumerate(TEST_IMG_URLS): - ds_items_with_metadata.append( - DatasetItem( - image_location=url, - reference_id=reference_id_from_url(url), - metadata={ - "made_with_pytest": True, - "example_int": i, - "example_str": "hello", - "example_float": 0.5, - "example_dict": { - "nested": True, - }, - "example_list": ["hello", i, False], - }, - ) - ) - response = dataset.append(ds_items_with_metadata) + response = dataset.append(make_dataset_items()) check_is_expected_response(response) @@ -131,6 +135,26 @@ def test_dataset_append_local(CLIENT, dataset): assert ERROR_PAYLOAD not in resp_json +def test_dataset_append_async(dataset: Dataset): + job = dataset.append(make_dataset_items(), asynchronous=True) + job.sleep_until_complete() + status = job.status() + assert status == { + "job_id": job.id, + "status": "Completed", + "message": { + "image_upload_step": {"errored": 0, "pending": 0, "completed": 5}, + "started_image_processing": f"Dataset: {dataset.id}, Job: {job.id}", + "ingest_to_reupload_queue": { + "epoch": 1, + "total": 5, + "datasetId": f"{dataset.id}", + "processed": 5, + }, + }, + } + + def test_dataset_list_autotags(CLIENT, dataset): # Creation # List of Autotags should be empty diff --git a/tests/test_indexing.py b/tests/test_indexing.py index ee341ab7..9ad61eec 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1,6 +1,6 @@ import pytest -from helpers import ( +from .helpers import ( TEST_INDEX_EMBEDDINGS_FILE, TEST_IMG_URLS, TEST_DATASET_NAME, diff --git a/tests/test_models.py b/tests/test_models.py index 082229c1..b4a05092 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -18,7 +18,7 @@ ERROR_PAYLOAD, DATASET_ID_KEY, ) -from helpers import ( +from .helpers import ( TEST_MODEL_NAME, TEST_MODEL_RUN, TEST_PREDS, diff --git a/tests/test_prediction.py b/tests/test_prediction.py index bfcd459f..04a88636 100644 --- a/tests/test_prediction.py +++ b/tests/test_prediction.py @@ -1,6 +1,6 @@ import pytest import time -from helpers import ( +from .helpers import ( TEST_DATASET_NAME, TEST_MODEL_NAME, TEST_MODEL_RUN, diff --git a/tests/test_slice.py b/tests/test_slice.py index 6826008c..dc4520fc 100644 --- a/tests/test_slice.py +++ b/tests/test_slice.py @@ -1,7 +1,7 @@ import pytest from nucleus import Slice, NucleusClient, DatasetItem, BoxAnnotation from nucleus.constants import ERROR_PAYLOAD, ITEM_KEY -from helpers import ( +from .helpers import ( TEST_DATASET_NAME, TEST_IMG_URLS, TEST_SLICE_NAME, From e6980b8e7df5fb1ece7e418e0f35a1f7d8ed1982 Mon Sep 17 00:00:00 2001 From: Diego Ardila Date: Tue, 11 May 2021 10:13:02 -0700 Subject: [PATCH 2/8] added error test cases --- nucleus/dataset_item.py | 5 ++--- nucleus/job.py | 7 ++++--- tests/test_dataset.py | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py index e18bbd96..490785c2 100644 --- a/nucleus/dataset_item.py +++ b/nucleus/dataset_item.py @@ -21,7 +21,6 @@ class DatasetItem: metadata: Optional[dict] = None def __post_init__(self): - self.image_url = self.image_location self.local = is_local_path(self.image_location) @classmethod @@ -37,11 +36,11 @@ def from_json(cls, payload: dict): ) def local_file_exists(self): - return os.path.isfile(self.image_url) + return os.path.isfile(self.image_location) def to_payload(self) -> dict: payload = { - IMAGE_URL_KEY: self.image_url, + IMAGE_URL_KEY: self.image_location, METADATA_KEY: self.metadata or {}, } if self.reference_id: diff --git a/nucleus/job.py b/nucleus/job.py index 2319d3f0..9f338ff7 100644 --- a/nucleus/job.py +++ b/nucleus/job.py @@ -20,7 +20,7 @@ def status(self) -> Dict[str, str]: def errors(self) -> List[str]: return self.client.make_request( payload={}, - route=f"job/${self.id}/errors", + route=f"job/{self.id}/errors", requests_command=requests.get, ) @@ -28,11 +28,12 @@ def sleep_until_complete(self, verbose_std_out=True): while 1: time.sleep(1) status = self.status() + if verbose_std_out: + print(status) if status["status"] == "Running": - if verbose_std_out: - print(status) continue break + final_status = status if final_status["status"] == "Errored": raise JobError(final_status, self) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 75e29b01..7809de8f 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,3 +1,4 @@ +from nucleus.job import JobError import pytest import os @@ -155,6 +156,46 @@ def test_dataset_append_async(dataset: Dataset): } +def test_dataset_append_async_with_local_path(dataset: Dataset): + ds_items = make_dataset_items() + ds_items[ + 0 + ].image_location = "/a/fake/local/path/you/can/tell/is/local/but/is/fake" + with pytest.raises(ValueError): + dataset.append(ds_items, asynchronous=True) + + +def test_dataset_append_async_with_1_bad_url(dataset: Dataset): + ds_items = make_dataset_items() + ds_items[0].image_location = "https://looks.ok.but.is.not.accessible" + job = dataset.append(ds_items, asynchronous=True) + with pytest.raises(JobError): + job.sleep_until_complete() + assert job.status() == { + "job_id": f"{job.id}", + "status": "Errored", + "message": { + "final_error": ( + "One or more of the images you attempted to upload did not process" + " correctly. Please see the status for an overview and the errors for " + "more detailed messages." + ), + "image_upload_step": {"errored": 1, "pending": 0, "completed": 4}, + "ingest_to_reupload_queue": { + "epoch": 1, + "total": 5, + "datasetId": f"{dataset.id}", + "processed": 5, + }, + "started_image_processing": f"Dataset: {dataset.id}, Job: {job.id}", + }, + } + assert job.errors() == [ + "One or more of the images you attempted to upload did not process correctly. Please see the status for an overview and the errors for more detailed messages.", + 'Failure when processing the image "https://looks.ok.but.is.not.accessible": {}', + ] + + def test_dataset_list_autotags(CLIENT, dataset): # Creation # List of Autotags should be empty From 53e2271a74641aacf80fd97da92d116d9d93e146 Mon Sep 17 00:00:00 2001 From: Diego Ardila Date: Tue, 11 May 2021 10:13:49 -0700 Subject: [PATCH 3/8] Add todo --- tests/test_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 7809de8f..8c62c4ad 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -192,6 +192,7 @@ def test_dataset_append_async_with_1_bad_url(dataset: Dataset): } assert job.errors() == [ "One or more of the images you attempted to upload did not process correctly. Please see the status for an overview and the errors for more detailed messages.", + # Todo: figure out why this error isn't propagating from image upload. 'Failure when processing the image "https://looks.ok.but.is.not.accessible": {}', ] From 1672a1429cecbe08c68b61e5f4c708e5580cf2d6 Mon Sep 17 00:00:00 2001 From: Diego Ardila Date: Tue, 11 May 2021 10:19:31 -0700 Subject: [PATCH 4/8] bump version to prepare for eventual release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 06a52ba1..2a32c52e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ exclude = ''' [tool.poetry] name = "scale-nucleus" -version = "0.1.3" +version = "0.1.4" description = "The official Python client library for Nucleus, the Data Platform for AI" license = "MIT" authors = ["Scale AI Nucleus Team "] From fcc648f5ac20d1d9772e7389d1b0719f2f609f0a Mon Sep 17 00:00:00 2001 From: Diego Ardila Date: Wed, 12 May 2021 14:26:39 -0700 Subject: [PATCH 5/8] reduced polling interval and tested against prod --- nucleus/job.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nucleus/job.py b/nucleus/job.py index 9f338ff7..3e24bb39 100644 --- a/nucleus/job.py +++ b/nucleus/job.py @@ -4,6 +4,8 @@ import requests +JOB_POLLING_INTERVAL = 5 + @dataclass class AsyncJob: @@ -26,10 +28,12 @@ def errors(self) -> List[str]: def sleep_until_complete(self, verbose_std_out=True): while 1: - time.sleep(1) status = self.status() + + time.sleep(JOB_POLLING_INTERVAL) + if verbose_std_out: - print(status) + print(f"Status at {time.ctime()}: {status}") if status["status"] == "Running": continue break From 934e22a28abbb2ea6bf2a510e63421253c06eb2f Mon Sep 17 00:00:00 2001 From: Diego Ardila Date: Wed, 12 May 2021 16:19:05 -0700 Subject: [PATCH 6/8] Don't install jupyterlab in CI + urllibparse --- .circleci/config.yml | 2 +- nucleus/dataset_item.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 39fb5616..e63e5255 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -13,7 +13,7 @@ jobs: command: | # install dependencies pip install --upgrade pip pip install poetry - poetry install + poetry install --no-dev - run: name: Black Formatting Check # Only validation, without re-formatting command: | diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py index ab1d7673..91fdac52 100644 --- a/nucleus/dataset_item.py +++ b/nucleus/dataset_item.py @@ -3,6 +3,7 @@ import os.path from dataclasses import dataclass from typing import Optional, Sequence +from urllib.parse import urlparse from .constants import ( DATASET_ITEM_ID_KEY, @@ -55,8 +56,7 @@ def to_json(self) -> str: def is_local_path(path: str) -> bool: - path_components = [comp.lower() for comp in path.split("/")] - return path_components[0] not in {"https:", "http:", "s3:", "gs:"} + return urlparse(path).scheme not in {"https", "http", "s3", "gs"} def check_all_paths_remote(dataset_items: Sequence[DatasetItem]): From 03878b3316c1f5676a8f4e89f301a6dbe2e2fc47 Mon Sep 17 00:00:00 2001 From: Diego Ardila Date: Wed, 12 May 2021 16:28:17 -0700 Subject: [PATCH 7/8] remove jupyter but still install things we need for test --- .circleci/config.yml | 1 + pyproject.toml | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e63e5255..851e90d8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -14,6 +14,7 @@ jobs: pip install --upgrade pip pip install poetry poetry install --no-dev + - run: name: Black Formatting Check # Only validation, without re-formatting command: | diff --git a/pyproject.toml b/pyproject.toml index 2a32c52e..6c002276 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,6 @@ flake8 = "^3.9.1" mypy = "^0.812" coverage = "^5.5" pre-commit = "^2.12.1" -jupyterlab = "^3.0.14" [build-system] From 854462d61b176bf3fbf49619dcc6ade065704817 Mon Sep 17 00:00:00 2001 From: Diego Ardila Date: Wed, 12 May 2021 16:30:05 -0700 Subject: [PATCH 8/8] restore dev requirements --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 851e90d8..22d30747 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -13,7 +13,7 @@ jobs: command: | # install dependencies pip install --upgrade pip pip install poetry - poetry install --no-dev + poetry install - run: name: Black Formatting Check # Only validation, without re-formatting