From 90e47b7a5f2603ad4e62606b1a2f60f860f78fb9 Mon Sep 17 00:00:00 2001
From: Diego Ardila <diego.ardila@scale.com>
Date: Tue, 11 May 2021 03:45:50 -0700
Subject: [PATCH 1/8] passing integration test

---
 nucleus/__init__.py            |  12 ++--
 nucleus/constants.py           | 106 +++++++++++++++++----------------
 nucleus/dataset.py             |  48 ++++++++++++---
 nucleus/dataset_item.py        |  22 +++++--
 nucleus/job.py                 |  51 ++++++++++++++++
 nucleus/payload_constructor.py |   4 +-
 nucleus/utils.py               |  36 ++++++++---
 pyproject.toml                 |   1 +
 tests/__init__.py              |   0
 tests/test_annotation.py       |   2 +-
 tests/test_dataset.py          |  66 +++++++++++++-------
 tests/test_indexing.py         |   2 +-
 tests/test_models.py           |   2 +-
 tests/test_prediction.py       |   2 +-
 tests/test_slice.py            |   2 +-
 15 files changed, 248 insertions(+), 108 deletions(-)
 create mode 100644 nucleus/job.py
 create mode 100644 tests/__init__.py

diff --git a/nucleus/__init__.py b/nucleus/__init__.py
index a6d07eba..94144de2 100644
--- a/nucleus/__init__.py
+++ b/nucleus/__init__.py
@@ -70,7 +70,7 @@
 # pylint: disable=C0302
 from requests.packages.urllib3.util.retry import Retry
 
-from .constants import REFERENCE_IDS_KEY, DATASET_ITEM_IDS_KEY
+from .constants import REFERENCE_IDS_KEY, DATASET_ITEM_IDS_KEY, UPDATE_KEY
 from .dataset import Dataset
 from .dataset_item import DatasetItem
 from .annotation import (
@@ -123,7 +123,6 @@
     AUTOTAGS_KEY,
     ANNOTATION_METADATA_SCHEMA_KEY,
     ITEM_METADATA_SCHEMA_KEY,
-    FORCE_KEY,
     EMBEDDINGS_URL_KEY,
 )
 from .model import Model
@@ -151,11 +150,14 @@ def __init__(
         self,
         api_key: str,
         use_notebook: bool = False,
-        endpoint=NUCLEUS_ENDPOINT,
+        endpoint: str = None,
     ):
         self.api_key = api_key
         self.tqdm_bar = tqdm.tqdm
-        self.endpoint = endpoint
+        if endpoint is None:
+            self.endpoint = os.environ.get(
+                "NUCLEUS_ENDPOINT", NUCLEUS_ENDPOINT
+            )
         self._use_notebook = use_notebook
         if use_notebook:
             self.tqdm_bar = tqdm_notebook.tqdm
@@ -497,7 +499,7 @@ def exception_handler(request, exception):
         items = payload[ITEMS_KEY]
         payloads = [
             # batch_size images per request
-            {ITEMS_KEY: items[i : i + batch_size], FORCE_KEY: update}
+            {ITEMS_KEY: items[i : i + batch_size], UPDATE_KEY: update}
             for i in range(0, len(items), batch_size)
         ]
 
diff --git a/nucleus/constants.py b/nucleus/constants.py
index 666ceb34..5172068d 100644
--- a/nucleus/constants.py
+++ b/nucleus/constants.py
@@ -1,63 +1,65 @@
-NUCLEUS_ENDPOINT = "https://api.scale.com/v1/nucleus"
-DEFAULT_NETWORK_TIMEOUT_SEC = 120
-ITEMS_KEY = "items"
-ITEM_KEY = "item"
-REFERENCE_ID_KEY = "reference_id"
-REFERENCE_IDS_KEY = "reference_ids"
-DATASET_ID_KEY = "dataset_id"
-IMAGE_KEY = "image"
-IMAGE_URL_KEY = "image_url"
-NEW_ITEMS = "new_items"
-UPDATED_ITEMS = "updated_items"
-IGNORED_ITEMS = "ignored_items"
-ERROR_ITEMS = "upload_errors"
-ERROR_PAYLOAD = "error_payload"
-ERROR_CODES = "error_codes"
+ANNOTATIONS_IGNORED_KEY = "annotations_ignored"
 ANNOTATIONS_KEY = "annotations"
-ANNOTATION_ID_KEY = "annotation_id"
 ANNOTATIONS_PROCESSED_KEY = "annotations_processed"
-ANNOTATIONS_IGNORED_KEY = "annotations_ignored"
-PREDICTIONS_PROCESSED_KEY = "predictions_processed"
-PREDICTIONS_IGNORED_KEY = "predictions_ignored"
+ANNOTATION_ID_KEY = "annotation_id"
+ANNOTATION_METADATA_SCHEMA_KEY = "annotation_metadata_schema"
+BOX_TYPE = "box"
+POLYGON_TYPE = "polygon"
+SEGMENTATION_TYPE = "segmentation"
+ANNOTATION_TYPES = (BOX_TYPE, POLYGON_TYPE, SEGMENTATION_TYPE)
 ANNOTATION_UPDATE_KEY = "update"
-DEFAULT_ANNOTATION_UPDATE_MODE = False
-STATUS_CODE_KEY = "status_code"
-STATUS_KEY = "status"
-SUCCESS_STATUS_CODES = [200, 201, 202]
-ERRORS_KEY = "errors"
-MODEL_RUN_ID_KEY = "model_run_id"
-MODEL_ID_KEY = "model_id"
-DATASET_ITEM_ID_KEY = "dataset_item_id"
-ITEM_ID_KEY = "item_id"
+AUTOTAGS_KEY = "autotags"
+
+CONFIDENCE_KEY = "confidence"
+DATASET_ID_KEY = "dataset_id"
 DATASET_ITEM_IDS_KEY = "dataset_item_ids"
-SLICE_ID_KEY = "slice_id"
-DATASET_NAME_KEY = "name"
+DATASET_ITEM_ID_KEY = "dataset_item_id"
+DATASET_LENGTH_KEY = "length"
 DATASET_MODEL_RUNS_KEY = "model_run_ids"
+DATASET_NAME_KEY = "name"
 DATASET_SLICES_KEY = "slice_ids"
-DATASET_LENGTH_KEY = "length"
-FORCE_KEY = "update"
+DEFAULT_ANNOTATION_UPDATE_MODE = False
+DEFAULT_NETWORK_TIMEOUT_SEC = 120
+EMBEDDINGS_URL_KEY = "embeddings_url"
+ERRORS_KEY = "errors"
+ERROR_CODES = "error_codes"
+ERROR_ITEMS = "upload_errors"
+ERROR_PAYLOAD = "error_payload"
+GEOMETRY_KEY = "geometry"
+HEIGHT_KEY = "height"
+IGNORED_ITEMS = "ignored_items"
+IMAGE_KEY = "image"
+IMAGE_URL_KEY = "image_url"
+INDEX_KEY = "index"
+ITEMS_KEY = "items"
+ITEM_ID_KEY = "item_id"
+ITEM_KEY = "item"
+ITEM_METADATA_SCHEMA_KEY = "item_metadata_schema"
+JOB_ID_KEY = "job_id"
+LABEL_KEY = "label"
+MASK_URL_KEY = "mask_url"
+MESSAGE_KEY = "message"
 METADATA_KEY = "metadata"
+MODEL_ID_KEY = "model_id"
+MODEL_RUN_ID_KEY = "model_run_id"
 NAME_KEY = "name"
-LABEL_KEY = "label"
-CONFIDENCE_KEY = "confidence"
+NEW_ITEMS = "new_items"
+NUCLEUS_ENDPOINT = "https://api.scale.com/v1/nucleus"
 ORIGINAL_IMAGE_URL_KEY = "original_image_url"
-X_KEY = "x"
-Y_KEY = "y"
-WIDTH_KEY = "width"
-HEIGHT_KEY = "height"
+PREDICTIONS_IGNORED_KEY = "predictions_ignored"
+PREDICTIONS_PROCESSED_KEY = "predictions_processed"
+REFERENCE_IDS_KEY = "reference_ids"
+REFERENCE_ID_KEY = "reference_id"
+REQUEST_ID_KEY = "requestId"
+SEGMENTATIONS_KEY = "segmentations"
+SLICE_ID_KEY = "slice_id"
+STATUS_CODE_KEY = "status_code"
+STATUS_KEY = "status"
+SUCCESS_STATUS_CODES = [200, 201, 202]
 TYPE_KEY = "type"
+UPDATED_ITEMS = "updated_items"
+UPDATE_KEY = "update"
 VERTICES_KEY = "vertices"
-BOX_TYPE = "box"
-POLYGON_TYPE = "polygon"
-SEGMENTATION_TYPE = "segmentation"
-ANNOTATION_TYPES = (BOX_TYPE, POLYGON_TYPE, SEGMENTATION_TYPE)
-GEOMETRY_KEY = "geometry"
-AUTOTAGS_KEY = "autotags"
-ANNOTATION_METADATA_SCHEMA_KEY = "annotation_metadata_schema"
-ITEM_METADATA_SCHEMA_KEY = "item_metadata_schema"
-MASK_URL_KEY = "mask_url"
-INDEX_KEY = "index"
-SEGMENTATIONS_KEY = "segmentations"
-EMBEDDINGS_URL_KEY = "embeddings_url"
-JOB_ID_KEY = "job_id"
-MESSAGE_KEY = "message"
+WIDTH_KEY = "width"
+X_KEY = "x"
+Y_KEY = "y"
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
index 79bc8d09..a3206137 100644
--- a/nucleus/dataset.py
+++ b/nucleus/dataset.py
@@ -1,8 +1,13 @@
-from typing import Any, Dict, List, Optional
+import uuid
+from typing import Any, Dict, List, Optional, Union
 
 import requests
 
-from nucleus.utils import format_dataset_item_response
+from nucleus.job import AsyncJob
+from nucleus.utils import (
+    format_dataset_item_response,
+    serialize_and_write_to_presigned_url,
+)
 
 from .annotation import Annotation
 from .constants import (
@@ -14,8 +19,10 @@
     DEFAULT_ANNOTATION_UPDATE_MODE,
     NAME_KEY,
     REFERENCE_IDS_KEY,
+    REQUEST_ID_KEY,
+    UPDATE_KEY,
 )
-from .dataset_item import DatasetItem
+from .dataset_item import DatasetItem, check_all_paths_remote
 from .payload_constructor import construct_model_run_creation_payload
 
 
@@ -26,7 +33,11 @@ class Dataset:
     compare model performance on you data.
     """
 
-    def __init__(self, dataset_id: str, client):
+    def __init__(
+        self,
+        dataset_id: str,
+        client: "NucleusClient",  # type:ignore # noqa: F821
+    ):
         self.id = dataset_id
         self._client = client
 
@@ -160,16 +171,18 @@ def ingest_tasks(self, task_ids: dict):
     def append(
         self,
         dataset_items: List[DatasetItem],
-        force: Optional[bool] = False,
+        update: Optional[bool] = False,
         batch_size: Optional[int] = 20,
-    ) -> dict:
+        asynchronous=False,
+    ) -> Union[dict, AsyncJob]:
         """
         Appends images with metadata (dataset items) to the dataset. Overwrites images on collision if forced.
 
         Parameters:
         :param dataset_items: items to upload
-        :param force: if True overwrites images on collision
+        :param update: if True overwrites images and metadata on collision
         :param batch_size: batch parameter for long uploads
+        :param aynchronous: if True, return a job object representing asynchronous ingestion job.
         :return:
         {
             'dataset_id': str,
@@ -178,10 +191,29 @@ def append(
             'ignored_items': int,
         }
         """
+        if asynchronous:
+            check_all_paths_remote(dataset_items)
+            request_id = uuid.uuid4().hex
+            response = self._client.make_request(
+                payload={},
+                route=f"dataset/{self.id}/signedUrl/{request_id}",
+                requests_command=requests.get,
+            )
+            print(response["signed_url"])
+
+            serialize_and_write_to_presigned_url(
+                dataset_items, response["signed_url"]
+            )
+            response = self._client.make_request(
+                payload={REQUEST_ID_KEY: request_id, UPDATE_KEY: update},
+                route=f"dataset/{self.id}/append?async=1",
+            )
+            return AsyncJob(response["job_id"], self._client)
+
         return self._client.populate_dataset(
             self.id,
             dataset_items,
-            force=force,
+            force=update,
             batch_size=batch_size,
         )
 
diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py
index 2a5fc6b6..e18bbd96 100644
--- a/nucleus/dataset_item.py
+++ b/nucleus/dataset_item.py
@@ -1,7 +1,7 @@
 import json
 import os.path
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional
 
 from .constants import (
     DATASET_ITEM_ID_KEY,
@@ -22,7 +22,7 @@ class DatasetItem:
 
     def __post_init__(self):
         self.image_url = self.image_location
-        self.local = self._is_local_path(self.image_location)
+        self.local = is_local_path(self.image_location)
 
     @classmethod
     def from_json(cls, payload: dict):
@@ -36,10 +36,6 @@ def from_json(cls, payload: dict):
             metadata=payload.get(METADATA_KEY, {}),
         )
 
-    def _is_local_path(self, path: str) -> bool:
-        path_components = [comp.lower() for comp in path.split("/")]
-        return path_components[0] not in {"https:", "http:", "s3:", "gs:"}
-
     def local_file_exists(self):
         return os.path.isfile(self.image_url)
 
@@ -56,3 +52,17 @@ def to_payload(self) -> dict:
 
     def to_json(self) -> str:
         return json.dumps(self.to_payload())
+
+
+def is_local_path(path: str) -> bool:
+    path_components = [comp.lower() for comp in path.split("/")]
+    return path_components[0] not in {"https:", "http:", "s3:", "gs:"}
+
+
+def check_all_paths_remote(dataset_items: List[DatasetItem]):
+    for item in dataset_items:
+        if is_local_path(item.image_location):
+            raise ValueError(
+                f"All paths must be remote, but {item.image_location} is either "
+                "local, or a remote URL type that is not supported."
+            )
diff --git a/nucleus/job.py b/nucleus/job.py
new file mode 100644
index 00000000..2319d3f0
--- /dev/null
+++ b/nucleus/job.py
@@ -0,0 +1,51 @@
+from dataclasses import dataclass
+import time
+from typing import Dict, List
+
+import requests
+
+
+@dataclass
+class AsyncJob:
+    id: str
+    client: "NucleusClient"  # type: ignore # noqa: F821
+
+    def status(self) -> Dict[str, str]:
+        return self.client.make_request(
+            payload={},
+            route=f"job/{self.id}",
+            requests_command=requests.get,
+        )
+
+    def errors(self) -> List[str]:
+        return self.client.make_request(
+            payload={},
+            route=f"job/${self.id}/errors",
+            requests_command=requests.get,
+        )
+
+    def sleep_until_complete(self, verbose_std_out=True):
+        while 1:
+            time.sleep(1)
+            status = self.status()
+            if status["status"] == "Running":
+                if verbose_std_out:
+                    print(status)
+                continue
+            break
+        final_status = status
+        if final_status["status"] == "Errored":
+            raise JobError(final_status, self)
+
+
+class JobError(Exception):
+    def __init__(self, job_status: Dict[str, str], job: AsyncJob):
+        final_status_message = job_status["message"]
+        final_status = job_status["status"]
+        message = (
+            f"The job reported a final status of {final_status} "
+            "This could, however, mean a partial success with some successes and some failures. "
+            f"The final status message was: {final_status_message} \n"
+            f"For more detailed error messages you can call {str(job)}.errors()"
+        )
+        super().__init__(message)
diff --git a/nucleus/payload_constructor.py b/nucleus/payload_constructor.py
index c467604d..efe847a7 100644
--- a/nucleus/payload_constructor.py
+++ b/nucleus/payload_constructor.py
@@ -17,7 +17,7 @@
     REFERENCE_ID_KEY,
     ANNOTATIONS_KEY,
     ITEMS_KEY,
-    FORCE_KEY,
+    UPDATE_KEY,
     MODEL_ID_KEY,
     ANNOTATION_METADATA_SCHEMA_KEY,
     SEGMENTATIONS_KEY,
@@ -34,7 +34,7 @@ def construct_append_payload(
     return (
         {ITEMS_KEY: items}
         if not force
-        else {ITEMS_KEY: items, FORCE_KEY: True}
+        else {ITEMS_KEY: items, UPDATE_KEY: True}
     )
 
 
diff --git a/nucleus/utils.py b/nucleus/utils.py
index 56e6fa90..1736b29d 100644
--- a/nucleus/utils.py
+++ b/nucleus/utils.py
@@ -1,18 +1,18 @@
 """Shared stateless utility function library"""
 
 
-from typing import List, Union, Dict
+import io
+from typing import IO, Dict, List, Sequence, Union
+
+import requests
+from requests.models import HTTPError
 
 from nucleus.annotation import Annotation
+
+from .constants import ANNOTATION_TYPES, ANNOTATIONS_KEY, ITEM_KEY
 from .dataset_item import DatasetItem
 from .prediction import BoxPrediction, PolygonPrediction
 
-from .constants import (
-    ITEM_KEY,
-    ANNOTATIONS_KEY,
-    ANNOTATION_TYPES,
-)
-
 
 def _get_all_field_values(metadata_list: List[dict], key: str):
     return {metadata[key] for metadata in metadata_list if key in metadata}
@@ -73,9 +73,9 @@ def format_dataset_item_response(response: dict) -> dict:
 
 
 def serialize_and_write(
-    upload_unit: List[Union[DatasetItem, Annotation]], file_pointer
+    upload_units: Sequence[Union[DatasetItem, Annotation]], file_pointer
 ):
-    for unit in upload_unit:
+    for unit in upload_units:
         try:
             file_pointer.write(unit.to_json() + "\n")
         except TypeError as e:
@@ -92,3 +92,21 @@ def serialize_and_write(
             )
             message += f"The specific error was {e}"
             raise ValueError(message) from e
+
+
+def upload_to_presigned_url(presigned_url: str, file_pointer: IO):
+    # TODO optimize this further to deal with truly huge files and flaky internet connection.
+    upload_response = requests.put(presigned_url, file_pointer)
+    if not upload_response.ok:
+        raise HTTPError(
+            f"Tried to put a file to url, but failed with status {upload_response.status_code}. The detailed error was: {upload_response.text}"
+        )
+
+
+def serialize_and_write_to_presigned_url(
+    upload_units: Sequence[Union[DatasetItem, Annotation]], presigned_url
+):
+    strio = io.StringIO()
+    serialize_and_write(upload_units, strio)
+    strio.seek(0)
+    upload_to_presigned_url(presigned_url, strio)
diff --git a/pyproject.toml b/pyproject.toml
index c2ede34f..06a52ba1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ flake8 = "^3.9.1"
 mypy = "^0.812"
 coverage = "^5.5"
 pre-commit = "^2.12.1"
+jupyterlab = "^3.0.14"
 
 
 [build-system]
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_annotation.py b/tests/test_annotation.py
index 0552cd3f..f0e3c772 100644
--- a/tests/test_annotation.py
+++ b/tests/test_annotation.py
@@ -1,6 +1,6 @@
 import pytest
 
-from helpers import (
+from .helpers import (
     TEST_DATASET_NAME,
     TEST_IMG_URLS,
     TEST_BOX_ANNOTATIONS,
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 7e7006ef..75e29b01 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -1,7 +1,7 @@
 import pytest
 import os
 
-from helpers import (
+from .helpers import (
     TEST_SLICE_NAME,
     TEST_DATASET_NAME,
     TEST_IMG_URLS,
@@ -58,7 +58,29 @@ def dataset(CLIENT):
     yield ds
 
     response = CLIENT.delete_dataset(ds.id)
-    assert response == {}
+    assert response == {"message": "Beginning dataset deletion..."}
+
+
+def make_dataset_items():
+    ds_items_with_metadata = []
+    for i, url in enumerate(TEST_IMG_URLS):
+        ds_items_with_metadata.append(
+            DatasetItem(
+                image_location=url,
+                reference_id=reference_id_from_url(url),
+                metadata={
+                    "made_with_pytest": True,
+                    "example_int": i,
+                    "example_str": "hello",
+                    "example_float": 0.5,
+                    "example_dict": {
+                        "nested": True,
+                    },
+                    "example_list": ["hello", i, False],
+                },
+            )
+        )
+    return ds_items_with_metadata
 
 
 def test_dataset_create_and_delete(CLIENT):
@@ -95,26 +117,8 @@ def check_is_expected_response(response):
     check_is_expected_response(response)
 
     # With reference ids and metadata:
-    ds_items_with_metadata = []
-    for i, url in enumerate(TEST_IMG_URLS):
-        ds_items_with_metadata.append(
-            DatasetItem(
-                image_location=url,
-                reference_id=reference_id_from_url(url),
-                metadata={
-                    "made_with_pytest": True,
-                    "example_int": i,
-                    "example_str": "hello",
-                    "example_float": 0.5,
-                    "example_dict": {
-                        "nested": True,
-                    },
-                    "example_list": ["hello", i, False],
-                },
-            )
-        )
 
-    response = dataset.append(ds_items_with_metadata)
+    response = dataset.append(make_dataset_items())
     check_is_expected_response(response)
 
 
@@ -131,6 +135,26 @@ def test_dataset_append_local(CLIENT, dataset):
     assert ERROR_PAYLOAD not in resp_json
 
 
+def test_dataset_append_async(dataset: Dataset):
+    job = dataset.append(make_dataset_items(), asynchronous=True)
+    job.sleep_until_complete()
+    status = job.status()
+    assert status == {
+        "job_id": job.id,
+        "status": "Completed",
+        "message": {
+            "image_upload_step": {"errored": 0, "pending": 0, "completed": 5},
+            "started_image_processing": f"Dataset: {dataset.id}, Job: {job.id}",
+            "ingest_to_reupload_queue": {
+                "epoch": 1,
+                "total": 5,
+                "datasetId": f"{dataset.id}",
+                "processed": 5,
+            },
+        },
+    }
+
+
 def test_dataset_list_autotags(CLIENT, dataset):
     # Creation
     # List of Autotags should be empty
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
index ee341ab7..9ad61eec 100644
--- a/tests/test_indexing.py
+++ b/tests/test_indexing.py
@@ -1,6 +1,6 @@
 import pytest
 
-from helpers import (
+from .helpers import (
     TEST_INDEX_EMBEDDINGS_FILE,
     TEST_IMG_URLS,
     TEST_DATASET_NAME,
diff --git a/tests/test_models.py b/tests/test_models.py
index 082229c1..b4a05092 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -18,7 +18,7 @@
     ERROR_PAYLOAD,
     DATASET_ID_KEY,
 )
-from helpers import (
+from .helpers import (
     TEST_MODEL_NAME,
     TEST_MODEL_RUN,
     TEST_PREDS,
diff --git a/tests/test_prediction.py b/tests/test_prediction.py
index bfcd459f..04a88636 100644
--- a/tests/test_prediction.py
+++ b/tests/test_prediction.py
@@ -1,6 +1,6 @@
 import pytest
 import time
-from helpers import (
+from .helpers import (
     TEST_DATASET_NAME,
     TEST_MODEL_NAME,
     TEST_MODEL_RUN,
diff --git a/tests/test_slice.py b/tests/test_slice.py
index 6826008c..dc4520fc 100644
--- a/tests/test_slice.py
+++ b/tests/test_slice.py
@@ -1,7 +1,7 @@
 import pytest
 from nucleus import Slice, NucleusClient, DatasetItem, BoxAnnotation
 from nucleus.constants import ERROR_PAYLOAD, ITEM_KEY
-from helpers import (
+from .helpers import (
     TEST_DATASET_NAME,
     TEST_IMG_URLS,
     TEST_SLICE_NAME,

From e6980b8e7df5fb1ece7e418e0f35a1f7d8ed1982 Mon Sep 17 00:00:00 2001
From: Diego Ardila <diego.ardila@scale.com>
Date: Tue, 11 May 2021 10:13:02 -0700
Subject: [PATCH 2/8] added error test cases

---
 nucleus/dataset_item.py |  5 ++---
 nucleus/job.py          |  7 ++++---
 tests/test_dataset.py   | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py
index e18bbd96..490785c2 100644
--- a/nucleus/dataset_item.py
+++ b/nucleus/dataset_item.py
@@ -21,7 +21,6 @@ class DatasetItem:
     metadata: Optional[dict] = None
 
     def __post_init__(self):
-        self.image_url = self.image_location
         self.local = is_local_path(self.image_location)
 
     @classmethod
@@ -37,11 +36,11 @@ def from_json(cls, payload: dict):
         )
 
     def local_file_exists(self):
-        return os.path.isfile(self.image_url)
+        return os.path.isfile(self.image_location)
 
     def to_payload(self) -> dict:
         payload = {
-            IMAGE_URL_KEY: self.image_url,
+            IMAGE_URL_KEY: self.image_location,
             METADATA_KEY: self.metadata or {},
         }
         if self.reference_id:
diff --git a/nucleus/job.py b/nucleus/job.py
index 2319d3f0..9f338ff7 100644
--- a/nucleus/job.py
+++ b/nucleus/job.py
@@ -20,7 +20,7 @@ def status(self) -> Dict[str, str]:
     def errors(self) -> List[str]:
         return self.client.make_request(
             payload={},
-            route=f"job/${self.id}/errors",
+            route=f"job/{self.id}/errors",
             requests_command=requests.get,
         )
 
@@ -28,11 +28,12 @@ def sleep_until_complete(self, verbose_std_out=True):
         while 1:
             time.sleep(1)
             status = self.status()
+            if verbose_std_out:
+                print(status)
             if status["status"] == "Running":
-                if verbose_std_out:
-                    print(status)
                 continue
             break
+
         final_status = status
         if final_status["status"] == "Errored":
             raise JobError(final_status, self)
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 75e29b01..7809de8f 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -1,3 +1,4 @@
+from nucleus.job import JobError
 import pytest
 import os
 
@@ -155,6 +156,46 @@ def test_dataset_append_async(dataset: Dataset):
     }
 
 
+def test_dataset_append_async_with_local_path(dataset: Dataset):
+    ds_items = make_dataset_items()
+    ds_items[
+        0
+    ].image_location = "/a/fake/local/path/you/can/tell/is/local/but/is/fake"
+    with pytest.raises(ValueError):
+        dataset.append(ds_items, asynchronous=True)
+
+
+def test_dataset_append_async_with_1_bad_url(dataset: Dataset):
+    ds_items = make_dataset_items()
+    ds_items[0].image_location = "https://looks.ok.but.is.not.accessible"
+    job = dataset.append(ds_items, asynchronous=True)
+    with pytest.raises(JobError):
+        job.sleep_until_complete()
+    assert job.status() == {
+        "job_id": f"{job.id}",
+        "status": "Errored",
+        "message": {
+            "final_error": (
+                "One or more of the images you attempted to upload did not process"
+                " correctly. Please see the status for an overview and the errors for "
+                "more detailed messages."
+            ),
+            "image_upload_step": {"errored": 1, "pending": 0, "completed": 4},
+            "ingest_to_reupload_queue": {
+                "epoch": 1,
+                "total": 5,
+                "datasetId": f"{dataset.id}",
+                "processed": 5,
+            },
+            "started_image_processing": f"Dataset: {dataset.id}, Job: {job.id}",
+        },
+    }
+    assert job.errors() == [
+        "One or more of the images you attempted to upload did not process correctly. Please see the status for an overview and the errors for more detailed messages.",
+        'Failure when processing the image "https://looks.ok.but.is.not.accessible": {}',
+    ]
+
+
 def test_dataset_list_autotags(CLIENT, dataset):
     # Creation
     # List of Autotags should be empty

From 53e2271a74641aacf80fd97da92d116d9d93e146 Mon Sep 17 00:00:00 2001
From: Diego Ardila <diego.ardila@scale.com>
Date: Tue, 11 May 2021 10:13:49 -0700
Subject: [PATCH 3/8] Add todo

---
 tests/test_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 7809de8f..8c62c4ad 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -192,6 +192,7 @@ def test_dataset_append_async_with_1_bad_url(dataset: Dataset):
     }
     assert job.errors() == [
         "One or more of the images you attempted to upload did not process correctly. Please see the status for an overview and the errors for more detailed messages.",
+        # Todo: figure out why this error isn't propagating from image upload.
         'Failure when processing the image "https://looks.ok.but.is.not.accessible": {}',
     ]
 

From 1672a1429cecbe08c68b61e5f4c708e5580cf2d6 Mon Sep 17 00:00:00 2001
From: Diego Ardila <diego.ardila@scale.com>
Date: Tue, 11 May 2021 10:19:31 -0700
Subject: [PATCH 4/8] bump version to prepare for eventual release

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 06a52ba1..2a32c52e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.1.3"
+version = "0.1.4"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]

From fcc648f5ac20d1d9772e7389d1b0719f2f609f0a Mon Sep 17 00:00:00 2001
From: Diego Ardila <diego.ardila@scale.com>
Date: Wed, 12 May 2021 14:26:39 -0700
Subject: [PATCH 5/8] reduced polling interval and tested against prod

---
 nucleus/job.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/nucleus/job.py b/nucleus/job.py
index 9f338ff7..3e24bb39 100644
--- a/nucleus/job.py
+++ b/nucleus/job.py
@@ -4,6 +4,8 @@
 
 import requests
 
+JOB_POLLING_INTERVAL = 5
+
 
 @dataclass
 class AsyncJob:
@@ -26,10 +28,12 @@ def errors(self) -> List[str]:
 
     def sleep_until_complete(self, verbose_std_out=True):
         while 1:
-            time.sleep(1)
             status = self.status()
+
+            time.sleep(JOB_POLLING_INTERVAL)
+
             if verbose_std_out:
-                print(status)
+                print(f"Status at {time.ctime()}: {status}")
             if status["status"] == "Running":
                 continue
             break

From 934e22a28abbb2ea6bf2a510e63421253c06eb2f Mon Sep 17 00:00:00 2001
From: Diego Ardila <diego.ardila@scale.com>
Date: Wed, 12 May 2021 16:19:05 -0700
Subject: [PATCH 6/8] Don't install jupyterlab in CI + urllibparse

---
 .circleci/config.yml    | 2 +-
 nucleus/dataset_item.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 39fb5616..e63e5255 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -13,7 +13,7 @@ jobs:
           command: | # install dependencies
             pip install --upgrade pip 
             pip install poetry
-            poetry install
+            poetry install --no-dev
       - run:
           name: Black Formatting Check # Only validation, without re-formatting
           command: |
diff --git a/nucleus/dataset_item.py b/nucleus/dataset_item.py
index ab1d7673..91fdac52 100644
--- a/nucleus/dataset_item.py
+++ b/nucleus/dataset_item.py
@@ -3,6 +3,7 @@
 import os.path
 from dataclasses import dataclass
 from typing import Optional, Sequence
+from urllib.parse import urlparse
 
 from .constants import (
     DATASET_ITEM_ID_KEY,
@@ -55,8 +56,7 @@ def to_json(self) -> str:
 
 
 def is_local_path(path: str) -> bool:
-    path_components = [comp.lower() for comp in path.split("/")]
-    return path_components[0] not in {"https:", "http:", "s3:", "gs:"}
+    return urlparse(path).scheme not in {"https", "http", "s3", "gs"}
 
 
 def check_all_paths_remote(dataset_items: Sequence[DatasetItem]):

From 03878b3316c1f5676a8f4e89f301a6dbe2e2fc47 Mon Sep 17 00:00:00 2001
From: Diego Ardila <diego.ardila@scale.com>
Date: Wed, 12 May 2021 16:28:17 -0700
Subject: [PATCH 7/8] remove jupyter but still install things we need for test

---
 .circleci/config.yml | 1 +
 pyproject.toml       | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e63e5255..851e90d8 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -14,6 +14,7 @@ jobs:
             pip install --upgrade pip 
             pip install poetry
             poetry install --no-dev
+
       - run:
           name: Black Formatting Check # Only validation, without re-formatting
           command: |
diff --git a/pyproject.toml b/pyproject.toml
index 2a32c52e..6c002276 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,6 @@ flake8 = "^3.9.1"
 mypy = "^0.812"
 coverage = "^5.5"
 pre-commit = "^2.12.1"
-jupyterlab = "^3.0.14"
 
 
 [build-system]

From 854462d61b176bf3fbf49619dcc6ade065704817 Mon Sep 17 00:00:00 2001
From: Diego Ardila <diego.ardila@scale.com>
Date: Wed, 12 May 2021 16:30:05 -0700
Subject: [PATCH 8/8] restore dev requirements

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 851e90d8..22d30747 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -13,7 +13,7 @@ jobs:
           command: | # install dependencies
             pip install --upgrade pip 
             pip install poetry
-            poetry install --no-dev
+            poetry install
 
       - run:
           name: Black Formatting Check # Only validation, without re-formatting