Skip to content

Commit

Permalink
Reduce peak memory consumption during Datumaro and COCO extractors (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
wonjuleee authored Jul 6, 2023
1 parent 9be801a commit f00acf8
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 10 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1055>)
- Add CVAT data format document
(<https://github.com/openvinotoolkit/datumaro/pull/1060>)
- Reduce peak memory usage when importing COCO and Datumaro formats
(<https://github.com/openvinotoolkit/datumaro/pull/1061>)
- Enhance the error message for datum stats to be more user friendly
(<https://github.com/openvinotoolkit/datumaro/pull/1069>)

Expand Down
16 changes: 13 additions & 3 deletions src/datumaro/plugins/data_formats/coco/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ def __init__(
self._mask_dir = osp.splitext(path)[0]
self._items = self._load_items(json_data)

del json_data

def __iter__(self):
yield from self._items.values()

Expand Down Expand Up @@ -225,10 +227,16 @@ def _load_person_kp_categories(self, json_cat):

def _load_items(self, json_data):
pbars = self._ctx.progress_reporter.split(2)

def _gen_ann(info_lists):
while info_lists:
yield info_lists.pop()

items = {}
img_infos = {}
img_lists = self._parse_field(json_data, "images", list)
for img_info in pbars[0].iter(
self._parse_field(json_data, "images", list),
_gen_ann(img_lists),
desc=f"Parsing image info in '{osp.basename(self._path)}'",
):
img_id = None
Expand Down Expand Up @@ -258,8 +266,9 @@ def _load_items(self, json_data):
self._ctx.error_policy.report_item_error(e, item_id=(img_id, self._subset))

if self._task is not CocoTask.panoptic:
ann_lists = self._parse_field(json_data, "annotations", list)
for ann in pbars[1].iter(
self._parse_field(json_data, "annotations", list),
_gen_ann(ann_lists),
desc=f"Parsing annotations in '{osp.basename(self._path)}'",
):
img_id = None
Expand All @@ -277,8 +286,9 @@ def _load_items(self, json_data):
e, item_id=(img_id, self._subset)
)
else:
ann_lists = self._parse_field(json_data, "annotations", list)
for ann in pbars[1].iter(
self._parse_field(json_data, "annotations", list),
_gen_ann(ann_lists),
desc=f"Parsing annotations in '{osp.basename(self._path)}'",
):
img_id = None
Expand Down
4 changes: 3 additions & 1 deletion src/datumaro/plugins/data_formats/datumaro/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ def _load_categories(parsed):
def _load_items(self, parsed):
items = []

for item_desc in parsed["items"]:
item_descs = parsed["items"]
while item_descs:
item_desc = item_descs.pop()
item_id = item_desc["id"]

media = None
Expand Down
15 changes: 9 additions & 6 deletions tests/unit/data_formats/datumaro/test_datumaro_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _test_save_and_load(
[
pytest.param(
"fxt_test_datumaro_format_dataset",
compare_datasets_strict,
compare_datasets,
True,
id="test_can_save_and_load",
),
Expand All @@ -76,13 +76,13 @@ def _test_save_and_load(
),
pytest.param(
"fxt_relative_paths",
compare_datasets_strict,
compare_datasets,
True,
id="test_relative_paths",
),
pytest.param(
"fxt_can_save_dataset_with_cjk_categories",
compare_datasets_strict,
compare_datasets,
True,
id="test_can_save_dataset_with_cjk_categories",
),
Expand All @@ -94,7 +94,7 @@ def _test_save_and_load(
),
pytest.param(
"fxt_can_save_and_load_image_with_arbitrary_extension",
compare_datasets_strict,
compare_datasets,
True,
id="test_can_save_and_load_image_with_arbitrary_extension",
),
Expand Down Expand Up @@ -203,8 +203,11 @@ def test_inplace_save_writes_only_updated_data_with_direct_changes(self, test_di
set(os.listdir(osp.join(test_dir, "annotations"))),
)
helper_tc.assertEqual({"2.jpg"}, set(os.listdir(osp.join(test_dir, "images", "a"))))
compare_datasets_strict(
helper_tc, expected, Dataset.import_from(test_dir, format=self.format)
compare_datasets(
helper_tc,
expected,
Dataset.import_from(test_dir, format=self.format),
require_media=True,
)

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
Expand Down

0 comments on commit f00acf8

Please sign in to comment.