Fix merging of stream datasets (#1609)

### Summary When importing a stream dataset with multiple sources in eager mode by specifying `error_policy` or `progress_reporting`, an error occurs: ``` '_MergedStreamDataset' object has no attribute '_data' ```  ### How to test  ### Checklist  - [x] I have added unit tests to cover my changes. - [x] I have added integration tests to cover my changes. - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md). - [x] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [x] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2024 Intel Corporation # # SPDX-License-Identifier: MIT ``` --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: Ilya Trushkin <ilya.trushkin@intel.com> Co-authored-by: williamcorsel <31770711+williamcorsel@users.noreply.github.com> Co-authored-by: Sooah Lee <sooah.lee@intel.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Yunchu Lee <yunchu.lee@intel.com> Co-authored-by: Wonju Lee <wonju.lee@intel.com>
openvinotoolkit · Sep 24, 2024 · ad84aa7 · ad84aa7
1 parent 7d7b327
commit ad84aa7
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 10 deletions.
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -52,7 +52,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@f0f3afee809481da311ca3a6ff1ff51d81dbeb24 # v3.26.4
+      uses: github/codeql-action/init@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6
       with:
         languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -73,7 +73,7 @@ jobs:
         python -m build
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@f0f3afee809481da311ca3a6ff1ff51d81dbeb24 # v3.26.4
+      uses: github/codeql-action/analyze@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6
       with:
         category: "/language:${{matrix.language}}"
     - name: Generate Security Report

diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish_to_pypi.yml
@@ -80,12 +80,12 @@ jobs:
         file_glob: true
     - name: Publish package distributions to PyPI
       if: ${{ steps.check-tag.outputs.match != '' }}
-      uses: pypa/gh-action-pypi-publish@v1.9.0
+      uses: pypa/gh-action-pypi-publish@v1.10.1
       with:
         password: ${{ secrets.PYPI_API_TOKEN }}
     - name: Publish package distributions to TestPyPI
       if: ${{ steps.check-tag.outputs.match == '' }}
-      uses: pypa/gh-action-pypi-publish@v1.9.0
+      uses: pypa/gh-action-pypi-publish@v1.10.1
       with:
         password: ${{ secrets.TESTPYPI_API_TOKEN }}
         repository-url: https://test.pypi.org/legacy/

diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
@@ -67,6 +67,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@f0f3afee809481da311ca3a6ff1ff51d81dbeb24 # v3.26.4
+        uses: github/codeql-action/upload-sarif@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6
         with:
           sarif_file: results.sarif
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1607>)
 
 ### Bug fixes
+- Fix StreamDataset merging when importing in eager mode
+  (<https://github.com/openvinotoolkit/datumaro/pull/1609>)
 
 ## Q3 2024 Release 1.9.0
 ### New features

diff --git a/src/datumaro/components/dataset.py b/src/datumaro/components/dataset.py
@@ -1023,17 +1023,22 @@ class _MergedStreamDataset(cls):
                 def __init__(self, *sources: IDataset):
                     from datumaro.components.hl_ops import HLOps
 
-                    self.merged = HLOps.merge(*sources, merge_policy=merge_policy)
+                    self._merged = HLOps.merge(*sources, merge_policy=merge_policy)
+                    self._data = self._merged._data
+                    self._env = env
+                    self._format = DEFAULT_FORMAT
+                    self._source_path = None
+                    self._options = {}
 
                 def __iter__(self):
-                    yield from self.merged
+                    yield from self._merged
 
                 @property
                 def is_stream(self):
                     return True
 
                 def subsets(self) -> Dict[str, DatasetSubset]:
-                    return self.merged.subsets()
+                    return self._merged.subsets()
 
             return _MergedStreamDataset(*sources)
 

diff --git a/tests/unit/test_imagenet_format.py b/tests/unit/test_imagenet_format.py
@@ -7,6 +7,7 @@
 import pytest
 
 from datumaro.components.annotation import AnnotationType, Label, LabelCategories
+from datumaro.components.contexts.importer import ImportErrorPolicy
 from datumaro.components.dataset import Dataset, StreamDataset
 from datumaro.components.dataset_base import DatasetItem
 from datumaro.components.environment import Environment
@@ -214,7 +215,9 @@ def _create_expected_dataset(self):
     @pytest.mark.parametrize("dataset_cls, is_stream", [(Dataset, False), (StreamDataset, True)])
     def test_can_import(self, dataset_cls, is_stream, helper_tc):
         expected_dataset = self._create_expected_dataset()
-        dataset = dataset_cls.import_from(self.DUMMY_DATASET_DIR, self.IMPORTER_NAME)
+        dataset = dataset_cls.import_from(
+            self.DUMMY_DATASET_DIR, self.IMPORTER_NAME, error_policy=ImportErrorPolicy()
+        )
         assert dataset.is_stream == is_stream
 
         compare_datasets(helper_tc, expected_dataset, dataset, require_media=True)
@@ -240,7 +243,9 @@ class ImagenetWithSubsetDirsImporterTest(ImagenetImporterTest):
     @mark_requirement(Requirements.DATUM_GENERAL_REQ)
     @pytest.mark.parametrize("dataset_cls, is_stream", [(Dataset, False), (StreamDataset, True)])
     def test_can_import(self, dataset_cls, is_stream, helper_tc):
-        dataset = dataset_cls.import_from(self.DUMMY_DATASET_DIR, self.IMPORTER_NAME)
+        dataset = dataset_cls.import_from(
+            self.DUMMY_DATASET_DIR, self.IMPORTER_NAME, error_policy=ImportErrorPolicy()
+        )
         assert dataset.is_stream == is_stream
 
         for subset_name, subset in dataset.subsets().items():