-
Notifications
You must be signed in to change notification settings - Fork 129
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Ticket no.112037 - Support python api, project cli, cli for prune
- Loading branch information
Showing
13 changed files
with
1,063 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# Copyright (C) 2023 Intel Corporation | ||
# | ||
# SPDX-License-Identifier: MIT | ||
|
||
import argparse | ||
import logging as log | ||
|
||
from datumaro.components.algorithms.hash_key_inference.prune import Prune | ||
from datumaro.components.errors import ProjectNotFoundError | ||
from datumaro.util.scope import scope_add, scoped | ||
|
||
from ..util import MultilineFormatter | ||
from ..util.project import load_project, parse_full_revpath | ||
|
||
|
||
def build_parser(parser_ctor=argparse.ArgumentParser): | ||
parser = parser_ctor( | ||
help="Prune dataset and make a representative subset", | ||
description=""" | ||
Apply data pruning to a dataset.|n | ||
The command can be useful if you have to extract representative subset. | ||
|n | ||
The current project (-p/--project) is used as a context for plugins | ||
and models. It is used when there is a dataset path in target. | ||
When not specified, the current project's working tree is used.|n | ||
|n | ||
By default, datasets are updated in-place. The '-o/--output-dir' | ||
option can be used to specify another output directory. When | ||
updating in-place, use the '--overwrite' parameter (in-place | ||
updates fail by default to prevent data loss), unless a project | ||
target is modified.|n | ||
|n | ||
The command can be applied to a dataset or a project build target, | ||
a stage or the combined 'project' target, in which case all the | ||
targets will be affected.|n | ||
|n | ||
Examples:|n | ||
- Prune dataset with selecting random and ratio 80%:|n | ||
|s|s%(prog)s -m random -r 0.8|n | ||
- Prune dataset with clustering in image hash and ratio 50%:|n | ||
|s|s%(prog)s -m query_clust -h img -r 0.5| | ||
- Prune dataset based on entropy with clustering in image hash and ratio 50%:|n | ||
|s|s%(prog)s -m entropy -h img -r 0.5| | ||
""", | ||
formatter_class=MultilineFormatter, | ||
) | ||
|
||
parser.add_argument("target", nargs="?", help="Target dataset revpath (default: project)") | ||
parser.add_argument("-m", "--method", dest="method", help="Method to apply to the dataset") | ||
parser.add_argument( | ||
"-r", "--ratio", type=float, dest="ratio", help="How much to remain dataset after pruning" | ||
) | ||
parser.add_argument( | ||
"--hash-type", | ||
type=str, | ||
dest="hash_type", | ||
default="img", | ||
help="Hashtype to extract feature from data information between image and text(label)", | ||
) | ||
parser.add_argument( | ||
"-p", | ||
"--project", | ||
dest="project_dir", | ||
help="Directory of the project to operate on (default: current dir)", | ||
) | ||
|
||
parser.add_argument( | ||
"-o", | ||
"--output-dir", | ||
dest="dst_dir", | ||
help=""" | ||
Output directory. Can be omitted for main project targets | ||
(i.e. data sources and the 'project' target, but not | ||
intermediate stages) and dataset targets. | ||
If not specified, the results will be saved inplace. | ||
""", | ||
) | ||
parser.add_argument( | ||
"--overwrite", action="store_true", help="Overwrite existing files in the save directory" | ||
) | ||
parser.set_defaults(command=prune_command) | ||
|
||
return parser | ||
|
||
|
||
def get_sensitive_args(): | ||
return { | ||
prune_command: [ | ||
"target", | ||
"method", | ||
"ratio", | ||
"hash_type", | ||
"project_dir", | ||
"dst_dir", | ||
] | ||
} | ||
|
||
|
||
@scoped | ||
def prune_command(args): | ||
project = None | ||
try: | ||
project = scope_add(load_project(args.project_dir)) | ||
except ProjectNotFoundError: | ||
if args.project_dir: | ||
raise | ||
|
||
targets = [args.target] if args.target else list(project.working_tree.sources) | ||
|
||
source_dataset = [parse_full_revpath(target, project)[0] for target in targets][0] | ||
|
||
prune = Prune(source_dataset, cluster_method=args.method, hash_type=args.hash_type) | ||
|
||
source_dataset.save(source_dataset.data_path, save_media=True, save_hashkey_meta=True) | ||
|
||
result = prune.get_pruned(args.ratio) | ||
|
||
dst_dir = args.dst_dir or source_dataset.data_path | ||
result.save(dst_dir, save_media=True) | ||
|
||
log.info("Results have been saved to '%s'" % dst_dir) |
3 changes: 3 additions & 0 deletions
3
src/datumaro/components/algorithms/hash_key_inference/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Copyright (C) 2023 Intel Corporation | ||
# | ||
# SPDX-License-Identifier: MIT |
37 changes: 37 additions & 0 deletions
37
src/datumaro/components/algorithms/hash_key_inference/base.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Copyright (C) 2023 Intel Corporation | ||
# | ||
# SPDX-License-Identifier: MIT | ||
|
||
from typing import Sequence | ||
|
||
from datumaro.components.dataset import Dataset | ||
from datumaro.plugins.explorer import ExplorerLauncher | ||
|
||
|
||
class HashInference: | ||
def __init__(self, *datasets: Sequence[Dataset]) -> None: | ||
pass | ||
|
||
@property | ||
def model(self): | ||
if self._model is None: | ||
self._model = ExplorerLauncher(model_name="clip_visual_ViT-B_32") | ||
return self._model | ||
|
||
@property | ||
def text_model(self): | ||
if self._text_model is None: | ||
self._text_model = ExplorerLauncher(model_name="clip_text_ViT-B_32") | ||
return self._text_model | ||
|
||
def _compute_hash_key(self, datasets, datasets_to_infer): | ||
for dataset_to_infer in datasets_to_infer: | ||
if dataset_to_infer: | ||
dataset_to_infer.run_model(self.model, append_annotation=True) | ||
for dataset, dataset_to_infer in zip(datasets, datasets_to_infer): | ||
updated_items = [ | ||
dataset.get(item.id, item.subset).wrap(annotations=item.annotations) | ||
for item in dataset_to_infer | ||
] | ||
dataset.update(updated_items) | ||
return datasets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.