diff --git a/COCO_DATA_FORMAT.md b/COCO_DATA_FORMAT.md index 619b641..1fd675d 100644 --- a/COCO_DATA_FORMAT.md +++ b/COCO_DATA_FORMAT.md @@ -139,8 +139,8 @@ This is Visual Question & Answering with bboxes in the groundtruth. {"id": 2, "zip_file": "test2.zip", "file_name": "test/1/image_2.jpg"} ], "annotations": [ - {"image_id": 1, "id": 1, "question": "whats animal are in the image?", "answer": [{"text": "a cat", "bbox": [10, 10, 100, 100]}, {"text": "a bird", "bbox": [15, 15, 30, 30]}]}, - {"image_id": 2, "id": 2, "question": "What is the title of the book on the shelf?", "answer": [{"text": "a cat", "bbox": [10, 10, 100, 100]}]} + {"image_id": 1, "id": 1, "question": "whats animal are in the image?", "answer": "cat and bird", "grounding": [{"text": "a cat", "bbox": [10, 10, 100, 100]}, {"text": "a bird", "bbox": [15, 15, 30, 30]}]}, + {"image_id": 2, "id": 2, "question": "What is the title and auther of the book on the shelf?", "answer": "Tile is baking and auther is John", : [{"text": "Title: Baking", "bbox": [10, 10, 100, 100]}, {"text": "Author: John", "bbox": [0, 0, 50, 50]}]} ] } ``` diff --git a/tests/resources/util.py b/tests/resources/util.py index 7f7d58c..0301964 100644 --- a/tests/resources/util.py +++ b/tests/resources/util.py @@ -315,8 +315,19 @@ class VisualObjectGroundingTestCases: {"id": 1, "file_name": "test1.zip@test/0/image_1.jpg"}, {"id": 2, "file_name": "test2.zip@test/1/image_2.jpg"} ], "annotations": [ - {"image_id": 1, "id": 1, "question": "where are the apples", "answer": [{"text": "left top corner", "bbox": [0, 10, 10, 10]}]}, - {"image_id": 2, "id": 2, "question": "where are the banana", "answer": [{"text": "right bottom corner", "bbox": [90, 90, 10, 10]}]}, + { + "image_id": 1, + "id": 1, + "question": "where are the apples", + "answer": "who knows", + "grounding": [{"text": "left top corner", "bbox": [0, 10, 10, 10]}]}, + { + "image_id": 2, + "id": 2, + "question": "where are the banana", + "answer": "check the grounding", + "grounding": [{"text": "right bottom corner", "bbox": [90, 90, 10, 10]}] + }, ] }, { @@ -324,9 +335,26 @@ class VisualObjectGroundingTestCases: {"id": 1, "file_name": "test1.zip@test/0/image_1.jpg"}, {"id": 2, "file_name": "test2.zip@test/1/image_2.jpg"}, {"id": 3, "file_name": "test2.zip@test/1/image_3.jpg"} ], "annotations": [ - {"image_id": 1, "id": 1, "question": "Describe the image", "answer": [{"text": "left top corner", "bbox": [0, 10, 10, 10]}, {"text": "right bottom corner", "bbox": [90, 90, 10, 10]}]}, - {"image_id": 2, "id": 2, "question": "where is an banana", "answer": [{"text": "mid of the image", "bbox": [50, 50, 10, 10]}]}, - {"image_id": 2, "id": 3, "question": "describe the top half of the image", "answer": [{"text": "Sun rise", "bbox": [0, 0, 100, 50]}]}, + { + "image_id": 1, + "id": 1, + "question": "Describe the image", + "answer": "many books", + "grounding": [{"text": "20 books", "bbox": [0, 10, 10, 10]}, {"text": "10 books", "bbox": [90, 90, 10, 10]}]}, + { + "image_id": 2, + "id": 2, + "question": "where is an banana", + "answer": "present in the image", + "grounding": [{"text": "mid of the image", "bbox": [50, 50, 10, 10]}] + }, + { + "image_id": 2, + "id": 3, + "question": "describe the top half of the image", + "answer": "ok", + "grounding": [{"text": "Sun rise", "bbox": [0, 0, 100, 50]}] + }, ] } ] diff --git a/vision_datasets/visual_object_grounding/coco_manifest_adaptor.py b/vision_datasets/visual_object_grounding/coco_manifest_adaptor.py index eb95af3..4190341 100644 --- a/vision_datasets/visual_object_grounding/coco_manifest_adaptor.py +++ b/vision_datasets/visual_object_grounding/coco_manifest_adaptor.py @@ -8,5 +8,5 @@ def __init__(self) -> None: super().__init__(DatasetTypes.VISUAL_OBJECT_GROUNDING) def process_label(self, image: ImageDataManifest, annotation: dict, coco_manifest: dict): - image.labels.append(VisualObjectGroundingLabelManifest({"question": annotation['question'], "answer": annotation['answer']}, - additional_info=self._get_additional_info(annotation, {'id', 'question', 'answer'}))) + image.labels.append(VisualObjectGroundingLabelManifest({'question': annotation['question'], 'answer': annotation['answer'], 'grounding': annotation['grounding']}, + additional_info=self._get_additional_info(annotation, {'id', 'question', 'answer', 'grounding'}))) diff --git a/vision_datasets/visual_object_grounding/manifest.py b/vision_datasets/visual_object_grounding/manifest.py index 7ed8208..035b435 100644 --- a/vision_datasets/visual_object_grounding/manifest.py +++ b/vision_datasets/visual_object_grounding/manifest.py @@ -2,7 +2,7 @@ from ..common import ImageLabelManifest -class GroundingAnswer: +class Grounding: def __init__(self, label_data: dict): self._label_data = label_data @@ -17,18 +17,21 @@ def bbox(self): class VisualObjectGroundingLabelManifest(ImageLabelManifest): """ - {"question": "a question about the image", "answer": [{"text": " in text", "bbox": [left, top, right, bottom]}, ...]} + {"question": "a question about the image", "answer": "generic caption or answer to the question", "grounding": [{"text": "....", "bbox": [left, top, right, bottom]}, ...]} """ def _read_label_data(self): raise NotImplementedError def _check_label(self, label_data): - if label_data is None or "question" not in label_data or "answer" not in label_data: + def is_present(key): + return key in label_data and label_data[key] is not None + + if label_data is None or any(not is_present(key) for key in ['question', 'answer', 'grounding']): raise ValueError - for ans in label_data["answer"]: - if "text" not in ans or "bbox" not in ans or len(ans['bbox']) != 4: + for grounding in label_data["grounding"]: + if "text" not in grounding or "bbox" not in grounding or len(grounding['bbox']) != 4: raise ValueError @property @@ -36,5 +39,9 @@ def question(self) -> str: return self._label_data["question"] @property - def answer(self) -> List[GroundingAnswer]: - return [GroundingAnswer(x) for x in self._label_data["answer"]] + def answer(self) -> str: + return self._label_data["answer"] + + @property + def grounding(self) -> List[Grounding]: + return [Grounding(x) for x in self._label_data["grounding"]] \ No newline at end of file