change format for grounding

microsoft · Aug 31, 2023 · 260d783 · 260d783
1 parent a340496
commit 260d783
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 16 deletions.
diff --git a/COCO_DATA_FORMAT.md b/COCO_DATA_FORMAT.md
@@ -139,8 +139,8 @@ This is Visual Question & Answering with bboxes in the groundtruth.
         {"id": 2, "zip_file": "test2.zip", "file_name": "test/1/image_2.jpg"}
     ],
     "annotations": [
-        {"image_id": 1, "id": 1, "question": "whats animal are in the image?", "answer": [{"text": "a cat", "bbox": [10, 10, 100, 100]}, {"text": "a bird", "bbox": [15, 15, 30, 30]}]},
-        {"image_id": 2, "id": 2, "question": "What is the title of the book on the shelf?",  "answer": [{"text": "a cat", "bbox": [10, 10, 100, 100]}]}
+        {"image_id": 1, "id": 1, "question": "whats animal are in the image?", "answer": "cat and bird", "grounding": [{"text": "a cat", "bbox": [10, 10, 100, 100]}, {"text": "a bird", "bbox": [15, 15, 30, 30]}]},
+        {"image_id": 2, "id": 2, "question": "What is the title and auther of the book on the shelf?", "answer": "Tile is baking and auther is John", : [{"text": "Title: Baking", "bbox": [10, 10, 100, 100]}, {"text": "Author: John", "bbox": [0, 0, 50, 50]}]}
     ]
 }
 ```

diff --git a/tests/resources/util.py b/tests/resources/util.py
@@ -315,18 +315,46 @@ class VisualObjectGroundingTestCases:
                 {"id": 1, "file_name": "test1.zip@test/0/image_1.jpg"}, {"id": 2, "file_name": "test2.zip@test/1/image_2.jpg"}
             ],
             "annotations": [
-                {"image_id": 1, "id": 1, "question": "where are the apples", "answer": [{"text": "left top corner", "bbox": [0, 10, 10, 10]}]},
-                {"image_id": 2, "id": 2, "question": "where are the banana", "answer": [{"text": "right bottom corner", "bbox": [90, 90, 10, 10]}]},
+                {
+                    "image_id": 1,
+                    "id": 1,
+                    "question": "where are the apples",
+                    "answer": "who knows",
+                    "grounding": [{"text": "left top corner", "bbox": [0, 10, 10, 10]}]},
+                {
+                    "image_id": 2,
+                    "id": 2,
+                    "question": "where are the banana",
+                    "answer": "check the grounding",
+                    "grounding": [{"text": "right bottom corner", "bbox": [90, 90, 10, 10]}]
+                },
             ]
         },
         {
             "images": [
                 {"id": 1, "file_name": "test1.zip@test/0/image_1.jpg"}, {"id": 2, "file_name": "test2.zip@test/1/image_2.jpg"}, {"id": 3, "file_name": "test2.zip@test/1/image_3.jpg"}
             ],
             "annotations": [
-                {"image_id": 1, "id": 1, "question": "Describe the image", "answer": [{"text": "left top corner", "bbox": [0, 10, 10, 10]}, {"text": "right bottom corner", "bbox": [90, 90, 10, 10]}]},
-                {"image_id": 2, "id": 2, "question": "where is an banana", "answer": [{"text": "mid of the image", "bbox": [50, 50, 10, 10]}]},
-                {"image_id": 2, "id": 3, "question": "describe the top half of the image", "answer": [{"text": "Sun rise", "bbox": [0, 0, 100, 50]}]},
+                {
+                    "image_id": 1,
+                    "id": 1,
+                    "question": "Describe the image",
+                    "answer": "many books",
+                    "grounding": [{"text": "20 books", "bbox": [0, 10, 10, 10]}, {"text": "10 books", "bbox": [90, 90, 10, 10]}]},
+                {
+                    "image_id": 2,
+                    "id": 2,
+                    "question": "where is an banana",
+                    "answer": "present in the image",
+                    "grounding": [{"text": "mid of the image", "bbox": [50, 50, 10, 10]}]
+                },
+                {
+                    "image_id": 2,
+                    "id": 3,
+                    "question": "describe the top half of the image",
+                    "answer": "ok",
+                    "grounding": [{"text": "Sun rise", "bbox": [0, 0, 100, 50]}]
+                },
             ]
         }
     ]

diff --git a/vision_datasets/visual_object_grounding/coco_manifest_adaptor.py b/vision_datasets/visual_object_grounding/coco_manifest_adaptor.py
@@ -8,5 +8,5 @@ def __init__(self) -> None:
         super().__init__(DatasetTypes.VISUAL_OBJECT_GROUNDING)
 
     def process_label(self, image: ImageDataManifest, annotation: dict, coco_manifest: dict):
-        image.labels.append(VisualObjectGroundingLabelManifest({"question": annotation['question'], "answer": annotation['answer']},
-                                                               additional_info=self._get_additional_info(annotation, {'id', 'question', 'answer'})))
+        image.labels.append(VisualObjectGroundingLabelManifest({'question': annotation['question'], 'answer': annotation['answer'], 'grounding': annotation['grounding']},
+                                                               additional_info=self._get_additional_info(annotation, {'id', 'question', 'answer', 'grounding'})))
diff --git a/vision_datasets/visual_object_grounding/manifest.py b/vision_datasets/visual_object_grounding/manifest.py
@@ -2,7 +2,7 @@
 from ..common import ImageLabelManifest
 
 
-class GroundingAnswer:
+class Grounding:
     def __init__(self, label_data: dict):
         self._label_data = label_data
 
@@ -17,24 +17,31 @@ def bbox(self):
 
 class VisualObjectGroundingLabelManifest(ImageLabelManifest):
     """
-    {"question": "a question about the image",  "answer": [{"text": " in text", "bbox": [left, top, right, bottom]}, ...]}
+    {"question": "a question about the image",  "answer": "generic caption or answer to the question", "grounding": [{"text": "....", "bbox": [left, top, right, bottom]}, ...]}
     """
 
     def _read_label_data(self):
         raise NotImplementedError
 
     def _check_label(self, label_data):
-        if label_data is None or "question" not in label_data or "answer" not in label_data:
+        def is_present(key):
+            return key in label_data and label_data[key] is not None
+
+        if label_data is None or any(not is_present(key) for key in ['question', 'answer', 'grounding']):
             raise ValueError
 
-        for ans in label_data["answer"]:
-            if "text" not in ans or "bbox" not in ans or len(ans['bbox']) != 4:
+        for grounding in label_data["grounding"]:
+            if "text" not in grounding or "bbox" not in grounding or len(grounding['bbox']) != 4:
                 raise ValueError
 
     @property
     def question(self) -> str:
         return self._label_data["question"]
 
     @property
-    def answer(self) -> List[GroundingAnswer]:
-        return [GroundingAnswer(x) for x in self._label_data["answer"]]
+    def answer(self) -> str:
+        return self._label_data["answer"]
+
+    @property
+    def grounding(self) -> List[Grounding]:
+        return [Grounding(x) for x in self._label_data["grounding"]]