-
Notifications
You must be signed in to change notification settings - Fork 0
/
flatten_any_VQA.py
114 lines (100 loc) · 4.09 KB
/
flatten_any_VQA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import json
from torch.utils.data import Dataset
from typing import List
import os
def inconvertible_grounding_visual(text):
if "{" in text:
return True
if "}" in text:
return True
if "|" in text:
return True
if "<p>" in text:
return True
if "," in text:
return True
if "." in text:
return True
return False
def inconvertible_grounding_description(text):
if "{" in text:
return True
if "}" in text:
return True
if "|" in text:
return True
if "<p>" in text:
return True
if "</p>" in text:
return True
return False
class flatten_dataset(Dataset):
r"""
Goal: Bring many QA-pairs inside 'conversations' per VQA sample -> (1 QA-pair, 1 image) per VQA sample
Input: VQA dataset, which is usually formatted like [sample_1 Dict, sample_2 Dict,..., sample_n Dict].
[
sample_1 Dict{
'image': relative/path/to/image
'conversations': contain many QA-pairs
[
{
'from': 'human'
'value': "Describe this image"
},
{
'from': 'gpt'
'value': "It has Formula Ford cars?"
},
{
'from': 'human'
'value': "What is at this region <loc_20><loc_49><loc_142><loc_121>?"
},
{
'from': 'gpt'
'value': "Nam's favourite night club"
},
]
},
sample_2 Dict{
....
},
... and so on ...
]
Output: flattened VQA dataset, which is now formatted like [sample_1 Tuple, sample_2 Tuple,..., sample_n Tuple]
[
(relative/path/to/image, "Describe this image", "It has Formula Ford cars?"),
(relative/path/to/image, "What is at this region <loc_20><loc_49><loc_142><loc_121>?", "Nam's favourite night club"),
(relative/path/to/image, sample_2 1st question, sample_2 1st answer),
(relative/path/to/image, sample_2 2nd question, sample_2 2nd answer),
... and so on ...
]
"""
def __init__(self, data_path: str):
super(flatten_dataset, self).__init__()
list_data_dict = json.load(open(data_path, "r"))
self.list_data_dict = list_data_dict
self.all_pairs = []
for VQA_sample in self.list_data_dict:
conversations = VQA_sample['conversations']
for i in range(0, len(conversations), 2):
if not (conversations[i]['from'] == 'human' and conversations[i+1]['from'] == 'gpt'):
continue
if "<DENSE_REGION_CAPTION>" in conversations[i + 1]['value']:
if inconvertible_grounding_visual(conversations[i + 1]['value']):
continue
if inconvertible_grounding_description(conversations[i]['value']) or inconvertible_grounding_description(conversations[i + 1]['value']):
continue
self.all_pairs.append((VQA_sample['image'], conversations[i]['value'], conversations[i + 1]['value']))
def _get_data_dict(self) -> List:
print(f"Initial VQA size is {len(self.list_data_dict)}")
print(f"Flattened VQA size is {len(self.all_pairs)}")
return self.all_pairs
source_dir = "/home/ubuntu/Documents/nam/GeoChat_images"
relative_data_path = "Florence_Instruct_grounding_only.json"
absolute_data_path = os.path.join(source_dir, relative_data_path)
VQA_dataset = flatten_dataset(absolute_data_path)
VQA_allPairs = VQA_dataset._get_data_dict()
relative_flatten_data_path = "flatten_Florence_Instruct_grounding_only.json"
absolute_flatten_data_path = os.path.join(source_dir, relative_flatten_data_path)
with open(absolute_flatten_data_path, "w") as file:
json.dump(VQA_allPairs, file, indent=4)