-
Notifications
You must be signed in to change notification settings - Fork 6
/
mmmu_download.py
52 lines (50 loc) · 2.07 KB
/
mmmu_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from datasets import load_dataset
from tqdm import tqdm
import os
cate = ['Accounting', 'Agriculture', 'Architecture_and_Engineering', 'Art', 'Art_Theory', 'Basic_Medical_Science', 'Biology', 'Chemistry', 'Clinical_Medicine', 'Computer_Science', 'Design', 'Diagnostics_and_Laboratory_Medicine', 'Economics', 'Electronics', 'Energy_and_Power', 'Finance', 'Geography', 'History', 'Literature', 'Manage', 'Marketing', 'Materials', 'Math', 'Mechanical_Engineering', 'Music', 'Pharmacy', 'Physics', 'Psychology', 'Public_Health', 'Sociology']
temp = {
"Art": "art_and_design",
"Design": "art_and_design",
"Music": "art_and_design",
"Art_Theory": "art_and_design",
"Accounting": "business",
"Economics": "business",
"Finance": "business",
"Manage": "business",
"Marketing": "business",
"Biology": "science",
"Chemistry": "science",
"Geography": "science",
"Math": "science",
"Physics": "science",
"Basic_Medical_Science": "health_and_medicine",
"Clinical_Medicine": "health_and_medicine",
"Diagnostics_and_Laboratory_Medicine": "health_and_medicine",
"Pharmacy": "health_and_medicine",
"Public_Health": "health_and_medicine",
"History": "humanities_and_social_sci",
"Literature": "humanities_and_social_sci",
"Psychology": "humanities_and_social_sci",
"Sociology": "humanities_and_social_sci",
"Agriculture": "tech_and_engineering",
"Architecture_and_Engineering": "tech_and_engineering",
"Computer_Science": "tech_and_engineering",
"Electronics": "tech_and_engineering",
"Energy_and_Power": "tech_and_engineering",
"Materials": "tech_and_engineering",
"Mechanical_Engineering": "tech_and_engineering"
}
ids = 0
save_dir = "./mmmu"
if not os.path.exists(save_dir):
os.makedirs(save_dir)
pattern = r"\['(.*?)'\]"
for c in tqdm(cate):
dataset = load_dataset("MMMU/MMMU", c)
splits = ['dev', 'test', 'validation']
for s in splits:
images = dataset[s]['image_1']
for img in images:
path = f"{save_dir}/{ids}.png"
img.save(path)
ids += 1