forked from matsui528/sis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_features.py
105 lines (83 loc) · 2.84 KB
/
extract_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
# fix cuda not being detected on windows :|
os.add_dll_directory("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.4/bin")
from PIL import Image
from tensorflow.python.keras import backend
from feature_extractor import FeatureExtractor
from tensorflow.data import AUTOTUNE
from imutils import paths
import tensorflow as tf
import argparse
import gc
import h5py
from tensorflow.python.keras.applications.vgg16 import preprocess_input
def load_images(image_path):
# read the image from disk, decode it, resize it, expand from 3 to 4 dimensions and preprocess it
x = tf.io.read_file(image_path)
x = tf.cond(
tf.image.is_jpeg(x),
lambda: tf.image.decode_jpeg(x, channels=3, try_recover_truncated=True, acceptable_fraction=0.5),
lambda: tf.image.decode_png(x, channels=3)
)
x = tf.image.resize(x, (224, 224))
x = tf.expand_dims(x, axis=0)
x = preprocess_input(x)
return tf.strings.split(image_path, "\\")[-1], x
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required=True, help="path to input dataset")
args = vars(ap.parse_args())
# initialize batch size
BS = 32
# grab the list of images in our dataset directory and grab all
# unique class names
print("[INFO] loading image paths...")
imagePaths = list(paths.list_images(args["dataset"]))
# checking for corrupted images
for path in imagePaths:
try:
img = Image.open(path)
img.verify()
except (IOError, SyntaxError) as e:
print('[WARN] skipping corrupted: ' + path)
imagePaths.remove(path)
total = len(imagePaths)
# build the dataset and data input pipeline
print("[INFO] creating a tf.data input pipeline..")
dataset = tf.data.Dataset.from_tensor_slices(imagePaths)
dataset = (dataset
.shuffle(1024)
.map(load_images, num_parallel_calls=AUTOTUNE)
.cache()
.batch(BS)
.prefetch(AUTOTUNE)
)
datasetGen = iter(dataset)
current = 0
fe = FeatureExtractor()
hf = h5py.File("./static/feature/data.h5", "a")
# loop through all images
for pathBatch, imageBatch in datasetGen:
for name, image in zip(pathBatch, imageBatch):
current += 1
# tensor byte array to string
name = name.numpy().decode('ascii')
print("[PARSING {}/{}] {}".format(current, total, name))
if name in hf:
continue
# extract feature
feature = fe.predict_and_normalize(image)
# store it in a file :)
hf.create_dataset(name, data=feature, compression="gzip")
# cleanup, if it even helps at all
del feature
del name
del image
# del feature_path
# cleanup, if it even helps at all
del pathBatch
del imageBatch
backend.clear_session()
gc.collect()
hf.close()
print('[INFO] DONE~')