extract_features.py

import os

# fix cuda not being detected on windows :|
os.add_dll_directory("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.4/bin")

from PIL import Image
from tensorflow.python.keras import backend

from feature_extractor import FeatureExtractor
from tensorflow.data import AUTOTUNE
from imutils import paths
import tensorflow as tf
import argparse
import gc
import h5py
from tensorflow.python.keras.applications.vgg16 import preprocess_input


def load_images(image_path):
    # read the image from disk, decode it, resize it, expand from 3 to 4 dimensions and preprocess it
    x = tf.io.read_file(image_path)
    x = tf.cond(
        tf.image.is_jpeg(x),
        lambda: tf.image.decode_jpeg(x, channels=3, try_recover_truncated=True, acceptable_fraction=0.5),
        lambda: tf.image.decode_png(x, channels=3)
    )
    x = tf.image.resize(x, (224, 224))
    x = tf.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return tf.strings.split(image_path, "\\")[-1], x


# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required=True, help="path to input dataset")
args = vars(ap.parse_args())

# initialize batch size
BS = 32

# grab the list of images in our dataset directory and grab all
# unique class names
print("[INFO] loading image paths...")
imagePaths = list(paths.list_images(args["dataset"]))

# checking for corrupted images
for path in imagePaths:
    try:
        img = Image.open(path)
        img.verify()
    except (IOError, SyntaxError) as e:
        print('[WARN] skipping corrupted: ' + path)
        imagePaths.remove(path)

total = len(imagePaths)

# build the dataset and data input pipeline
print("[INFO] creating a tf.data input pipeline..")
dataset = tf.data.Dataset.from_tensor_slices(imagePaths)
dataset = (dataset
           .shuffle(1024)
           .map(load_images, num_parallel_calls=AUTOTUNE)
           .cache()
           .batch(BS)
           .prefetch(AUTOTUNE)
           )

datasetGen = iter(dataset)

current = 0
fe = FeatureExtractor()

hf = h5py.File("./static/feature/data.h5", "a")

# loop through all images
for pathBatch, imageBatch in datasetGen:
    for name, image in zip(pathBatch, imageBatch):
        current += 1
        # tensor byte array to string
        name = name.numpy().decode('ascii')

        print("[PARSING {}/{}] {}".format(current, total, name))
        if name in hf:
            continue

        # extract feature
        feature = fe.predict_and_normalize(image)

        # store it in a file :)
        hf.create_dataset(name, data=feature, compression="gzip")

        # cleanup, if it even helps at all
        del feature
        del name
        del image
        # del feature_path

    # cleanup, if it even helps at all
    del pathBatch
    del imageBatch
    backend.clear_session()
    gc.collect()

hf.close()
print('[INFO] DONE~')