From f3551f7abfd74ac6faa822a6a200114759b3dcdb Mon Sep 17 00:00:00 2001
From: Sefik Ilkin Serengil <serengil@gmail.com>
Date: Mon, 8 Jan 2024 16:53:57 +0000
Subject: [PATCH 1/6] discard sandbox notebooks and outputs

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index 02f9bbe3..63ebe2c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,5 +8,6 @@ Pipfile.lock
 .idea/
 deepface.egg-info/
 tests/dataset/*.pkl
-tests/sandbox.ipynb
+tests/*.ipynb
+tests/*.csv
 *.pyc

From d498d510bdfa97da5d1698590450ad4de51ec9c1 Mon Sep 17 00:00:00 2001
From: Sefik Ilkin Serengil <serengil@gmail.com>
Date: Mon, 8 Jan 2024 16:55:24 +0000
Subject: [PATCH 2/6] new output shape of vgg is 4096

---
 README.md                       | 6 +++---
 tests/test_enforce_detection.py | 2 +-
 tests/test_represent.py         | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index b136e6bf..a4549c3f 100644
--- a/README.md
+++ b/README.md
@@ -90,15 +90,15 @@ Face recognition models basically represent facial images as multi-dimensional v
 embedding_objs = DeepFace.represent(img_path = "img.jpg")
 ```
 
-This function returns an array as embedding. The size of the embedding array would be different based on the model name. For instance, VGG-Face is the default model and it represents facial images as 2622 dimensional vectors.
+This function returns an array as embedding. The size of the embedding array would be different based on the model name. For instance, VGG-Face is the default model and it represents facial images as 4096 dimensional vectors.
 
 ```python
 embedding = embedding_objs[0]["embedding"]
 assert isinstance(embedding, list)
-assert model_name = "VGG-Face" and len(embedding) == 2622
+assert model_name = "VGG-Face" and len(embedding) == 4096
 ```
 
-Here, embedding is also [plotted](https://sefiks.com/2020/05/01/a-gentle-introduction-to-face-recognition-in-deep-learning/) with 2622 slots horizontally. Each slot is corresponding to a dimension value in the embedding vector and dimension value is explained in the colorbar on the right. Similar to 2D barcodes, vertical dimension stores no information in the illustration.
+Here, embedding is also [plotted](https://sefiks.com/2020/05/01/a-gentle-introduction-to-face-recognition-in-deep-learning/) with 4096 slots horizontally. Each slot is corresponding to a dimension value in the embedding vector and dimension value is explained in the colorbar on the right. Similar to 2D barcodes, vertical dimension stores no information in the illustration.
 
 <p align="center"><img src="https://raw.githubusercontent.com/serengil/deepface/master/icon/embedding.jpg" width="95%" height="95%"></p>
 
diff --git a/tests/test_enforce_detection.py b/tests/test_enforce_detection.py
index 7fa281da..74c4704b 100644
--- a/tests/test_enforce_detection.py
+++ b/tests/test_enforce_detection.py
@@ -33,7 +33,7 @@ def test_disabled_enforce_detection_for_non_facial_input_on_represent():
     assert "w" in objs[0]["facial_area"].keys()
     assert "h" in objs[0]["facial_area"].keys()
     assert isinstance(objs[0]["embedding"], list)
-    assert len(objs[0]["embedding"]) == 2622  # embedding of VGG-Face
+    assert len(objs[0]["embedding"]) == 4096  # embedding of VGG-Face
 
     logger.info("✅ disabled enforce detection with non facial input test for represent tests done")
 
diff --git a/tests/test_represent.py b/tests/test_represent.py
index 2dd68eae..4b455944 100644
--- a/tests/test_represent.py
+++ b/tests/test_represent.py
@@ -10,7 +10,7 @@ def test_standard_represent():
     for embedding_obj in embedding_objs:
         embedding = embedding_obj["embedding"]
         logger.debug(f"Function returned {len(embedding)} dimensional vector")
-        assert len(embedding) == 2622
+        assert len(embedding) == 4096
     logger.info("✅ test standard represent function done")
 
 

From 19f057a712773c6f81a9083e611d830d47e09c9e Mon Sep 17 00:00:00 2001
From: Sefik Ilkin Serengil <serengil@gmail.com>
Date: Mon, 8 Jan 2024 16:57:07 +0000
Subject: [PATCH 3/6] new structure for vgg-face model

---
 deepface/basemodels/VGGFace.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/deepface/basemodels/VGGFace.py b/deepface/basemodels/VGGFace.py
index d02909d6..a1494258 100644
--- a/deepface/basemodels/VGGFace.py
+++ b/deepface/basemodels/VGGFace.py
@@ -19,7 +19,9 @@
         Flatten,
         Dropout,
         Activation,
+        Lambda,
     )
+    from keras import backend as K
 else:
     from tensorflow.keras.models import Model, Sequential
     from tensorflow.keras.layers import (
@@ -29,7 +31,9 @@
         Flatten,
         Dropout,
         Activation,
+        Lambda,
     )
+    from tensorflow.keras import backend as K
 
 # ---------------------------------------
 
@@ -98,6 +102,18 @@ def loadModel(
 
     model.load_weights(output)
 
-    vgg_face_descriptor = Model(inputs=model.layers[0].input, outputs=model.layers[-2].output)
+    # 2622d dimensional model
+    # vgg_face_descriptor = Model(inputs=model.layers[0].input, outputs=model.layers[-2].output)
+
+    # 4096 dimensional model offers 6% to 14% increasement on accuracy!
+    # - softmax causes underfitting
+    # - added normalization layer to avoid underfitting with euclidean
+    # as described here: https://github.com/serengil/deepface/issues/944
+    base_model_output = Sequential()
+    base_model_output = Flatten()(model.layers[-5].output)
+    base_model_output = Lambda(lambda x: K.l2_normalize(x, axis=1), name="norm_layer")(
+        base_model_output
+    )
+    vgg_face_descriptor = Model(inputs=model.input, outputs=base_model_output)
 
     return vgg_face_descriptor

From 05013e550cda3d460c7cdd216b27e17971135103 Mon Sep 17 00:00:00 2001
From: Sefik Ilkin Serengil <serengil@gmail.com>
Date: Mon, 8 Jan 2024 16:57:41 +0000
Subject: [PATCH 4/6] new threshold values for vgg-face

tuned on lfw dataset
---
 deepface/commons/distance.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/deepface/commons/distance.py b/deepface/commons/distance.py
index 4ed22472..17447368 100644
--- a/deepface/commons/distance.py
+++ b/deepface/commons/distance.py
@@ -41,7 +41,12 @@ def findThreshold(model_name: str, distance_metric: str) -> float:
     base_threshold = {"cosine": 0.40, "euclidean": 0.55, "euclidean_l2": 0.75}
 
     thresholds = {
-        "VGG-Face": {"cosine": 0.40, "euclidean": 0.60, "euclidean_l2": 0.86},
+        # "VGG-Face": {"cosine": 0.40, "euclidean": 0.60, "euclidean_l2": 0.86}, # 2622d
+        "VGG-Face": {
+            "cosine": 0.68,
+            "euclidean": 1.17,
+            "euclidean_l2": 1.17,
+        },  # 4096d - tuned with LFW
         "Facenet": {"cosine": 0.40, "euclidean": 10, "euclidean_l2": 0.80},
         "Facenet512": {"cosine": 0.30, "euclidean": 23.56, "euclidean_l2": 1.04},
         "ArcFace": {"cosine": 0.68, "euclidean": 4.15, "euclidean_l2": 1.13},

From 0eb1515e11361483e3c390168427c22824b84a7c Mon Sep 17 00:00:00 2001
From: Sefik Ilkin Serengil <serengil@gmail.com>
Date: Mon, 8 Jan 2024 16:59:20 +0000
Subject: [PATCH 5/6] avoid dimension imcompability error

created pickle may have 2622 dimensional vectors but VGG-Face is not creating
4096 dimensional vectors. If they are mismatch, then raise a meaningful error
---
 deepface/DeepFace.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/deepface/DeepFace.py b/deepface/DeepFace.py
index 7a83587c..4a157d3b 100644
--- a/deepface/DeepFace.py
+++ b/deepface/DeepFace.py
@@ -616,6 +616,15 @@ def find(
         for index, instance in df.iterrows():
             source_representation = instance[f"{model_name}_representation"]
 
+            target_dims = len(list(target_representation))
+            source_dims = len(list(source_representation))
+            if target_dims != source_dims:
+                raise ValueError(
+                    "Source and target embeddings must have same dimensions but "
+                    + f"{target_dims}:{source_dims}. Model structure may change"
+                    + " after pickle created. Delete the {file_name} and re-run."
+                )
+
             if distance_metric == "cosine":
                 distance = dst.findCosineDistance(source_representation, target_representation)
             elif distance_metric == "euclidean":
@@ -636,6 +645,7 @@ def find(
 
         threshold = dst.findThreshold(model_name, distance_metric)
         result_df = result_df.drop(columns=[f"{model_name}_representation"])
+        # pylint: disable=unsubscriptable-object
         result_df = result_df[result_df[f"{model_name}_{distance_metric}"] <= threshold]
         result_df = result_df.sort_values(
             by=[f"{model_name}_{distance_metric}"], ascending=True

From d35833e4e1d9b70254c64fb04ebc3a3f2cd8d8ca Mon Sep 17 00:00:00 2001
From: Sefik Ilkin Serengil <serengil@gmail.com>
Date: Mon, 8 Jan 2024 17:31:49 +0000
Subject: [PATCH 6/6] some more check for find test

---
 tests/test_find.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/tests/test_find.py b/tests/test_find.py
index aefe98e9..423567b6 100644
--- a/tests/test_find.py
+++ b/tests/test_find.py
@@ -7,19 +7,42 @@
 
 
 def test_find_with_exact_path():
-    dfs = DeepFace.find(img_path="dataset/img1.jpg", db_path="dataset", silent=True)
+    img_path = "dataset/img1.jpg"
+    dfs = DeepFace.find(img_path=img_path, db_path="dataset", silent=True)
+    assert len(dfs) > 0
     for df in dfs:
         assert isinstance(df, pd.DataFrame)
+
+        # one is img1.jpg itself
+        identity_df = df[df["identity"] == img_path]
+        assert identity_df.shape[0] > 0
+
+        # validate reproducability
+        assert identity_df["VGG-Face_cosine"].values[0] == 0
+
+        df = df[df["identity"] != img_path]
         logger.debug(df.head())
         assert df.shape[0] > 0
     logger.info("✅ test find for exact path done")
 
 
 def test_find_with_array_input():
-    img1 = cv2.imread("dataset/img1.jpg")
+    img_path = "dataset/img1.jpg"
+    img1 = cv2.imread(img_path)
     dfs = DeepFace.find(img1, db_path="dataset", silent=True)
-
+    assert len(dfs) > 0
     for df in dfs:
+        assert isinstance(df, pd.DataFrame)
+
+        # one is img1.jpg itself
+        identity_df = df[df["identity"] == img_path]
+        assert identity_df.shape[0] > 0
+
+        # validate reproducability
+        assert identity_df["VGG-Face_cosine"].values[0] == 0
+
+
+        df = df[df["identity"] != img_path]
         logger.debug(df.head())
         assert df.shape[0] > 0