From b4fce30302283314fefb1b0f08e4e19cefefbf9c Mon Sep 17 00:00:00 2001
From: Bogdan Buduroiu <bogdan@buduroiu.com>
Date: Tue, 28 Nov 2023 10:45:36 +0800
Subject: [PATCH] Functional tests: capture algo performance

---
 semantic_router/layer.py                 | 19 +++----
 semantic_router/linear.py                | 30 +++++++++++
 tests/functional/test_linear.py          | 69 ++++++++++++++++++++++++
 tests/{ => unit}/encoders/test_base.py   |  0
 tests/{ => unit}/encoders/test_cohere.py |  0
 tests/{ => unit}/encoders/test_openai.py |  0
 tests/{ => unit}/test_layer.py           |  0
 tests/{ => unit}/test_schema.py          |  0
 8 files changed, 105 insertions(+), 13 deletions(-)
 create mode 100644 semantic_router/linear.py
 create mode 100644 tests/functional/test_linear.py
 rename tests/{ => unit}/encoders/test_base.py (100%)
 rename tests/{ => unit}/encoders/test_cohere.py (100%)
 rename tests/{ => unit}/encoders/test_openai.py (100%)
 rename tests/{ => unit}/test_layer.py (100%)
 rename tests/{ => unit}/test_schema.py (100%)

diff --git a/semantic_router/layer.py b/semantic_router/layer.py
index 089f2793..dd746d0e 100644
--- a/semantic_router/layer.py
+++ b/semantic_router/layer.py
@@ -2,6 +2,7 @@
 from numpy.linalg import norm
 
 from semantic_router.encoders import BaseEncoder, CohereEncoder, OpenAIEncoder
+from semantic_router.linear import similarity_matrix, top_scores
 from semantic_router.schema import Decision
 
 
@@ -63,18 +64,12 @@ def _query(self, text: str, top_k: int = 5):
         xq = np.squeeze(xq)  # Reduce to 1d array.
 
         if self.index is not None:
-            index_norm = norm(self.index, axis=1)
-            xq_norm = norm(xq.T)
-            sim = np.dot(self.index, xq.T) / (index_norm * xq_norm)
-            # get indices of top_k records
-            top_k = min(top_k, sim.shape[0])
-            idx = np.argpartition(sim, -top_k)[-top_k:]
-            scores = sim[idx]
+            # calculate similarity matrix
+            sim = similarity_matrix(xq, self.index)
+            scores, idx = top_scores(sim, top_k)
             # get the utterance categories (decision names)
             decisions = self.categories[idx] if self.categories is not None else []
-            return [
-                {"decision": d, "score": s.item()} for d, s in zip(decisions, scores)
-            ]
+            return [{"decision": d, "score": s.item()} for d, s in zip(decisions, scores)]
         else:
             return []
 
@@ -89,9 +84,7 @@ def _semantic_classify(self, query_results: list[dict]) -> tuple[str, list[float
                 scores_by_class[decision] = [score]
 
         # Calculate total score for each class
-        total_scores = {
-            decision: sum(scores) for decision, scores in scores_by_class.items()
-        }
+        total_scores = {decision: sum(scores) for decision, scores in scores_by_class.items()}
         top_class = max(total_scores, key=lambda x: total_scores[x], default=None)
 
         # Return the top class and its associated scores
diff --git a/semantic_router/linear.py b/semantic_router/linear.py
new file mode 100644
index 00000000..1c13262f
--- /dev/null
+++ b/semantic_router/linear.py
@@ -0,0 +1,30 @@
+from typing import Tuple
+
+import numpy as np
+from numpy.linalg import norm
+
+
+def similarity_matrix(xq: np.ndarray, index: np.ndarray) -> np.ndarray:
+    """Compute the similarity scores between a query vector and a set of vectors.
+
+    Args:
+        xq: A query vector (1d ndarray)
+        index: A set of vectors.
+
+    Returns:
+        The similarity between the query vector and the set of vectors.
+    """
+
+    index_norm = norm(index, axis=1)
+    xq_norm = norm(xq.T)
+    sim = np.dot(index, xq.T) / (index_norm * xq_norm)
+    return sim
+
+
+def top_scores(sim: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
+    # get indices of top_k records
+    top_k = min(top_k, sim.shape[0])
+    idx = np.argpartition(sim, -top_k)[-top_k:]
+    scores = sim[idx]
+
+    return scores, idx
diff --git a/tests/functional/test_linear.py b/tests/functional/test_linear.py
new file mode 100644
index 00000000..6771fd9c
--- /dev/null
+++ b/tests/functional/test_linear.py
@@ -0,0 +1,69 @@
+import pytest
+import numpy as np
+
+from semantic_router.linear import similarity_matrix, top_scores
+
+
+@pytest.fixture
+def ident_vector():
+    return np.identity(10)[0]
+
+
+@pytest.fixture
+def test_index():
+    return np.array([[3, 0, 0], [2, 1, 0], [0, 1, 0]])
+
+
+def test_similarity_matrix__dimensionality():
+    """Test that the similarity matrix is square."""
+    xq = np.random.random((10,))  # 10-dimensional embedding vector
+    index = np.random.random((100, 10))
+    S = similarity_matrix(xq, index)
+    assert S.shape == (100,)
+
+
+def test_similarity_matrix__is_norm_max(ident_vector):
+    """
+    Using identical vectors should yield a maximum similarity of 1
+    """
+    index = np.repeat(np.atleast_2d(ident_vector), 3, axis=0)
+    sim = similarity_matrix(ident_vector, index)
+    assert sim.max() == 1.0
+
+
+def test_similarity_matrix__is_norm_min(ident_vector):
+    """
+    Using orthogonal vectors should yield a minimum similarity of 0
+    """
+    orth_v = np.roll(np.atleast_2d(ident_vector), 1)
+    index = np.repeat(orth_v, 3, axis=0)
+    sim = similarity_matrix(ident_vector, index)
+    assert sim.min() == 0.0
+
+
+def test_top_scores__is_sorted(test_index):
+    """
+    Test that the top_scores function returns a sorted list of scores.
+    """
+
+    xq = test_index[0]  # should have max similarity
+
+    sim = similarity_matrix(xq, test_index)
+    _, idx = top_scores(sim, 3)
+
+    # Scores and indexes should be sorted ascending
+    assert np.array_equal(idx, np.array([2, 1, 0]))
+
+
+def test_top_scores__scores(test_index):
+    """
+    Test that for a known vector and a known index, the top_scores function
+    returns exactly the expected scores.
+    """
+    xq = test_index[0]  # should have max similarity
+
+    sim = similarity_matrix(xq, test_index)
+    scores, _ = top_scores(sim, 3)
+
+    # Scores and indexes should be sorted ascending
+    assert np.allclose(scores, np.array([0.0, 0.89442719, 1.0]))
diff --git a/tests/encoders/test_base.py b/tests/unit/encoders/test_base.py
similarity index 100%
rename from tests/encoders/test_base.py
rename to tests/unit/encoders/test_base.py
diff --git a/tests/encoders/test_cohere.py b/tests/unit/encoders/test_cohere.py
similarity index 100%
rename from tests/encoders/test_cohere.py
rename to tests/unit/encoders/test_cohere.py
diff --git a/tests/encoders/test_openai.py b/tests/unit/encoders/test_openai.py
similarity index 100%
rename from tests/encoders/test_openai.py
rename to tests/unit/encoders/test_openai.py
diff --git a/tests/test_layer.py b/tests/unit/test_layer.py
similarity index 100%
rename from tests/test_layer.py
rename to tests/unit/test_layer.py
diff --git a/tests/test_schema.py b/tests/unit/test_schema.py
similarity index 100%
rename from tests/test_schema.py
rename to tests/unit/test_schema.py