From 863032eea98006064335c65c1ebb5b4bfa6c86d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Pedersen?= Date: Fri, 15 Mar 2024 12:55:57 +0100 Subject: [PATCH] feat: Support for setting aggregation for (Hybrid)RouteLayer (#202) * Enable to set top_k for RouteLayer * Add top_k unit test for RouteLayer * Added support for setting different aggregation method for HybridRL * Add unit test to verify that aggregation works * Add support for setting aggregation in RouteLayer * Fix bug where agg method not used in RL * Add tests for agg * Linted code * Updated codecov file * Change aggregation method names to lower case --------- Co-authored-by: James Briggs <35938317+jamescalam@users.noreply.github.com> --- coverage.xml | 385 +++++++++++++++++--------------- semantic_router/hybrid_layer.py | 24 +- semantic_router/layer.py | 31 ++- tests/unit/test_hybrid_layer.py | 43 ++++ tests/unit/test_layer.py | 44 +++- 5 files changed, 344 insertions(+), 183 deletions(-) diff --git a/coverage.xml b/coverage.xml index ef0c214f..321f6c5c 100644 --- a/coverage.xml +++ b/coverage.xml @@ -1,12 +1,12 @@ - + /Users/andreped/workspace/semantic-router/semantic_router - + @@ -18,7 +18,7 @@ - + @@ -33,104 +33,116 @@ - - - + + - - + + - - + + - + + - - + + + - - - + + - - + + + - - - - - - + + + + + - - + - + + + - - + + + - - + + - - - - - - - + + + + + - - + - + - + - + + - - + - - + + + - - - + + + + + + + + + + + + + + + - + @@ -204,217 +216,232 @@ - + + - - + + - - + + - + - - - - + + + + - + - + - + - - + + - + - - - + - - + + + - + + - + - - - - - - - - + + + + + + + + + + - - - - - - + + + - - - - - - - - - + + + + + + + + + + + + - - - - - + + + + - + + - - - - + + + - - - - - - + + + - + + + + + - - - + + + - - + - - - - - - - - - + + + + + + + + + + + - - - - - - + + + - + - - + + + + - - - + - - - + + + + + - - - + + + - - - + + + - + + - - + + - - - - - + + + - + - - + - - + - - - - + + + - + + + + - - - - + + + + + + + - + + + + + + + + + + + @@ -671,7 +698,7 @@ - + @@ -878,7 +905,7 @@ - + @@ -903,12 +930,12 @@ - + - - - - + + + + diff --git a/semantic_router/hybrid_layer.py b/semantic_router/hybrid_layer.py index 9791786f..5f223384 100644 --- a/semantic_router/hybrid_layer.py +++ b/semantic_router/hybrid_layer.py @@ -25,6 +25,7 @@ def __init__( routes: List[Route] = [], alpha: float = 0.3, top_k: int = 5, + aggregation: str = "sum", ): self.encoder = encoder self.score_threshold = self.encoder.score_threshold @@ -39,6 +40,12 @@ def __init__( self.top_k = top_k if self.top_k < 1: raise ValueError(f"top_k needs to be >= 1, but was: {self.top_k}.") + self.aggregation = aggregation + if self.aggregation not in ["sum", "mean", "max"]: + raise ValueError( + f"Unsupported aggregation method chosen: {aggregation}. Choose either 'SUM', 'MEAN', or 'MAX'." + ) + self.aggregation_method = self._set_aggregation_method(self.aggregation) self.routes = routes if isinstance(self.sparse_encoder, TfidfEncoder) and hasattr( self.sparse_encoder, "fit" @@ -165,6 +172,18 @@ def _convex_scaling(self, dense: np.ndarray, sparse: np.ndarray): sparse = np.array(sparse) * (1 - self.alpha) return dense, sparse + def _set_aggregation_method(self, aggregation: str = "sum"): + if aggregation == "sum": + return lambda x: sum(x) + elif aggregation == "mean": + return lambda x: np.mean(x) + elif aggregation == "max": + return lambda x: max(x) + else: + raise ValueError( + f"Unsupported aggregation method chosen: {aggregation}. Choose either 'SUM', 'MEAN', or 'MAX'." + ) + def _semantic_classify(self, query_results: List[Dict]) -> Tuple[str, List[float]]: scores_by_class: Dict[str, List[float]] = {} for result in query_results: @@ -176,7 +195,10 @@ def _semantic_classify(self, query_results: List[Dict]) -> Tuple[str, List[float scores_by_class[route] = [score] # Calculate total score for each class - total_scores = {route: sum(scores) for route, scores in scores_by_class.items()} + total_scores = { + route: self.aggregation_method(scores) + for route, scores in scores_by_class.items() + } top_class = max(total_scores, key=lambda x: total_scores[x], default=None) # Return the top class and its associated scores diff --git a/semantic_router/layer.py b/semantic_router/layer.py index d0d3e33a..221de2be 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -182,6 +182,8 @@ def __init__( llm: Optional[BaseLLM] = None, routes: Optional[List[Route]] = None, index: Optional[BaseIndex] = None, # type: ignore + top_k: int = 5, + aggregation: str = "sum", ): logger.info("local") self.index: BaseIndex = index if index is not None else LocalIndex() @@ -196,6 +198,16 @@ def __init__( self.llm = llm self.routes: list[Route] = routes if routes is not None else [] self.score_threshold = self.encoder.score_threshold + self.top_k = top_k + if self.top_k < 1: + raise ValueError(f"top_k needs to be >= 1, but was: {self.top_k}.") + self.aggregation = aggregation + if self.aggregation not in ["sum", "mean", "max"]: + raise ValueError( + f"Unsupported aggregation method chosen: {aggregation}. Choose either 'SUM', 'MEAN', or 'MAX'." + ) + self.aggregation_method = self._set_aggregation_method(self.aggregation) + # set route score thresholds if not already set for route in self.routes: if route.score_threshold is None: @@ -266,7 +278,7 @@ def _retrieve_top_route( Returns a tuple of the route (if any) and the scores of the top class. """ # get relevant results (scores and routes) - results = self._retrieve(xq=np.array(vector)) + results = self._retrieve(xq=np.array(vector), top_k=self.top_k) # decide most relevant routes top_class, top_class_scores = self._semantic_classify(results) # TODO do we need this check? @@ -391,6 +403,18 @@ def _retrieve(self, xq: Any, top_k: int = 5) -> List[dict]: scores, routes = self.index.query(vector=xq, top_k=top_k) return [{"route": d, "score": s.item()} for d, s in zip(routes, scores)] + def _set_aggregation_method(self, aggregation: str = "sum"): + if aggregation == "sum": + return lambda x: sum(x) + elif aggregation == "mean": + return lambda x: np.mean(x) + elif aggregation == "max": + return lambda x: max(x) + else: + raise ValueError( + f"Unsupported aggregation method chosen: {aggregation}. Choose either 'SUM', 'MEAN', or 'MAX'." + ) + def _semantic_classify(self, query_results: List[dict]) -> Tuple[str, List[float]]: scores_by_class: Dict[str, List[float]] = {} for result in query_results: @@ -402,7 +426,10 @@ def _semantic_classify(self, query_results: List[dict]) -> Tuple[str, List[float scores_by_class[route] = [score] # Calculate total score for each class - total_scores = {route: sum(scores) for route, scores in scores_by_class.items()} + total_scores = { + route: self.aggregation_method(scores) + for route, scores in scores_by_class.items() + } top_class = max(total_scores, key=lambda x: total_scores[x], default=None) # Return the top class and its associated scores diff --git a/tests/unit/test_hybrid_layer.py b/tests/unit/test_hybrid_layer.py index d4896509..bf0c2ad2 100644 --- a/tests/unit/test_hybrid_layer.py +++ b/tests/unit/test_hybrid_layer.py @@ -193,5 +193,48 @@ def test_add_route_tfidf(self, cohere_encoder, tfidf_encoder, routes): assert hybrid_route_layer.sparse_index is not None assert len(hybrid_route_layer.sparse_index) == len(all_utterances) + def test_setting_aggregation_methods(self, openai_encoder, routes): + for agg in ["sum", "mean", "max"]: + route_layer = HybridRouteLayer( + encoder=openai_encoder, + sparse_encoder=sparse_encoder, + routes=routes, + aggregation=agg, + ) + assert route_layer.aggregation == agg + + def test_semantic_classify_multiple_routes_with_different_aggregation( + self, openai_encoder, routes + ): + route_scores = [ + {"route": "Route 1", "score": 0.5}, + {"route": "Route 1", "score": 0.5}, + {"route": "Route 1", "score": 0.5}, + {"route": "Route 1", "score": 0.5}, + {"route": "Route 2", "score": 0.4}, + {"route": "Route 2", "score": 0.6}, + {"route": "Route 2", "score": 0.8}, + {"route": "Route 3", "score": 0.1}, + {"route": "Route 3", "score": 1.0}, + ] + for agg in ["sum", "mean", "max"]: + route_layer = HybridRouteLayer( + encoder=openai_encoder, + sparse_encoder=sparse_encoder, + routes=routes, + aggregation=agg, + ) + classification, score = route_layer._semantic_classify(route_scores) + + if agg == "sum": + assert classification == "Route 1" + assert score == [0.5, 0.5, 0.5, 0.5] + elif agg == "mean": + assert classification == "Route 2" + assert score == [0.4, 0.6, 0.8] + elif agg == "max": + assert classification == "Route 3" + assert score == [0.1, 1.0] + # Add more tests for edge cases and error handling as needed. diff --git a/tests/unit/test_layer.py b/tests/unit/test_layer.py index 415150a5..4a55777b 100644 --- a/tests/unit/test_layer.py +++ b/tests/unit/test_layer.py @@ -120,9 +120,10 @@ def test_data(): class TestRouteLayer: def test_initialization(self, openai_encoder, routes): - route_layer = RouteLayer(encoder=openai_encoder, routes=routes) + route_layer = RouteLayer(encoder=openai_encoder, routes=routes, top_k=10) assert openai_encoder.score_threshold == 0.82 assert route_layer.score_threshold == 0.82 + assert route_layer.top_k == 10 assert len(route_layer.index) if route_layer.index is not None else 0 == 5 assert ( len(set(route_layer._get_route_names())) @@ -522,3 +523,44 @@ def test_remove(self): layer_config = LayerConfig(routes=[route]) layer_config.remove("test") assert layer_config.routes == [] + + def test_setting_aggregation_methods(self, openai_encoder, routes): + for agg in ["sum", "mean", "max"]: + route_layer = RouteLayer( + encoder=openai_encoder, + routes=routes, + aggregation=agg, + ) + assert route_layer.aggregation == agg + + def test_semantic_classify_multiple_routes_with_different_aggregation( + self, openai_encoder, routes + ): + route_scores = [ + {"route": "Route 1", "score": 0.5}, + {"route": "Route 1", "score": 0.5}, + {"route": "Route 1", "score": 0.5}, + {"route": "Route 1", "score": 0.5}, + {"route": "Route 2", "score": 0.4}, + {"route": "Route 2", "score": 0.6}, + {"route": "Route 2", "score": 0.8}, + {"route": "Route 3", "score": 0.1}, + {"route": "Route 3", "score": 1.0}, + ] + for agg in ["sum", "mean", "max"]: + route_layer = RouteLayer( + encoder=openai_encoder, + routes=routes, + aggregation=agg, + ) + classification, score = route_layer._semantic_classify(route_scores) + + if agg == "sum": + assert classification == "Route 1" + assert score == [0.5, 0.5, 0.5, 0.5] + elif agg == "mean": + assert classification == "Route 2" + assert score == [0.4, 0.6, 0.8] + elif agg == "max": + assert classification == "Route 3" + assert score == [0.1, 1.0]