diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index e43183cd..8a2880a8 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -127,6 +127,7 @@ jobs: - name: Extract metadata (tags, labels) for Docker id: meta-grpc + if: ${{ matrix.grpc }} uses: docker/metadata-action@v5 with: images: | @@ -142,6 +143,7 @@ jobs: - name: Build and push Docker image id: build-and-push-grpc + if: ${{ matrix.grpc }} uses: docker/build-push-action@v6 with: context: . diff --git a/.github/workflows/matrix.json b/.github/workflows/matrix.json index c83fcf34..230bd986 100644 --- a/.github/workflows/matrix.json +++ b/.github/workflows/matrix.json @@ -6,6 +6,7 @@ "sccache": true, "cudaComputeCap": 75, "extraBuildArgs": "DEFAULT_USE_FLASH_ATTENTION=False", + "grpc": true, "dockerfile": "Dockerfile-cuda" }, { @@ -14,6 +15,7 @@ "runOn": "always", "sccache": true, "cudaComputeCap": 80, + "grpc": true, "dockerfile": "Dockerfile-cuda" }, { @@ -22,6 +24,7 @@ "runOn": "main", "sccache": true, "cudaComputeCap": 86, + "grpc": true, "dockerfile": "Dockerfile-cuda" }, { @@ -30,6 +33,7 @@ "runOn": "main", "sccache": true, "cudaComputeCap": 89, + "grpc": true, "dockerfile": "Dockerfile-cuda" }, { @@ -38,6 +42,7 @@ "runOn": "main", "sccache": true, "cudaComputeCap": 90, + "grpc": true, "dockerfile": "Dockerfile-cuda" }, { @@ -45,6 +50,7 @@ "imageNamePrefix": "cuda-", "runOn": "main", "sccache": false, + "grpc": false, "dockerfile": "Dockerfile-cuda-all" }, { @@ -52,6 +58,7 @@ "imageNamePrefix": "cpu-", "runOn": "main", "sccache": true, + "grpc": true, "dockerfile": "Dockerfile" } ] diff --git a/router/src/http/server.rs b/router/src/http/server.rs index e2623c38..ae3d8a60 100644 --- a/router/src/http/server.rs +++ b/router/src/http/server.rs @@ -89,6 +89,8 @@ example = json ! ({"error": "Inference failed", "error_type": "backend"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Tokenization error", body = ErrorResponse, example = json ! ({"error": "Tokenization error", "error_type": "tokenizer"})), +(status = 400, description = "Batch is empty", body = ErrorResponse, +example = json ! ({"error": "Batch is empty", "error_type": "empty"})), (status = 413, description = "Batch size error", body = ErrorResponse, example = json ! ({"error": "Batch size error", "error_type": "validation"})), ) @@ -285,6 +287,8 @@ example = json ! ({"error": "Inference failed", "error_type": "backend"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Tokenization error", body = ErrorResponse, example = json ! ({"error": "Tokenization error", "error_type": "tokenizer"})), +(status = 400, description = "Batch is empty", body = ErrorResponse, +example = json ! ({"error": "Batch is empty", "error_type": "empty"})), (status = 413, description = "Batch size error", body = ErrorResponse, example = json ! ({"error": "Batch size error", "error_type": "validation"})), ) @@ -306,7 +310,7 @@ async fn rerank( tracing::error!("{message}"); let err = ErrorResponse { error: message, - error_type: ErrorType::Validation, + error_type: ErrorType::Empty, }; let counter = metrics::counter!("te_request_failure", "err" => "validation"); counter.increment(1); @@ -471,6 +475,8 @@ example = json ! ({"error": "Inference failed", "error_type": "backend"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Tokenization error", body = ErrorResponse, example = json ! ({"error": "Tokenization error", "error_type": "tokenizer"})), +(status = 400, description = "Batch is empty", body = ErrorResponse, +example = json ! ({"error": "Batch is empty", "error_type": "empty"})), (status = 413, description = "Batch size error", body = ErrorResponse, example = json ! ({"error": "Batch size error", "error_type": "validation"})), ) @@ -489,7 +495,7 @@ async fn similarity( tracing::error!("{message}"); let err = ErrorResponse { error: message, - error_type: ErrorType::Validation, + error_type: ErrorType::Empty, }; let counter = metrics::counter!("te_request_failure", "err" => "validation"); counter.increment(1); @@ -553,6 +559,8 @@ example = json ! ({"error": "Inference failed", "error_type": "backend"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Tokenization error", body = ErrorResponse, example = json ! ({"error": "Tokenization error", "error_type": "tokenizer"})), +(status = 400, description = "Batch is empty", body = ErrorResponse, +example = json ! ({"error": "Batch is empty", "error_type": "empty"})), (status = 413, description = "Batch size error", body = ErrorResponse, example = json ! ({"error": "Batch size error", "error_type": "validation"})), ) @@ -615,7 +623,7 @@ async fn embed( tracing::error!("{message}"); let err = ErrorResponse { error: message, - error_type: ErrorType::Validation, + error_type: ErrorType::Empty, }; let counter = metrics::counter!("te_request_failure", "err" => "validation"); counter.increment(1); @@ -722,6 +730,8 @@ example = json ! ({"error": "Inference failed", "error_type": "backend"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Tokenization error", body = ErrorResponse, example = json ! ({"error": "Tokenization error", "error_type": "tokenizer"})), +(status = 400, description = "Batch is empty", body = ErrorResponse, +example = json ! ({"error": "Batch is empty", "error_type": "empty"})), (status = 413, description = "Batch size error", body = ErrorResponse, example = json ! ({"error": "Batch size error", "error_type": "validation"})), ) @@ -792,7 +802,7 @@ async fn embed_sparse( tracing::error!("{message}"); let err = ErrorResponse { error: message, - error_type: ErrorType::Validation, + error_type: ErrorType::Empty, }; let counter = metrics::counter!("te_request_failure", "err" => "validation"); counter.increment(1); @@ -900,6 +910,8 @@ example = json ! ({"error": "Inference failed", "error_type": "backend"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Tokenization error", body = ErrorResponse, example = json ! ({"error": "Tokenization error", "error_type": "tokenizer"})), +(status = 400, description = "Batch is empty", body = ErrorResponse, +example = json ! ({"error": "Batch is empty", "error_type": "empty"})), (status = 413, description = "Batch size error", body = ErrorResponse, example = json ! ({"error": "Batch size error", "error_type": "validation"})), ) @@ -961,7 +973,7 @@ async fn embed_all( tracing::error!("{message}"); let err = ErrorResponse { error: message, - error_type: ErrorType::Validation, + error_type: ErrorType::Empty, }; let counter = metrics::counter!("te_request_failure", "err" => "validation"); counter.increment(1); @@ -1067,6 +1079,8 @@ example = json ! ({"message": "Inference failed", "type": "backend"})), example = json ! ({"message": "Model is overloaded", "type": "overloaded"})), (status = 422, description = "Tokenization error", body = OpenAICompatErrorResponse, example = json ! ({"message": "Tokenization error", "type": "tokenizer"})), +(status = 400, description = "Batch is empty", body = OpenAICompatErrorResponse, +example = json ! ({"message": "Batch is empty", "type": "empty"})), (status = 413, description = "Batch size error", body = OpenAICompatErrorResponse, example = json ! ({"message": "Batch size error", "type": "validation"})), ) @@ -1150,7 +1164,7 @@ async fn openai_embed( tracing::error!("{message}"); let err = ErrorResponse { error: message, - error_type: ErrorType::Validation, + error_type: ErrorType::Empty, }; let counter = metrics::counter!("te_request_failure", "err" => "validation"); counter.increment(1); @@ -1265,8 +1279,12 @@ path = "/tokenize", request_body = TokenizeRequest, responses( (status = 200, description = "Tokenized ids", body = TokenizeResponse), +(status = 400, description = "Batch is empty", body = ErrorResponse, +example = json ! ({"error": "Batch is empty", "error_type": "empty"})), +(status = 413, description = "Batch size error", body = ErrorResponse, +example = json ! ({"error": "Batch size error", "error_type": "validation"})), (status = 422, description = "Tokenization error", body = ErrorResponse, -example = json ! ({"message": "Tokenization error", "type": "tokenizer"})), +example = json ! ({"error": "Tokenization error", "error_type": "tokenizer"})), ) )] #[instrument(skip_all)] @@ -1327,7 +1345,7 @@ async fn tokenize( tracing::error!("{message}"); let err = ErrorResponse { error: message, - error_type: ErrorType::Validation, + error_type: ErrorType::Empty, }; let counter = metrics::counter!("te_request_failure", "err" => "validation"); counter.increment(1); @@ -1377,8 +1395,12 @@ path = "/decode", request_body = DecodeRequest, responses( (status = 200, description = "Decoded ids", body = DecodeResponse), +(status = 400, description = "Batch is empty", body = ErrorResponse, +example = json ! ({"error": "Batch is empty", "error_type": "empty"})), +(status = 413, description = "Batch size error", body = ErrorResponse, +example = json ! ({"error": "Batch size error", "error_type": "validation"})), (status = 422, description = "Tokenization error", body = ErrorResponse, -example = json ! ({"message": "Tokenization error", "type": "tokenizer"})), +example = json ! ({"error": "Tokenization error", "error_type": "tokenizer"})), ) )] #[instrument(skip_all)] @@ -1403,7 +1425,7 @@ async fn decode( tracing::error!("{message}"); let err = ErrorResponse { error: message, - error_type: ErrorType::Validation, + error_type: ErrorType::Empty, }; let counter = metrics::counter!("te_request_failure", "err" => "validation"); counter.increment(1); @@ -1454,6 +1476,8 @@ example = json ! ({"error": "Inference failed", "error_type": "backend"})), example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), (status = 422, description = "Tokenization error", body = ErrorResponse, example = json ! ({"error": "Tokenization error", "error_type": "tokenizer"})), +(status = 400, description = "Batch is empty", body = ErrorResponse, +example = json ! ({"error": "Batch is empty", "error_type": "empty"})), (status = 413, description = "Batch size error", body = ErrorResponse, example = json ! ({"error": "Batch size error", "error_type": "validation"})), ) @@ -1804,6 +1828,7 @@ impl From<&ErrorType> for StatusCode { ErrorType::Overloaded => StatusCode::TOO_MANY_REQUESTS, ErrorType::Tokenizer => StatusCode::UNPROCESSABLE_ENTITY, ErrorType::Validation => StatusCode::PAYLOAD_TOO_LARGE, + ErrorType::Empty => StatusCode::BAD_REQUEST, } } } diff --git a/router/src/lib.rs b/router/src/lib.rs index 540e61af..68c86b87 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -516,6 +516,7 @@ pub enum ErrorType { Overloaded, Validation, Tokenizer, + Empty, } #[derive(Serialize)]