Merge branch 'main' into tmp/fc-model

Signed-off-by: Avik Basu <ab93@users.noreply.github.com>
numaproj · Jun 11, 2024 · f3252a2 · f3252a2
2 parents 7114f76 + 0e1a928
commit f3252a2
Show file tree

Hide file tree

Showing 23 changed files with 974 additions and 872 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -36,7 +36,8 @@ var/
 *.egg-info/
 .installed.cfg
 *.egg
+site
 
 # Virtual environment
 .env
-venv/
+**/.venv/
diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     name: Generate changelog
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           ref: main
           fetch-depth: 0

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -13,16 +13,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Install poetry
       run: pipx install poetry==1.6.1
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
         cache: 'poetry'

diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -16,13 +16,13 @@ jobs:
         python-version: ["3.9"]
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     - name: Install poetry
       run: pipx install poetry==1.6.1
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
         cache: 'poetry'

diff --git a/.github/workflows/gh-pages.yaml b/.github/workflows/gh-pages.yaml
@@ -15,9 +15,9 @@ jobs:
     if: github.repository == 'numaproj/numalogic'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Setup Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: 3.9
       - name: build

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -11,7 +11,7 @@ jobs:
     name: Black format
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: psf/black@stable
         with:
           options: "--check --verbose"

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -16,13 +16,13 @@ jobs:
 
     name: Publish to PyPi
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Install poetry
         run: pipx install poetry==1.6.1
 
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
           cache: 'poetry'

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -19,7 +19,7 @@ jobs:
         python-version: ["3.11"]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Docker Login
         uses: docker/login-action@v2.1.0

diff --git a/Dockerfile b/Dockerfile
@@ -3,9 +3,9 @@
 ####################################################################################################
 
 ARG PYTHON_VERSION=3.11
-FROM python:${PYTHON_VERSION}-slim-bookworm AS builder
+FROM python:${PYTHON_VERSION}-bookworm AS builder
 
-ARG POETRY_VERSION=1.6
+ARG POETRY_VERSION=1.8
 ARG INSTALL_EXTRAS
 
 ENV POETRY_NO_INTERACTION=1 \
@@ -16,28 +16,22 @@ ENV POETRY_NO_INTERACTION=1 \
     POETRY_HOME="/opt/poetry" \
     PATH="$POETRY_HOME/bin:$PATH"
 
-RUN apt-get update \
-    && apt-get install --no-install-recommends -y build-essential \
-    && apt-get clean && rm -rf /var/lib/apt/lists/* \
-    && pip install --no-cache-dir poetry==$POETRY_VERSION
+RUN pip install --no-cache-dir poetry==$POETRY_VERSION
 
 WORKDIR /app
 COPY poetry.lock pyproject.toml ./
 
 RUN poetry install --without dev --no-root --extras "${INSTALL_EXTRAS}"  \
     && poetry run pip install --no-cache-dir "torch>=2.0,<3.0" --index-url https://download.pytorch.org/whl/cpu \
-    && poetry run pip install --no-cache-dir "lightning[pytorch]" \
-    && rm -rf $POETRY_CACHE_DIR \
-    && pip cache purge \
-    && apt-get purge -y --auto-remove build-essential
+    && poetry run pip install --no-cache-dir "lightning[pytorch]<3.0"
 
 ####################################################################################################
 # runtime: used for running the udf vertices
 ####################################################################################################
 FROM python:${PYTHON_VERSION}-slim-bookworm AS runtime
 
 RUN apt-get update \
-    && apt-get install --no-install-recommends -y dumb-init \
+    && apt-get install dumb-init \
     && apt-get clean && rm -rf /var/lib/apt/lists/* \
     && apt-get purge -y --auto-remove
 

diff --git a/examples/multichannel_ae.ipynb b/examples/multichannel_ae.ipynb
diff --git a/numalogic/config/factory.py b/numalogic/config/factory.py
@@ -51,6 +51,7 @@ class PreprocessFactory(_ObjectFactory):
         GaussianNoiseAdder,
         DifferenceTransform,
         FlattenVector,
+        FlattenVectorWithPadding,
         PercentileScaler,
         ExpMovingAverage,
     )
@@ -67,6 +68,7 @@ class PreprocessFactory(_ObjectFactory):
         "GaussianNoiseAdder": GaussianNoiseAdder,
         "DifferenceTransform": DifferenceTransform,
         "FlattenVector": FlattenVector,
+        "FlattenVectorWithPadding": FlattenVectorWithPadding,
         "PercentileScaler": PercentileScaler,
         "ExpMovingAverage": ExpMovingAverage,
     }
@@ -123,6 +125,7 @@ class ModelFactory(_ObjectFactory):
     from numalogic.models.autoencoder.variants import (
         VanillaAE,
         SparseVanillaAE,
+        MultichannelAE,
         Conv1dAE,
         SparseConv1dAE,
         LSTMAE,
@@ -135,6 +138,7 @@ class ModelFactory(_ObjectFactory):
     _CLS_MAP: ClassVar[dict] = {
         "VanillaAE": VanillaAE,
         "SparseVanillaAE": SparseVanillaAE,
+        "MultichannelAE": MultichannelAE,
         "Conv1dAE": Conv1dAE,
         "SparseConv1dAE": SparseConv1dAE,
         "LSTMAE": LSTMAE,

diff --git a/numalogic/connectors/_config.py b/numalogic/connectors/_config.py
@@ -52,7 +52,7 @@ class DruidFetcherConf:
     dimensions: list[str] = field(default_factory=list)
     aggregations: dict = field(default_factory=dict)
     group_by: list[str] = field(default_factory=list)
-    pivot: Pivot = field(default_factory=lambda: Pivot())
+    pivot: Optional[Pivot] = field(default_factory=lambda: Pivot())
     granularity: str = "minute"
 
     def __post_init__(self):
@@ -92,7 +92,7 @@ class RDSFetcherConf:
     # metric column names
     metrics: list[str]
     group_by: list[str] = field(default_factory=list)
-    pivot: Pivot = field(default_factory=lambda: Pivot())
+    pivot: Optional[Pivot] = field(default_factory=lambda: Pivot())
     hash_query_type: bool = True
     hash_column_name: str = "model_md5_hash"
     datetime_column_name: str = "eventdatetime"

diff --git a/numalogic/models/autoencoder/variants/vanilla.py b/numalogic/models/autoencoder/variants/vanilla.py
@@ -210,7 +210,7 @@ def predict_step(self, batch: Tensor, batch_idx: int, dataloader_idx: int = 0):
 
 
 class MultichannelAE(BaseAE):
-    r"""Multichannel Autoencoder model based on the vanilla encoder and decoder.
+    r"""Multichannel Vanilla Autoencoder model based on the vanilla encoder and decoder.
         Each channel is an isolated neural network.
 
     Args:
@@ -234,14 +234,12 @@ def __init__(
         decoder_layersizes: Sequence[int] = (8, 16),
         dropout_p: float = 0.25,
         batchnorm: bool = False,
-        encoderinfo: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.seq_len = seq_len
         self.dropout_prob = dropout_p
         self.n_channels = n_channels
-        self.encoderinfo = encoderinfo
         # The number of features per channel default to 1 in this architecture
         self.n_features = 1
 
@@ -291,16 +289,11 @@ def forward(self, batch: Tensor) -> tuple[Tensor, Tensor]:
             encoded = encoder(batch_channel)
             decoded = decoder(encoded)
 
-            if self.encoderinfo:
-                encoded_all.append(encoded)
+            encoded_all.append(encoded)
             decoded_all.append(decoded)
 
-        if self.encoderinfo:
-            # Stack the encoded outputs of all channels when required
-            encoded_all = torch.stack(encoded_all, dim=-1)
-            encoded_all = torch.squeeze(encoded_all, 1)
-        else:
-            encoded_all = EMPTY_TENSOR
+        encoded_all = torch.stack(encoded_all, dim=-1)
+        encoded_all = torch.squeeze(encoded_all, 1)
 
         decoded_all = torch.stack(decoded_all, dim=-1)
         decoded_all = torch.squeeze(decoded_all, 1)

diff --git a/numalogic/transforms/__init__.py b/numalogic/transforms/__init__.py
@@ -22,6 +22,7 @@
     GaussianNoiseAdder,
     DifferenceTransform,
     FlattenVector,
+    FlattenVectorWithPadding,
 )
 from numalogic.transforms._movavg import ExpMovingAverage, expmov_avg_aggregator
 from numalogic.transforms._postprocess import TanhNorm, tanh_norm, SigmoidNorm
@@ -38,6 +39,7 @@
     "GaussianNoiseAdder",
     "DifferenceTransform",
     "FlattenVector",
+    "FlattenVectorWithPadding",
     "PercentileScaler",
     "SigmoidNorm",
 ]
diff --git a/numalogic/transforms/_stateless.py b/numalogic/transforms/_stateless.py
@@ -156,3 +156,56 @@ def transform(self, X: npt.NDArray[float], **__) -> npt.NDArray[float]:
 
     def inverse_transform(self, X: npt.NDArray[float]) -> npt.NDArray[float]:
         return X.reshape(-1, self.n_features)
+
+
+class FlattenVectorWithPadding(StatelessTransformer):
+    """A stateless transformer that flattens some of the columns and does padding on the rest.
+
+    Args:
+    ____
+        features: list of all feature names in the order of columns of the payload matrix
+        flatten_features: list of feature names to be flattened
+        padding_with: numerical value to be used for padding, default is 0
+
+    """
+
+    __slots__ = (
+        "features",
+        "flatten_features",
+        "padding_with",
+        "padding_features",
+        "flatten_indexes",
+        "padding_indexes",
+    )
+
+    @staticmethod
+    def _feature_indexes(features_all: list[str], features: list[str]) -> list[int]:
+        return [features_all.index(f) for f in features]
+
+    def __init__(self, features: list[str], flatten_features: list[str], padding_with: float = 0.0):
+        self.features = features
+        self.flatten_features = flatten_features
+        self.padding_with = padding_with
+
+        self.padding_features = list(set(features) - set(flatten_features))
+        if not self.padding_features:
+            raise ValueError("At least one feature should be left for padding.")
+        self.flatten_indexes = self._feature_indexes(features, self.flatten_features)
+        self.padding_indexes = self._feature_indexes(features, self.padding_features)
+
+    def transform(self, X: npt.NDArray[float], **__) -> npt.NDArray[float]:
+        X_flatten = X[:, self.flatten_indexes].flatten().reshape(-1, 1)
+        padding_len = X_flatten.shape[0] - X.shape[0]
+        X_padding = np.pad(
+            X[:, self.padding_indexes],
+            ((0, padding_len), (0, 0)),
+            mode="constant",
+            constant_values=self.padding_with,
+        )
+        return np.concatenate([X_flatten, X_padding], axis=1)
+
+    def inverse_transform(self, X: npt.NDArray[float]) -> npt.NDArray[float]:
+        X_flatten = X[:, 0].reshape(-1, len(self.flatten_features))
+        original_len = X.shape[0] - X_flatten.shape[0]
+        X_padding_removed = X[:original_len, 1:]
+        return np.concatenate([X_flatten, X_padding_removed], axis=1)
diff --git a/numalogic/udfs/__main__.py b/numalogic/udfs/__main__.py
@@ -28,7 +28,7 @@ def init_server(step: str, server_type: str):
         redis_client = get_redis_client_from_conf(pipeline_conf.redis_conf)
         udf = UDFFactory.get_udf_instance(step, r_client=redis_client, pl_conf=pipeline_conf)
 
-    return ServerFactory.get_server_instance(server_type, handler=udf)
+    return ServerFactory.get_server_instance(server_type, mapper_instance=udf)
 
 
 def start_server() -> None:

diff --git a/numalogic/udfs/factory.py b/numalogic/udfs/factory.py
@@ -94,12 +94,12 @@ def get_udf_instance(cls, udf_name: str, **kwargs) -> nl_udf_t:
 class ServerFactory:
     """Factory class to fetch the right pynumaflow function server/mapper."""
 
-    from pynumaflow.mapper import Mapper, MultiProcMapper, AsyncMapper
+    from pynumaflow.mapper import MapServer, MapMultiprocServer, MapAsyncServer
 
     _SERVER_MAP: ClassVar[dict] = {
-        "sync": Mapper,
-        "async": AsyncMapper,
-        "multiproc": MultiProcMapper,
+        "sync": MapServer,
+        "async": MapAsyncServer,
+        "multiproc": MapMultiprocServer,
     }
 
     @classmethod