fix: performance adjustments, migrate

quintoandar · Oct 4, 2024 · 142a29b · 142a29b
1 parent 5f7028b
commit 142a29b
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 16 deletions.
diff --git a/butterfree/migrations/database_migration/cassandra_migration.py b/butterfree/migrations/database_migration/cassandra_migration.py
@@ -86,11 +86,30 @@ def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str:
             Alter column type query.
 
         """
-        parsed_columns = self._get_parsed_columns([column])
+    def _get_alter_column_type_query(self, column: Diff, table_name: str) -> str:
+        """Creates CQL statement to alter columns' types.
+            In Cassandra 3.4.x to 3.11.x alter type is not allowed.
+            This method creates a temp column to comply.
+
+        Args:
+            columns: list of Diff objects with ALTER_TYPE kind.
+            table_name: table name.
+
+        Returns:
+            Alter column type query.
+
+        """
+
+        temp_column_name = f"{column.column}_temp"
+
+        add_temp_column_query = f"ALTER TABLE {table_name} ADD {temp_column_name} {column.value};"
+        copy_data_to_temp_query = f"UPDATE {table_name} SET {temp_column_name} = {column.column};"
+
+        drop_old_column_query = f"ALTER TABLE {table_name} DROP {column.column};"
+        rename_temp_column_query = f"ALTER TABLE {table_name} RENAME {temp_column_name} TO {column.column};"
+
+        return f"{add_temp_column_query} {copy_data_to_temp_query} {drop_old_column_query} {rename_temp_column_query};"
 
-        return (
-            f"ALTER TABLE {table_name} ALTER {parsed_columns.replace(' ', ' TYPE ')};"
-        )
 
     @staticmethod
     def _get_create_table_query(columns: List[Dict[str, Any]], table_name: str) -> str:

diff --git a/butterfree/transform/aggregated_feature_set.py b/butterfree/transform/aggregated_feature_set.py
@@ -576,14 +576,17 @@ def construct(
 
         pre_hook_df = self.run_pre_hooks(dataframe)
 
-        output_df = reduce(
-            lambda df, feature: feature.transform(df),
-            self.keys + [self.timestamp],
-            pre_hook_df,
+        # Apply transformations
+        for feature in self.keys + [self.timestamp]:
+            output_df = feature.transform(pre_hook_df)
+
+        # Early filter data
+        output_df = self.incremental_strategy.filter_with_incremental_strategy(
+            dataframe=output_df, start_date=start_date, end_date=end_date
         )
 
         if self._windows and end_date is not None:
-            # run aggregations for each window
+            # Run aggregations for each window
             agg_list = [
                 self._aggregate(
                     dataframe=output_df,
@@ -603,13 +606,12 @@ def construct(
 
             # keeping this logic to maintain the same behavior for already implemented
             # feature sets
-
             if self._windows[0].slide == "1 day":
                 base_df = self._get_base_dataframe(
                     client=client, dataframe=output_df, end_date=end_date
                 )
 
-                # left join each aggregation result to our base dataframe
+                # Left join each aggregation result to our base dataframe
                 output_df = reduce(
                     lambda left, right: self._dataframe_join(
                         left,
@@ -635,19 +637,21 @@ def construct(
         else:
             output_df = self._aggregate(output_df, features=self.features)
 
-        output_df = self.incremental_strategy.filter_with_incremental_strategy(
-            dataframe=output_df, start_date=start_date, end_date=end_date
-        )
-
         output_df = output_df.select(*self.columns).replace(  # type: ignore
             float("nan"), None
         )
+
         if not output_df.isStreaming and self.deduplicate_rows:
             output_df = self._filter_duplicated_rows(output_df)
 
         post_hook_df = self.run_post_hooks(output_df)
 
+        # Eager evaluation, only if needed and managable
         if not output_df.isStreaming and self.eager_evaluation:
-            post_hook_df.cache().count()
+            # Small dataframes only
+            if output_df.count() < 1_000_000:
+                post_hook_df.cache().count()
+            else:
+                post_hook_df.cache()  # Cache without materialization for large volumes
 
         return post_hook_df