apache · Jackie-Jiang · Apr 5, 2024 · Nov 8, 2023 · Nov 9, 2023 · Mar 9, 2024
diff --git a/...l/src/main/java/org/apache/pinot/segment/local/upsert/BaseTableUpsertMetadataManager.java b/...l/src/main/java/org/apache/pinot/segment/local/upsert/BaseTableUpsertMetadataManager.java
@@ -22,7 +22,6 @@
 import java.io.File;
 import java.util.Collections;
 import java.util.List;
-import java.util.Map;
 import javax.annotation.concurrent.ThreadSafe;
 import org.apache.commons.collections.CollectionUtils;
 import org.apache.pinot.segment.local.data.manager.TableDataManager;
@@ -60,12 +59,7 @@ public void init(TableConfig tableConfig, Schema schema, TableDataManager tableD
 
     PartialUpsertHandler partialUpsertHandler = null;
     if (upsertConfig.getMode() == UpsertConfig.Mode.PARTIAL) {
-      Map<String, UpsertConfig.Strategy> partialUpsertStrategies = upsertConfig.getPartialUpsertStrategies();
-      Preconditions.checkArgument(partialUpsertStrategies != null,
-          "Partial-upsert strategies must be configured for partial-upsert enabled table: %s", _tableNameWithType);
-      partialUpsertHandler =
-          new PartialUpsertHandler(schema, partialUpsertStrategies, upsertConfig.getDefaultPartialUpsertStrategy(),
-              comparisonColumns);
+      partialUpsertHandler = new PartialUpsertHandler(schema, comparisonColumns, upsertConfig);
     }
 
     String deleteRecordColumn = upsertConfig.getDeleteRecordColumn();

diff --git a/...va/org/apache/pinot/segment/local/upsert/ConcurrentMapPartitionUpsertMetadataManager.java b/...va/org/apache/pinot/segment/local/upsert/ConcurrentMapPartitionUpsertMetadataManager.java
@@ -19,7 +19,9 @@
 package org.apache.pinot.segment.local.upsert;
 
 import com.google.common.annotations.VisibleForTesting;
+import java.util.HashMap;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -49,6 +51,7 @@ public class ConcurrentMapPartitionUpsertMetadataManager extends BasePartitionUp
 
   // Used to initialize a reference to previous row for merging in partial upsert
   private final LazyRow _reusePreviousRow = new LazyRow();
+  private final Map<String, Object> _reuseMergeResultHolder = new HashMap<>();
 
   @VisibleForTesting
   final ConcurrentHashMap<Object, RecordLocation> _primaryKeyToRecordLocationMap = new ConcurrentHashMap<>();
@@ -340,7 +343,8 @@ protected GenericRow doUpdateRecord(GenericRow record, RecordInfo recordInfo) {
             int currentDocId = recordLocation.getDocId();
             if (currentQueryableDocIds == null || currentQueryableDocIds.contains(currentDocId)) {
               _reusePreviousRow.init(currentSegment, currentDocId);
-              _partialUpsertHandler.merge(_reusePreviousRow, record);
+              _partialUpsertHandler.merge(_reusePreviousRow, record, _reuseMergeResultHolder);
+              _reuseMergeResultHolder.clear();
             }
           }
           return recordLocation;

diff --git a/...gment-local/src/main/java/org/apache/pinot/segment/local/upsert/PartialUpsertHandler.java b/...gment-local/src/main/java/org/apache/pinot/segment/local/upsert/PartialUpsertHandler.java
@@ -18,90 +18,73 @@
  */
 package org.apache.pinot.segment.local.upsert;
 
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.TreeMap;
+import javax.annotation.Nullable;
 import org.apache.pinot.segment.local.segment.readers.LazyRow;
-import org.apache.pinot.segment.local.upsert.merger.OverwriteMerger;
 import org.apache.pinot.segment.local.upsert.merger.PartialUpsertMerger;
 import org.apache.pinot.segment.local.upsert.merger.PartialUpsertMergerFactory;
 import org.apache.pinot.spi.config.table.UpsertConfig;
+import org.apache.pinot.spi.data.FieldSpec;
 import org.apache.pinot.spi.data.Schema;
 import org.apache.pinot.spi.data.readers.GenericRow;
 
 
 /**
  * Handler for partial-upsert.
+ *
+ * This class is responsible for merging the new record with the previous record.
+ * It uses the configured merge strategies to merge the columns. If no merge strategy is configured for a column,
+ * it uses the default merge strategy.
+ *
+ * It is also possible to define a custom logic for merging rows by implementing {@link PartialUpsertMerger}.
+ * If a merger for row is defined then it takes precedence and ignores column mergers.
  */
 public class PartialUpsertHandler {
-  // _column2Mergers maintains the mapping of merge strategies per columns.
-  private final Map<String, PartialUpsertMerger> _column2Mergers = new HashMap<>();
-  private final PartialUpsertMerger _defaultPartialUpsertMerger;
-  private final List<String> _comparisonColumns;
   private final List<String> _primaryKeyColumns;
+  private final List<String> _comparisonColumns;
+  private final TreeMap<String, FieldSpec> _fieldSpecMap;
+  private final PartialUpsertMerger _partialUpsertMerger;
 
-  public PartialUpsertHandler(Schema schema, Map<String, UpsertConfig.Strategy> partialUpsertStrategies,
-      UpsertConfig.Strategy defaultPartialUpsertStrategy, List<String> comparisonColumns) {
-    _defaultPartialUpsertMerger = PartialUpsertMergerFactory.getMerger(defaultPartialUpsertStrategy);
-    _comparisonColumns = comparisonColumns;
+  public PartialUpsertHandler(Schema schema, List<String> comparisonColumns, UpsertConfig upsertConfig) {
     _primaryKeyColumns = schema.getPrimaryKeyColumns();
+    _comparisonColumns = comparisonColumns;
+    _fieldSpecMap = schema.getFieldSpecMap();
+    _partialUpsertMerger =
+        PartialUpsertMergerFactory.getPartialUpsertMerger(_primaryKeyColumns, comparisonColumns, upsertConfig);
+  }
 
-    for (Map.Entry<String, UpsertConfig.Strategy> entry : partialUpsertStrategies.entrySet()) {
-      _column2Mergers.put(entry.getKey(), PartialUpsertMergerFactory.getMerger(entry.getValue()));
+  public void merge(LazyRow previousRow, GenericRow newRow, Map<String, Object> resultHolder) {
+    _partialUpsertMerger.merge(previousRow, newRow, resultHolder);
+
+    // iterate over only merger results and update newRecord with merged values
+    for (Map.Entry<String, Object> entry : resultHolder.entrySet()) {
+      // skip primary key and comparison columns
+      String column = entry.getKey();
+      if (_primaryKeyColumns.contains(column) || _comparisonColumns.contains(column)) {
+        continue;
+      }
+      setMergedValue(newRow, column, entry.getValue());
     }
-  }
 
-  /**
-   * Merges records and returns the merged record.
-   * We used a map to indicate all configured fields for partial upsert. For these fields
-   * (1) If the prev value is null, return the new value
-   * (2) If the prev record is not null, the new value is null, return the prev value.
-   * (3) If neither values are not null, then merge the value and return.
-   * For un-configured fields, they are using default override behavior, regardless null values.
-   *
-   * For example, overwrite merger will only override the prev value if the new value is not null.
-   * Null values will override existing values if not configured. They can be ignored by using ignoreMerger.
-   *
-   * @param prevRecord wrapper for previous record, which lazily reads column values of previous row and caches for
-   *                   re-reads.
-   * @param newRecord the new consumed record.
-   */
-  public void merge(LazyRow prevRecord, GenericRow newRecord) {
-    for (String column : prevRecord.getColumnNames()) {
-      if (!_primaryKeyColumns.contains(column)) {
-        PartialUpsertMerger merger = _column2Mergers.getOrDefault(column, _defaultPartialUpsertMerger);
-        // Non-overwrite mergers
-        // (1) If the value of the previous is null value, skip merging and use the new value
-        // (2) Else If the value of new value is null, use the previous value (even for comparison columns).
-        // (3) Else If the column is not a comparison column, we applied the merged value to it.
-        if (!(merger instanceof OverwriteMerger)) {
-          Object prevValue = prevRecord.getValue(column);
-          if (prevValue != null) {
-            if (newRecord.isNullValue(column)) {
-              // Note that we intentionally want to overwrite any previous _comparisonColumn value in the case of
-              // using
-              // multiple comparison columns. We never apply a merge function to it, rather we just take any/all
-              // non-null comparison column values from the previous record, and the sole non-null comparison column
-              // value from the new record.
-              newRecord.putValue(column, prevValue);
-              newRecord.removeNullValueField(column);
-            } else if (!_comparisonColumns.contains(column)) {
-              newRecord.putValue(column, merger.merge(prevValue, newRecord.getValue(column)));
-            }
-          }
-        } else {
-          // Overwrite mergers.
-          // (1) If the merge strategy is Overwrite merger and newValue is not null, skip and use the new value
-          // (2) Otherwise, if previous is not null, init columnReader and use the previous value.
-          if (newRecord.isNullValue(column)) {
-            Object prevValue = prevRecord.getValue(column);
-            if (prevValue != null) {
-              newRecord.putValue(column, prevValue);
-              newRecord.removeNullValueField(column);
-            }
-          }
-        }
+    // handle comparison columns
+    for (String column : _comparisonColumns) {
+      if (newRow.isNullValue(column) && !previousRow.isNullValue(column)) {
+        newRow.putValue(column, previousRow.getValue(column));
+        newRow.removeNullValueField(column);
       }
     }
   }
+
+  private void setMergedValue(GenericRow row, String column, @Nullable Object mergedValue) {
+    if (mergedValue != null) {
+      // remove null value field if it was set
+      row.removeNullValueField(column);
+      row.putValue(column, mergedValue);
+    } else {
+      // if column exists but mapped to a null value then merger result was a null value
+      row.putDefaultNullValue(column, _fieldSpecMap.get(column).getDefaultNullValue());
+    }
+  }
 }
diff --git a/...l/src/main/java/org/apache/pinot/segment/local/upsert/merger/BasePartialUpsertMerger.java b/...l/src/main/java/org/apache/pinot/segment/local/upsert/merger/BasePartialUpsertMerger.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.upsert.merger;
+
+import java.util.List;
+import org.apache.pinot.spi.config.table.UpsertConfig;
+
+
+public abstract class BasePartialUpsertMerger implements PartialUpsertMerger {
+  protected final List<String> _primaryKeyColumns;
+  protected final List<String> _comparisonColumns;
+  protected final UpsertConfig _upsertConfig;
+
+  protected BasePartialUpsertMerger(List<String> primaryKeyColumns, List<String> comparisonColumns,
+      UpsertConfig upsertConfig) {
+    _primaryKeyColumns = primaryKeyColumns;
+    _comparisonColumns = comparisonColumns;
+    _upsertConfig = upsertConfig;
+  }
+}
diff --git a/...c/main/java/org/apache/pinot/segment/local/upsert/merger/PartialUpsertColumnarMerger.java b/...c/main/java/org/apache/pinot/segment/local/upsert/merger/PartialUpsertColumnarMerger.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.upsert.merger;
+
+import com.google.common.base.Preconditions;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.pinot.segment.local.segment.readers.LazyRow;
+import org.apache.pinot.segment.local.upsert.merger.columnar.OverwriteMerger;
+import org.apache.pinot.segment.local.upsert.merger.columnar.PartialUpsertColumnMerger;
+import org.apache.pinot.segment.local.upsert.merger.columnar.PartialUpsertColumnMergerFactory;
+import org.apache.pinot.spi.config.table.UpsertConfig;
+import org.apache.pinot.spi.data.readers.GenericRow;
+
+
+/**
+ * Default Partial upsert merger implementation.
+ * PartialUpsertColumnarMerger iterates over each column and merges them based on the defined strategy per column in
+ * table config.
+ */
+public class PartialUpsertColumnarMerger extends BasePartialUpsertMerger {
+  private final PartialUpsertColumnMerger _defaultColumnValueMerger;
+  private final Map<String, PartialUpsertColumnMerger> _column2Mergers = new HashMap<>();
+
+  public PartialUpsertColumnarMerger(List<String> primaryKeyColumns, List<String> comparisonColumns,
+      UpsertConfig upsertConfig) {
+    super(primaryKeyColumns, comparisonColumns, upsertConfig);
+    _defaultColumnValueMerger =
+        PartialUpsertColumnMergerFactory.getMerger(upsertConfig.getDefaultPartialUpsertStrategy());
+    Map<String, UpsertConfig.Strategy> partialUpsertStrategies = upsertConfig.getPartialUpsertStrategies();
+    Preconditions.checkArgument(partialUpsertStrategies != null, "Partial upsert strategies must be configured");
+    for (Map.Entry<String, UpsertConfig.Strategy> entry : partialUpsertStrategies.entrySet()) {
+      _column2Mergers.put(entry.getKey(), PartialUpsertColumnMergerFactory.getMerger(entry.getValue()));
+    }
+  }
+
+  /**
+   * Merges records and returns the merged record.
+   * We used a map to indicate all configured fields for partial upsert. For these fields
+   * (1) If the prev value is null, return the new value
+   * (2) If the prev record is not null, the new value is null, return the prev value.
+   * (3) If neither values are not null, then merge the value and return.
+   * For un-configured fields, they are using default override behavior, regardless null values.
+   *
+   * For example, overwrite merger will only override the prev value if the new value is not null.
+   * Null values will override existing values if not configured. They can be ignored by using ignoreMerger.
+   */
+  @Override
+  public void merge(LazyRow previousRow, GenericRow newRow, Map<String, Object> resultHolder) {
+    for (String column : previousRow.getColumnNames()) {
+      // Skip primary key and comparison columns
+      if (_primaryKeyColumns.contains(column) || _comparisonColumns.contains(column)) {
+        continue;
+      }
+      PartialUpsertColumnMerger merger = _column2Mergers.getOrDefault(column, _defaultColumnValueMerger);
+      // Non-overwrite mergers
+      // (1) If the value of the previous is null value, skip merging and use the new value
+      // (2) Else If the value of new value is null, use the previous value (even for comparison columns)
+      // (3) Else If the column is not a comparison column, we applied the merged value to it
+      if (!(merger instanceof OverwriteMerger)) {
+        Object prevValue = previousRow.getValue(column);
+        if (prevValue != null) {
+          if (newRow.isNullValue(column)) {
+            resultHolder.put(column, prevValue);
+          } else {
+            resultHolder.put(column, merger.merge(prevValue, newRow.getValue(column)));
+          }
+        }
+      } else {
+        // Overwrite mergers
+        // (1) If the merge strategy is Overwrite merger and newValue is not null, skip and use the new value
+        // (2) Otherwise, use previous value if it is not null
+        if (newRow.isNullValue(column)) {
+          Object prevValue = previousRow.getValue(column);
+          if (prevValue != null) {
+            resultHolder.put(column, prevValue);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/...local/src/main/java/org/apache/pinot/segment/local/upsert/merger/PartialUpsertMerger.java b/...local/src/main/java/org/apache/pinot/segment/local/upsert/merger/PartialUpsertMerger.java
@@ -18,13 +18,20 @@
  */
 package org.apache.pinot.segment.local.upsert.merger;
 
+import java.util.Map;
+import org.apache.pinot.segment.local.segment.readers.LazyRow;
+import org.apache.pinot.spi.data.readers.GenericRow;
+
+
+/**
+ * Merger to merge previously persisted row with the new incoming row.
+ * Custom implementation can be plugged by implementing this interface and add the class name to the upsert config.
+ */
 public interface PartialUpsertMerger {
+
   /**
-   * Handle partial upsert merge.
-   *
-   * @param previousValue the value of given field from the last derived full record during ingestion.
-   * @param currentValue the value of given field from the new consumed record.
-   * @return a new value after merge
+   * Merges previous row with new incoming row and persists the merged results per column in the provided resultHolder.
+   * Primary key and comparison columns should not be merged because their values are not allowed to be modified.
    */
-  Object merge(Object previousValue, Object currentValue);
+  void merge(LazyRow previousRow, GenericRow newRow, Map<String, Object> resultHolder);
 }