linkedin · ZacAttack · Mar 21, 2024 · Mar 26, 2024 · Apr 10, 2024 · Apr 11, 2024
diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/config/VeniceServerConfig.java
@@ -73,6 +73,7 @@
 import static com.linkedin.venice.ConfigKeys.SERVER_INGESTION_ISOLATION_SERVICE_PORT;
 import static com.linkedin.venice.ConfigKeys.SERVER_INGESTION_MODE;
 import static com.linkedin.venice.ConfigKeys.SERVER_INGESTION_TASK_MAX_IDLE_COUNT;
+import static com.linkedin.venice.ConfigKeys.SERVER_INGESTION_TASK_THREAD_SAFE_MODE;
 import static com.linkedin.venice.ConfigKeys.SERVER_KAFKA_CONSUMER_OFFSET_COLLECTION_ENABLED;
 import static com.linkedin.venice.ConfigKeys.SERVER_KAFKA_MAX_POLL_RECORDS;
 import static com.linkedin.venice.ConfigKeys.SERVER_LEADER_COMPLETE_STATE_CHECK_IN_FOLLOWER_ENABLED;
@@ -445,6 +446,8 @@ public class VeniceServerConfig extends VeniceClusterConfig {
 
   private final int ingestionTaskMaxIdleCount;
 
+  private final boolean threadSafeMode;
+
   private final long metaStoreWriterCloseTimeoutInMS;
   private final int metaStoreWriterCloseConcurrency;
 
@@ -727,6 +730,7 @@ public VeniceServerConfig(VeniceProperties serverProperties, Map<String, Map<Str
     pubSubClientsFactory = new PubSubClientsFactory(serverProperties);
     routerPrincipalName = serverProperties.getString(ROUTER_PRINCIPAL_NAME, "CN=venice-router");
     ingestionTaskMaxIdleCount = serverProperties.getInt(SERVER_INGESTION_TASK_MAX_IDLE_COUNT, 10000);
+    threadSafeMode = serverProperties.getBoolean(SERVER_INGESTION_TASK_THREAD_SAFE_MODE, false);
     metaStoreWriterCloseTimeoutInMS = serverProperties.getLong(META_STORE_WRITER_CLOSE_TIMEOUT_MS, 300000L);
     metaStoreWriterCloseConcurrency = serverProperties.getInt(META_STORE_WRITER_CLOSE_CONCURRENCY, -1);
     ingestionHeartbeatIntervalMs =
@@ -1280,6 +1284,10 @@ public int getIngestionTaskMaxIdleCount() {
     return ingestionTaskMaxIdleCount;
   }
 
+  public boolean isThreadSafeMode() {
+    return threadSafeMode;
+  }
+
   public boolean isKMERegistrationFromMessageHeaderEnabled() {
     return isKMERegistrationFromMessageHeaderEnabled;
   }

diff --git a/...lient/src/main/java/com/linkedin/davinci/kafka/consumer/ActiveActiveProducerCallback.java b/...lient/src/main/java/com/linkedin/davinci/kafka/consumer/ActiveActiveProducerCallback.java
@@ -21,15 +21,17 @@ public ActiveActiveProducerCallback(
       LeaderProducedRecordContext leaderProducedRecordContext,
       int subPartition,
       String kafkaUrl,
-      long beforeProcessingRecordTimestamp) {
+      long beforeProcessingRecordTimestamp,
+      boolean syncOffsetsOnlyAfterProducing) {
     super(
         ingestionTask,
         sourceConsumerRecord,
         partitionConsumptionState,
         leaderProducedRecordContext,
         subPartition,
         kafkaUrl,
-        beforeProcessingRecordTimestamp);
+        beforeProcessingRecordTimestamp,
+        syncOffsetsOnlyAfterProducing);
   }
 
   @Override

diff --git a/...ent/src/main/java/com/linkedin/davinci/kafka/consumer/ActiveActiveStoreIngestionTask.java b/...ent/src/main/java/com/linkedin/davinci/kafka/consumer/ActiveActiveStoreIngestionTask.java
@@ -178,7 +178,7 @@ protected DelegateConsumerRecordResult delegateConsumerRecord(
           beforeProcessingBatchRecordsTimestampMs);
     } else {
       /**
-       * The below flow must be executed in a critical session for the same key:
+       * The below flow must be executed in a critical section for the same key:
        * Read existing value/RMD from transient record cache/disk -> perform DCR and decide incoming value wins
        * -> update transient record cache -> produce to VT (just call send, no need to wait for the produce future in the critical session)
        *
@@ -499,7 +499,6 @@ protected void processMessageAndMaybeProduceToKafka(
           .updateLatestIgnoredUpstreamRTOffset(kafkaClusterIdToUrlMap.get(kafkaClusterId), sourceOffset);
     } else {
       validatePostOperationResultsAndRecord(mergeConflictResult, offsetSumPreOperation, recordTimestampsPreOperation);
-
       // Apply this update to any views for this store
       // TODO: It'd be good to be able to do this in LeaderFollowerStoreIngestionTask instead, however, AA currently is
       // the
@@ -1572,14 +1571,16 @@ protected LeaderProducerCallback createProducerCallback(
       LeaderProducedRecordContext leaderProducedRecordContext,
       int subPartition,
       String kafkaUrl,
-      long beforeProcessingRecordTimestampNs) {
+      long beforeProcessingRecordTimestampNs,
+      boolean syncOffsetsOnlyAfterProducing) {
     return new ActiveActiveProducerCallback(
         this,
         consumerRecord,
         partitionConsumptionState,
         leaderProducedRecordContext,
         subPartition,
         kafkaUrl,
-        beforeProcessingRecordTimestampNs);
+        beforeProcessingRecordTimestampNs,
+        syncOffsetsOnlyAfterProducing);
   }
 }
diff --git a/...t/src/main/java/com/linkedin/davinci/kafka/consumer/LeaderFollowerStoreIngestionTask.java b/...t/src/main/java/com/linkedin/davinci/kafka/consumer/LeaderFollowerStoreIngestionTask.java
@@ -1602,13 +1602,27 @@ protected void produceToLocalKafka(
       String kafkaUrl,
       int kafkaClusterId,
       long beforeProcessingRecordTimestampNs) {
+
+    if (this.runInThreadSafeMode) {
+      // Write to rocksdb. At time of writing, this is the last step after a huge amount of processing and compression
+      // and whatnot. At this stage we do not sync the offset, instead doing that after successfully produce.
+      this.processConsumerRecord(
+          consumerRecord,
+          leaderProducedRecordContext,
+          subPartition,
+          kafkaUrl,
+          beforeProcessingRecordTimestampNs,
+          false);
+    }
+
     LeaderProducerCallback callback = createProducerCallback(
         consumerRecord,
         partitionConsumptionState,
         leaderProducedRecordContext,
         subPartition,
         kafkaUrl,
-        beforeProcessingRecordTimestampNs);
+        beforeProcessingRecordTimestampNs,
+        this.runInThreadSafeMode);
     long sourceTopicOffset = consumerRecord.getOffset();
     LeaderMetadataWrapper leaderMetadataWrapper = new LeaderMetadataWrapper(sourceTopicOffset, kafkaClusterId);
     partitionConsumptionState.setLastLeaderPersistFuture(leaderProducedRecordContext.getPersistedToDBFuture());
@@ -2093,7 +2107,8 @@ private void propagateHeartbeatFromUpstreamTopicToLocalVersionTopic(
         leaderProducedRecordContext,
         partition,
         kafkaUrl,
-        beforeProcessingRecordTimestampNs);
+        beforeProcessingRecordTimestampNs,
+        this.runInThreadSafeMode);
     LeaderMetadataWrapper leaderMetadataWrapper = new LeaderMetadataWrapper(consumerRecord.getOffset(), kafkaClusterId);
     List<Integer> subPartitions =
         PartitionUtils.getSubPartitions(partitionConsumptionState.getUserPartition(), amplificationFactor);
@@ -2160,7 +2175,7 @@ protected void recordHeartbeatReceived(
    * This function should be called as one of the first steps in processing pipeline for all messages consumed from any kafka topic.
    *
    * The caller of this function should only process this {@param consumerRecord} further if the return is
-   * {@link DelegateConsumerRecordResult#QUEUED_TO_DRAINER}.
+   * {@link DelegateConsumerRecordResult#QUEUE_TO_DRAINER}.
    *
    * This function assumes {@link #shouldProcessRecord(PubSubMessage, int)} has been called which happens in
    * {@link StoreIngestionTask#produceToStoreBufferServiceOrKafka(Iterable, PubSubTopicPartition, String, int)}
@@ -2182,7 +2197,6 @@ protected DelegateConsumerRecordResult delegateConsumerRecord(
       int kafkaClusterId,
       long beforeProcessingPerRecordTimestampNs,
       long beforeProcessingBatchRecordsTimestampMs) {
-    boolean produceToLocalKafka = false;
     try {
       KafkaKey kafkaKey = consumerRecord.getKey();
       KafkaMessageEnvelope kafkaValue = consumerRecord.getValue();
@@ -2198,9 +2212,9 @@ protected DelegateConsumerRecordResult delegateConsumerRecord(
       PartitionConsumptionState partitionConsumptionState = partitionConsumptionStateMap.get(subPartition);
       if (partitionConsumptionState == null) {
         // The partition is likely unsubscribed, will skip these messages.
-        return DelegateConsumerRecordResult.SKIPPED_MESSAGE;
+        return DelegateConsumerRecordResult.END_PROCESSING;
       }
-      produceToLocalKafka = shouldProduceToVersionTopic(partitionConsumptionState);
+      boolean produceToLocalKafka = shouldProduceToVersionTopic(partitionConsumptionState);
       // UPDATE message is only expected in LEADER which must be produced to kafka.
       MessageType msgType = MessageType.valueOf(kafkaValue);
       if (msgType == MessageType.UPDATE && !produceToLocalKafka) {
@@ -2222,7 +2236,10 @@ protected DelegateConsumerRecordResult delegateConsumerRecord(
        * (i) it's a follower or (ii) leader is consuming from VT
        */
       if (!produceToLocalKafka) {
-        return DelegateConsumerRecordResult.QUEUED_TO_DRAINER;
+        // TODO: The next step will put in the drainer queue. When threadsafe mode is enabled, it means we skip
+        // the drainer during rt consumption and commit straight to rocksdb. To remove drainer completely,
+        // we should do the same here
+        return DelegateConsumerRecordResult.QUEUE_TO_DRAINER;
       }
 
       // If we are here the message must be produced to local kafka or silently consumed.
@@ -2271,7 +2288,7 @@ protected DelegateConsumerRecordResult delegateConsumerRecord(
          */
         divErrorMetricCallback.accept(e);
         LOGGER.debug("{} : Skipping a duplicate record at offset: {}", ingestionTaskName, consumerRecord.getOffset());
-        return DelegateConsumerRecordResult.DUPLICATE_MESSAGE;
+        return DelegateConsumerRecordResult.END_PROCESSING;
       }
 
       if (kafkaKey.isControlMessage()) {
@@ -2427,7 +2444,7 @@ protected DelegateConsumerRecordResult delegateConsumerRecord(
             if (isDataRecovery && !partitionConsumptionState.isBatchOnly()) {
               // Ignore remote VT's TS message since we might need to consume more RT or incremental push data from VT
               // that's no longer in the local/remote RT due to retention.
-              return DelegateConsumerRecordResult.SKIPPED_MESSAGE;
+              return DelegateConsumerRecordResult.END_PROCESSING;
             }
             leaderProducedRecordContext =
                 LeaderProducedRecordContext.newControlMessageRecord(kafkaKey.getKey(), controlMessage);
@@ -2448,7 +2465,7 @@ protected DelegateConsumerRecordResult delegateConsumerRecord(
                 beforeProcessingPerRecordTimestampNs);
             break;
           case VERSION_SWAP:
-            return DelegateConsumerRecordResult.QUEUED_TO_DRAINER;
+            return DelegateConsumerRecordResult.QUEUE_TO_DRAINER;
           default:
             // do nothing
             break;
@@ -2478,7 +2495,7 @@ protected DelegateConsumerRecordResult delegateConsumerRecord(
             beforeProcessingPerRecordTimestampNs,
             beforeProcessingBatchRecordsTimestampMs);
       }
-      return DelegateConsumerRecordResult.PRODUCED_TO_KAFKA;
+      return DelegateConsumerRecordResult.END_PROCESSING;
     } catch (Exception e) {
       throw new VeniceException(
           ingestionTaskName + " hasProducedToKafka: exception for message received from: "
@@ -3418,15 +3435,17 @@ protected LeaderProducerCallback createProducerCallback(
       LeaderProducedRecordContext leaderProducedRecordContext,
       int subPartition,
       String kafkaUrl,
-      long beforeProcessingRecordTimestampNs) {
+      long beforeProcessingRecordTimestampNs,
+      boolean syncOffsetsOnlyAfterProducing) {
     return new LeaderProducerCallback(
         this,
         consumerRecord,
         partitionConsumptionState,
         leaderProducedRecordContext,
         subPartition,
         kafkaUrl,
-        beforeProcessingRecordTimestampNs);
+        beforeProcessingRecordTimestampNs,
+        syncOffsetsOnlyAfterProducing);
   }
 
   protected Lazy<VeniceWriter<byte[], byte[], byte[]>> getVeniceWriter() {

diff --git a/...inci-client/src/main/java/com/linkedin/davinci/kafka/consumer/LeaderProducerCallback.java b/...inci-client/src/main/java/com/linkedin/davinci/kafka/consumer/LeaderProducerCallback.java
@@ -55,14 +55,17 @@ public class LeaderProducerCallback implements ChunkAwareCallback {
   protected ChunkedValueManifest oldValueManifest = null;
   protected ChunkedValueManifest oldRmdManifest = null;
 
+  private final boolean syncOffsetsOnlyAfterProducing;
+
   public LeaderProducerCallback(
       LeaderFollowerStoreIngestionTask ingestionTask,
       PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long> sourceConsumerRecord,
       PartitionConsumptionState partitionConsumptionState,
       LeaderProducedRecordContext leaderProducedRecordContext,
       int subPartition,
       String kafkaUrl,
-      long beforeProcessingRecordTimestampNs) {
+      long beforeProcessingRecordTimestampNs,
+      boolean syncOffsetsOnlyAfterProducing) {
     this.ingestionTask = ingestionTask;
     this.sourceConsumerRecord = sourceConsumerRecord;
     this.partitionConsumptionState = partitionConsumptionState;
@@ -71,6 +74,7 @@ public LeaderProducerCallback(
     this.leaderProducedRecordContext = leaderProducedRecordContext;
     this.produceTimeNs = ingestionTask.isUserSystemStore() ? 0 : System.nanoTime();
     this.beforeProcessingRecordTimestampNs = beforeProcessingRecordTimestampNs;
+    this.syncOffsetsOnlyAfterProducing = syncOffsetsOnlyAfterProducing;
   }
 
   @Override
@@ -156,7 +160,7 @@ public void onCompletion(PubSubProduceResult produceResult, Exception e) {
          */
         if (chunkedValueManifest == null) {
           leaderProducedRecordContext.setProducedOffset(produceResult.getOffset());
-          ingestionTask.produceToStoreBufferService(
+          produceToStoreBufferService(
               sourceConsumerRecord,
               leaderProducedRecordContext,
               subPartition,
@@ -194,7 +198,7 @@ public void onCompletion(PubSubProduceResult produceResult, Exception e) {
               manifestPut,
               leaderProducedRecordContext.getPersistedToDBFuture());
           producedRecordForManifest.setProducedOffset(produceResult.getOffset());
-          ingestionTask.produceToStoreBufferService(
+          produceToStoreBufferService(
               sourceConsumerRecord,
               producedRecordForManifest,
               subPartition,
@@ -321,7 +325,7 @@ private long produceChunksToStoreBufferService(
       LeaderProducedRecordContext producedRecordForChunk =
           LeaderProducedRecordContext.newChunkPutRecord(ByteUtils.extractByteArray(chunkKey), chunkPut);
       producedRecordForChunk.setProducedOffset(-1);
-      ingestionTask.produceToStoreBufferService(
+      produceToStoreBufferService(
           sourceConsumerRecord,
           producedRecordForChunk,
           subPartition,
@@ -347,7 +351,7 @@ void produceDeprecatedChunkDeletionToStoreBufferService(ChunkedValueManifest man
       LeaderProducedRecordContext producedRecordForChunk =
           LeaderProducedRecordContext.newChunkDeleteRecord(ByteUtils.extractByteArray(chunkKey), chunkDelete);
       producedRecordForChunk.setProducedOffset(-1);
-      ingestionTask.produceToStoreBufferService(
+      produceToStoreBufferService(
           sourceConsumerRecord,
           producedRecordForChunk,
           subPartition,
@@ -357,6 +361,28 @@ void produceDeprecatedChunkDeletionToStoreBufferService(ChunkedValueManifest man
     }
   }
 
+  protected void produceToStoreBufferService(
+      PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long> consumedRecord,
+      LeaderProducedRecordContext leaderProducedRecordContext,
+      int subPartition,
+      String kafkaUrl,
+      long beforeProcessingRecordTimestampNs,
+      long currentTimeForMetricsMs) throws InterruptedException {
+    if (this.syncOffsetsOnlyAfterProducing) {
+      // sync offsets
+      ingestionTask
+          .maybeSyncOffsets(consumedRecord, leaderProducedRecordContext, partitionConsumptionState, subPartition);
+    } else {
+      ingestionTask.produceToStoreBufferService(
+          consumedRecord,
+          leaderProducedRecordContext,
+          subPartition,
+          kafkaUrl,
+          beforeProcessingRecordTimestampNs,
+          currentTimeForMetricsMs);
+    }
+  }
+
   // Visible for VeniceWriter unit test.
   public PartitionConsumptionState getPartitionConsumptionState() {
     return partitionConsumptionState;

diff --git a/...i-client/src/main/java/com/linkedin/davinci/kafka/consumer/PartitionConsumptionState.java b/...i-client/src/main/java/com/linkedin/davinci/kafka/consumer/PartitionConsumptionState.java
@@ -223,7 +223,14 @@ public class PartitionConsumptionState {
    */
   private boolean firstHeartBeatSOSReceived;
 
-  public PartitionConsumptionState(int partition, int amplificationFactor, OffsetRecord offsetRecord, boolean hybrid) {
+  private boolean threadSafeMode;
+
+  public PartitionConsumptionState(
+      int partition,
+      int amplificationFactor,
+      OffsetRecord offsetRecord,
+      boolean hybrid,
+      boolean threadSafeMode) {
     this.partition = partition;
     this.amplificationFactor = amplificationFactor;
     this.userPartition = PartitionUtils.getUserPartition(partition, amplificationFactor);
@@ -237,6 +244,7 @@ public PartitionConsumptionState(int partition, int amplificationFactor, OffsetR
     this.processedRecordSizeSinceLastSync = 0;
     this.leaderFollowerState = LeaderFollowerStateType.STANDBY;
     this.expectedSSTFileChecksum = null;
+    this.threadSafeMode = threadSafeMode;
     /**
      * Initialize the latest consumed time with current time; otherwise, it's 0 by default
      * and leader will be promoted immediately.
@@ -565,6 +573,10 @@ public void setTransientRecord(
       int valueLen,
       int valueSchemaId,
       GenericRecord replicationMetadataRecord) {
+    if (this.threadSafeMode) {
+      // NoOp
+      return;
+    }
     TransientRecord transientRecord =
         new TransientRecord(value, valueOffset, valueLen, valueSchemaId, kafkaClusterId, kafkaConsumedOffset);
     if (replicationMetadataRecord != null) {