From 14fde53d6c7a799777186ae8e6f6f00886f67d20 Mon Sep 17 00:00:00 2001 From: Leon Burdinov Date: Mon, 6 Feb 2023 12:10:03 +0200 Subject: [PATCH 01/52] Ds scalafmt (#32485) * format: data streams/ * format: iptf/ GitOrigin-RevId: a31eb0dc0eaff014511665e52497dea67a3f9ff1 --- .../dst/greyhound/core/AdminClientIT.scala | 6 +- .../greyhound/core/consumer/Consumer.scala | 7 +- .../greyhound/core/consumer/EventLoop.scala | 12 +- .../core/consumer/OffsetsInitializer.scala | 6 +- .../core/consumer/ReportingConsumer.scala | 24 ++-- .../retry/NonBlockingRetryHelper.scala | 66 ++++++----- .../retry/NonBlockingRetryRecordHandler.scala | 7 +- .../retry/RetryRecordHandlerMetric.scala | 3 +- .../consumer/OffsetsInitializerTest.scala | 104 +++++++++--------- 9 files changed, 126 insertions(+), 109 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/AdminClientIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/AdminClientIT.scala index 95d2353a..dec5377c 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/AdminClientIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/AdminClientIT.scala @@ -10,7 +10,7 @@ import com.wixpress.dst.greyhound.core.producer.ProducerRecord import com.wixpress.dst.greyhound.core.testkit.{BaseTestWithSharedEnv, TestMetrics} import com.wixpress.dst.greyhound.core.zioutils.CountDownLatch import com.wixpress.dst.greyhound.testenv.ITEnv -import com.wixpress.dst.greyhound.testenv.ITEnv.{Env, TestResources, testResources} +import com.wixpress.dst.greyhound.testenv.ITEnv.{testResources, Env, TestResources} import org.apache.kafka.common.config.TopicConfig.{DELETE_RETENTION_MS_CONFIG, MAX_MESSAGE_BYTES_CONFIG, RETENTION_MS_CONFIG} import org.apache.kafka.common.errors.InvalidTopicException import org.specs2.specification.core.Fragments @@ -83,7 +83,7 @@ class AdminClientIT extends BaseTestWithSharedEnv[Env, TestResources] { } } - //todo uncomment this after https://github.com/wix-private/core-server-build-tools/pull/13043 is merged + // todo uncomment this after https://github.com/wix-private/core-server-build-tools/pull/13043 is merged // "reflect errors" in { // val topic1 = aTopicConfig() // val topic2 = aTopicConfig("x" * 250) @@ -104,7 +104,7 @@ class AdminClientIT extends BaseTestWithSharedEnv[Env, TestResources] { // created === Map(badTopic.name -> None) // } // } - //todo uncomment this after https://github.com/wix-private/core-server-build-tools/pull/13043 is merged + // todo uncomment this after https://github.com/wix-private/core-server-build-tools/pull/13043 is merged // ================================================================================================================================= "ignore TopicExistsException by default" in { val topic = aTopicConfig() diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index 3a53c551..7592362e 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -382,7 +382,10 @@ object UnsafeOffsetOperations { } override def offsetsForTimes(partitions: Set[TopicPartition], timeEpoch: Long, timeout: Duration): Map[TopicPartition, Option[Long]] = - consumer.offsetsForTimes(partitions.map(_.asKafka).map(tp => (tp, new lang.Long(timeEpoch))).toMap.asJava, timeout) - .asScala.toMap.map { case (tp, of) => TopicPartition(tp) -> (Option(of).map(_.offset())) } + consumer + .offsetsForTimes(partitions.map(_.asKafka).map(tp => (tp, new lang.Long(timeEpoch))).toMap.asJava, timeout) + .asScala + .toMap + .map { case (tp, of) => TopicPartition(tp) -> (Option(of).map(_.offset())) } } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 6901b5c9..5985f80d 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -56,14 +56,14 @@ object EventLoop { partitionsAssigned <- Promise.make[Nothing, Unit] // TODO how to handle errors in subscribe? rebalanceListener = listener(pausedPartitionsRef, config, dispatcher, partitionsAssigned, group, consumer, clientId, offsets) - _ <- report(SubscribingToInitialSubAndRebalanceListener(clientId, group, consumerAttributes)) + _ <- report(SubscribingToInitialSubAndRebalanceListener(clientId, group, consumerAttributes)) _ <- subscribe(initialSubscription, rebalanceListener)(consumer) running <- Ref.make[EventLoopState](Running) - _ <- report(CreatingPollOnceFiber(clientId, group, consumerAttributes)) + _ <- report(CreatingPollOnceFiber(clientId, group, consumerAttributes)) fiber <- pollOnce(running, consumer, dispatcher, pausedPartitionsRef, positionsRef, offsets, config, clientId, group) .repeatWhile(_ == true) .forkDaemon - _ <- report(AwaitingPartitionsAssignment(clientId, group, consumerAttributes)) + _ <- report(AwaitingPartitionsAssignment(clientId, group, consumerAttributes)) _ <- partitionsAssigned.await env <- ZIO.environment[Env] } yield (dispatcher, fiber, offsets, positionsRef, running, rebalanceListener.provideEnvironment(env)) @@ -303,9 +303,11 @@ object EventLoopMetric { case class FailedToUpdatePositions(t: Throwable, clientId: ClientId, attributes: Map[String, String] = Map.empty) extends EventLoopMetric - case class CreatingDispatcher(clientId: ClientId, group: Group, attributes: Map[String, String], startPaused: Boolean) extends EventLoopMetric + case class CreatingDispatcher(clientId: ClientId, group: Group, attributes: Map[String, String], startPaused: Boolean) + extends EventLoopMetric - case class SubscribingToInitialSubAndRebalanceListener(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric + case class SubscribingToInitialSubAndRebalanceListener(clientId: ClientId, group: Group, attributes: Map[String, String]) + extends EventLoopMetric case class CreatingPollOnceFiber(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala index 97f1b341..e684eef7 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala @@ -40,8 +40,10 @@ class OffsetsInitializer( offsetOperations.pause(toPause) val rewindUncommittedOffsets = if (offsetResetIsEarliest || notCommitted.isEmpty || rewindUncommittedOffsetsBy.isZero) Map.empty - else offsetOperations.offsetsForTimes(notCommitted, clock.millis() - rewindUncommittedOffsetsBy.toMillis, effectiveTimeout) - .map{case (tp, maybeRewindedOffset) => (tp, maybeRewindedOffset.orElse(endOffsets.get(tp)).getOrElse(0L))} + else + offsetOperations + .offsetsForTimes(notCommitted, clock.millis() - rewindUncommittedOffsetsBy.toMillis, effectiveTimeout) + .map { case (tp, maybeRewindedOffset) => (tp, maybeRewindedOffset.orElse(endOffsets.get(tp)).getOrElse(0L)) } val positions = notCommitted.map(tp => tp -> offsetOperations.position(tp, effectiveTimeout)).toMap ++ toOffsets ++ rewindUncommittedOffsets diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala index 96a929f8..d0f93a00 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala @@ -41,10 +41,14 @@ case class ReportingConsumer(clientId: ClientId, group: Group, internal: Consume implicit trace: Trace ): UIO[DelayedRebalanceEffect] = (report(PartitionsRevoked(clientId, group, partitions, config.consumerAttributes)) *> - rebalanceListener.onPartitionsRevoked(consumer, partitions) - .timed.tap { case (duration, _) => report(PartitionsRevokedComplete(clientId, group, partitions, config.consumerAttributes, duration.toMillis)) } - .map(_._2) - ).provideEnvironment(r) + rebalanceListener + .onPartitionsRevoked(consumer, partitions) + .timed + .tap { + case (duration, _) => + report(PartitionsRevokedComplete(clientId, group, partitions, config.consumerAttributes, duration.toMillis)) + } + .map(_._2)).provideEnvironment(r) override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[Any] = (report(PartitionsAssigned(clientId, group, partitions, config.consumerAttributes)) *> @@ -241,11 +245,13 @@ object ConsumerMetric { attributes: Map[String, String] = Map.empty ) extends ConsumerMetric - case class PartitionsRevokedComplete(clientId: ClientId, - group: Group, - partitions: Set[TopicPartition], - attributes: Map[String, String] = Map.empty, - durationMs: Long) extends ConsumerMetric + case class PartitionsRevokedComplete( + clientId: ClientId, + group: Group, + partitions: Set[TopicPartition], + attributes: Map[String, String] = Map.empty, + durationMs: Long + ) extends ConsumerMetric case class SubscribeFailed(clientId: ClientId, group: Group, error: Throwable, attributes: Map[String, String] = Map.empty) extends ConsumerMetric diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala index c8dd3edd..a73b7dce 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala @@ -1,18 +1,18 @@ package com.wixpress.dst.greyhound.core.consumer.retry -import java.time.{Instant, Duration => JavaDuration} +import java.time.{Duration => JavaDuration, Instant} import java.util.concurrent.TimeUnit.MILLISECONDS import java.util.regex.Pattern import com.wixpress.dst.greyhound.core.Serdes.StringSerde import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.{TopicPattern, Topics} -import com.wixpress.dst.greyhound.core.consumer.retry.RetryAttempt.{RetryAttemptNumber, currentTime} +import com.wixpress.dst.greyhound.core.consumer.retry.RetryAttempt.{currentTime, RetryAttemptNumber} import com.wixpress.dst.greyhound.core.consumer.retry.RetryDecision.{NoMoreRetries, RetryWith} import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.WaitingForRetry import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics.report import com.wixpress.dst.greyhound.core.producer.ProducerRecord -import com.wixpress.dst.greyhound.core.{Group, Headers, Topic, durationDeserializer, instantDeserializer} +import com.wixpress.dst.greyhound.core.{durationDeserializer, instantDeserializer, Group, Headers, Topic} import zio.Clock import zio.Duration import zio.Schedule.spaced @@ -24,14 +24,14 @@ trait NonBlockingRetryHelper { def retryTopicsFor(originalTopic: Topic): Set[Topic] def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace + implicit trace: Trace ): UIO[Option[RetryAttempt]] def retryDecision[E]( - retryAttempt: Option[RetryAttempt], - record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], - error: E, - subscription: ConsumerSubscription + retryAttempt: Option[RetryAttempt], + record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], + error: E, + subscription: ConsumerSubscription )(implicit trace: Trace): URIO[Any, RetryDecision] def retrySteps = retryTopicsFor("").size @@ -47,46 +47,44 @@ object NonBlockingRetryHelper { .getOrElse(NonBlockingBackoffPolicy.empty) override def retryTopicsFor(topic: Topic): Set[Topic] = - policy(topic).intervals.indices.foldLeft(Set.empty[String])((acc, attempt) => - acc + s"$topic-$group-retry-$attempt" - ) + policy(topic).intervals.indices.foldLeft(Set.empty[String])((acc, attempt) => acc + s"$topic-$group-retry-$attempt") override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace + implicit trace: Trace ): UIO[Option[RetryAttempt]] = { (for { submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) originalTopic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) } yield for { - ta <- topicAttempt(subscription, topic, originalTopic) + ta <- topicAttempt(subscription, topic, originalTopic) TopicAttempt(originalTopic, attempt) = ta - s <- submitted - b <- backoff + s <- submitted + b <- backoff } yield RetryAttempt(originalTopic, attempt, s, b)) .catchAll(_ => ZIO.none) } private def topicAttempt( - subscription: ConsumerSubscription, - topic: Topic, - originalTopicHeader: Option[String] + subscription: ConsumerSubscription, + topic: Topic, + originalTopicHeader: Option[String] ) = subscription match { - case _: Topics => extractTopicAttempt(group, topic) + case _: Topics => extractTopicAttempt(group, topic) case _: TopicPattern => extractTopicAttemptFromPatternRetryTopic(group, topic, originalTopicHeader) } override def retryDecision[E]( - retryAttempt: Option[RetryAttempt], - record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], - error: E, - subscription: ConsumerSubscription + retryAttempt: Option[RetryAttempt], + record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], + error: E, + subscription: ConsumerSubscription )(implicit trace: Trace): URIO[Any, RetryDecision] = currentTime.map(now => { val nextRetryAttempt = retryAttempt.fold(0)(_.attempt + 1) val originalTopic = retryAttempt.fold(record.topic)(_.originalTopic) - val retryTopic = subscription match { + val retryTopic = subscription match { case _: TopicPattern => patternRetryTopic(group, nextRetryAttempt) case _: Topics => fixedRetryTopic(originalTopic, group, nextRetryAttempt) } @@ -123,19 +121,19 @@ object NonBlockingRetryHelper { inputTopic.split(s"-$group-retry-").toSeq match { case Seq(topic, attempt) if Try(attempt.toInt).isSuccess => Some(TopicAttempt(topic, attempt.toInt)) - case _ => None + case _ => None } private def extractTopicAttemptFromPatternRetryTopic[E]( - group: Group, - inputTopic: Topic, - originalTopicHeader: Option[String] + group: Group, + inputTopic: Topic, + originalTopicHeader: Option[String] ) = { originalTopicHeader.flatMap(originalTopic => { inputTopic.split(s"__gh_pattern-retry-$group-attempt-").toSeq match { case Seq(_, attempt) if Try(attempt.toInt).isSuccess => Some(TopicAttempt(originalTopic, attempt.toInt)) - case _ => None + case _ => None } }) } @@ -176,14 +174,14 @@ object RetryHeader { } case class RetryAttempt( - originalTopic: Topic, - attempt: RetryAttemptNumber, - submittedAt: Instant, - backoff: Duration + originalTopic: Topic, + attempt: RetryAttemptNumber, + submittedAt: Instant, + backoff: Duration ) { def sleep(implicit trace: Trace): URIO[GreyhoundMetrics, Unit] = - (RetryUtil.sleep(submittedAt, backoff) race reportWaitingInIntervals(every = 60.seconds)) + RetryUtil.sleep(submittedAt, backoff) race reportWaitingInIntervals(every = 60.seconds) private def reportWaitingInIntervals(every: Duration) = report(WaitingForRetry(originalTopic, attempt, submittedAt.toEpochMilli, backoff.toMillis)) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala index b6e410d3..a15f1e5b 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala @@ -49,14 +49,17 @@ private[retry] object NonBlockingRetryRecordHandler { } private def delayRetry(record: ConsumerRecord[_, _], awaitShutdown: TopicPartition => UIO[AwaitShutdown])( - retryAttempt: RetryAttempt) = + retryAttempt: RetryAttempt + ) = zio.Random.nextInt.flatMap(correlationId => report( WaitingBeforeRetry(record.topic, retryAttempt, record.partition, record.offset, correlationId) ) *> awaitShutdown(record.topicPartition) .flatMap(_.interruptOnShutdown(retryAttempt.sleep)) - .reporting(r => DoneWaitingBeforeRetry(record.topic, record.partition, record.offset, retryAttempt, r.duration, r.failed, correlationId)) + .reporting(r => + DoneWaitingBeforeRetry(record.topic, record.partition, record.offset, retryAttempt, r.duration, r.failed, correlationId) + ) ) override def isHandlingRetryTopicMessage(group: Group, record: ConsumerRecord[K, V]): Boolean = { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandlerMetric.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandlerMetric.scala index 980a71f7..b1580b16 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandlerMetric.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandlerMetric.scala @@ -20,7 +20,8 @@ object RetryRecordHandlerMetric { case class NoRetryOnNonRetryableFailure(partition: TopicPartition, offset: Long, cause: Exception) extends RetryRecordHandlerMetric case object Silent extends RetryRecordHandlerMetric - case class WaitingBeforeRetry(retryTopic: Topic, retryAttempt: RetryAttempt, partition: Int, offset:Long, correlationId: Int) extends RetryRecordHandlerMetric + case class WaitingBeforeRetry(retryTopic: Topic, retryAttempt: RetryAttempt, partition: Int, offset: Long, correlationId: Int) + extends RetryRecordHandlerMetric case class DoneWaitingBeforeRetry( retryTopic: Topic, diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala index d8c0c28e..5e708452 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala @@ -16,7 +16,7 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { private val Seq(p1, p2, p3) = Seq("t1" -> 1, "t2" -> 2, "t3" -> 3).map(tp => TopicPartition(tp._1, tp._2)) private val partitions = Set(p1, p2, p3) private val p1Pos, p2Pos, p3Pos = randomInt.toLong - val epochTimeToRewind = 1000L + val epochTimeToRewind = 1000L "do nothing if no missing offsets" in new ctx { @@ -124,71 +124,73 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { reported must contain(CommittedMissingOffsetsFailed(clientId, group, partitions, Map.empty, elapsed = Duration.ZERO, e)) } - "rewind uncommitted offsets" in new ctx { - givenCommittedOffsets(partitions)(Map(p2 -> randomInt)) - givenPositions(p2 -> p2Pos, p3 -> p3Pos) - givenOffsetsForTimes(epochTimeToRewind, p1 -> 0L, p2 -> 1L) - - committer.initializeOffsets(partitions) + "rewind uncommitted offsets" in + new ctx { + givenCommittedOffsets(partitions)(Map(p2 -> randomInt)) + givenPositions(p2 -> p2Pos, p3 -> p3Pos) + givenOffsetsForTimes(epochTimeToRewind, p1 -> 0L, p2 -> 1L) - val missingOffsets = Map( - p1 -> p1Pos, - p3 -> p3Pos - ) + committer.initializeOffsets(partitions) - val rewindedOffsets = Map( - p1 -> 0L, - ) + val missingOffsets = Map( + p1 -> p1Pos, + p3 -> p3Pos + ) - there was - one(offsetOps).commit( - missingOffsets ++ rewindedOffsets, - timeout + val rewindedOffsets = Map( + p1 -> 0L ) - } - "rewind to endOffsets for uncommitted partitions when offsetsForTimes return null offsets " in new ctx { - givenCommittedOffsets(partitions)(Map(p2 -> randomInt, p3 -> randomInt)) - givenPositions(p3 -> p3Pos) - givenEndOffsets(partitions, timeout)(Map(p1 -> p1Pos)) - givenOffsetsForTimes(Set(p1))(p1 -> None /*kafka SDK returned null*/) + there was + one(offsetOps).commit( + missingOffsets ++ rewindedOffsets, + timeout + ) + } - committer.initializeOffsets(partitions) + "rewind to endOffsets for uncommitted partitions when offsetsForTimes return null offsets " in + new ctx { + givenCommittedOffsets(partitions)(Map(p2 -> randomInt, p3 -> randomInt)) + givenPositions(p3 -> p3Pos) + givenEndOffsets(partitions, timeout)(Map(p1 -> p1Pos)) + givenOffsetsForTimes(Set(p1))(p1 -> None /*kafka SDK returned null*/ ) - val committedOffsets = Map( - p1 -> p1Pos, - ) + committer.initializeOffsets(partitions) - there was - one(offsetOps).commit( - committedOffsets, - timeout + val committedOffsets = Map( + p1 -> p1Pos ) - } - "not rewind uncommitted offsets when offset reset is earliest" in new ctx(offsetReset = OffsetReset.Earliest) { - givenCommittedOffsets(partitions)(Map(p2 -> randomInt)) - givenPositions(p2 -> p2Pos, p3 -> p3Pos) - givenOffsetsForTimes(epochTimeToRewind, p1 -> 0L, p2 -> 1L) + there was + one(offsetOps).commit( + committedOffsets, + timeout + ) + } - committer.initializeOffsets(partitions) + "not rewind uncommitted offsets when offset reset is earliest" in + new ctx(offsetReset = OffsetReset.Earliest) { + givenCommittedOffsets(partitions)(Map(p2 -> randomInt)) + givenPositions(p2 -> p2Pos, p3 -> p3Pos) + givenOffsetsForTimes(epochTimeToRewind, p1 -> 0L, p2 -> 1L) - val missingOffsets = Map( - p1 -> p1Pos, - p3 -> p3Pos - ) + committer.initializeOffsets(partitions) - val rewindedOffsets = Map( - p1 -> 0L, - ) + val missingOffsets = Map( + p1 -> p1Pos, + p3 -> p3Pos + ) - there was - one(offsetOps).commit( - missingOffsets ++ rewindedOffsets, - timeout + val rewindedOffsets = Map( + p1 -> 0L ) - } + there was + one(offsetOps).commit( + missingOffsets ++ rewindedOffsets, + timeout + ) + } class ctx(val seekTo: Map[TopicPartition, SeekTo] = Map.empty, offsetReset: OffsetReset = OffsetReset.Latest) extends Scope { private val metricsLogRef = new AtomicReference(Seq.empty[GreyhoundMetric]) @@ -256,4 +258,4 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { private def randomStr = Random.alphanumeric.take(5).mkString private def randomInt = Random.nextInt(200) private def randomPartition = TopicPartition(randomStr, randomInt) -} \ No newline at end of file +} From 08a62da0213fdb7eedc2c597f666d10f2852c8e5 Mon Sep 17 00:00:00 2001 From: Vaidas Pilkauskas Date: Fri, 10 Mar 2023 21:27:29 +0200 Subject: [PATCH 02/52] Remove unused test deps (#33323) * Remove unused test deps * Remove unused imports * Add runtime deps * Initial cross-repo check * Updating cross-repo check * Updating cross-repo check * Updating cross-repo check --------- Co-authored-by: wixapiregistry <58037308+wixapiregistry@users.noreply.github.com> GitOrigin-RevId: bd83b6b6f4ca9706f6b9e0f247a8cac8bd3f1292 --- .../src/it/scala/com/wixpress/dst/greyhound/core/BUILD.bazel | 5 ----- .../scala/com/wixpress/dst/greyhound/core/offset/BUILD.bazel | 3 --- .../com/wixpress/dst/greyhound/core/rabalance/BUILD.bazel | 3 --- .../scala/com/wixpress/dst/greyhound/core/retry/BUILD.bazel | 4 ---- .../it/scala/com/wixpress/dst/greyhound/testenv/BUILD.bazel | 4 ---- .../it/scala/com/wixpress/dst/greyhound/testkit/BUILD.bazel | 1 - .../com/wixpress/dst/greyhound/core/consumer/BUILD.bazel | 3 --- .../wixpress/dst/greyhound/core/consumer/retry/BUILD.bazel | 1 - .../com/wixpress/dst/greyhound/core/producer/BUILD.bazel | 1 - .../com/wixpress/dst/greyhound/core/testkit/BUILD.bazel | 1 - .../it/scala/com/wixpress/dst/greyhound/future/BUILD.bazel | 3 --- 11 files changed, 29 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/BUILD.bazel b/core/src/it/scala/com/wixpress/dst/greyhound/core/BUILD.bazel index 28d79c08..9735d210 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/BUILD.bazel +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/BUILD.bazel @@ -24,13 +24,8 @@ specs2_ite2e_test( "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", "//core/src/main/scala/com/wixpress/dst/greyhound/core/zioutils", "//core/src/test/resources", - #"//core/src/test/scala/com/wixpress/dst/greyhound/core/consumer", - #"//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", - "@ch_qos_logback_logback_classic", # "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_2_12", - "@dev_zio_zio_test_2_12", - "@org_apache_kafka_kafka_2_12", "@org_apache_kafka_kafka_clients", "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", ], diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/offset/BUILD.bazel b/core/src/it/scala/com/wixpress/dst/greyhound/core/offset/BUILD.bazel index ce9f7faf..a5c08ae5 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/offset/BUILD.bazel +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/offset/BUILD.bazel @@ -13,7 +13,6 @@ specs2_ite2e_test( "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_managed_2_12", "//core/src/it/resources", - "//core/src/it/scala/com/wixpress/dst/greyhound/core", "//core/src/it/scala/com/wixpress/dst/greyhound/testenv", "//core/src/it/scala/com/wixpress/dst/greyhound/testkit", "//core/src/main/scala/com/wixpress/dst/greyhound/core", @@ -23,9 +22,7 @@ specs2_ite2e_test( "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", "//core/src/main/scala/com/wixpress/dst/greyhound/core/zioutils", - "//core/src/test/scala/com/wixpress/dst/greyhound/core/consumer", "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", - "@ch_qos_logback_logback_classic", # "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_2_12", "@org_apache_kafka_kafka_clients", diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/rabalance/BUILD.bazel b/core/src/it/scala/com/wixpress/dst/greyhound/core/rabalance/BUILD.bazel index 3da73eba..c763308a 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/rabalance/BUILD.bazel +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/rabalance/BUILD.bazel @@ -13,7 +13,6 @@ specs2_ite2e_test( "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_managed_2_12", "//core/src/it/resources", - "//core/src/it/scala/com/wixpress/dst/greyhound/core", "//core/src/it/scala/com/wixpress/dst/greyhound/testenv", "//core/src/it/scala/com/wixpress/dst/greyhound/testkit", "//core/src/main/scala/com/wixpress/dst/greyhound/core", @@ -23,9 +22,7 @@ specs2_ite2e_test( "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", "//core/src/main/scala/com/wixpress/dst/greyhound/core/zioutils", - "//core/src/test/scala/com/wixpress/dst/greyhound/core/consumer", "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", - "@ch_qos_logback_logback_classic", # "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_2_12", "@org_apache_kafka_kafka_clients", diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/retry/BUILD.bazel b/core/src/it/scala/com/wixpress/dst/greyhound/core/retry/BUILD.bazel index 3f83ff43..cb001169 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/retry/BUILD.bazel +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/retry/BUILD.bazel @@ -12,7 +12,6 @@ specs2_ite2e_test( deps = [ "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_managed_2_12", - "//core/src/it/scala/com/wixpress/dst/greyhound/core", "//core/src/it/scala/com/wixpress/dst/greyhound/testenv", "//core/src/it/scala/com/wixpress/dst/greyhound/testkit", "//core/src/main/scala/com/wixpress/dst/greyhound/core", @@ -21,11 +20,8 @@ specs2_ite2e_test( "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry", "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", - "//core/src/main/scala/com/wixpress/dst/greyhound/core/zioutils", "//core/src/test/resources", - "//core/src/test/scala/com/wixpress/dst/greyhound/core/consumer", "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", - "@ch_qos_logback_logback_classic", # "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_2_12", "@org_apache_kafka_kafka_clients", diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/testenv/BUILD.bazel b/core/src/it/scala/com/wixpress/dst/greyhound/testenv/BUILD.bazel index 35ab06f9..6b5f15d9 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/testenv/BUILD.bazel +++ b/core/src/it/scala/com/wixpress/dst/greyhound/testenv/BUILD.bazel @@ -13,15 +13,11 @@ scala_library( "@dev_zio_zio_managed_2_12", "//core/src/it/scala/com/wixpress/dst/greyhound/testkit", "//core/src/main/scala/com/wixpress/dst/greyhound/core", - "//core/src/main/scala/com/wixpress/dst/greyhound/core/admin", "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", # "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_2_12", "@dev_zio_zio_test_2_12", - "@org_apache_curator_curator_test", - "@org_apache_kafka_kafka_2_12", - "@org_apache_kafka_kafka_clients", ], ) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/testkit/BUILD.bazel b/core/src/it/scala/com/wixpress/dst/greyhound/testkit/BUILD.bazel index fb990677..f3345b95 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/testkit/BUILD.bazel +++ b/core/src/it/scala/com/wixpress/dst/greyhound/testkit/BUILD.bazel @@ -16,7 +16,6 @@ scala_library( "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", # "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", "@dev_zio_zio_2_12", - "@dev_zio_zio_test_2_12", "@org_apache_curator_curator_test", "@org_apache_kafka_kafka_2_12", "@org_apache_kafka_kafka_clients", diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/BUILD.bazel b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/BUILD.bazel index 90ed4056..3e104e18 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/BUILD.bazel +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/BUILD.bazel @@ -11,17 +11,14 @@ specs2_unit_test( deps = [ "@dev_zio_zio_managed_2_12", "@dev_zio_zio_stacktracer_2_12", - "//core/src/it/scala/com/wixpress/dst/greyhound/testkit", "//core/src/main/scala/com/wixpress/dst/greyhound/core", "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer", "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/domain", "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry", "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", - "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", "//core/src/main/scala/com/wixpress/dst/greyhound/core/zioutils", "//core/src/test/resources", "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", - "@ch_qos_logback_logback_classic", # "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_2_12", "@dev_zio_zio_streams_2_12", diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/BUILD.bazel b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/BUILD.bazel index b02990a0..dea7a061 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/BUILD.bazel +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/BUILD.bazel @@ -17,7 +17,6 @@ specs2_unit_test( "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", "//core/src/main/scala/com/wixpress/dst/greyhound/core/zioutils", - "//core/src/test/scala/com/wixpress/dst/greyhound/core/consumer", "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", # "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_2_12", diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/producer/BUILD.bazel b/core/src/test/scala/com/wixpress/dst/greyhound/core/producer/BUILD.bazel index 27f12381..46310b8b 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/producer/BUILD.bazel +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/producer/BUILD.bazel @@ -16,7 +16,6 @@ specs2_unit_test( "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", "//core/src/test/resources", "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", - "@ch_qos_logback_logback_classic", # "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_2_12", "@dev_zio_zio_test_2_12", diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/BUILD.bazel b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/BUILD.bazel index 17c68cef..75f65d20 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/BUILD.bazel +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/BUILD.bazel @@ -15,7 +15,6 @@ scala_library( "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry", "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", - "//core/src/main/scala/com/wixpress/dst/greyhound/core/zioutils", # "@dev_zio_izumi_reflect_2_12", # "@dev_zio_izumi_reflect_thirdparty_boopickle_shaded_2_12", "@dev_zio_zio_2_12", diff --git a/future-interop/src/it/scala/com/wixpress/dst/greyhound/future/BUILD.bazel b/future-interop/src/it/scala/com/wixpress/dst/greyhound/future/BUILD.bazel index 45e8069e..af2ef67c 100644 --- a/future-interop/src/it/scala/com/wixpress/dst/greyhound/future/BUILD.bazel +++ b/future-interop/src/it/scala/com/wixpress/dst/greyhound/future/BUILD.bazel @@ -21,11 +21,8 @@ specs2_ite2e_test( "//core/src/test/resources", "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", "//future-interop/src/main/scala/com/wixpress/dst/greyhound/future", - "//java-interop/src/main/java/com/wixpress/dst/greyhound/java", - "@ch_qos_logback_logback_classic", "@dev_zio_izumi_reflect_2_12", "@dev_zio_zio_2_12", "@dev_zio_zio_stacktracer_2_12", - "@org_apache_kafka_kafka_clients", ], ) From 66f88e5faa343c274355e1c2a11db45a2a486bf2 Mon Sep 17 00:00:00 2001 From: Alexey Dsiuba Date: Tue, 14 Mar 2023 12:14:06 +0200 Subject: [PATCH 03/52] Blocking retries attempts tracking fix (#32757) * Blocking retries attempts tracking fix #pr * Factored out RetryAttempt logic, TODO tests * Added RetryAttemptTest * Reverted RetryHeader back * Fixed retry test GitOrigin-RevId: 7e0e8cbccdd4c5fe6446775262b6695697785b2b --- .../retry/BlockingRetryRecordHandler.scala | 47 +++++- .../retry/NonBlockingRetryHelper.scala | 137 ++++++------------ .../retry/NonBlockingRetryRecordHandler.scala | 7 +- .../core/consumer/retry/RetryAttempt.scala | 98 +++++++++++++ .../consumer/retry/RetryRecordHandler.scala | 13 +- .../consumer/retry/RetryAttemptTest.scala | 78 ++++++++++ .../RetryConsumerRecordHandlerTest.scala | 32 +--- .../core/testkit/FakeRetryHelper.scala | 43 +----- 8 files changed, 280 insertions(+), 175 deletions(-) create mode 100644 core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala create mode 100644 core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala index 502e31e1..2f5d59a9 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala @@ -3,9 +3,8 @@ package com.wixpress.dst.greyhound.core.consumer.retry import java.util.concurrent.TimeUnit import com.wixpress.dst.greyhound.core.{Group, TopicPartition} import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHandler} -import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, Blocking => InternalBlocking, IgnoringOnce} +import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, IgnoringOnce, Blocking => InternalBlocking} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.{BlockingRetryHandlerInvocationFailed, DoneBlockingBeforeRetry, NoRetryOnNonRetryableFailure} -import com.wixpress.dst.greyhound.core.consumer.retry.ZIOHelper.foreachWhile import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics.report import com.wixpress.dst.greyhound.core.zioutils.AwaitShutdown @@ -31,7 +30,7 @@ private[retry] object BlockingRetryRecordHandler { override def handle(record: ConsumerRecord[K, V])(implicit trace: Trace): ZIO[GreyhoundMetrics with R, Nothing, LastHandleResult] = { val topicPartition = TopicPartition(record.topic, record.partition) - def pollBlockingStateWithSuspensions(interval: Duration, start: Long): URIO[GreyhoundMetrics, PollResult] = { + def pollBlockingStateWithSuspensions(record: ConsumerRecord[K, V], interval: Duration, start: Long): URIO[GreyhoundMetrics, PollResult] = { for { shouldBlock <- blockingStateResolver.resolve(record) shouldPollAgain <- @@ -43,14 +42,14 @@ private[retry] object BlockingRetryRecordHandler { } yield shouldPollAgain } - def blockOnErrorFor(interval: Duration) = { + def blockOnErrorFor(record: ConsumerRecord[K, V], interval: Duration) = { for { start <- currentTime(TimeUnit.MILLISECONDS) continueBlocking <- if (interval.toMillis > 100L) { awaitShutdown(record.topicPartition).flatMap( _.interruptOnShutdown( - pollBlockingStateWithSuspensions(interval, start).repeatWhile(result => result.pollAgain).map(_.blockHandling) + pollBlockingStateWithSuspensions(record, interval, start).repeatWhile(result => result.pollAgain).map(_.blockHandling) ).reporting(r => DoneBlockingBeforeRetry(record.topic, record.partition, record.offset, r.duration, r.failed)) ) } else { @@ -63,6 +62,7 @@ private[retry] object BlockingRetryRecordHandler { } def handleAndMaybeBlockOnErrorFor( + record: ConsumerRecord[K, V], interval: Option[Duration] ): ZIO[R with GreyhoundMetrics, Nothing, LastHandleResult] = { handler.handle(record).map(_ => LastHandleResult(lastHandleSucceeded = true, shouldContinue = false)).catchAll { @@ -73,7 +73,7 @@ private[retry] object BlockingRetryRecordHandler { case error => interval .map { interval => - report(BlockingRetryHandlerInvocationFailed(topicPartition, record.offset, error.toString)) *> blockOnErrorFor(interval) + report(BlockingRetryHandlerInvocationFailed(topicPartition, record.offset, error.toString)) *> blockOnErrorFor(record, interval) } .getOrElse(ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false))) } @@ -96,13 +96,44 @@ private[retry] object BlockingRetryRecordHandler { } else { val durationsIncludingForInvocationWithNoErrorHandling = retryConfig.blockingBackoffs(record.topic)().map(Some(_)) :+ None for { - result <- foreachWhile(durationsIncludingForInvocationWithNoErrorHandling) { interval => handleAndMaybeBlockOnErrorFor(interval) } - _ <- maybeBackToStateBlocking + result <- retryEvery(record, durationsIncludingForInvocationWithNoErrorHandling) { (rec, interval) => + handleAndMaybeBlockOnErrorFor(rec, interval) + } + _ <- maybeBackToStateBlocking } yield result } } } + private def retryEvery[K, V, R, E](record: ConsumerRecord[K, V], as: Iterable[Option[Duration]])( + f: (ConsumerRecord[K, V], Option[Duration]) => ZIO[R, E, LastHandleResult] + )(implicit trace: Trace): ZIO[R, E, LastHandleResult] = { + ZIO.succeed(as.iterator).flatMap { i => + def loop(retryAttempt: Option[RetryAttempt]): ZIO[R, E, LastHandleResult] = + if (i.hasNext) { + val nextDelay = i.next + val recordWithAttempt = retryAttempt.fold(record) { attempt => + record.copy(headers = record.headers ++ RetryAttempt.toHeaders(attempt)) + } + f(recordWithAttempt, nextDelay).flatMap { result => + if (result.shouldContinue) Clock.instant.flatMap { now => + val nextAttempt = RetryAttempt( + originalTopic = record.topic, + attempt = retryAttempt.fold(0)(_.attempt + 1), + submittedAt = now, + backoff = nextDelay getOrElse Duration.Zero + ) + loop(Some(nextAttempt)) + } + else ZIO.succeed(result) + } + } + else ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) + + loop(None) + } + } + private def handleNonRetriable[K, V, E, R](record: ConsumerRecord[K, V], topicPartition: TopicPartition, cause: Exception) = report(NoRetryOnNonRetryableFailure(topicPartition, record.offset, cause)) .as(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala index a73b7dce..1d1cd24c 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala @@ -1,32 +1,23 @@ package com.wixpress.dst.greyhound.core.consumer.retry -import java.time.{Duration => JavaDuration, Instant} -import java.util.concurrent.TimeUnit.MILLISECONDS -import java.util.regex.Pattern -import com.wixpress.dst.greyhound.core.Serdes.StringSerde import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.{TopicPattern, Topics} -import com.wixpress.dst.greyhound.core.consumer.retry.RetryAttempt.{currentTime, RetryAttemptNumber} -import com.wixpress.dst.greyhound.core.consumer.retry.RetryDecision.{NoMoreRetries, RetryWith} import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription} +import com.wixpress.dst.greyhound.core.consumer.retry.RetryDecision.{NoMoreRetries, RetryWith} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.WaitingForRetry import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics.report import com.wixpress.dst.greyhound.core.producer.ProducerRecord -import com.wixpress.dst.greyhound.core.{durationDeserializer, instantDeserializer, Group, Headers, Topic} -import zio.Clock -import zio.Duration +import com.wixpress.dst.greyhound.core.{Group, Topic} import zio.Schedule.spaced -import zio.{Chunk, UIO, URIO, _} +import zio.{Chunk, Clock, Duration, URIO, _} +import java.time.Instant +import java.util.regex.Pattern import scala.util.Try trait NonBlockingRetryHelper { def retryTopicsFor(originalTopic: Topic): Set[Topic] - def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace - ): UIO[Option[RetryAttempt]] - def retryDecision[E]( retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], @@ -34,7 +25,7 @@ trait NonBlockingRetryHelper { subscription: ConsumerSubscription )(implicit trace: Trace): URIO[Any, RetryDecision] - def retrySteps = retryTopicsFor("").size + def retrySteps: Int = retryTopicsFor("").size } object NonBlockingRetryHelper { @@ -49,82 +40,70 @@ object NonBlockingRetryHelper { override def retryTopicsFor(topic: Topic): Set[Topic] = policy(topic).intervals.indices.foldLeft(Set.empty[String])((acc, attempt) => acc + s"$topic-$group-retry-$attempt") - override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace - ): UIO[Option[RetryAttempt]] = { - (for { - submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) - backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) - originalTopic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) - } yield for { - ta <- topicAttempt(subscription, topic, originalTopic) - TopicAttempt(originalTopic, attempt) = ta - s <- submitted - b <- backoff - } yield RetryAttempt(originalTopic, attempt, s, b)) - .catchAll(_ => ZIO.none) - } - - private def topicAttempt( - subscription: ConsumerSubscription, - topic: Topic, - originalTopicHeader: Option[String] - ) = - subscription match { - case _: Topics => extractTopicAttempt(group, topic) - case _: TopicPattern => - extractTopicAttemptFromPatternRetryTopic(group, topic, originalTopicHeader) - } - override def retryDecision[E]( retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], error: E, subscription: ConsumerSubscription - )(implicit trace: Trace): URIO[Any, RetryDecision] = currentTime.map(now => { - val nextRetryAttempt = retryAttempt.fold(0)(_.attempt + 1) + )(implicit trace: Trace): URIO[Any, RetryDecision] = Clock.instant.map(now => { + val blockingRetriesBefore = RetryAttempt.maxBlockingAttempts( + NonBlockingRetryHelper.originalTopic(record.topic, group), + retryConfig + ).getOrElse(0) + + // attempt if present contains full number of retries + val nextNonBlockingAttempt = retryAttempt.fold(0)(_.attempt + 1 - blockingRetriesBefore) + val nextRetryAttempt = nextNonBlockingAttempt + blockingRetriesBefore val originalTopic = retryAttempt.fold(record.topic)(_.originalTopic) val retryTopic = subscription match { - case _: TopicPattern => patternRetryTopic(group, nextRetryAttempt) - case _: Topics => fixedRetryTopic(originalTopic, group, nextRetryAttempt) + case _: TopicPattern => patternRetryTopic(group, nextNonBlockingAttempt) + case _: Topics => fixedRetryTopic(originalTopic, group, nextNonBlockingAttempt) } val topicRetryPolicy = policy(record.topic) topicRetryPolicy.intervals - .lift(nextRetryAttempt) + .lift(nextNonBlockingAttempt) .map { backoff => + val attempt = RetryAttempt( + attempt = nextRetryAttempt, + originalTopic = originalTopic, + submittedAt = now, + backoff = backoff + ) topicRetryPolicy.recordMutate( ProducerRecord( topic = retryTopic, value = record.value, key = record.key, partition = None, - headers = record.headers + - (RetryHeader.Submitted -> toChunk(now.toEpochMilli)) + - (RetryHeader.Backoff -> toChunk(backoff.toMillis)) + - (RetryHeader.OriginalTopic -> toChunk(originalTopic)) + - (RetryHeader.RetryAttempt -> toChunk(nextRetryAttempt)) + headers = record.headers ++ RetryAttempt.toHeaders(attempt) ) ) } .fold[RetryDecision](NoMoreRetries)(RetryWith) }) + } - private def toChunk(long: Long): Chunk[Byte] = - Chunk.fromArray(long.toString.getBytes) - - private def toChunk(str: String): Chunk[Byte] = - Chunk.fromArray(str.getBytes) + private[retry] def attemptNumberFromTopic( + subscription: ConsumerSubscription, + topic: Topic, + originalTopicHeader: Option[String], + group: Group + ) = + subscription match { + case _: Topics => extractTopicAttempt(group, topic) + case _: TopicPattern => + extractTopicAttemptFromPatternRetryTopic(group, topic, originalTopicHeader) } - private def extractTopicAttempt[E](group: Group, inputTopic: Topic) = + private def extractTopicAttempt(group: Group, inputTopic: Topic) = inputTopic.split(s"-$group-retry-").toSeq match { case Seq(topic, attempt) if Try(attempt.toInt).isSuccess => Some(TopicAttempt(topic, attempt.toInt)) - case _ => None + case _ => None } - private def extractTopicAttemptFromPatternRetryTopic[E]( + private def extractTopicAttemptFromPatternRetryTopic( group: Group, inputTopic: Topic, originalTopicHeader: Option[String] @@ -166,49 +145,27 @@ object DelayHeaders { val Backoff = "backOffTimeMs" } -object RetryHeader { - val Submitted = "submitTimestamp" - val Backoff = DelayHeaders.Backoff - val OriginalTopic = "GH_OriginalTopic" - val RetryAttempt = "GH_RetryAttempt" -} - -case class RetryAttempt( - originalTopic: Topic, - attempt: RetryAttemptNumber, - submittedAt: Instant, - backoff: Duration -) { - - def sleep(implicit trace: Trace): URIO[GreyhoundMetrics, Unit] = - RetryUtil.sleep(submittedAt, backoff) race reportWaitingInIntervals(every = 60.seconds) - - private def reportWaitingInIntervals(every: Duration) = - report(WaitingForRetry(originalTopic, attempt, submittedAt.toEpochMilli, backoff.toMillis)) - .repeat(spaced(every)) - .unit -} - object RetryUtil { + def sleep(attempt: RetryAttempt)(implicit trace: Trace): URIO[GreyhoundMetrics, Unit] = + sleep(attempt.submittedAt, attempt.backoff) race + report(WaitingForRetry(attempt.originalTopic, attempt.attempt, attempt.submittedAt.toEpochMilli, attempt.backoff.toMillis)) + .repeat(spaced(60.seconds)) + .unit + def sleep(submittedAt: Instant, backoff: Duration)(implicit trace: Trace): URIO[Any, Unit] = { val expiresAt = submittedAt.plus(backoff.asJava) - currentTime + Clock.instant .map(_.isAfter(expiresAt)) .flatMap(expired => if (expired) ZIO.unit else - ZIO.sleep(1.seconds).repeatUntilZIO(_ => currentTime.map(_.isAfter(expiresAt))).unit + ZIO.sleep(1.second).repeatUntilZIO(_ => Clock.instant.map(_.isAfter(expiresAt))).unit ) } } private case class TopicAttempt(originalTopic: Topic, attempt: Int) -object RetryAttempt { - type RetryAttemptNumber = Int - val currentTime = Clock.currentTime(MILLISECONDS).map(Instant.ofEpochMilli) -} - sealed trait RetryDecision object RetryDecision { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala index a15f1e5b..a6ff6560 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala @@ -31,11 +31,12 @@ private[retry] object NonBlockingRetryRecordHandler { retryConfig: RetryConfig, subscription: ConsumerSubscription, nonBlockingRetryHelper: NonBlockingRetryHelper, + groupId: Group, awaitShutdown: TopicPartition => UIO[AwaitShutdown] )(implicit evK: K <:< Chunk[Byte], evV: V <:< Chunk[Byte]): NonBlockingRetryRecordHandler[V, K, R] = new NonBlockingRetryRecordHandler[V, K, R] { override def handle(record: ConsumerRecord[K, V]): ZIO[GreyhoundMetrics with R, Nothing, Any] = { - nonBlockingRetryHelper.retryAttempt(record.topic, record.headers, subscription).flatMap { retryAttempt => + RetryAttempt.extract(record.headers, record.topic, groupId, subscription, Some(retryConfig)).flatMap { retryAttempt => maybeDelayRetry(record, retryAttempt) *> handler.handle(record).catchAll { case Right(_: NonRetriableException) => ZIO.unit @@ -56,7 +57,7 @@ private[retry] object NonBlockingRetryRecordHandler { WaitingBeforeRetry(record.topic, retryAttempt, record.partition, record.offset, correlationId) ) *> awaitShutdown(record.topicPartition) - .flatMap(_.interruptOnShutdown(retryAttempt.sleep)) + .flatMap(_.interruptOnShutdown(RetryUtil.sleep(retryAttempt))) .reporting(r => DoneWaitingBeforeRetry(record.topic, record.partition, record.offset, retryAttempt, r.duration, r.failed, correlationId) ) @@ -74,7 +75,7 @@ private[retry] object NonBlockingRetryRecordHandler { override def handleAfterBlockingFailed( record: ConsumerRecord[K, V] ): ZIO[GreyhoundMetrics with R, Nothing, Any] = { - nonBlockingRetryHelper.retryAttempt(record.topic, record.headers, subscription).flatMap { retryAttempt => + RetryAttempt.extract(record.headers, record.topic, groupId, subscription, Some(retryConfig)).flatMap { retryAttempt => maybeRetry(retryAttempt, BlockingHandlerFailed, record) } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala new file mode 100644 index 00000000..9265eab8 --- /dev/null +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala @@ -0,0 +1,98 @@ +package com.wixpress.dst.greyhound.core.consumer.retry + +import com.wixpress.dst.greyhound.core.Serdes.StringSerde +import com.wixpress.dst.greyhound.core._ +import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription +import com.wixpress.dst.greyhound.core.consumer.retry.NonBlockingRetryHelper.attemptNumberFromTopic +import com.wixpress.dst.greyhound.core.consumer.retry.RetryAttempt.RetryAttemptNumber +import zio._ + +import java.time.Instant + +/** + * Description of a retry attempt + * @param attempt contains which attempt is it, starting from 0 including blocking and non-blocking attempts + */ +case class RetryAttempt( + originalTopic: Topic, + attempt: RetryAttemptNumber, + submittedAt: Instant, + backoff: Duration +) + +object RetryHeader { + val Submitted = "submitTimestamp" + val Backoff = DelayHeaders.Backoff + val OriginalTopic = "GH_OriginalTopic" + val RetryAttempt = "GH_RetryAttempt" +} + +object RetryAttempt { + type RetryAttemptNumber = Int + + private def toChunk(str: String): Chunk[Byte] = Chunk.fromArray(str.getBytes) + + def toHeaders(attempt: RetryAttempt): Headers = Headers( + RetryHeader.Submitted -> toChunk(attempt.submittedAt.toEpochMilli.toString), + RetryHeader.Backoff -> toChunk(attempt.backoff.toMillis.toString), + RetryHeader.OriginalTopic -> toChunk(attempt.originalTopic), + RetryHeader.RetryAttempt -> toChunk(attempt.attempt.toString), + ) + + private case class RetryAttemptHeaders( + originalTopic: Option[Topic], + attempt: Option[RetryAttemptNumber], + submittedAt: Option[Instant], + backoff: Option[Duration] + ) + + private def fromHeaders(headers: Headers): Task[RetryAttemptHeaders] = + for { + submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) + backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) + topic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) + attempt <- headers.get(RetryHeader.RetryAttempt, longDeserializer) + } yield RetryAttemptHeaders(topic, attempt.map(_.toInt), submitted, backoff) + + /** @return None on infinite blocking retries */ + def maxBlockingAttempts(topic: Topic, retryConfig: Option[RetryConfig]): Option[Int] = + retryConfig.map(_.blockingBackoffs(topic)()).fold(Option(0)) { + case finite if finite.hasDefiniteSize => Some(finite.size) + case _ => None + } + + /** @return None on infinite retries */ + def maxOverallAttempts(topic: Topic, retryConfig: Option[RetryConfig]): Option[Int] = + maxBlockingAttempts(topic, retryConfig).map { + _ + retryConfig.fold(0)(_.nonBlockingBackoffs(topic).length) + } + + def extract( + headers: Headers, + topic: Topic, + group: Group, + subscription: ConsumerSubscription, + retryConfig: Option[RetryConfig], + )(implicit trace: Trace): UIO[Option[RetryAttempt]] = { + + def nonBlockingAttempt(hs: RetryAttemptHeaders): Option[RetryAttempt] = + for { + submitted <- hs.submittedAt + backoff <- hs.backoff + TopicAttempt(originalTopic, attempt) <- attemptNumberFromTopic(subscription, topic, hs.originalTopic, group) + blockingRetries = maxBlockingAttempts(originalTopic, retryConfig).getOrElse(0) + } yield RetryAttempt(originalTopic, blockingRetries + attempt, submitted, backoff) + + def blockingAttempt(hs: RetryAttemptHeaders): Option[RetryAttempt] = + for { + submitted <- hs.submittedAt + backoff <- hs.backoff + originalTopic <- hs.originalTopic + attempt <- hs.attempt + } yield RetryAttempt(originalTopic, attempt, submitted, backoff) + + fromHeaders(headers).map { hs => + nonBlockingAttempt(hs) orElse blockingAttempt(hs) + } + }.catchAll(_ => ZIO.none) +} diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala index af0749c6..8ee8ab9f 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala @@ -33,7 +33,7 @@ object RetryRecordHandler { ): RecordHandler[R with R2 with GreyhoundMetrics, Nothing, K, V] = { val nonBlockingHandler = - NonBlockingRetryRecordHandler(handler, producer, retryConfig, subscription, nonBlockingRetryHelper, awaitShutdown) + NonBlockingRetryRecordHandler(handler, producer, retryConfig, subscription, nonBlockingRetryHelper, groupId, awaitShutdown) val blockingHandler = BlockingRetryRecordHandler(groupId, handler, retryConfig, blockingState, nonBlockingHandler, awaitShutdown) val blockingAndNonBlockingHandler = BlockingAndNonBlockingRetryRecordHandler(groupId, blockingHandler, nonBlockingHandler) @@ -55,15 +55,4 @@ object RetryRecordHandler { record.headers.get[String](key, StringSerde).catchAll(_ => ZIO.none) } -object ZIOHelper { - def foreachWhile[R, E, A](as: Iterable[A])(f: A => ZIO[R, E, LastHandleResult])(implicit trace: Trace): ZIO[R, E, LastHandleResult] = - ZIO.succeed(as.iterator).flatMap { i => - def loop: ZIO[R, E, LastHandleResult] = - if (i.hasNext) f(i.next).flatMap(result => if (result.shouldContinue) loop else ZIO.succeed(result)) - else ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) - - loop - } -} - case class LastHandleResult(lastHandleSucceeded: Boolean, shouldContinue: Boolean) diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala new file mode 100644 index 00000000..3139f845 --- /dev/null +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala @@ -0,0 +1,78 @@ +package com.wixpress.dst.greyhound.core.consumer.retry + +import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription +import com.wixpress.dst.greyhound.core.testkit.BaseTest +import zio.test.TestEnvironment + +import java.time.{Duration, Instant} +import scala.util.Random +import scala.concurrent.duration._ + +class RetryAttemptTest extends BaseTest[TestEnvironment] { + + "RetryAttempt.extract" should { + "deserialize attempt from headers for blocking retries" in { + val attempt = randomRetryAttempt + val headers = RetryAttempt.toHeaders(attempt) + val subscription = ConsumerSubscription.Topics(Set(attempt.originalTopic)) + for (result <- RetryAttempt.extract(headers, attempt.originalTopic, randomStr, subscription, None)) + yield result must beSome(attempt) + } + "deserialize attempt from headers and topic for non-blocking retries" in { + val attempt = randomRetryAttempt + // topic and attempt must be extracted from retryTopic + val headers = RetryAttempt.toHeaders(attempt.copy(originalTopic = "", attempt = -1)) + val subscription = ConsumerSubscription.Topics(Set(attempt.originalTopic)) + val group = randomStr + val retryTopic = NonBlockingRetryHelper.fixedRetryTopic(attempt.originalTopic, group, attempt.attempt) + for (result <- RetryAttempt.extract(headers, retryTopic, group, subscription, None)) + yield result must beSome(attempt) + } + "deserialize attempt for non-blocking retry after blocking retries" in { + val attempt = randomRetryAttempt + val headers = RetryAttempt.toHeaders(attempt) + val subscription = ConsumerSubscription.Topics(Set(attempt.originalTopic)) + val group = randomStr + val retries = RetryConfig.blockingFollowedByNonBlockingRetry( + blockingBackoffs = 1.milli :: 1.second :: Nil, + nonBlockingBackoffs = 5.minutes :: Nil, + ) + val retryTopic = NonBlockingRetryHelper.fixedRetryTopic(attempt.originalTopic, group, attempt.attempt) + for (result <- RetryAttempt.extract(headers, retryTopic, group, subscription, Some(retries))) + yield result must beSome(attempt.copy(attempt = attempt.attempt + 2)) // with 2 blocking retries before + } + } + + "RetryAttempt.maxOverallAttempts" should { + "return 0 if no retries configured" in { + RetryAttempt.maxOverallAttempts(randomStr, None) must beSome(0) + } + "return max attempts for blocking retries" in { + val config = RetryConfig.finiteBlockingRetry(1.milli, 1.second) + RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beSome(2) + } + "return max attempts for non-blocking retries" in { + val config = RetryConfig.nonBlockingRetry(1.milli, 1.second, 5.minutes) + RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beSome(3) + } + "return max attempts for blocking retries followed by non-blocking" in { + val config = RetryConfig.blockingFollowedByNonBlockingRetry(1.milli :: 2.seconds :: Nil, 1.minute :: Nil) + RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beSome(3) + } + "return None for infinite blocking retries" in { + val config = RetryConfig.infiniteBlockingRetry(1.milli) + RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beNone + } + } + + override def env = testEnvironment + + private def randomStr = Random.alphanumeric.take(10).mkString + + private def randomRetryAttempt = RetryAttempt( + originalTopic = randomStr, + attempt = Random.nextInt(1000), + submittedAt = Instant.ofEpochMilli(math.abs(Random.nextLong())), + backoff = Duration.ofMillis(Random.nextInt(100000)) + ) +} diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala index c713fb12..b34e8d82 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala @@ -4,7 +4,7 @@ import java.time.Instant import com.wixpress.dst.greyhound.core.Serdes._ import com.wixpress.dst.greyhound.core._ import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.Topics -import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription, RecordHandler} +import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHandler} import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, Blocking => InternalBlocking, IgnoringAll, IgnoringOnce} import com.wixpress.dst.greyhound.core.consumer.retry.RetryConsumerRecordHandlerTest.{offset, partition, _} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.{BlockingIgnoredForAllFor, BlockingIgnoredOnceFor, BlockingRetryHandlerInvocationFailed, NoRetryOnNonRetryableFailure} @@ -21,8 +21,6 @@ import zio.Random.{nextBytes, nextIntBounded} import zio.managed.UManaged import zio.test.TestClock -import scala.concurrent.TimeoutException - class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics] { override def env: UManaged[ZEnvironment[TestClock with TestMetrics]] = @@ -52,9 +50,6 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) record <- producer.records.take now <- currentTime - retryAttempt <- IntSerde.serialize(retryTopic, 0) - submittedAt <- InstantSerde.serialize(retryTopic, now) - backoff <- DurationSerde.serialize(retryTopic, 1.second) } yield { record === ProducerRecord( @@ -62,7 +57,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics value, Some(key), partition = None, - headers = Headers("retry-attempt" -> retryAttempt, "retry-submitted-at" -> submittedAt, "retry-backoff" -> backoff) + headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, now, 1.second)) ) } } @@ -86,10 +81,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) value <- bytes begin <- currentTime - retryAttempt <- IntSerde.serialize(retryTopic, 0) - submittedAt <- InstantSerde.serialize(retryTopic, begin) - backoff <- DurationSerde.serialize(retryTopic, 1.second) - headers = Headers("retry-attempt" -> retryAttempt, "retry-submitted-at" -> submittedAt, "retry-backoff" -> backoff) + headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, begin, 1.second)) _ <- retryHandler.handle(ConsumerRecord(retryTopic, partition, offset, headers, None, value, 0L, 0L, 0L)).fork _ <- TestClock.adjust(1.second).repeat(Schedule.once) end <- executionTime.await.disconnect.timeoutFail(TimeoutWaitingForAssertion)(5.seconds) @@ -404,7 +396,8 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics producer <- FakeProducer.make topic <- randomTopicName blockingState <- Ref.make[Map[BlockingTarget, BlockingState]](Map.empty) - retryHelper = alwaysBackOffRetryHelper(3.seconds) + retryHelper = FakeRetryHelper(topic) + now <- Clock.instant handling <- AwaitShutdown.makeManaged.flatMap { awaitShutdown => val retryHandler = RetryRecordHandler.withRetries( group, @@ -416,11 +409,12 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics retryHelper, awaitShutdown = _ => ZIO.succeed(awaitShutdown) ) + val headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, now, 3.seconds)) for { key <- bytes value <- bytes handling <- retryHandler - .handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + .handle(ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L)) .forkDaemon } yield handling } @@ -534,18 +528,6 @@ object RetryConsumerRecordHandlerTest { def randomTopicName = randomStr.map(suffix => s"some-topic-$suffix") val cause = new RuntimeException("cause") - - def alwaysBackOffRetryHelper(backoff: Duration) = { - new FakeNonBlockingRetryHelper { - override val topic: Topic = "" - - override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace - ): UIO[Option[RetryAttempt]] = ZIO.succeed( - Some(RetryAttempt(topic, 1, Instant.now, backoff)) - ) - } - } } object TimeoutWaitingForAssertion extends RuntimeException diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala index cee13948..619ada36 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala @@ -1,35 +1,21 @@ package com.wixpress.dst.greyhound.core.testkit import java.time.Instant -import java.util.concurrent.TimeUnit.MILLISECONDS - -import com.wixpress.dst.greyhound.core.Serdes._ import com.wixpress.dst.greyhound.core._ import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription} import com.wixpress.dst.greyhound.core.consumer.retry.RetryDecision.{NoMoreRetries, RetryWith} -import com.wixpress.dst.greyhound.core.consumer.retry.{BlockingHandlerFailed, NonBlockingRetryHelper, RetryAttempt, RetryDecision} +import com.wixpress.dst.greyhound.core.consumer.retry.{BlockingHandlerFailed, NonBlockingRetryHelper, RetryAttempt, RetryDecision, RetryHeader} import com.wixpress.dst.greyhound.core.producer.ProducerRecord import com.wixpress.dst.greyhound.core.testkit.FakeRetryHelper._ import zio._ import zio.Clock -import zio.Clock - trait FakeNonBlockingRetryHelper extends NonBlockingRetryHelper { val topic: Topic override def retryTopicsFor(originalTopic: Topic): Set[Topic] = Set(s"$originalTopic-retry") - override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace - ): UIO[Option[RetryAttempt]] = - (for { - attempt <- headers.get(Header.Attempt, IntSerde) - submittedAt <- headers.get(Header.SubmittedAt, InstantSerde) - backoff <- headers.get(Header.Backoff, DurationSerde) - } yield retryAttemptInternal(topic, attempt, submittedAt, backoff)).orElse(ZIO.none) - override def retryDecision[E]( retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], @@ -38,35 +24,23 @@ trait FakeNonBlockingRetryHelper extends NonBlockingRetryHelper { )(implicit trace: Trace): URIO[Any, RetryDecision] = error match { case RetriableError | BlockingHandlerFailed => - currentTime.flatMap(now => - recordFrom(now, retryAttempt, record) - .fold(_ => NoMoreRetries, RetryWith) + currentTime.map(now => + RetryWith(recordFrom(now, retryAttempt, record)) ) case NonRetriableError => ZIO.succeed(NoMoreRetries) } - private def retryAttemptInternal(topic: Topic, attempt: Option[Int], submittedAt: Option[Instant], backoff: Option[Duration]) = - for { - a <- attempt - s <- submittedAt - b <- backoff - } yield RetryAttempt(topic, a, s, b) - private def recordFrom(now: Instant, retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]])( implicit trace: Trace ) = { val nextRetryAttempt = retryAttempt.fold(0)(_.attempt + 1) - for { - retryAttempt <- IntSerde.serialize(topic, nextRetryAttempt) - submittedAt <- InstantSerde.serialize(topic, now) - backoff <- DurationSerde.serialize(topic, 1.second) - } yield ProducerRecord( + ProducerRecord( topic = s"$topic-retry", value = record.value, key = record.key, partition = None, - headers = Headers(Header.Attempt -> retryAttempt, Header.SubmittedAt -> submittedAt, Header.Backoff -> backoff) + headers = RetryAttempt.toHeaders(RetryAttempt(topic, nextRetryAttempt, now, 1.second)) ) } } @@ -75,13 +49,8 @@ case class FakeRetryHelper(topic: Topic) extends FakeNonBlockingRetryHelper object FakeRetryHelper { implicit private val trace = Trace.empty - object Header { - val Attempt = "retry-attempt" - val SubmittedAt = "retry-submitted-at" - val Backoff = "retry-backoff" - } - val currentTime = Clock.currentTime(MILLISECONDS).map(Instant.ofEpochMilli) + val currentTime: UIO[Instant] = Clock.instant } sealed trait HandlerError From 2a49a320443156dc527450475fb952a36ce69893 Mon Sep 17 00:00:00 2001 From: Natan Silnitsky Date: Mon, 3 Apr 2023 15:22:47 +0300 Subject: [PATCH 04/52] Revert "Blocking retries attempts tracking fix" (#33818) Revert "Blocking retries attempts tracking fix (#32757)" This reverts commit 7e0e8cbccdd4c5fe6446775262b6695697785b2b. GitOrigin-RevId: eb859fcf70860c06a0bd0492b746ed3ff00bc8ac --- .../retry/BlockingRetryRecordHandler.scala | 47 +----- .../retry/NonBlockingRetryHelper.scala | 137 ++++++++++++------ .../retry/NonBlockingRetryRecordHandler.scala | 7 +- .../core/consumer/retry/RetryAttempt.scala | 98 ------------- .../consumer/retry/RetryRecordHandler.scala | 13 +- .../consumer/retry/RetryAttemptTest.scala | 78 ---------- .../RetryConsumerRecordHandlerTest.scala | 32 +++- .../core/testkit/FakeRetryHelper.scala | 43 +++++- 8 files changed, 175 insertions(+), 280 deletions(-) delete mode 100644 core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala delete mode 100644 core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala index 2f5d59a9..502e31e1 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala @@ -3,8 +3,9 @@ package com.wixpress.dst.greyhound.core.consumer.retry import java.util.concurrent.TimeUnit import com.wixpress.dst.greyhound.core.{Group, TopicPartition} import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHandler} -import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, IgnoringOnce, Blocking => InternalBlocking} +import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, Blocking => InternalBlocking, IgnoringOnce} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.{BlockingRetryHandlerInvocationFailed, DoneBlockingBeforeRetry, NoRetryOnNonRetryableFailure} +import com.wixpress.dst.greyhound.core.consumer.retry.ZIOHelper.foreachWhile import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics.report import com.wixpress.dst.greyhound.core.zioutils.AwaitShutdown @@ -30,7 +31,7 @@ private[retry] object BlockingRetryRecordHandler { override def handle(record: ConsumerRecord[K, V])(implicit trace: Trace): ZIO[GreyhoundMetrics with R, Nothing, LastHandleResult] = { val topicPartition = TopicPartition(record.topic, record.partition) - def pollBlockingStateWithSuspensions(record: ConsumerRecord[K, V], interval: Duration, start: Long): URIO[GreyhoundMetrics, PollResult] = { + def pollBlockingStateWithSuspensions(interval: Duration, start: Long): URIO[GreyhoundMetrics, PollResult] = { for { shouldBlock <- blockingStateResolver.resolve(record) shouldPollAgain <- @@ -42,14 +43,14 @@ private[retry] object BlockingRetryRecordHandler { } yield shouldPollAgain } - def blockOnErrorFor(record: ConsumerRecord[K, V], interval: Duration) = { + def blockOnErrorFor(interval: Duration) = { for { start <- currentTime(TimeUnit.MILLISECONDS) continueBlocking <- if (interval.toMillis > 100L) { awaitShutdown(record.topicPartition).flatMap( _.interruptOnShutdown( - pollBlockingStateWithSuspensions(record, interval, start).repeatWhile(result => result.pollAgain).map(_.blockHandling) + pollBlockingStateWithSuspensions(interval, start).repeatWhile(result => result.pollAgain).map(_.blockHandling) ).reporting(r => DoneBlockingBeforeRetry(record.topic, record.partition, record.offset, r.duration, r.failed)) ) } else { @@ -62,7 +63,6 @@ private[retry] object BlockingRetryRecordHandler { } def handleAndMaybeBlockOnErrorFor( - record: ConsumerRecord[K, V], interval: Option[Duration] ): ZIO[R with GreyhoundMetrics, Nothing, LastHandleResult] = { handler.handle(record).map(_ => LastHandleResult(lastHandleSucceeded = true, shouldContinue = false)).catchAll { @@ -73,7 +73,7 @@ private[retry] object BlockingRetryRecordHandler { case error => interval .map { interval => - report(BlockingRetryHandlerInvocationFailed(topicPartition, record.offset, error.toString)) *> blockOnErrorFor(record, interval) + report(BlockingRetryHandlerInvocationFailed(topicPartition, record.offset, error.toString)) *> blockOnErrorFor(interval) } .getOrElse(ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false))) } @@ -96,44 +96,13 @@ private[retry] object BlockingRetryRecordHandler { } else { val durationsIncludingForInvocationWithNoErrorHandling = retryConfig.blockingBackoffs(record.topic)().map(Some(_)) :+ None for { - result <- retryEvery(record, durationsIncludingForInvocationWithNoErrorHandling) { (rec, interval) => - handleAndMaybeBlockOnErrorFor(rec, interval) - } - _ <- maybeBackToStateBlocking + result <- foreachWhile(durationsIncludingForInvocationWithNoErrorHandling) { interval => handleAndMaybeBlockOnErrorFor(interval) } + _ <- maybeBackToStateBlocking } yield result } } } - private def retryEvery[K, V, R, E](record: ConsumerRecord[K, V], as: Iterable[Option[Duration]])( - f: (ConsumerRecord[K, V], Option[Duration]) => ZIO[R, E, LastHandleResult] - )(implicit trace: Trace): ZIO[R, E, LastHandleResult] = { - ZIO.succeed(as.iterator).flatMap { i => - def loop(retryAttempt: Option[RetryAttempt]): ZIO[R, E, LastHandleResult] = - if (i.hasNext) { - val nextDelay = i.next - val recordWithAttempt = retryAttempt.fold(record) { attempt => - record.copy(headers = record.headers ++ RetryAttempt.toHeaders(attempt)) - } - f(recordWithAttempt, nextDelay).flatMap { result => - if (result.shouldContinue) Clock.instant.flatMap { now => - val nextAttempt = RetryAttempt( - originalTopic = record.topic, - attempt = retryAttempt.fold(0)(_.attempt + 1), - submittedAt = now, - backoff = nextDelay getOrElse Duration.Zero - ) - loop(Some(nextAttempt)) - } - else ZIO.succeed(result) - } - } - else ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) - - loop(None) - } - } - private def handleNonRetriable[K, V, E, R](record: ConsumerRecord[K, V], topicPartition: TopicPartition, cause: Exception) = report(NoRetryOnNonRetryableFailure(topicPartition, record.offset, cause)) .as(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala index 1d1cd24c..a73b7dce 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala @@ -1,23 +1,32 @@ package com.wixpress.dst.greyhound.core.consumer.retry +import java.time.{Duration => JavaDuration, Instant} +import java.util.concurrent.TimeUnit.MILLISECONDS +import java.util.regex.Pattern +import com.wixpress.dst.greyhound.core.Serdes.StringSerde import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.{TopicPattern, Topics} -import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription} +import com.wixpress.dst.greyhound.core.consumer.retry.RetryAttempt.{currentTime, RetryAttemptNumber} import com.wixpress.dst.greyhound.core.consumer.retry.RetryDecision.{NoMoreRetries, RetryWith} +import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.WaitingForRetry import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics.report import com.wixpress.dst.greyhound.core.producer.ProducerRecord -import com.wixpress.dst.greyhound.core.{Group, Topic} +import com.wixpress.dst.greyhound.core.{durationDeserializer, instantDeserializer, Group, Headers, Topic} +import zio.Clock +import zio.Duration import zio.Schedule.spaced -import zio.{Chunk, Clock, Duration, URIO, _} +import zio.{Chunk, UIO, URIO, _} -import java.time.Instant -import java.util.regex.Pattern import scala.util.Try trait NonBlockingRetryHelper { def retryTopicsFor(originalTopic: Topic): Set[Topic] + def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( + implicit trace: Trace + ): UIO[Option[RetryAttempt]] + def retryDecision[E]( retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], @@ -25,7 +34,7 @@ trait NonBlockingRetryHelper { subscription: ConsumerSubscription )(implicit trace: Trace): URIO[Any, RetryDecision] - def retrySteps: Int = retryTopicsFor("").size + def retrySteps = retryTopicsFor("").size } object NonBlockingRetryHelper { @@ -40,70 +49,82 @@ object NonBlockingRetryHelper { override def retryTopicsFor(topic: Topic): Set[Topic] = policy(topic).intervals.indices.foldLeft(Set.empty[String])((acc, attempt) => acc + s"$topic-$group-retry-$attempt") + override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( + implicit trace: Trace + ): UIO[Option[RetryAttempt]] = { + (for { + submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) + backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) + originalTopic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) + } yield for { + ta <- topicAttempt(subscription, topic, originalTopic) + TopicAttempt(originalTopic, attempt) = ta + s <- submitted + b <- backoff + } yield RetryAttempt(originalTopic, attempt, s, b)) + .catchAll(_ => ZIO.none) + } + + private def topicAttempt( + subscription: ConsumerSubscription, + topic: Topic, + originalTopicHeader: Option[String] + ) = + subscription match { + case _: Topics => extractTopicAttempt(group, topic) + case _: TopicPattern => + extractTopicAttemptFromPatternRetryTopic(group, topic, originalTopicHeader) + } + override def retryDecision[E]( retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], error: E, subscription: ConsumerSubscription - )(implicit trace: Trace): URIO[Any, RetryDecision] = Clock.instant.map(now => { - val blockingRetriesBefore = RetryAttempt.maxBlockingAttempts( - NonBlockingRetryHelper.originalTopic(record.topic, group), - retryConfig - ).getOrElse(0) - - // attempt if present contains full number of retries - val nextNonBlockingAttempt = retryAttempt.fold(0)(_.attempt + 1 - blockingRetriesBefore) - val nextRetryAttempt = nextNonBlockingAttempt + blockingRetriesBefore + )(implicit trace: Trace): URIO[Any, RetryDecision] = currentTime.map(now => { + val nextRetryAttempt = retryAttempt.fold(0)(_.attempt + 1) val originalTopic = retryAttempt.fold(record.topic)(_.originalTopic) val retryTopic = subscription match { - case _: TopicPattern => patternRetryTopic(group, nextNonBlockingAttempt) - case _: Topics => fixedRetryTopic(originalTopic, group, nextNonBlockingAttempt) + case _: TopicPattern => patternRetryTopic(group, nextRetryAttempt) + case _: Topics => fixedRetryTopic(originalTopic, group, nextRetryAttempt) } val topicRetryPolicy = policy(record.topic) topicRetryPolicy.intervals - .lift(nextNonBlockingAttempt) + .lift(nextRetryAttempt) .map { backoff => - val attempt = RetryAttempt( - attempt = nextRetryAttempt, - originalTopic = originalTopic, - submittedAt = now, - backoff = backoff - ) topicRetryPolicy.recordMutate( ProducerRecord( topic = retryTopic, value = record.value, key = record.key, partition = None, - headers = record.headers ++ RetryAttempt.toHeaders(attempt) + headers = record.headers + + (RetryHeader.Submitted -> toChunk(now.toEpochMilli)) + + (RetryHeader.Backoff -> toChunk(backoff.toMillis)) + + (RetryHeader.OriginalTopic -> toChunk(originalTopic)) + + (RetryHeader.RetryAttempt -> toChunk(nextRetryAttempt)) ) ) } .fold[RetryDecision](NoMoreRetries)(RetryWith) }) - } - private[retry] def attemptNumberFromTopic( - subscription: ConsumerSubscription, - topic: Topic, - originalTopicHeader: Option[String], - group: Group - ) = - subscription match { - case _: Topics => extractTopicAttempt(group, topic) - case _: TopicPattern => - extractTopicAttemptFromPatternRetryTopic(group, topic, originalTopicHeader) + private def toChunk(long: Long): Chunk[Byte] = + Chunk.fromArray(long.toString.getBytes) + + private def toChunk(str: String): Chunk[Byte] = + Chunk.fromArray(str.getBytes) } - private def extractTopicAttempt(group: Group, inputTopic: Topic) = + private def extractTopicAttempt[E](group: Group, inputTopic: Topic) = inputTopic.split(s"-$group-retry-").toSeq match { case Seq(topic, attempt) if Try(attempt.toInt).isSuccess => Some(TopicAttempt(topic, attempt.toInt)) - case _ => None + case _ => None } - private def extractTopicAttemptFromPatternRetryTopic( + private def extractTopicAttemptFromPatternRetryTopic[E]( group: Group, inputTopic: Topic, originalTopicHeader: Option[String] @@ -145,27 +166,49 @@ object DelayHeaders { val Backoff = "backOffTimeMs" } -object RetryUtil { - def sleep(attempt: RetryAttempt)(implicit trace: Trace): URIO[GreyhoundMetrics, Unit] = - sleep(attempt.submittedAt, attempt.backoff) race - report(WaitingForRetry(attempt.originalTopic, attempt.attempt, attempt.submittedAt.toEpochMilli, attempt.backoff.toMillis)) - .repeat(spaced(60.seconds)) - .unit +object RetryHeader { + val Submitted = "submitTimestamp" + val Backoff = DelayHeaders.Backoff + val OriginalTopic = "GH_OriginalTopic" + val RetryAttempt = "GH_RetryAttempt" +} + +case class RetryAttempt( + originalTopic: Topic, + attempt: RetryAttemptNumber, + submittedAt: Instant, + backoff: Duration +) { + + def sleep(implicit trace: Trace): URIO[GreyhoundMetrics, Unit] = + RetryUtil.sleep(submittedAt, backoff) race reportWaitingInIntervals(every = 60.seconds) + private def reportWaitingInIntervals(every: Duration) = + report(WaitingForRetry(originalTopic, attempt, submittedAt.toEpochMilli, backoff.toMillis)) + .repeat(spaced(every)) + .unit +} + +object RetryUtil { def sleep(submittedAt: Instant, backoff: Duration)(implicit trace: Trace): URIO[Any, Unit] = { val expiresAt = submittedAt.plus(backoff.asJava) - Clock.instant + currentTime .map(_.isAfter(expiresAt)) .flatMap(expired => if (expired) ZIO.unit else - ZIO.sleep(1.second).repeatUntilZIO(_ => Clock.instant.map(_.isAfter(expiresAt))).unit + ZIO.sleep(1.seconds).repeatUntilZIO(_ => currentTime.map(_.isAfter(expiresAt))).unit ) } } private case class TopicAttempt(originalTopic: Topic, attempt: Int) +object RetryAttempt { + type RetryAttemptNumber = Int + val currentTime = Clock.currentTime(MILLISECONDS).map(Instant.ofEpochMilli) +} + sealed trait RetryDecision object RetryDecision { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala index a6ff6560..a15f1e5b 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala @@ -31,12 +31,11 @@ private[retry] object NonBlockingRetryRecordHandler { retryConfig: RetryConfig, subscription: ConsumerSubscription, nonBlockingRetryHelper: NonBlockingRetryHelper, - groupId: Group, awaitShutdown: TopicPartition => UIO[AwaitShutdown] )(implicit evK: K <:< Chunk[Byte], evV: V <:< Chunk[Byte]): NonBlockingRetryRecordHandler[V, K, R] = new NonBlockingRetryRecordHandler[V, K, R] { override def handle(record: ConsumerRecord[K, V]): ZIO[GreyhoundMetrics with R, Nothing, Any] = { - RetryAttempt.extract(record.headers, record.topic, groupId, subscription, Some(retryConfig)).flatMap { retryAttempt => + nonBlockingRetryHelper.retryAttempt(record.topic, record.headers, subscription).flatMap { retryAttempt => maybeDelayRetry(record, retryAttempt) *> handler.handle(record).catchAll { case Right(_: NonRetriableException) => ZIO.unit @@ -57,7 +56,7 @@ private[retry] object NonBlockingRetryRecordHandler { WaitingBeforeRetry(record.topic, retryAttempt, record.partition, record.offset, correlationId) ) *> awaitShutdown(record.topicPartition) - .flatMap(_.interruptOnShutdown(RetryUtil.sleep(retryAttempt))) + .flatMap(_.interruptOnShutdown(retryAttempt.sleep)) .reporting(r => DoneWaitingBeforeRetry(record.topic, record.partition, record.offset, retryAttempt, r.duration, r.failed, correlationId) ) @@ -75,7 +74,7 @@ private[retry] object NonBlockingRetryRecordHandler { override def handleAfterBlockingFailed( record: ConsumerRecord[K, V] ): ZIO[GreyhoundMetrics with R, Nothing, Any] = { - RetryAttempt.extract(record.headers, record.topic, groupId, subscription, Some(retryConfig)).flatMap { retryAttempt => + nonBlockingRetryHelper.retryAttempt(record.topic, record.headers, subscription).flatMap { retryAttempt => maybeRetry(retryAttempt, BlockingHandlerFailed, record) } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala deleted file mode 100644 index 9265eab8..00000000 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala +++ /dev/null @@ -1,98 +0,0 @@ -package com.wixpress.dst.greyhound.core.consumer.retry - -import com.wixpress.dst.greyhound.core.Serdes.StringSerde -import com.wixpress.dst.greyhound.core._ -import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription -import com.wixpress.dst.greyhound.core.consumer.retry.NonBlockingRetryHelper.attemptNumberFromTopic -import com.wixpress.dst.greyhound.core.consumer.retry.RetryAttempt.RetryAttemptNumber -import zio._ - -import java.time.Instant - -/** - * Description of a retry attempt - * @param attempt contains which attempt is it, starting from 0 including blocking and non-blocking attempts - */ -case class RetryAttempt( - originalTopic: Topic, - attempt: RetryAttemptNumber, - submittedAt: Instant, - backoff: Duration -) - -object RetryHeader { - val Submitted = "submitTimestamp" - val Backoff = DelayHeaders.Backoff - val OriginalTopic = "GH_OriginalTopic" - val RetryAttempt = "GH_RetryAttempt" -} - -object RetryAttempt { - type RetryAttemptNumber = Int - - private def toChunk(str: String): Chunk[Byte] = Chunk.fromArray(str.getBytes) - - def toHeaders(attempt: RetryAttempt): Headers = Headers( - RetryHeader.Submitted -> toChunk(attempt.submittedAt.toEpochMilli.toString), - RetryHeader.Backoff -> toChunk(attempt.backoff.toMillis.toString), - RetryHeader.OriginalTopic -> toChunk(attempt.originalTopic), - RetryHeader.RetryAttempt -> toChunk(attempt.attempt.toString), - ) - - private case class RetryAttemptHeaders( - originalTopic: Option[Topic], - attempt: Option[RetryAttemptNumber], - submittedAt: Option[Instant], - backoff: Option[Duration] - ) - - private def fromHeaders(headers: Headers): Task[RetryAttemptHeaders] = - for { - submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) - backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) - topic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) - attempt <- headers.get(RetryHeader.RetryAttempt, longDeserializer) - } yield RetryAttemptHeaders(topic, attempt.map(_.toInt), submitted, backoff) - - /** @return None on infinite blocking retries */ - def maxBlockingAttempts(topic: Topic, retryConfig: Option[RetryConfig]): Option[Int] = - retryConfig.map(_.blockingBackoffs(topic)()).fold(Option(0)) { - case finite if finite.hasDefiniteSize => Some(finite.size) - case _ => None - } - - /** @return None on infinite retries */ - def maxOverallAttempts(topic: Topic, retryConfig: Option[RetryConfig]): Option[Int] = - maxBlockingAttempts(topic, retryConfig).map { - _ + retryConfig.fold(0)(_.nonBlockingBackoffs(topic).length) - } - - def extract( - headers: Headers, - topic: Topic, - group: Group, - subscription: ConsumerSubscription, - retryConfig: Option[RetryConfig], - )(implicit trace: Trace): UIO[Option[RetryAttempt]] = { - - def nonBlockingAttempt(hs: RetryAttemptHeaders): Option[RetryAttempt] = - for { - submitted <- hs.submittedAt - backoff <- hs.backoff - TopicAttempt(originalTopic, attempt) <- attemptNumberFromTopic(subscription, topic, hs.originalTopic, group) - blockingRetries = maxBlockingAttempts(originalTopic, retryConfig).getOrElse(0) - } yield RetryAttempt(originalTopic, blockingRetries + attempt, submitted, backoff) - - def blockingAttempt(hs: RetryAttemptHeaders): Option[RetryAttempt] = - for { - submitted <- hs.submittedAt - backoff <- hs.backoff - originalTopic <- hs.originalTopic - attempt <- hs.attempt - } yield RetryAttempt(originalTopic, attempt, submitted, backoff) - - fromHeaders(headers).map { hs => - nonBlockingAttempt(hs) orElse blockingAttempt(hs) - } - }.catchAll(_ => ZIO.none) -} diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala index 8ee8ab9f..af0749c6 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala @@ -33,7 +33,7 @@ object RetryRecordHandler { ): RecordHandler[R with R2 with GreyhoundMetrics, Nothing, K, V] = { val nonBlockingHandler = - NonBlockingRetryRecordHandler(handler, producer, retryConfig, subscription, nonBlockingRetryHelper, groupId, awaitShutdown) + NonBlockingRetryRecordHandler(handler, producer, retryConfig, subscription, nonBlockingRetryHelper, awaitShutdown) val blockingHandler = BlockingRetryRecordHandler(groupId, handler, retryConfig, blockingState, nonBlockingHandler, awaitShutdown) val blockingAndNonBlockingHandler = BlockingAndNonBlockingRetryRecordHandler(groupId, blockingHandler, nonBlockingHandler) @@ -55,4 +55,15 @@ object RetryRecordHandler { record.headers.get[String](key, StringSerde).catchAll(_ => ZIO.none) } +object ZIOHelper { + def foreachWhile[R, E, A](as: Iterable[A])(f: A => ZIO[R, E, LastHandleResult])(implicit trace: Trace): ZIO[R, E, LastHandleResult] = + ZIO.succeed(as.iterator).flatMap { i => + def loop: ZIO[R, E, LastHandleResult] = + if (i.hasNext) f(i.next).flatMap(result => if (result.shouldContinue) loop else ZIO.succeed(result)) + else ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) + + loop + } +} + case class LastHandleResult(lastHandleSucceeded: Boolean, shouldContinue: Boolean) diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala deleted file mode 100644 index 3139f845..00000000 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala +++ /dev/null @@ -1,78 +0,0 @@ -package com.wixpress.dst.greyhound.core.consumer.retry - -import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription -import com.wixpress.dst.greyhound.core.testkit.BaseTest -import zio.test.TestEnvironment - -import java.time.{Duration, Instant} -import scala.util.Random -import scala.concurrent.duration._ - -class RetryAttemptTest extends BaseTest[TestEnvironment] { - - "RetryAttempt.extract" should { - "deserialize attempt from headers for blocking retries" in { - val attempt = randomRetryAttempt - val headers = RetryAttempt.toHeaders(attempt) - val subscription = ConsumerSubscription.Topics(Set(attempt.originalTopic)) - for (result <- RetryAttempt.extract(headers, attempt.originalTopic, randomStr, subscription, None)) - yield result must beSome(attempt) - } - "deserialize attempt from headers and topic for non-blocking retries" in { - val attempt = randomRetryAttempt - // topic and attempt must be extracted from retryTopic - val headers = RetryAttempt.toHeaders(attempt.copy(originalTopic = "", attempt = -1)) - val subscription = ConsumerSubscription.Topics(Set(attempt.originalTopic)) - val group = randomStr - val retryTopic = NonBlockingRetryHelper.fixedRetryTopic(attempt.originalTopic, group, attempt.attempt) - for (result <- RetryAttempt.extract(headers, retryTopic, group, subscription, None)) - yield result must beSome(attempt) - } - "deserialize attempt for non-blocking retry after blocking retries" in { - val attempt = randomRetryAttempt - val headers = RetryAttempt.toHeaders(attempt) - val subscription = ConsumerSubscription.Topics(Set(attempt.originalTopic)) - val group = randomStr - val retries = RetryConfig.blockingFollowedByNonBlockingRetry( - blockingBackoffs = 1.milli :: 1.second :: Nil, - nonBlockingBackoffs = 5.minutes :: Nil, - ) - val retryTopic = NonBlockingRetryHelper.fixedRetryTopic(attempt.originalTopic, group, attempt.attempt) - for (result <- RetryAttempt.extract(headers, retryTopic, group, subscription, Some(retries))) - yield result must beSome(attempt.copy(attempt = attempt.attempt + 2)) // with 2 blocking retries before - } - } - - "RetryAttempt.maxOverallAttempts" should { - "return 0 if no retries configured" in { - RetryAttempt.maxOverallAttempts(randomStr, None) must beSome(0) - } - "return max attempts for blocking retries" in { - val config = RetryConfig.finiteBlockingRetry(1.milli, 1.second) - RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beSome(2) - } - "return max attempts for non-blocking retries" in { - val config = RetryConfig.nonBlockingRetry(1.milli, 1.second, 5.minutes) - RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beSome(3) - } - "return max attempts for blocking retries followed by non-blocking" in { - val config = RetryConfig.blockingFollowedByNonBlockingRetry(1.milli :: 2.seconds :: Nil, 1.minute :: Nil) - RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beSome(3) - } - "return None for infinite blocking retries" in { - val config = RetryConfig.infiniteBlockingRetry(1.milli) - RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beNone - } - } - - override def env = testEnvironment - - private def randomStr = Random.alphanumeric.take(10).mkString - - private def randomRetryAttempt = RetryAttempt( - originalTopic = randomStr, - attempt = Random.nextInt(1000), - submittedAt = Instant.ofEpochMilli(math.abs(Random.nextLong())), - backoff = Duration.ofMillis(Random.nextInt(100000)) - ) -} diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala index b34e8d82..c713fb12 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala @@ -4,7 +4,7 @@ import java.time.Instant import com.wixpress.dst.greyhound.core.Serdes._ import com.wixpress.dst.greyhound.core._ import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.Topics -import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHandler} +import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription, RecordHandler} import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, Blocking => InternalBlocking, IgnoringAll, IgnoringOnce} import com.wixpress.dst.greyhound.core.consumer.retry.RetryConsumerRecordHandlerTest.{offset, partition, _} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.{BlockingIgnoredForAllFor, BlockingIgnoredOnceFor, BlockingRetryHandlerInvocationFailed, NoRetryOnNonRetryableFailure} @@ -21,6 +21,8 @@ import zio.Random.{nextBytes, nextIntBounded} import zio.managed.UManaged import zio.test.TestClock +import scala.concurrent.TimeoutException + class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics] { override def env: UManaged[ZEnvironment[TestClock with TestMetrics]] = @@ -50,6 +52,9 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) record <- producer.records.take now <- currentTime + retryAttempt <- IntSerde.serialize(retryTopic, 0) + submittedAt <- InstantSerde.serialize(retryTopic, now) + backoff <- DurationSerde.serialize(retryTopic, 1.second) } yield { record === ProducerRecord( @@ -57,7 +62,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics value, Some(key), partition = None, - headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, now, 1.second)) + headers = Headers("retry-attempt" -> retryAttempt, "retry-submitted-at" -> submittedAt, "retry-backoff" -> backoff) ) } } @@ -81,7 +86,10 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) value <- bytes begin <- currentTime - headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, begin, 1.second)) + retryAttempt <- IntSerde.serialize(retryTopic, 0) + submittedAt <- InstantSerde.serialize(retryTopic, begin) + backoff <- DurationSerde.serialize(retryTopic, 1.second) + headers = Headers("retry-attempt" -> retryAttempt, "retry-submitted-at" -> submittedAt, "retry-backoff" -> backoff) _ <- retryHandler.handle(ConsumerRecord(retryTopic, partition, offset, headers, None, value, 0L, 0L, 0L)).fork _ <- TestClock.adjust(1.second).repeat(Schedule.once) end <- executionTime.await.disconnect.timeoutFail(TimeoutWaitingForAssertion)(5.seconds) @@ -396,8 +404,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics producer <- FakeProducer.make topic <- randomTopicName blockingState <- Ref.make[Map[BlockingTarget, BlockingState]](Map.empty) - retryHelper = FakeRetryHelper(topic) - now <- Clock.instant + retryHelper = alwaysBackOffRetryHelper(3.seconds) handling <- AwaitShutdown.makeManaged.flatMap { awaitShutdown => val retryHandler = RetryRecordHandler.withRetries( group, @@ -409,12 +416,11 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics retryHelper, awaitShutdown = _ => ZIO.succeed(awaitShutdown) ) - val headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, now, 3.seconds)) for { key <- bytes value <- bytes handling <- retryHandler - .handle(ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L)) + .handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) .forkDaemon } yield handling } @@ -528,6 +534,18 @@ object RetryConsumerRecordHandlerTest { def randomTopicName = randomStr.map(suffix => s"some-topic-$suffix") val cause = new RuntimeException("cause") + + def alwaysBackOffRetryHelper(backoff: Duration) = { + new FakeNonBlockingRetryHelper { + override val topic: Topic = "" + + override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( + implicit trace: Trace + ): UIO[Option[RetryAttempt]] = ZIO.succeed( + Some(RetryAttempt(topic, 1, Instant.now, backoff)) + ) + } + } } object TimeoutWaitingForAssertion extends RuntimeException diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala index 619ada36..cee13948 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala @@ -1,21 +1,35 @@ package com.wixpress.dst.greyhound.core.testkit import java.time.Instant +import java.util.concurrent.TimeUnit.MILLISECONDS + +import com.wixpress.dst.greyhound.core.Serdes._ import com.wixpress.dst.greyhound.core._ import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription} import com.wixpress.dst.greyhound.core.consumer.retry.RetryDecision.{NoMoreRetries, RetryWith} -import com.wixpress.dst.greyhound.core.consumer.retry.{BlockingHandlerFailed, NonBlockingRetryHelper, RetryAttempt, RetryDecision, RetryHeader} +import com.wixpress.dst.greyhound.core.consumer.retry.{BlockingHandlerFailed, NonBlockingRetryHelper, RetryAttempt, RetryDecision} import com.wixpress.dst.greyhound.core.producer.ProducerRecord import com.wixpress.dst.greyhound.core.testkit.FakeRetryHelper._ import zio._ import zio.Clock +import zio.Clock + trait FakeNonBlockingRetryHelper extends NonBlockingRetryHelper { val topic: Topic override def retryTopicsFor(originalTopic: Topic): Set[Topic] = Set(s"$originalTopic-retry") + override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( + implicit trace: Trace + ): UIO[Option[RetryAttempt]] = + (for { + attempt <- headers.get(Header.Attempt, IntSerde) + submittedAt <- headers.get(Header.SubmittedAt, InstantSerde) + backoff <- headers.get(Header.Backoff, DurationSerde) + } yield retryAttemptInternal(topic, attempt, submittedAt, backoff)).orElse(ZIO.none) + override def retryDecision[E]( retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], @@ -24,23 +38,35 @@ trait FakeNonBlockingRetryHelper extends NonBlockingRetryHelper { )(implicit trace: Trace): URIO[Any, RetryDecision] = error match { case RetriableError | BlockingHandlerFailed => - currentTime.map(now => - RetryWith(recordFrom(now, retryAttempt, record)) + currentTime.flatMap(now => + recordFrom(now, retryAttempt, record) + .fold(_ => NoMoreRetries, RetryWith) ) case NonRetriableError => ZIO.succeed(NoMoreRetries) } + private def retryAttemptInternal(topic: Topic, attempt: Option[Int], submittedAt: Option[Instant], backoff: Option[Duration]) = + for { + a <- attempt + s <- submittedAt + b <- backoff + } yield RetryAttempt(topic, a, s, b) + private def recordFrom(now: Instant, retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]])( implicit trace: Trace ) = { val nextRetryAttempt = retryAttempt.fold(0)(_.attempt + 1) - ProducerRecord( + for { + retryAttempt <- IntSerde.serialize(topic, nextRetryAttempt) + submittedAt <- InstantSerde.serialize(topic, now) + backoff <- DurationSerde.serialize(topic, 1.second) + } yield ProducerRecord( topic = s"$topic-retry", value = record.value, key = record.key, partition = None, - headers = RetryAttempt.toHeaders(RetryAttempt(topic, nextRetryAttempt, now, 1.second)) + headers = Headers(Header.Attempt -> retryAttempt, Header.SubmittedAt -> submittedAt, Header.Backoff -> backoff) ) } } @@ -49,8 +75,13 @@ case class FakeRetryHelper(topic: Topic) extends FakeNonBlockingRetryHelper object FakeRetryHelper { implicit private val trace = Trace.empty + object Header { + val Attempt = "retry-attempt" + val SubmittedAt = "retry-submitted-at" + val Backoff = "retry-backoff" + } - val currentTime: UIO[Instant] = Clock.instant + val currentTime = Clock.currentTime(MILLISECONDS).map(Instant.ofEpochMilli) } sealed trait HandlerError From 87d8c14db8b43567b5021f6f2d530de72b612ff2 Mon Sep 17 00:00:00 2001 From: Noam Berman Date: Mon, 3 Apr 2023 17:26:15 +0300 Subject: [PATCH 05/52] [greyhound] remove internal topic creation - wix adapter (#33820) not create retry topics inside RecordConsumer builder, from wix adapter (it's already created). Removing last reference to an actual AdminClient in wix-adapter when asking to use proxy. GitOrigin-RevId: fb1416141b6b05559b1b606576cc03556b6a78f4 --- .../dst/greyhound/core/consumer/RecordConsumer.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala index 883f17a8..1b426d60 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala @@ -79,7 +79,7 @@ object RecordConsumer { (initialSubscription, topicsToCreate) = config.retryConfig.fold((config.initialSubscription, Set.empty[Topic]))(policy => maybeAddRetryTopics(policy, config, nonBlockingRetryHelper) ) - _ <- AdminClient + _ <- ZIO.when(config.createRetryTopics)(AdminClient .make(AdminClientConfig(config.bootstrapServers, config.kafkaAuthProperties), config.consumerAttributes) .tap(client => client.createTopics( @@ -87,7 +87,7 @@ object RecordConsumer { TopicConfig(topic, partitions = 1, replicationFactor = 1, cleanupPolicy = CleanupPolicy.Delete(86400000L)) ) ) - ) + )) blockingState <- Ref.make[Map[BlockingTarget, BlockingState]](Map.empty) blockingStateResolver = BlockingStateResolver(blockingState) workersShutdownRef <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) @@ -321,7 +321,8 @@ case class RecordConsumerConfig( decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, retryProducerAttributes: Map[String, String] = Map.empty, commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, - rewindUncommittedOffsetsBy: Duration = 0.millis + rewindUncommittedOffsetsBy: Duration = 0.millis, + createRetryTopics: Boolean = true ) extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = extraProperties From 199c66cebe54e89c9608b72351b22200f9aa635a Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Tue, 4 Apr 2023 12:03:59 +0300 Subject: [PATCH 06/52] [greyhound] parallel consumer OffsetsAndGaps (#33605) GitOrigin-RevId: f178c94663c7cbcb22bd7266c3e15919d8997d8c --- .../core/consumer/OffsetsAndGaps.scala | 92 +++++++++++++++++++ .../core/consumer/OffsetsAndGapsTest.scala | 62 +++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala create mode 100644 core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala new file mode 100644 index 00000000..f5718555 --- /dev/null +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -0,0 +1,92 @@ +package com.wixpress.dst.greyhound.core.consumer + +import com.wixpress.dst.greyhound.core.{Offset, TopicPartition} +import zio._ + +trait OffsetsAndGaps { + def getCommittableAndClear: UIO[Map[TopicPartition, OffsetAndGaps]] + + def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] + + def update(partition: TopicPartition, batch: Seq[Offset]): UIO[Unit] + + def contains(partition: TopicPartition, offset: Offset): UIO[Boolean] +} + +object OffsetsAndGaps { + def make: UIO[OffsetsAndGaps] = + Ref.make(Map.empty[TopicPartition, OffsetAndGaps]).map { ref => + new OffsetsAndGaps { + override def getCommittableAndClear: UIO[Map[TopicPartition, OffsetAndGaps]] = + ref.modify(offsetsAndGaps => { + val committable = offsetsAndGaps.filter(_._2.committable) + val updated = offsetsAndGaps.mapValues(_.markCommitted) + (committable, updated) + }) + + override def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] = + ref.get.map(_.get(partition).fold(Seq.empty[Gap])(_.gaps.sortBy(_.start))) + + override def update(partition: TopicPartition, batch: Seq[Offset]): UIO[Unit] = + ref.update { offsetsAndGaps => + val sortedBatch = batch.sorted + val maxBatchOffset = sortedBatch.last + val maybeOffsetAndGaps = offsetsAndGaps.get(partition) + val prevOffset = maybeOffsetAndGaps.fold(-1L)(_.offset) + val partitionOffsetAndGaps = maybeOffsetAndGaps.fold(OffsetAndGaps(maxBatchOffset))(identity) + + val newGaps = gapsInBatch(sortedBatch, prevOffset) + + val updatedGaps = updateGapsByOffsets( + partitionOffsetAndGaps.gaps ++ newGaps, + sortedBatch + ) + + offsetsAndGaps + (partition -> OffsetAndGaps(maxBatchOffset max prevOffset, updatedGaps)) + }.unit + + override def contains(partition: TopicPartition, offset: Offset): UIO[Boolean] = + ref.get.map(_.get(partition).fold(false)(_.contains(offset))) + + private def gapsInBatch(batch: Seq[Offset], prevLastOffset: Offset): Seq[Gap] = + batch.sorted + .foldLeft(Seq.empty[Gap], prevLastOffset) { + case ((gaps, lastOffset), offset) => + if (offset <= lastOffset) (gaps, lastOffset) + else if (offset == lastOffset + 1) (gaps, offset) + else { + val newGap = Gap(lastOffset + 1, offset - 1) + (newGap +: gaps, offset) + } + } + ._1 + .reverse + + private def updateGapsByOffsets(gaps: Seq[Gap], offsets: Seq[Offset]): Seq[Gap] = { + val gapsToOffsets = gaps.map(gap => gap -> offsets.filter(o => o >= gap.start && o <= gap.end)).toMap + gapsToOffsets.flatMap { + case (gap, offsets) => + if (offsets.isEmpty) Seq(gap) + else if (offsets.size == (gap.size)) Seq.empty[Gap] + else gapsInBatch(offsets ++ Seq(gap.start - 1, gap.end + 1), gap.start - 2) + }.toSeq + } + } + } +} + +case class Gap(start: Offset, end: Offset) { + def contains(offset: Offset): Boolean = start <= offset && offset <= end + + def size: Long = end - start + 1 +} + +case class OffsetAndGaps(offset: Offset, gaps: Seq[Gap], committable: Boolean = true) { + def contains(offset: Offset): Boolean = gaps.exists(_.contains(offset)) + + def markCommitted: OffsetAndGaps = copy(committable = false) +} + +object OffsetAndGaps { + def apply(offset: Offset): OffsetAndGaps = OffsetAndGaps(offset, Seq.empty[Gap]) +} diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala new file mode 100644 index 00000000..df765ce5 --- /dev/null +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala @@ -0,0 +1,62 @@ +package com.wixpress.dst.greyhound.core.consumer + +import com.wixpress.dst.greyhound.core.TopicPartition +import com.wixpress.dst.greyhound.core.consumer.OffsetGapsTest._ +import com.wixpress.dst.greyhound.core.testkit.BaseTestNoEnv + +class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { + + "calculate gaps created by handled batch" in { + for { + offsetGaps <- OffsetsAndGaps.make + _ <- offsetGaps.update(topicPartition, Seq(1L, 3L, 7L)) + currentGaps <- offsetGaps.gapsForPartition(topicPartition) + } yield currentGaps must beEqualTo(Seq(Gap(0L, 0L), Gap(2L, 2L), Gap(4L, 6L))) + } + + "update offset and gaps according to handled batch" in { + for { + offsetGaps <- OffsetsAndGaps.make + _ <- offsetGaps.update(topicPartition, Seq(1L, 3L, 7L)) + _ <- offsetGaps.update(topicPartition, Seq(2L, 5L)) + getCommittableAndClear <- offsetGaps.getCommittableAndClear + } yield getCommittableAndClear must havePair(topicPartition -> OffsetAndGaps(7L, Seq(Gap(0L, 0L), Gap(4L, 4L), Gap(6L, 6L)))) + } + + "clear committable offsets" in { + for { + offsetGaps <- OffsetsAndGaps.make + _ <- offsetGaps.update(topicPartition, Seq(1L, 3L, 7L)) + _ <- offsetGaps.getCommittableAndClear + getCommittableAndClear <- offsetGaps.getCommittableAndClear + } yield getCommittableAndClear must beEmpty + } + + "do not clear gaps on retrieving current" in { + for { + offsetGaps <- OffsetsAndGaps.make + _ <- offsetGaps.update(topicPartition, Seq(1L, 3L, 7L)) + _ <- offsetGaps.gapsForPartition(topicPartition) + currentGaps <- offsetGaps.gapsForPartition(topicPartition) + } yield currentGaps must beEqualTo(Seq(Gap(0L, 0L), Gap(2L, 2L), Gap(4L, 6L))) + } + + "update with larger offset" in { + val partition0 = TopicPartition(topic, 0) + val partition1 = TopicPartition(topic, 1) + + for { + offsetGaps <- OffsetsAndGaps.make + _ <- offsetGaps.update(partition0, Seq(1L)) + _ <- offsetGaps.update(partition0, Seq(0L)) + _ <- offsetGaps.update(partition1, Seq(0L)) + current <- offsetGaps.getCommittableAndClear + } yield current must havePairs(partition0 -> OffsetAndGaps(1L, Seq()), partition1 -> OffsetAndGaps(0L, Seq())) + } + +} + +object OffsetGapsTest { + val topic = "some-topic" + val topicPartition = TopicPartition(topic, 0) +} From b46ad513a6af5cea1d934f7bde40728c900a5917 Mon Sep 17 00:00:00 2001 From: Natan Silnitsky Date: Tue, 4 Apr 2023 13:53:02 +0300 Subject: [PATCH 07/52] Blocking retries attempts tracking fix + fix transitive header bug (#33827) * Revert "Revert "Blocking retries attempts tracking fix" (#33818)" This reverts commit eb859fcf70860c06a0bd0492b746ed3ff00bc8ac. * fix bug on retryAttempt resolution - ignore propagated headers * CR change * fix test. change retry name to correctly structured one GitOrigin-RevId: 53fb7cf3efa315f496a00ade574397e30f20eeaf --- .../retry/BlockingRetryRecordHandler.scala | 47 +++++- .../retry/NonBlockingRetryHelper.scala | 137 ++++++------------ .../retry/NonBlockingRetryRecordHandler.scala | 7 +- .../core/consumer/retry/RetryAttempt.scala | 98 +++++++++++++ .../consumer/retry/RetryRecordHandler.scala | 13 +- .../consumer/retry/RetryAttemptTest.scala | 94 ++++++++++++ .../RetryConsumerRecordHandlerTest.scala | 35 ++--- .../core/testkit/FakeRetryHelper.scala | 43 +----- 8 files changed, 298 insertions(+), 176 deletions(-) create mode 100644 core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala create mode 100644 core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala index 502e31e1..2f5d59a9 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala @@ -3,9 +3,8 @@ package com.wixpress.dst.greyhound.core.consumer.retry import java.util.concurrent.TimeUnit import com.wixpress.dst.greyhound.core.{Group, TopicPartition} import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHandler} -import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, Blocking => InternalBlocking, IgnoringOnce} +import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, IgnoringOnce, Blocking => InternalBlocking} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.{BlockingRetryHandlerInvocationFailed, DoneBlockingBeforeRetry, NoRetryOnNonRetryableFailure} -import com.wixpress.dst.greyhound.core.consumer.retry.ZIOHelper.foreachWhile import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics.report import com.wixpress.dst.greyhound.core.zioutils.AwaitShutdown @@ -31,7 +30,7 @@ private[retry] object BlockingRetryRecordHandler { override def handle(record: ConsumerRecord[K, V])(implicit trace: Trace): ZIO[GreyhoundMetrics with R, Nothing, LastHandleResult] = { val topicPartition = TopicPartition(record.topic, record.partition) - def pollBlockingStateWithSuspensions(interval: Duration, start: Long): URIO[GreyhoundMetrics, PollResult] = { + def pollBlockingStateWithSuspensions(record: ConsumerRecord[K, V], interval: Duration, start: Long): URIO[GreyhoundMetrics, PollResult] = { for { shouldBlock <- blockingStateResolver.resolve(record) shouldPollAgain <- @@ -43,14 +42,14 @@ private[retry] object BlockingRetryRecordHandler { } yield shouldPollAgain } - def blockOnErrorFor(interval: Duration) = { + def blockOnErrorFor(record: ConsumerRecord[K, V], interval: Duration) = { for { start <- currentTime(TimeUnit.MILLISECONDS) continueBlocking <- if (interval.toMillis > 100L) { awaitShutdown(record.topicPartition).flatMap( _.interruptOnShutdown( - pollBlockingStateWithSuspensions(interval, start).repeatWhile(result => result.pollAgain).map(_.blockHandling) + pollBlockingStateWithSuspensions(record, interval, start).repeatWhile(result => result.pollAgain).map(_.blockHandling) ).reporting(r => DoneBlockingBeforeRetry(record.topic, record.partition, record.offset, r.duration, r.failed)) ) } else { @@ -63,6 +62,7 @@ private[retry] object BlockingRetryRecordHandler { } def handleAndMaybeBlockOnErrorFor( + record: ConsumerRecord[K, V], interval: Option[Duration] ): ZIO[R with GreyhoundMetrics, Nothing, LastHandleResult] = { handler.handle(record).map(_ => LastHandleResult(lastHandleSucceeded = true, shouldContinue = false)).catchAll { @@ -73,7 +73,7 @@ private[retry] object BlockingRetryRecordHandler { case error => interval .map { interval => - report(BlockingRetryHandlerInvocationFailed(topicPartition, record.offset, error.toString)) *> blockOnErrorFor(interval) + report(BlockingRetryHandlerInvocationFailed(topicPartition, record.offset, error.toString)) *> blockOnErrorFor(record, interval) } .getOrElse(ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false))) } @@ -96,13 +96,44 @@ private[retry] object BlockingRetryRecordHandler { } else { val durationsIncludingForInvocationWithNoErrorHandling = retryConfig.blockingBackoffs(record.topic)().map(Some(_)) :+ None for { - result <- foreachWhile(durationsIncludingForInvocationWithNoErrorHandling) { interval => handleAndMaybeBlockOnErrorFor(interval) } - _ <- maybeBackToStateBlocking + result <- retryEvery(record, durationsIncludingForInvocationWithNoErrorHandling) { (rec, interval) => + handleAndMaybeBlockOnErrorFor(rec, interval) + } + _ <- maybeBackToStateBlocking } yield result } } } + private def retryEvery[K, V, R, E](record: ConsumerRecord[K, V], as: Iterable[Option[Duration]])( + f: (ConsumerRecord[K, V], Option[Duration]) => ZIO[R, E, LastHandleResult] + )(implicit trace: Trace): ZIO[R, E, LastHandleResult] = { + ZIO.succeed(as.iterator).flatMap { i => + def loop(retryAttempt: Option[RetryAttempt]): ZIO[R, E, LastHandleResult] = + if (i.hasNext) { + val nextDelay = i.next + val recordWithAttempt = retryAttempt.fold(record) { attempt => + record.copy(headers = record.headers ++ RetryAttempt.toHeaders(attempt)) + } + f(recordWithAttempt, nextDelay).flatMap { result => + if (result.shouldContinue) Clock.instant.flatMap { now => + val nextAttempt = RetryAttempt( + originalTopic = record.topic, + attempt = retryAttempt.fold(0)(_.attempt + 1), + submittedAt = now, + backoff = nextDelay getOrElse Duration.Zero + ) + loop(Some(nextAttempt)) + } + else ZIO.succeed(result) + } + } + else ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) + + loop(None) + } + } + private def handleNonRetriable[K, V, E, R](record: ConsumerRecord[K, V], topicPartition: TopicPartition, cause: Exception) = report(NoRetryOnNonRetryableFailure(topicPartition, record.offset, cause)) .as(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala index a73b7dce..1d1cd24c 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryHelper.scala @@ -1,32 +1,23 @@ package com.wixpress.dst.greyhound.core.consumer.retry -import java.time.{Duration => JavaDuration, Instant} -import java.util.concurrent.TimeUnit.MILLISECONDS -import java.util.regex.Pattern -import com.wixpress.dst.greyhound.core.Serdes.StringSerde import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.{TopicPattern, Topics} -import com.wixpress.dst.greyhound.core.consumer.retry.RetryAttempt.{currentTime, RetryAttemptNumber} -import com.wixpress.dst.greyhound.core.consumer.retry.RetryDecision.{NoMoreRetries, RetryWith} import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription} +import com.wixpress.dst.greyhound.core.consumer.retry.RetryDecision.{NoMoreRetries, RetryWith} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.WaitingForRetry import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics.report import com.wixpress.dst.greyhound.core.producer.ProducerRecord -import com.wixpress.dst.greyhound.core.{durationDeserializer, instantDeserializer, Group, Headers, Topic} -import zio.Clock -import zio.Duration +import com.wixpress.dst.greyhound.core.{Group, Topic} import zio.Schedule.spaced -import zio.{Chunk, UIO, URIO, _} +import zio.{Chunk, Clock, Duration, URIO, _} +import java.time.Instant +import java.util.regex.Pattern import scala.util.Try trait NonBlockingRetryHelper { def retryTopicsFor(originalTopic: Topic): Set[Topic] - def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace - ): UIO[Option[RetryAttempt]] - def retryDecision[E]( retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], @@ -34,7 +25,7 @@ trait NonBlockingRetryHelper { subscription: ConsumerSubscription )(implicit trace: Trace): URIO[Any, RetryDecision] - def retrySteps = retryTopicsFor("").size + def retrySteps: Int = retryTopicsFor("").size } object NonBlockingRetryHelper { @@ -49,82 +40,70 @@ object NonBlockingRetryHelper { override def retryTopicsFor(topic: Topic): Set[Topic] = policy(topic).intervals.indices.foldLeft(Set.empty[String])((acc, attempt) => acc + s"$topic-$group-retry-$attempt") - override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace - ): UIO[Option[RetryAttempt]] = { - (for { - submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) - backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) - originalTopic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) - } yield for { - ta <- topicAttempt(subscription, topic, originalTopic) - TopicAttempt(originalTopic, attempt) = ta - s <- submitted - b <- backoff - } yield RetryAttempt(originalTopic, attempt, s, b)) - .catchAll(_ => ZIO.none) - } - - private def topicAttempt( - subscription: ConsumerSubscription, - topic: Topic, - originalTopicHeader: Option[String] - ) = - subscription match { - case _: Topics => extractTopicAttempt(group, topic) - case _: TopicPattern => - extractTopicAttemptFromPatternRetryTopic(group, topic, originalTopicHeader) - } - override def retryDecision[E]( retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], error: E, subscription: ConsumerSubscription - )(implicit trace: Trace): URIO[Any, RetryDecision] = currentTime.map(now => { - val nextRetryAttempt = retryAttempt.fold(0)(_.attempt + 1) + )(implicit trace: Trace): URIO[Any, RetryDecision] = Clock.instant.map(now => { + val blockingRetriesBefore = RetryAttempt.maxBlockingAttempts( + NonBlockingRetryHelper.originalTopic(record.topic, group), + retryConfig + ).getOrElse(0) + + // attempt if present contains full number of retries + val nextNonBlockingAttempt = retryAttempt.fold(0)(_.attempt + 1 - blockingRetriesBefore) + val nextRetryAttempt = nextNonBlockingAttempt + blockingRetriesBefore val originalTopic = retryAttempt.fold(record.topic)(_.originalTopic) val retryTopic = subscription match { - case _: TopicPattern => patternRetryTopic(group, nextRetryAttempt) - case _: Topics => fixedRetryTopic(originalTopic, group, nextRetryAttempt) + case _: TopicPattern => patternRetryTopic(group, nextNonBlockingAttempt) + case _: Topics => fixedRetryTopic(originalTopic, group, nextNonBlockingAttempt) } val topicRetryPolicy = policy(record.topic) topicRetryPolicy.intervals - .lift(nextRetryAttempt) + .lift(nextNonBlockingAttempt) .map { backoff => + val attempt = RetryAttempt( + attempt = nextRetryAttempt, + originalTopic = originalTopic, + submittedAt = now, + backoff = backoff + ) topicRetryPolicy.recordMutate( ProducerRecord( topic = retryTopic, value = record.value, key = record.key, partition = None, - headers = record.headers + - (RetryHeader.Submitted -> toChunk(now.toEpochMilli)) + - (RetryHeader.Backoff -> toChunk(backoff.toMillis)) + - (RetryHeader.OriginalTopic -> toChunk(originalTopic)) + - (RetryHeader.RetryAttempt -> toChunk(nextRetryAttempt)) + headers = record.headers ++ RetryAttempt.toHeaders(attempt) ) ) } .fold[RetryDecision](NoMoreRetries)(RetryWith) }) + } - private def toChunk(long: Long): Chunk[Byte] = - Chunk.fromArray(long.toString.getBytes) - - private def toChunk(str: String): Chunk[Byte] = - Chunk.fromArray(str.getBytes) + private[retry] def attemptNumberFromTopic( + subscription: ConsumerSubscription, + topic: Topic, + originalTopicHeader: Option[String], + group: Group + ) = + subscription match { + case _: Topics => extractTopicAttempt(group, topic) + case _: TopicPattern => + extractTopicAttemptFromPatternRetryTopic(group, topic, originalTopicHeader) } - private def extractTopicAttempt[E](group: Group, inputTopic: Topic) = + private def extractTopicAttempt(group: Group, inputTopic: Topic) = inputTopic.split(s"-$group-retry-").toSeq match { case Seq(topic, attempt) if Try(attempt.toInt).isSuccess => Some(TopicAttempt(topic, attempt.toInt)) - case _ => None + case _ => None } - private def extractTopicAttemptFromPatternRetryTopic[E]( + private def extractTopicAttemptFromPatternRetryTopic( group: Group, inputTopic: Topic, originalTopicHeader: Option[String] @@ -166,49 +145,27 @@ object DelayHeaders { val Backoff = "backOffTimeMs" } -object RetryHeader { - val Submitted = "submitTimestamp" - val Backoff = DelayHeaders.Backoff - val OriginalTopic = "GH_OriginalTopic" - val RetryAttempt = "GH_RetryAttempt" -} - -case class RetryAttempt( - originalTopic: Topic, - attempt: RetryAttemptNumber, - submittedAt: Instant, - backoff: Duration -) { - - def sleep(implicit trace: Trace): URIO[GreyhoundMetrics, Unit] = - RetryUtil.sleep(submittedAt, backoff) race reportWaitingInIntervals(every = 60.seconds) - - private def reportWaitingInIntervals(every: Duration) = - report(WaitingForRetry(originalTopic, attempt, submittedAt.toEpochMilli, backoff.toMillis)) - .repeat(spaced(every)) - .unit -} - object RetryUtil { + def sleep(attempt: RetryAttempt)(implicit trace: Trace): URIO[GreyhoundMetrics, Unit] = + sleep(attempt.submittedAt, attempt.backoff) race + report(WaitingForRetry(attempt.originalTopic, attempt.attempt, attempt.submittedAt.toEpochMilli, attempt.backoff.toMillis)) + .repeat(spaced(60.seconds)) + .unit + def sleep(submittedAt: Instant, backoff: Duration)(implicit trace: Trace): URIO[Any, Unit] = { val expiresAt = submittedAt.plus(backoff.asJava) - currentTime + Clock.instant .map(_.isAfter(expiresAt)) .flatMap(expired => if (expired) ZIO.unit else - ZIO.sleep(1.seconds).repeatUntilZIO(_ => currentTime.map(_.isAfter(expiresAt))).unit + ZIO.sleep(1.second).repeatUntilZIO(_ => Clock.instant.map(_.isAfter(expiresAt))).unit ) } } private case class TopicAttempt(originalTopic: Topic, attempt: Int) -object RetryAttempt { - type RetryAttemptNumber = Int - val currentTime = Clock.currentTime(MILLISECONDS).map(Instant.ofEpochMilli) -} - sealed trait RetryDecision object RetryDecision { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala index a15f1e5b..a6ff6560 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/NonBlockingRetryRecordHandler.scala @@ -31,11 +31,12 @@ private[retry] object NonBlockingRetryRecordHandler { retryConfig: RetryConfig, subscription: ConsumerSubscription, nonBlockingRetryHelper: NonBlockingRetryHelper, + groupId: Group, awaitShutdown: TopicPartition => UIO[AwaitShutdown] )(implicit evK: K <:< Chunk[Byte], evV: V <:< Chunk[Byte]): NonBlockingRetryRecordHandler[V, K, R] = new NonBlockingRetryRecordHandler[V, K, R] { override def handle(record: ConsumerRecord[K, V]): ZIO[GreyhoundMetrics with R, Nothing, Any] = { - nonBlockingRetryHelper.retryAttempt(record.topic, record.headers, subscription).flatMap { retryAttempt => + RetryAttempt.extract(record.headers, record.topic, groupId, subscription, Some(retryConfig)).flatMap { retryAttempt => maybeDelayRetry(record, retryAttempt) *> handler.handle(record).catchAll { case Right(_: NonRetriableException) => ZIO.unit @@ -56,7 +57,7 @@ private[retry] object NonBlockingRetryRecordHandler { WaitingBeforeRetry(record.topic, retryAttempt, record.partition, record.offset, correlationId) ) *> awaitShutdown(record.topicPartition) - .flatMap(_.interruptOnShutdown(retryAttempt.sleep)) + .flatMap(_.interruptOnShutdown(RetryUtil.sleep(retryAttempt))) .reporting(r => DoneWaitingBeforeRetry(record.topic, record.partition, record.offset, retryAttempt, r.duration, r.failed, correlationId) ) @@ -74,7 +75,7 @@ private[retry] object NonBlockingRetryRecordHandler { override def handleAfterBlockingFailed( record: ConsumerRecord[K, V] ): ZIO[GreyhoundMetrics with R, Nothing, Any] = { - nonBlockingRetryHelper.retryAttempt(record.topic, record.headers, subscription).flatMap { retryAttempt => + RetryAttempt.extract(record.headers, record.topic, groupId, subscription, Some(retryConfig)).flatMap { retryAttempt => maybeRetry(retryAttempt, BlockingHandlerFailed, record) } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala new file mode 100644 index 00000000..3fa3cba8 --- /dev/null +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala @@ -0,0 +1,98 @@ +package com.wixpress.dst.greyhound.core.consumer.retry + +import com.wixpress.dst.greyhound.core.Serdes.StringSerde +import com.wixpress.dst.greyhound.core._ +import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription +import com.wixpress.dst.greyhound.core.consumer.retry.NonBlockingRetryHelper.attemptNumberFromTopic +import com.wixpress.dst.greyhound.core.consumer.retry.RetryAttempt.RetryAttemptNumber +import zio._ + +import java.time.Instant + +/** + * Description of a retry attempt + * @param attempt contains which attempt is it, starting from 0 including blocking and non-blocking attempts + */ +case class RetryAttempt( + originalTopic: Topic, + attempt: RetryAttemptNumber, + submittedAt: Instant, + backoff: Duration +) + +object RetryHeader { + val Submitted = "submitTimestamp" + val Backoff = DelayHeaders.Backoff + val OriginalTopic = "GH_OriginalTopic" + val RetryAttempt = "GH_RetryAttempt" +} + +object RetryAttempt { + type RetryAttemptNumber = Int + + private def toChunk(str: String): Chunk[Byte] = Chunk.fromArray(str.getBytes) + + def toHeaders(attempt: RetryAttempt): Headers = Headers( + RetryHeader.Submitted -> toChunk(attempt.submittedAt.toEpochMilli.toString), + RetryHeader.Backoff -> toChunk(attempt.backoff.toMillis.toString), + RetryHeader.OriginalTopic -> toChunk(attempt.originalTopic), + RetryHeader.RetryAttempt -> toChunk(attempt.attempt.toString), + ) + + private case class RetryAttemptHeaders( + originalTopic: Option[Topic], + attempt: Option[RetryAttemptNumber], + submittedAt: Option[Instant], + backoff: Option[Duration] + ) + + private def fromHeaders(headers: Headers): Task[RetryAttemptHeaders] = + for { + submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) + backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) + topic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) + attempt <- headers.get(RetryHeader.RetryAttempt, longDeserializer) + } yield RetryAttemptHeaders(topic, attempt.map(_.toInt), submitted, backoff) + + /** @return None on infinite blocking retries */ + def maxBlockingAttempts(topic: Topic, retryConfig: Option[RetryConfig]): Option[Int] = + retryConfig.map(_.blockingBackoffs(topic)()).fold(Option(0)) { + case finite if finite.hasDefiniteSize => Some(finite.size) + case _ => None + } + + /** @return None on infinite retries */ + def maxOverallAttempts(topic: Topic, retryConfig: Option[RetryConfig]): Option[Int] = + maxBlockingAttempts(topic, retryConfig).map { + _ + retryConfig.fold(0)(_.nonBlockingBackoffs(topic).length) + } + + def extract( + headers: Headers, + topic: Topic, + group: Group, + subscription: ConsumerSubscription, + retryConfig: Option[RetryConfig], + )(implicit trace: Trace): UIO[Option[RetryAttempt]] = { + + def maybeNonBlockingAttempt(hs: RetryAttemptHeaders): Option[RetryAttempt] = + for { + submitted <- hs.submittedAt + backoff <- hs.backoff + TopicAttempt(originalTopic, attempt) <- attemptNumberFromTopic(subscription, topic, hs.originalTopic, group) + blockingRetries = maxBlockingAttempts(originalTopic, retryConfig).getOrElse(0) + } yield RetryAttempt(originalTopic, blockingRetries + attempt, submitted, backoff) + + def maybeBlockingAttempt(hs: RetryAttemptHeaders): Option[RetryAttempt] = + for { + submitted <- hs.submittedAt + backoff <- hs.backoff + originalTopic <- hs.originalTopic if originalTopic == topic + attempt <- hs.attempt + } yield RetryAttempt(originalTopic, attempt, submitted, backoff) + + fromHeaders(headers).map { hs => + maybeNonBlockingAttempt(hs) orElse maybeBlockingAttempt(hs) + } + }.catchAll(_ => ZIO.none) +} diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala index af0749c6..8ee8ab9f 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryRecordHandler.scala @@ -33,7 +33,7 @@ object RetryRecordHandler { ): RecordHandler[R with R2 with GreyhoundMetrics, Nothing, K, V] = { val nonBlockingHandler = - NonBlockingRetryRecordHandler(handler, producer, retryConfig, subscription, nonBlockingRetryHelper, awaitShutdown) + NonBlockingRetryRecordHandler(handler, producer, retryConfig, subscription, nonBlockingRetryHelper, groupId, awaitShutdown) val blockingHandler = BlockingRetryRecordHandler(groupId, handler, retryConfig, blockingState, nonBlockingHandler, awaitShutdown) val blockingAndNonBlockingHandler = BlockingAndNonBlockingRetryRecordHandler(groupId, blockingHandler, nonBlockingHandler) @@ -55,15 +55,4 @@ object RetryRecordHandler { record.headers.get[String](key, StringSerde).catchAll(_ => ZIO.none) } -object ZIOHelper { - def foreachWhile[R, E, A](as: Iterable[A])(f: A => ZIO[R, E, LastHandleResult])(implicit trace: Trace): ZIO[R, E, LastHandleResult] = - ZIO.succeed(as.iterator).flatMap { i => - def loop: ZIO[R, E, LastHandleResult] = - if (i.hasNext) f(i.next).flatMap(result => if (result.shouldContinue) loop else ZIO.succeed(result)) - else ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) - - loop - } -} - case class LastHandleResult(lastHandleSucceeded: Boolean, shouldContinue: Boolean) diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala new file mode 100644 index 00000000..9e27c10b --- /dev/null +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttemptTest.scala @@ -0,0 +1,94 @@ +package com.wixpress.dst.greyhound.core.consumer.retry + +import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription +import com.wixpress.dst.greyhound.core.testkit.BaseTest +import zio.test.TestEnvironment + +import java.time.{Duration, Instant} +import scala.util.Random +import scala.concurrent.duration._ + +class RetryAttemptTest extends BaseTest[TestEnvironment] { + + "RetryAttempt.extract" should { + "deserialize attempt from headers for blocking retries" in { + val attempt = randomRetryAttempt + val headers = RetryAttempt.toHeaders(attempt) + val subscription = ConsumerSubscription.Topics(Set(attempt.originalTopic)) + for (result <- RetryAttempt.extract(headers, attempt.originalTopic, randomStr, subscription, None)) + yield result must beSome(attempt) + } + "deserialize attempt from headers and topic for non-blocking retries" in { + val attempt = randomRetryAttempt + // topic and attempt must be extracted from retryTopic + val headers = RetryAttempt.toHeaders(attempt.copy(originalTopic = "", attempt = -1)) + val subscription = ConsumerSubscription.Topics(Set(attempt.originalTopic)) + val group = randomStr + val retryTopic = NonBlockingRetryHelper.fixedRetryTopic(attempt.originalTopic, group, attempt.attempt) + for (result <- RetryAttempt.extract(headers, retryTopic, group, subscription, None)) + yield result must beSome(attempt) + } + "deserialize attempt for non-blocking retry after blocking retries" in { + val attempt = randomRetryAttempt + val headers = RetryAttempt.toHeaders(attempt) + val subscription = ConsumerSubscription.Topics(Set(attempt.originalTopic)) + val group = randomStr + val retries = RetryConfig.blockingFollowedByNonBlockingRetry( + blockingBackoffs = 1.milli :: 1.second :: Nil, + nonBlockingBackoffs = 5.minutes :: Nil, + ) + val retryTopic = NonBlockingRetryHelper.fixedRetryTopic(attempt.originalTopic, group, attempt.attempt) + for (result <- RetryAttempt.extract(headers, retryTopic, group, subscription, Some(retries))) + yield result must beSome(attempt.copy(attempt = attempt.attempt + 2)) // with 2 blocking retries before + } + "In case incorrect originalTopic header propagated from a different consumer group, ignore it," + + "do NOT consider it as if it's a non-blocking retry" in { + val attempt = propagatedRetryAttempt + val headers = RetryAttempt.toHeaders(attempt) + val currentTopic = "relevant-topic" + val subscription = ConsumerSubscription.Topics(Set(currentTopic)) + for (result <- RetryAttempt.extract(headers, currentTopic, randomStr, subscription, None)) + yield result must beNone + } + } + + "RetryAttempt.maxOverallAttempts" should { + "return 0 if no retries configured" in { + RetryAttempt.maxOverallAttempts(randomStr, None) must beSome(0) + } + "return max attempts for blocking retries" in { + val config = RetryConfig.finiteBlockingRetry(1.milli, 1.second) + RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beSome(2) + } + "return max attempts for non-blocking retries" in { + val config = RetryConfig.nonBlockingRetry(1.milli, 1.second, 5.minutes) + RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beSome(3) + } + "return max attempts for blocking retries followed by non-blocking" in { + val config = RetryConfig.blockingFollowedByNonBlockingRetry(1.milli :: 2.seconds :: Nil, 1.minute :: Nil) + RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beSome(3) + } + "return None for infinite blocking retries" in { + val config = RetryConfig.infiniteBlockingRetry(1.milli) + RetryAttempt.maxOverallAttempts(randomStr, Some(config)) must beNone + } + } + + override def env = testEnvironment + + private def randomStr = Random.alphanumeric.take(10).mkString + + private def randomRetryAttempt = RetryAttempt( + originalTopic = randomStr, + attempt = Random.nextInt(1000), + submittedAt = Instant.ofEpochMilli(math.abs(Random.nextLong())), + backoff = Duration.ofMillis(Random.nextInt(100000)) + ) + + private def propagatedRetryAttempt = RetryAttempt( + originalTopic = "some-other-topic", + attempt = Random.nextInt(1000), + submittedAt = Instant.ofEpochMilli(math.abs(Random.nextLong())), + backoff = Duration.ofMillis(Random.nextInt(100000)) + ) +} diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala index c713fb12..0797ba25 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala @@ -4,7 +4,7 @@ import java.time.Instant import com.wixpress.dst.greyhound.core.Serdes._ import com.wixpress.dst.greyhound.core._ import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.Topics -import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription, RecordHandler} +import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHandler} import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, Blocking => InternalBlocking, IgnoringAll, IgnoringOnce} import com.wixpress.dst.greyhound.core.consumer.retry.RetryConsumerRecordHandlerTest.{offset, partition, _} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.{BlockingIgnoredForAllFor, BlockingIgnoredOnceFor, BlockingRetryHandlerInvocationFailed, NoRetryOnNonRetryableFailure} @@ -21,8 +21,6 @@ import zio.Random.{nextBytes, nextIntBounded} import zio.managed.UManaged import zio.test.TestClock -import scala.concurrent.TimeoutException - class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics] { override def env: UManaged[ZEnvironment[TestClock with TestMetrics]] = @@ -52,9 +50,6 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) record <- producer.records.take now <- currentTime - retryAttempt <- IntSerde.serialize(retryTopic, 0) - submittedAt <- InstantSerde.serialize(retryTopic, now) - backoff <- DurationSerde.serialize(retryTopic, 1.second) } yield { record === ProducerRecord( @@ -62,7 +57,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics value, Some(key), partition = None, - headers = Headers("retry-attempt" -> retryAttempt, "retry-submitted-at" -> submittedAt, "retry-backoff" -> backoff) + headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, now, 1.second)) ) } } @@ -71,7 +66,8 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics for { producer <- FakeProducer.make topic <- randomTopicName - retryTopic = s"$topic-retry" + attempt = 0 + retryTopic = NonBlockingRetryHelper.fixedRetryTopic(topic, group, attempt) executionTime <- Promise.make[Nothing, Instant] handler = RecordHandler[Clock, HandlerError, Chunk[Byte], Chunk[Byte]] { _ => currentTime.flatMap(executionTime.succeed) } blockingState <- Ref.make[Map[BlockingTarget, BlockingState]](Map.empty) @@ -86,10 +82,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) value <- bytes begin <- currentTime - retryAttempt <- IntSerde.serialize(retryTopic, 0) - submittedAt <- InstantSerde.serialize(retryTopic, begin) - backoff <- DurationSerde.serialize(retryTopic, 1.second) - headers = Headers("retry-attempt" -> retryAttempt, "retry-submitted-at" -> submittedAt, "retry-backoff" -> backoff) + headers = RetryAttempt.toHeaders(RetryAttempt(topic, attempt, begin, 1.second)) _ <- retryHandler.handle(ConsumerRecord(retryTopic, partition, offset, headers, None, value, 0L, 0L, 0L)).fork _ <- TestClock.adjust(1.second).repeat(Schedule.once) end <- executionTime.await.disconnect.timeoutFail(TimeoutWaitingForAssertion)(5.seconds) @@ -404,7 +397,8 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics producer <- FakeProducer.make topic <- randomTopicName blockingState <- Ref.make[Map[BlockingTarget, BlockingState]](Map.empty) - retryHelper = alwaysBackOffRetryHelper(3.seconds) + retryHelper = FakeRetryHelper(topic) + now <- Clock.instant handling <- AwaitShutdown.makeManaged.flatMap { awaitShutdown => val retryHandler = RetryRecordHandler.withRetries( group, @@ -416,11 +410,12 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics retryHelper, awaitShutdown = _ => ZIO.succeed(awaitShutdown) ) + val headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, now, 3.seconds)) for { key <- bytes value <- bytes handling <- retryHandler - .handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + .handle(ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L)) .forkDaemon } yield handling } @@ -534,18 +529,6 @@ object RetryConsumerRecordHandlerTest { def randomTopicName = randomStr.map(suffix => s"some-topic-$suffix") val cause = new RuntimeException("cause") - - def alwaysBackOffRetryHelper(backoff: Duration) = { - new FakeNonBlockingRetryHelper { - override val topic: Topic = "" - - override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace - ): UIO[Option[RetryAttempt]] = ZIO.succeed( - Some(RetryAttempt(topic, 1, Instant.now, backoff)) - ) - } - } } object TimeoutWaitingForAssertion extends RuntimeException diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala index cee13948..619ada36 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/FakeRetryHelper.scala @@ -1,35 +1,21 @@ package com.wixpress.dst.greyhound.core.testkit import java.time.Instant -import java.util.concurrent.TimeUnit.MILLISECONDS - -import com.wixpress.dst.greyhound.core.Serdes._ import com.wixpress.dst.greyhound.core._ import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription} import com.wixpress.dst.greyhound.core.consumer.retry.RetryDecision.{NoMoreRetries, RetryWith} -import com.wixpress.dst.greyhound.core.consumer.retry.{BlockingHandlerFailed, NonBlockingRetryHelper, RetryAttempt, RetryDecision} +import com.wixpress.dst.greyhound.core.consumer.retry.{BlockingHandlerFailed, NonBlockingRetryHelper, RetryAttempt, RetryDecision, RetryHeader} import com.wixpress.dst.greyhound.core.producer.ProducerRecord import com.wixpress.dst.greyhound.core.testkit.FakeRetryHelper._ import zio._ import zio.Clock -import zio.Clock - trait FakeNonBlockingRetryHelper extends NonBlockingRetryHelper { val topic: Topic override def retryTopicsFor(originalTopic: Topic): Set[Topic] = Set(s"$originalTopic-retry") - override def retryAttempt(topic: Topic, headers: Headers, subscription: ConsumerSubscription)( - implicit trace: Trace - ): UIO[Option[RetryAttempt]] = - (for { - attempt <- headers.get(Header.Attempt, IntSerde) - submittedAt <- headers.get(Header.SubmittedAt, InstantSerde) - backoff <- headers.get(Header.Backoff, DurationSerde) - } yield retryAttemptInternal(topic, attempt, submittedAt, backoff)).orElse(ZIO.none) - override def retryDecision[E]( retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]], @@ -38,35 +24,23 @@ trait FakeNonBlockingRetryHelper extends NonBlockingRetryHelper { )(implicit trace: Trace): URIO[Any, RetryDecision] = error match { case RetriableError | BlockingHandlerFailed => - currentTime.flatMap(now => - recordFrom(now, retryAttempt, record) - .fold(_ => NoMoreRetries, RetryWith) + currentTime.map(now => + RetryWith(recordFrom(now, retryAttempt, record)) ) case NonRetriableError => ZIO.succeed(NoMoreRetries) } - private def retryAttemptInternal(topic: Topic, attempt: Option[Int], submittedAt: Option[Instant], backoff: Option[Duration]) = - for { - a <- attempt - s <- submittedAt - b <- backoff - } yield RetryAttempt(topic, a, s, b) - private def recordFrom(now: Instant, retryAttempt: Option[RetryAttempt], record: ConsumerRecord[Chunk[Byte], Chunk[Byte]])( implicit trace: Trace ) = { val nextRetryAttempt = retryAttempt.fold(0)(_.attempt + 1) - for { - retryAttempt <- IntSerde.serialize(topic, nextRetryAttempt) - submittedAt <- InstantSerde.serialize(topic, now) - backoff <- DurationSerde.serialize(topic, 1.second) - } yield ProducerRecord( + ProducerRecord( topic = s"$topic-retry", value = record.value, key = record.key, partition = None, - headers = Headers(Header.Attempt -> retryAttempt, Header.SubmittedAt -> submittedAt, Header.Backoff -> backoff) + headers = RetryAttempt.toHeaders(RetryAttempt(topic, nextRetryAttempt, now, 1.second)) ) } } @@ -75,13 +49,8 @@ case class FakeRetryHelper(topic: Topic) extends FakeNonBlockingRetryHelper object FakeRetryHelper { implicit private val trace = Trace.empty - object Header { - val Attempt = "retry-attempt" - val SubmittedAt = "retry-submitted-at" - val Backoff = "retry-backoff" - } - val currentTime = Clock.currentTime(MILLISECONDS).map(Instant.ofEpochMilli) + val currentTime: UIO[Instant] = Clock.instant } sealed trait HandlerError From 693ab111b41e0e0351ce55a5b2cec704136134cb Mon Sep 17 00:00:00 2001 From: Noam Berman Date: Sun, 23 Apr 2023 00:49:05 +0300 Subject: [PATCH 08/52] [greyhound] expose internal kafka producer metrics (#34160) GitOrigin-RevId: ab792e2227f389d5642802944a982f3a8d3587d8 --- .../dst/greyhound/core/producer/Producer.scala | 17 +++++++++++++++-- .../core/producer/ReportingProducer.scala | 6 +++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/producer/Producer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/producer/Producer.scala index 2200b6cc..538c0b81 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/producer/Producer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/producer/Producer.scala @@ -5,13 +5,15 @@ import org.apache.kafka.clients.producer.{Callback, KafkaProducer, ProducerConfi import org.apache.kafka.common.header.Header import org.apache.kafka.common.header.internals.RecordHeader import org.apache.kafka.common.serialization.ByteArraySerializer +import org.apache.kafka.common.{Metric, MetricName} +import zio.ZIO.attemptBlocking import zio._ import scala.collection.JavaConverters._ -import zio.ZIO.attemptBlocking -import zio.managed._ trait ProducerR[-R] { self => + def metrics : UIO[Option[Map[MetricName, Metric]]] = ZIO.none + def produceAsync(record: ProducerRecord[Chunk[Byte], Chunk[Byte]])( implicit trace: Trace ): ZIO[R, ProducerError, IO[ProducerError, RecordMetadata]] @@ -80,6 +82,9 @@ object Producer { val acquire = ZIO.attemptBlocking(new KafkaProducer(config.properties, serializer, serializer)) ZIO.acquireRelease(acquire)(producer => attemptBlocking(producer.close()).ignore).map { producer => new ProducerR[R] { + override def metrics: UIO[Option[Map[MetricName, Metric]]] = + ZIO.succeed(Option(producer.metrics().asScala.toMap)) + private def recordFrom(record: ProducerRecord[Chunk[Byte], Chunk[Byte]]) = new KafkaProducerRecord( record.topic, @@ -153,6 +158,8 @@ object ProducerR { override def partitionsFor(topic: Topic)(implicit trace: Trace): RIO[Any, Seq[PartitionInfo]] = producer.partitionsFor(topic).provideEnvironment(env) + + override def metrics: UIO[Option[Map[MetricName, Metric]]] = producer.metrics } def onShutdown(onShutdown: => UIO[Unit])(implicit trace: Trace): ProducerR[R] = new ProducerR[R] { override def produceAsync( @@ -165,6 +172,8 @@ object ProducerR { override def attributes: Map[String, String] = producer.attributes override def partitionsFor(topic: Topic)(implicit trace: Trace) = producer.partitionsFor(topic) + + override def metrics: UIO[Option[Map[MetricName, Metric]]] = producer.metrics } def tapBoth(onError: (Topic, Cause[ProducerError]) => URIO[R, Unit], onSuccess: RecordMetadata => URIO[R, Unit]) = new ProducerR[R] { @@ -186,6 +195,8 @@ object ProducerR { override def attributes: Map[String, String] = producer.attributes override def partitionsFor(topic: Topic)(implicit trace: Trace) = producer.partitionsFor(topic) + + override def metrics: UIO[Option[Map[MetricName, Metric]]] = producer.metrics } def map(f: ProducerRecord[Chunk[Byte], Chunk[Byte]] => ProducerRecord[Chunk[Byte], Chunk[Byte]]) = new ProducerR[R] { @@ -199,6 +210,8 @@ object ProducerR { override def shutdown(implicit trace: Trace): UIO[Unit] = producer.shutdown override def attributes: Map[String, String] = producer.attributes + + override def metrics: UIO[Option[Map[MetricName, Metric]]] = producer.metrics } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/producer/ReportingProducer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/producer/ReportingProducer.scala index 10eab6a7..69306db1 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/producer/ReportingProducer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/producer/ReportingProducer.scala @@ -5,14 +5,18 @@ import java.util.concurrent.TimeUnit.MILLISECONDS import com.wixpress.dst.greyhound.core.PartitionInfo import com.wixpress.dst.greyhound.core.metrics.{GreyhoundMetric, GreyhoundMetrics} import com.wixpress.dst.greyhound.core.producer.ProducerMetric._ -import zio.{Chunk, IO, RIO, Trace, ULayer, ZIO} +import zio.{Chunk, IO, RIO, Trace, UIO, ULayer, ZIO} import GreyhoundMetrics._ +import org.apache.kafka.common.{Metric, MetricName} import scala.concurrent.duration.FiniteDuration import zio.Clock.currentTime case class ReportingProducer[-R](internal: ProducerR[R], extraAttributes: Map[String, String]) extends ProducerR[GreyhoundMetrics with R] { + + override def metrics: UIO[Option[Map[MetricName, Metric]]] = internal.metrics + override def produceAsync( record: ProducerRecord[Chunk[Byte], Chunk[Byte]] )(implicit trace: Trace): ZIO[GreyhoundMetrics with R, ProducerError, IO[ProducerError, RecordMetadata]] = From 659129ccd81d979de77dd89a09807aac8097c633 Mon Sep 17 00:00:00 2001 From: Leon Burdinov Date: Sun, 23 Apr 2023 08:12:42 +0300 Subject: [PATCH 09/52] [greyhound] Cooperative Rebalance fix (#34153) * [greyhound] code cleanup #pr * fix for paused partitions unrevoked in cooperative GitOrigin-RevId: 7cf8fe80c2ac9108b3ebe72472355fe73158df9a --- .../greyhound/core/consumer/Dispatcher.scala | 11 ++++------ .../greyhound/core/consumer/EventLoop.scala | 20 +++++++++++-------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index cbe8d883..b618d0ba 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -93,13 +93,10 @@ object Dispatcher { override def revoke(partitions: Set[TopicPartition]): URIO[GreyhoundMetrics, Unit] = workers .modify { workers => - partitions.foldLeft((List.empty[(TopicPartition, Worker)], workers)) { - case ((revoked, remaining), partition) => - remaining.get(partition) match { - case Some(worker) => ((partition, worker) :: revoked, remaining - partition) - case None => (revoked, remaining) - } - } + val revoked = workers.filterKeys(partitions.contains) + val remaining = workers -- partitions + + (revoked, remaining) } .flatMap(shutdownWorkers) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 5985f80d..0d6be6ee 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -170,12 +170,14 @@ object EventLoop { consumer: Consumer, partitions: Set[TopicPartition] )(implicit trace: Trace): URIO[GreyhoundMetrics, DelayedRebalanceEffect] = { - pausedPartitionsRef.set(Set.empty) *> - dispatcher.revoke(partitions).timeout(config.drainTimeout).flatMap { drained => - ZIO.when(drained.isEmpty)( - report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumer.config.consumerAttributes)) - ) - } *> commitOffsetsOnRebalance(consumer0, offsets) + for { + _ <- pausedPartitionsRef.update(_ -- partitions) + isRevokeTimedOut <- dispatcher.revoke(partitions).timeout(config.drainTimeout).map(_.isEmpty) + _ <- ZIO.when(isRevokeTimedOut)( + report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumer.config.consumerAttributes)) + ) + delayedRebalanceEffect <- commitOffsetsOnRebalance(consumer0, offsets) + } yield delayedRebalanceEffect } override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[Any] = @@ -258,7 +260,8 @@ case class EventLoopConfig( highWatermark: Int, rebalanceListener: RebalanceListener[Any], delayResumeOfPausedPartition: Long, - startPaused: Boolean + startPaused: Boolean, + cooperativeRebalanceEnabled: Boolean ) object EventLoopConfig { @@ -269,7 +272,8 @@ object EventLoopConfig { highWatermark = 256, rebalanceListener = RebalanceListener.Empty, delayResumeOfPausedPartition = 0, - startPaused = false + startPaused = false, + cooperativeRebalanceEnabled = false ) } From ae1bb8f3311685ad2c6f3a2d076ac111be9a3dbb Mon Sep 17 00:00:00 2001 From: Leon Burdinov Date: Sun, 23 Apr 2023 10:53:08 +0300 Subject: [PATCH 10/52] code cleanup (#34171) GitOrigin-RevId: c2a4d2c2de669eff3c98b2277be8502d834de3e4 --- .../wixpress/dst/greyhound/core/consumer/EventLoop.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 0d6be6ee..3c8940ac 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -260,8 +260,7 @@ case class EventLoopConfig( highWatermark: Int, rebalanceListener: RebalanceListener[Any], delayResumeOfPausedPartition: Long, - startPaused: Boolean, - cooperativeRebalanceEnabled: Boolean + startPaused: Boolean ) object EventLoopConfig { @@ -272,8 +271,7 @@ object EventLoopConfig { highWatermark = 256, rebalanceListener = RebalanceListener.Empty, delayResumeOfPausedPartition = 0, - startPaused = false, - cooperativeRebalanceEnabled = false + startPaused = false ) } From 57d215b002676dc91ff43ed9306bb7cfd9562404 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Wed, 24 May 2023 14:08:59 +0300 Subject: [PATCH 11/52] [greyhound] parallel consumer implementation (#34061) GitOrigin-RevId: 5c9cc1ea4fcc5935c6f905a5269ac17f6b82c294 --- .../dst/greyhound/core/ConsumerIT.scala | 854 ++++++++++-------- .../dst/greyhound/core/parallel/BUILD.bazel | 27 + .../core/parallel/ParallelConsumerIT.scala | 325 +++++++ .../greyhound/core/OffsetAndMetadata.scala | 3 + .../greyhound/core/consumer/Consumer.scala | 53 +- .../greyhound/core/consumer/Dispatcher.scala | 193 +++- .../greyhound/core/consumer/EventLoop.scala | 150 ++- .../core/consumer/OffsetsAndGaps.scala | 63 +- .../core/consumer/OffsetsInitializer.scala | 62 +- .../core/consumer/RecordConsumer.scala | 19 +- .../core/consumer/ReportingConsumer.scala | 93 ++ .../core/consumer/EventLoopTest.scala | 15 +- .../consumer/OffsetsInitializerTest.scala | 39 +- .../consumer/batched/BatchEventLoopTest.scala | 18 +- .../consumer/dispatcher/DispatcherTest.scala | 44 + .../core/testkit/RecordMatchers.scala | 6 + 16 files changed, 1497 insertions(+), 467 deletions(-) create mode 100644 core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/BUILD.bazel create mode 100644 core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala index e91e3742..3f26b47e 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala @@ -19,6 +19,7 @@ import com.wixpress.dst.greyhound.core.zioutils.Gate import com.wixpress.dst.greyhound.testenv.ITEnv import com.wixpress.dst.greyhound.testenv.ITEnv.{clientId, _} import com.wixpress.dst.greyhound.testkit.ManagedKafka +import org.specs2.specification.core.Fragments import zio.Clock import zio.stm.{STM, TRef} import zio._ @@ -30,428 +31,505 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { sequential override def env = ITEnv.ManagedEnv - override def sharedEnv = ITEnv.testResources() - val resources = testResources() - "produce, consume and rebalance - " in + + s"subscribe to a pattern" in ZIO.scoped { for { + _ <- ZIO.debug(">>>> starting test: patternTest with parallel") + topic1 = "core-subscribe-pattern1-topic" + topic2 = "core-subscribe-pattern2-topic" r <- getShared TestResources(kafka, producer) = r - topic <- kafka.createRandomTopic(prefix = s"topic1-single1") - topic2 <- kafka.createRandomTopic(prefix = "topic2-single1") + _ <- kafka.createTopics(Seq(topic1, topic2).map(t => TopicConfig(t, 1, 1, delete)): _*) group <- randomGroup + probe <- Ref.make(Seq.empty[(Topic, Offset)]) + handler = RecordHandler { record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => probe.update(_ :+ (record.topic, record.offset)) } - queue <- Queue.unbounded[ConsumerRecord[String, String]] - handler = RecordHandler((cr: ConsumerRecord[String, String]) => ZIO.succeed(println(s"***** Consumed: $cr")) *> queue.offer(cr)) - .withDeserializers(StringSerde, StringSerde) - .ignore - cId <- clientId - config = configFor(kafka, group, topic).copy(clientId = cId) - record = ProducerRecord(topic, "bar", Some("foo")) - - messages <- RecordConsumer.make(config, handler).flatMap { consumer => - producer.produce(record, StringSerde, StringSerde) *> sleep(3.seconds) *> - consumer.resubscribe(ConsumerSubscription.topics(topic, topic2)) *> - sleep(500.millis) *> // give the consumer some time to start polling topic2 - producer.produce(record.copy(topic = topic2, value = Some("BAR")), StringSerde, StringSerde) *> - (queue.take zip queue.take) - .timeout(20.seconds) - .tap(o => ZIO.when(o.isEmpty)(ZIO.debug("timeout waiting for messages!"))) - } - msgs <- ZIO.fromOption(messages).orElseFail(TimedOutWaitingForMessages) - } yield { - msgs._1 must - (beRecordWithKey("foo") and beRecordWithValue("bar")) and - (msgs._2 must (beRecordWithKey("foo") and beRecordWithValue("BAR"))) - } + _ <- makeConsumer(kafka, compile("core-subscribe-pattern1.*"), group, handler, 0, useParallelConsumer = false).flatMap { consumer => + val record = ProducerRecord(topic1, Chunk.empty, key = Option(Chunk.empty)) + producer.produce(record) *> eventuallyZ(probe.get)(_ == (topic1, 0) :: Nil) *> + consumer.resubscribe(TopicPattern(compile("core-subscribe-pattern2.*"))) *> + producer.produce(record.copy(topic = topic2)) *> + eventuallyZ(probe.get)(_ == (topic1, 0) :: (topic2, 0) :: Nil) + } + } yield ok } - "be able to resubscribe to same topics" in + s"subscribe to a pattern - using parallel consumer" in ZIO.scoped { for { + _ <- ZIO.debug(">>>> starting test: patternTest with parallel") + topic1 = "core-subscribe-parallel-pattern1-topic" + topic2 = "core-subscribe-parallel-pattern2-topic" r <- getShared TestResources(kafka, producer) = r - topic <- kafka.createRandomTopic(prefix = s"topic1-single1") + _ <- kafka.createTopics(Seq(topic1, topic2).map(t => TopicConfig(t, 1, 1, delete)): _*) group <- randomGroup + probe <- Ref.make(Seq.empty[(Topic, Offset)]) + handler = RecordHandler { record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => probe.update(_ :+ (record.topic, record.offset)) } - queue <- Queue.unbounded[ConsumerRecord[String, String]] - handler = RecordHandler((cr: ConsumerRecord[String, String]) => ZIO.succeed(println(s"***** Consumed: $cr")) *> queue.offer(cr)) - .withDeserializers(StringSerde, StringSerde) - .ignore - cId <- clientId - config = configFor(kafka, group, topic, mutateEventLoop = _.copy(drainTimeout = 1.second)).copy(clientId = cId) - record = ProducerRecord(topic, "bar", Some("foo")) - - messages <- RecordConsumer.make(config, handler).flatMap { consumer => - producer.produce(record, StringSerde, StringSerde) *> sleep(3.seconds) *> - consumer.resubscribe(ConsumerSubscription.topics(topic)) *> - consumer.resubscribe(ConsumerSubscription.topics(topic)) *> - producer.produce(record.copy(topic = topic, value = Some("BAR")), StringSerde, StringSerde) *> - (queue.take zip queue.take) - .timeout(20.seconds) - .tap(o => ZIO.when(o.isEmpty)(ZIO.debug("timeout waiting for messages!"))) - } - msgs <- ZIO.fromOption(messages).orElseFail(TimedOutWaitingForMessages) - } yield { - msgs._1 must - (beRecordWithKey("foo") and beRecordWithValue("bar")) and - (msgs._2 must (beRecordWithKey("foo") and beRecordWithValue("BAR"))) - } + _ <- makeConsumer(kafka, compile("core-subscribe-parallel-pattern1.*"), group, handler, 0, useParallelConsumer = true).flatMap { + consumer => + val record = ProducerRecord(topic1, Chunk.empty, key = Option(Chunk.empty)) + producer.produce(record) *> + eventuallyZ(probe.get, timeout = 20.seconds)(_ == (topic1, 0) :: Nil) *> + consumer.resubscribe(TopicPattern(compile("core-subscribe-parallel-pattern2.*"))) *> + producer.produce(record.copy(topic = topic2)) *> + eventuallyZ(probe.get)(_ == (topic1, 0) :: (topic2, 0) :: Nil) + } + } yield ok } - "produce consume null values (tombstones)" in - ZIO.scoped { - for { - r <- getShared - TestResources(kafka, producer) = r - topic <- kafka.createRandomTopic(prefix = s"topic1-single1") - group <- randomGroup + Fragments.foreach(Seq(false, true)) { useParallelConsumer => + s"produce, consume and rebalance${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic(prefix = s"topic1-single1") + topic2 <- kafka.createRandomTopic(prefix = "topic2-single1") + group <- randomGroup + + queue <- Queue.unbounded[ConsumerRecord[String, String]] + handler = RecordHandler((cr: ConsumerRecord[String, String]) => ZIO.succeed(println(s"***** Consumed: $cr")) *> queue.offer(cr)) + .withDeserializers(StringSerde, StringSerde) + .ignore + cId <- clientId + config = + configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)).copy(clientId = cId) + record = ProducerRecord(topic, "bar", Some("foo")) + + messages <- RecordConsumer.make(config, handler).flatMap { consumer => + producer.produce(record, StringSerde, StringSerde) *> sleep(3.seconds) *> + consumer.resubscribe(ConsumerSubscription.topics(topic, topic2)) *> + sleep(500.millis) *> // give the consumer some time to start polling topic2 + producer.produce(record.copy(topic = topic2, value = Some("BAR")), StringSerde, StringSerde) *> + (queue.take zip queue.take) + .timeout(20.seconds) + .tap(o => ZIO.when(o.isEmpty)(ZIO.debug("timeout waiting for messages!"))) + } + msgs <- ZIO.fromOption(messages).orElseFail(TimedOutWaitingForMessages) + } yield { + msgs._1 must + (beRecordWithKey("foo") and beRecordWithValue("bar")) and + (msgs._2 must (beRecordWithKey("foo") and beRecordWithValue("BAR"))) + } + } - queue <- Queue.unbounded[ConsumerRecord[String, String]] - handler = RecordHandler((cr: ConsumerRecord[String, String]) => ZIO.succeed(println(s"***** Consumed: $cr")) *> queue.offer(cr)) - .withDeserializers(StringSerde, StringSerde) - .ignore - cId <- clientId - config = configFor(kafka, group, topic).copy(clientId = cId) - record = ProducerRecord.tombstone(topic, Some("foo")) - message <- RecordConsumer.make(config, handler).flatMap { _ => - producer.produce(record, StringSerde, StringSerde) *> queue.take.timeoutFail(TimedOutWaitingForMessages)(10.seconds) - } - } yield { - message must (beRecordWithKey("foo") and beRecordWithValue(null)) + s"be able to resubscribe to same topics${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic(prefix = s"topic1-single1") + group <- randomGroup + + queue <- Queue.unbounded[ConsumerRecord[String, String]] + handler = RecordHandler((cr: ConsumerRecord[String, String]) => ZIO.succeed(println(s"***** Consumed: $cr")) *> queue.offer(cr)) + .withDeserializers(StringSerde, StringSerde) + .ignore + cId <- clientId + config = + configFor( + kafka, + group, + topic, + mutateEventLoop = _.copy(drainTimeout = 1.second, consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ).copy(clientId = cId) + record = ProducerRecord(topic, "bar", Some("foo")) + + messages <- RecordConsumer.make(config, handler).flatMap { consumer => + producer.produce(record, StringSerde, StringSerde) *> sleep(3.seconds) *> + consumer.resubscribe(ConsumerSubscription.topics(topic)) *> + consumer.resubscribe(ConsumerSubscription.topics(topic)) *> + producer.produce(record.copy(topic = topic, value = Some("BAR")), StringSerde, StringSerde) *> + (queue.take zip queue.take) + .timeout(20.seconds) + .tap(o => ZIO.when(o.isEmpty)(ZIO.debug("timeout waiting for messages!"))) + } + msgs <- ZIO.fromOption(messages).orElseFail(TimedOutWaitingForMessages) + } yield { + msgs._1 must + (beRecordWithKey("foo") and beRecordWithValue("bar")) and + (msgs._2 must (beRecordWithKey("foo") and beRecordWithValue("BAR"))) + } } - } - "not lose any messages on a slow consumer (drives the message dispatcher to throttling)" in - ZIO.scoped { - for { - r <- getShared - TestResources(kafka, producer) = r - _ <- ZIO.debug(">>>> starting test: throttlingTest") - - topic <- kafka.createRandomTopic(partitions = 2, prefix = "core-not-lose") - group <- randomGroup - - messagesPerPartition = 500 // Exceeds the queue capacity - delayPartition1 <- Promise.make[Nothing, Unit] - handledPartition0 <- CountDownLatch.make(messagesPerPartition) - handledPartition1 <- CountDownLatch.make(messagesPerPartition) - handler = RecordHandler { record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => - record.partition match { - case 0 => handledPartition0.countDown - case 1 => delayPartition1.await *> handledPartition1.countDown + s"produce consume null values (tombstones)${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic(prefix = s"topic1-single1") + group <- randomGroup + + queue <- Queue.unbounded[ConsumerRecord[String, String]] + handler = RecordHandler((cr: ConsumerRecord[String, String]) => ZIO.succeed(println(s"***** Consumed: $cr")) *> queue.offer(cr)) + .withDeserializers(StringSerde, StringSerde) + .ignore + cId <- clientId + config = + configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)).copy(clientId = cId) + record = ProducerRecord.tombstone(topic, Some("foo")) + message <- RecordConsumer.make(config, handler).flatMap { _ => + producer.produce(record, StringSerde, StringSerde) *> queue.take.timeoutFail(TimedOutWaitingForMessages)(10.seconds) + } + } yield { + message must (beRecordWithKey("foo") and beRecordWithValue(null)) + } + } + + s"not lose any messages on a slow consumer (drives the message dispatcher to throttling)${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + _ <- ZIO.debug(">>>> starting test: throttlingTest") + + topic <- kafka.createRandomTopic(partitions = 2, prefix = "core-not-lose") + group <- randomGroup + + messagesPerPartition = 500 // Exceeds the queue capacity + delayPartition1 <- Promise.make[Nothing, Unit] + handledPartition0 <- CountDownLatch.make(messagesPerPartition) + handledPartition1 <- CountDownLatch.make(messagesPerPartition) + handler = RecordHandler { record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => + record.partition match { + case 0 => handledPartition0.countDown + case 1 => delayPartition1.await *> handledPartition1.countDown + } } - } - - test <- RecordConsumer.make(configFor(kafka, group, topic), handler).flatMap { _ => - val recordPartition0 = ProducerRecord(topic, Chunk.empty, partition = Some(0)) - val recordPartition1 = ProducerRecord(topic, Chunk.empty, partition = Some(1)) - for { - _ <- ZIO.foreachParDiscard(0 until messagesPerPartition) { _ => - producer.produce(recordPartition0) zipPar producer.produce(recordPartition1) - } - handledAllFromPartition0 <- handledPartition0.await.timeout(10.seconds) - _ <- delayPartition1.succeed(()) - handledAllFromPartition1 <- handledPartition1.await.timeout(10.seconds) - } yield { - (handledAllFromPartition0 must beSome) and (handledAllFromPartition1 must beSome) - } + + test <- + RecordConsumer + .make(configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)), handler) + .flatMap { _ => + val recordPartition0 = ProducerRecord(topic, Chunk.empty, partition = Some(0)) + val recordPartition1 = ProducerRecord(topic, Chunk.empty, partition = Some(1)) + for { + _ <- ZIO.foreachParDiscard(0 until messagesPerPartition) { _ => + producer.produce(recordPartition0) zipPar producer.produce(recordPartition1) + } + handledAllFromPartition0 <- handledPartition0.await.timeout(10.seconds) + _ <- delayPartition1.succeed(()) + handledAllFromPartition1 <- handledPartition1.await.timeout(10.seconds) + } yield { + (handledAllFromPartition0 must beSome) and (handledAllFromPartition1 must beSome) } - } yield test - } + } + } yield test + } - "delay resuming a paused partition" in - ZIO.scoped { - for { - r <- getShared - TestResources(kafka, producer) = r - _ <- ZIO.debug(">>>> starting test: delay resuming a paused partition") - - topic <- kafka.createRandomTopic(partitions = 1, prefix = "core-not-lose") - group <- randomGroup - - messagesPerPartition = 500 // Exceeds the queue capacity - delayPartition <- Promise.make[Nothing, Unit] - handledPartition <- CountDownLatch.make(messagesPerPartition) - handler = RecordHandler { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => delayPartition.await *> handledPartition.countDown } - start <- Clock.currentTime(TimeUnit.MILLISECONDS) - test <- RecordConsumer - .make(configFor(kafka, group, topic, mutateEventLoop = _.copy(delayResumeOfPausedPartition = 3000)), handler) - .flatMap { _ => - val recordPartition = ProducerRecord(topic, Chunk.empty, partition = Some(0)) - for { - _ <- ZIO.foreachParDiscard(0 until messagesPerPartition) { _ => producer.produce(recordPartition) } - _ <- delayPartition.succeed(()).delay(1.seconds).fork - handledAllFromPartition <- handledPartition.await.timeout(10.seconds) - end <- Clock.currentTime(TimeUnit.MILLISECONDS) - - } yield { - (handledAllFromPartition aka "handledAllFromPartition" must beSome) and - (end - start aka "complete handling duration" must beGreaterThan(3000L)) + s"delay resuming a paused partition${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + _ <- ZIO.debug(">>>> starting test: delay resuming a paused partition") + + topic <- kafka.createRandomTopic(partitions = 1, prefix = "core-not-lose") + group <- randomGroup + + messagesPerPartition = 500 // Exceeds the queue capacity + delayPartition <- Promise.make[Nothing, Unit] + handledPartition <- CountDownLatch.make(messagesPerPartition) + handler = RecordHandler { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => delayPartition.await *> handledPartition.countDown } + start <- Clock.currentTime(TimeUnit.MILLISECONDS) + test <- RecordConsumer + .make( + configFor( + kafka, + group, + topic, + mutateEventLoop = _.copy(delayResumeOfPausedPartition = 3000, consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ), + handler + ) + .flatMap { _ => + val recordPartition = ProducerRecord(topic, Chunk.empty, partition = Some(0)) + for { + _ <- ZIO.foreachParDiscard(0 until messagesPerPartition) { _ => producer.produce(recordPartition) } + _ <- delayPartition.succeed(()).delay(1.seconds).fork + handledAllFromPartition <- handledPartition.await.timeout(10.seconds) + end <- Clock.currentTime(TimeUnit.MILLISECONDS) + + } yield { + (handledAllFromPartition aka "handledAllFromPartition" must beSome) and + (end - start aka "complete handling duration" must beGreaterThan(3000L)) + } } + } yield test + } + + s"pause and resume consumer${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + _ <- ZIO.debug(">>>> starting test: pauseResumeTest") + + topic <- kafka.createRandomTopic(prefix = "core-pause-resume") + group <- randomGroup + + numberOfMessages = 32 + someMessages = 16 + restOfMessages = numberOfMessages - someMessages + handledSomeMessages <- CountDownLatch.make(someMessages) + handledAllMessages <- CountDownLatch.make(numberOfMessages) + handleCounter <- Ref.make[Int](0) + handler = RecordHandler { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => + handleCounter.update(_ + 1) *> handledSomeMessages.countDown zipParRight handledAllMessages.countDown } - } yield test - } - "pause and resume consumer" in - ZIO.scoped { - for { - r <- getShared - TestResources(kafka, producer) = r - _ <- ZIO.debug(">>>> starting test: pauseResumeTest") - - topic <- kafka.createRandomTopic(prefix = "core-pause-resume") - group <- randomGroup - - numberOfMessages = 32 - someMessages = 16 - restOfMessages = numberOfMessages - someMessages - handledSomeMessages <- CountDownLatch.make(someMessages) - handledAllMessages <- CountDownLatch.make(numberOfMessages) - handleCounter <- Ref.make[Int](0) - handler = RecordHandler { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => - handleCounter.update(_ + 1) *> handledSomeMessages.countDown zipParRight handledAllMessages.countDown - } - - test <- RecordConsumer.make(configFor(kafka, group, topic).copy(offsetReset = OffsetReset.Earliest), handler).flatMap { consumer => - val record = ProducerRecord(topic, Chunk.empty) - for { - _ <- ZIO.foreachParDiscard(0 until someMessages)(_ => producer.produce(record)) - _ <- handledSomeMessages.await - _ <- consumer.pause - _ <- ZIO.foreachParDiscard(0 until restOfMessages)(_ => producer.produce(record)) - a <- handledAllMessages.await.timeout(5.seconds) - handledAfterPause <- handleCounter.get - _ <- consumer.resume - b <- handledAllMessages.await.timeout(5.seconds) - } yield { - (handledAfterPause === someMessages) and (a must beNone) and (b must beSome) - } + test <- + RecordConsumer + .make( + configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + .copy(offsetReset = OffsetReset.Earliest), + handler + ) + .flatMap { consumer => + val record = ProducerRecord(topic, Chunk.empty) + for { + _ <- ZIO.foreachParDiscard(0 until someMessages)(_ => producer.produce(record)) + _ <- handledSomeMessages.await + _ <- consumer.pause + _ <- ZIO.foreachParDiscard(0 until restOfMessages)(_ => producer.produce(record)) + a <- handledAllMessages.await.timeout(5.seconds) + handledAfterPause <- handleCounter.get + _ <- consumer.resume + b <- handledAllMessages.await.timeout(5.seconds) + } yield { + (handledAfterPause === someMessages) and (a must beNone) and (b must beSome) } - } yield test - } - - "wait until queues are drained" in { - for { - r <- getShared - TestResources(kafka, producer) = r - _ <- ZIO.debug(">>>> starting test: gracefulShutdownTest") - topic <- kafka.createRandomTopic(prefix = "core-wait-until") - group <- randomGroup - - ref <- Ref.make(0) - startedHandling <- Promise.make[Nothing, Unit] - handler: Handler[Any] = RecordHandler { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => - startedHandling.succeed(()) *> sleep(5.seconds) *> ref.update(_ + 1) - } - - _ <- ZIO.scoped(RecordConsumer.make(configFor(kafka, group, topic), handler).flatMap { _ => - producer.produce(ProducerRecord(topic, Chunk.empty)) *> startedHandling.await - }) - - handled <- ref.get - } yield { - handled must equalTo(1) - } - } + } + } yield test + } - "consumer from earliest offset" in - ZIO.scoped { + s"wait until queues are drained${parallelConsumerString(useParallelConsumer)}" in { for { r <- getShared TestResources(kafka, producer) = r - _ <- ZIO.debug(">>>> starting test: earliestTest") - topic <- kafka.createRandomTopic(prefix = "core-from-earliest") + _ <- ZIO.debug(">>>> starting test: gracefulShutdownTest") + topic <- kafka.createRandomTopic(prefix = "core-wait-until") group <- randomGroup - queue <- Queue.unbounded[ConsumerRecord[String, String]] - handler = RecordHandler(queue.offer(_: ConsumerRecord[String, String])) - .withDeserializers(StringSerde, StringSerde) - .ignore + ref <- Ref.make(0) + startedHandling <- Promise.make[Nothing, Unit] + handler: Handler[Any] = RecordHandler { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => + startedHandling.succeed(()) *> sleep(5.seconds) *> ref.update(_ + 1) + } - record = ProducerRecord(topic, "bar", Some("foo")) - _ <- producer.produce(record, StringSerde, StringSerde) + _ <- ZIO.scoped( + RecordConsumer + .make(configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)), handler) + .flatMap { _ => producer.produce(ProducerRecord(topic, Chunk.empty)) *> startedHandling.await } + ) - message <- RecordConsumer - .make(configFor(kafka, group, topic).copy(offsetReset = Earliest), handler) - .flatMap { _ => queue.take } - .timeout(10.seconds) + handled <- ref.get } yield { - message.get must (beRecordWithKey("foo") and beRecordWithValue("bar")) + handled must equalTo(1) } } - "not lose messages while throttling after rebalance" in - ZIO.scoped { - for { - _ <- ZIO.debug(">>>> starting test: throttleWhileRebalancingTest") - r <- getShared - TestResources(kafka, producer) = r - partitions = 30 - topic <- kafka.createRandomTopic(partitions, prefix = "core-not-lose-while-throttling") - group <- randomGroup - probe <- Ref.make(Map.empty[Partition, Seq[Offset]]) - messagesPerPartition = 250 - handler = RecordHandler { record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => - sleep(10.millis) *> - probe - .getAndUpdate(curr => curr + (record.partition -> (curr.getOrElse(record.partition, Nil) :+ record.offset))) - .flatMap(map => - ZIO.when(map.getOrElse(record.partition, Nil).contains(record.offset))( - ZIO.debug(OffsetWasAlreadyProcessed(record.partition, record.offset).toString) - ) - ) - } - createConsumerTask = (i: Int) => makeConsumer(kafka, topic, group, handler, i) - test <- createConsumerTask(0).flatMap { _ => - val record = ProducerRecord(topic, Chunk.empty, partition = Some(0)) - for { - env <- ZIO.environment[Env] - _ <- ZIO.foreachParDiscard(0 until partitions) { p => - ZIO.foreachDiscard(0 until messagesPerPartition)(_ => producer.produceAsync(record.copy(partition = Some(p)))) - } - _ <- createConsumerTask(1).provideEnvironment(env.add(Scope.global)).forkScoped // rebalance - _ <- createConsumerTask(2).provideEnvironment(env.add(Scope.global)).forkScoped // rebalance // rebalance - expected = (0 until partitions).map(p => (p, 0L until messagesPerPartition)).toMap - _ <- eventuallyTimeoutFail(probe.get)(m => - m.mapValues(_.lastOption).values.toSet == Set(Option(messagesPerPartition - 1L)) && m.size == partitions - )(120.seconds) - finalResult <- probe.get - _ <- ZIO.debug(finalResult.mapValues(_.size).mkString(",")) - } yield finalResult === expected - } - } yield test - } - - "subscribe to a pattern" in - ZIO.scoped { - for { - _ <- ZIO.debug(">>>> starting test: patternTest") - topic1 = "core-subscribe-pattern1-topic" - topic2 = "core-subscribe-pattern2-topic" - r <- getShared - TestResources(kafka, producer) = r - _ <- kafka.createTopics(Seq(topic1, topic2).map(t => TopicConfig(t, 1, 1, delete)): _*) - group <- randomGroup - probe <- Ref.make(Seq.empty[(Topic, Offset)]) - handler = RecordHandler { record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => probe.update(_ :+ (record.topic, record.offset)) } - - _ <- makeConsumer(kafka, compile("core-subscribe-pattern1.*"), group, handler, 0).flatMap { consumer => - val record = ProducerRecord(topic1, Chunk.empty, key = Option(Chunk.empty)) + s"consumer from earliest offset${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + _ <- ZIO.debug(">>>> starting test: earliestTest") + topic <- kafka.createRandomTopic(prefix = "core-from-earliest") + group <- randomGroup + + queue <- Queue.unbounded[ConsumerRecord[String, String]] + handler = RecordHandler(queue.offer(_: ConsumerRecord[String, String])) + .withDeserializers(StringSerde, StringSerde) + .ignore + + record = ProducerRecord(topic, "bar", Some("foo")) + _ <- producer.produce(record, StringSerde, StringSerde) + + message <- RecordConsumer + .make( + configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + .copy(offsetReset = Earliest), + handler + ) + .flatMap { _ => queue.take } + .timeout(10.seconds) + } yield { + message.get must (beRecordWithKey("foo") and beRecordWithValue("bar")) + } + } - producer.produce(record) *> eventuallyZ(probe.get)(_ == (topic1, 0) :: Nil) *> - consumer.resubscribe(TopicPattern(compile("core-subscribe-pattern2.*"))) *> - producer.produce(record.copy(topic = topic2)) *> eventuallyZ(probe.get)(_ == (topic1, 0) :: (topic2, 0) :: Nil) - } - } yield ok - } + s"not lose messages while throttling after rebalance${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + _ <- ZIO.debug(">>>> starting test: throttleWhileRebalancingTest") + r <- getShared + TestResources(kafka, producer) = r + partitions = 30 + topic <- kafka.createRandomTopic(partitions, prefix = "core-not-lose-while-throttling") + group <- randomGroup + probe <- Ref.make(Map.empty[Partition, Seq[Offset]]) + messagesPerPartition = 250 + handler = RecordHandler { record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => + sleep(10.millis) *> + probe + .getAndUpdate(curr => curr + (record.partition -> (curr.getOrElse(record.partition, Nil) :+ record.offset))) + .flatMap(map => + ZIO.when(map.getOrElse(record.partition, Nil).contains(record.offset))( + ZIO.debug(OffsetWasAlreadyProcessed(record.partition, record.offset).toString) + ) + ) + } + createConsumerTask = + (i: Int) => + makeConsumer(kafka, topic, group, handler, i, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + test <- createConsumerTask(0).flatMap { _ => + val record = ProducerRecord(topic, Chunk.empty, partition = Some(0)) + for { + env <- ZIO.environment[Env] + _ <- ZIO.foreachParDiscard(0 until partitions) { p => + ZIO.foreachDiscard(0 until messagesPerPartition)(_ => producer.produceAsync(record.copy(partition = Some(p)))) + } + _ <- createConsumerTask(1).provideEnvironment(env.add(Scope.global)).forkScoped // rebalance + _ <- createConsumerTask(2).provideEnvironment(env.add(Scope.global)).forkScoped // rebalance // rebalance + expected = (0 until partitions).map(p => (p, 0L until messagesPerPartition)).toMap + _ <- eventuallyTimeoutFail(probe.get)(m => + m.mapValues(_.lastOption).values.toSet == Set(Option(messagesPerPartition - 1L)) && m.size == partitions + )(120.seconds) + finalResult <- probe.get + _ <- ZIO.debug(finalResult.mapValues(_.size).mkString(",")) + } yield finalResult === expected + } + } yield test + } - "consumer from a new partition is interrupted before commit (offsetReset = Latest)" in - ZIO.scoped { - for { - r <- getShared - TestResources(kafka, producer) = r - topic <- kafka.createRandomTopic(1) - group <- randomGroup - handlingStarted <- Promise.make[Nothing, Unit] - hangForever <- Promise.make[Nothing, Unit] - hangingHandler = RecordHandler { record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => - handlingStarted.complete(ZIO.unit) *> hangForever.await - } - record <- aProducerRecord(topic) - recordValue = record.value.get - _ <- makeConsumer(kafka, topic, group, hangingHandler, 0, _.copy(drainTimeout = 200.millis)) - .flatMap { consumer => - producer.produce(record, StringSerde, StringSerde) *> handlingStarted.await *> - // unsubscribe to make sure partitions are released - consumer.resubscribe(ConsumerSubscription.topics()) + s"consumer from a new partition is interrupted before commit (offsetReset = Latest)${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic(1) + group <- randomGroup + handlingStarted <- Promise.make[Nothing, Unit] + hangForever <- Promise.make[Nothing, Unit] + hangingHandler = RecordHandler { record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => + handlingStarted.complete(ZIO.unit) *> hangForever.await } - .disconnect - .timeoutFail(new TimeoutException("timed out waiting for consumer 0"))(10.seconds) - consumed <- AwaitableRef.make(Seq.empty[String]) - handler = RecordHandler { record: ConsumerRecord[String, String] => consumed.update(_ :+ record.value) } - .withDeserializers(StringSerde, StringSerde) - - consumedValues <- makeConsumer(kafka, topic, group, handler, 1, modifyConfig = _.copy(offsetReset = Latest)) - .flatMap { _ => consumed.await(_.nonEmpty, 5.seconds) } - .disconnect - .timeoutFail(new TimeoutException("timed out waiting for consumer 1"))(10.seconds) - } yield { - consumedValues must contain(recordValue) + record <- aProducerRecord(topic) + recordValue = record.value.get + _ <- + makeConsumer( + kafka, + topic, + group, + hangingHandler, + 0, + mutateEventLoop = _.copy(drainTimeout = 200.millis, consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ) + .flatMap { consumer => + producer.produce(record, StringSerde, StringSerde) *> handlingStarted.await *> + // unsubscribe to make sure partitions are released + consumer.resubscribe(ConsumerSubscription.topics()) + } + .disconnect + .timeoutFail(new TimeoutException("timed out waiting for consumer 0"))(10.seconds) + consumed <- AwaitableRef.make(Seq.empty[String]) + handler = RecordHandler { record: ConsumerRecord[String, String] => consumed.update(_ :+ record.value) } + .withDeserializers(StringSerde, StringSerde) + + consumedValues <- makeConsumer( + kafka, + topic, + group, + handler, + 1, + mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8), + modifyConfig = _.copy(offsetReset = Latest) + ) + .flatMap { _ => consumed.await(_.nonEmpty, 5.seconds) } + .disconnect + .timeoutFail(new TimeoutException("timed out waiting for consumer 1"))(10.seconds) + } yield { + consumedValues must contain(recordValue) + } } - } - "block until current tasks complete" in - ZIO.scoped { - for { - r <- getShared - TestResources(kafka, producer) = r - topic <- kafka.createRandomTopic(prefix = "block-until") - group <- randomGroup + s"block until current tasks complete${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic(prefix = "block-until") + group <- randomGroup + + waitForTasksDuration <- TRef.make[Duration](0.millis).commit + innerGate <- Gate.make(initiallyAllow = false) + outerGate <- Gate.make(initiallyAllow = false) + handler = RecordHandler((_: ConsumerRecord[String, String]) => outerGate.toggle(true) *> innerGate.await()) + .withDeserializers(StringSerde, StringSerde) + .ignore + config = configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + record = ProducerRecord(topic, "bar", Some("foo")) + + test <- RecordConsumer.make(config, handler).flatMap { consumer => + for { + _ <- ZIO.foreach((0 until 100).toSet)(_ => producer.produce(record, StringSerde, StringSerde)) *> + outerGate.await() /* handler waiting on innerGate now */ *> + consumer.waitForCurrentRecordsCompletion.timed + .map(_._1) /* after 'delay' the consumer's innerGate opens and handler completes */ + .flatMap(d => waitForTasksDuration.set(d).commit) + .fork + delay = 1.second + tasksDurationFiber <- waitForTasksDuration.get.tap(d => STM.check(d > 0.millis)).commit.fork + _ <- innerGate.toggle(true).delay(delay) /*and now handler will complete */ + tasksDuration <- tasksDurationFiber.join + } yield tasksDuration must between(delay * 0.9, delay * 3) + } + } yield test + } - waitForTasksDuration <- TRef.make[Duration](0.millis).commit - innerGate <- Gate.make(initiallyAllow = false) - outerGate <- Gate.make(initiallyAllow = false) - handler = RecordHandler((_: ConsumerRecord[String, String]) => outerGate.toggle(true) *> innerGate.await()) - .withDeserializers(StringSerde, StringSerde) - .ignore - config = configFor(kafka, group, topic) - record = ProducerRecord(topic, "bar", Some("foo")) - - test <- RecordConsumer.make(config, handler).flatMap { consumer => - for { - _ <- ZIO.foreach((0 until 100).toSet)(_ => producer.produce(record, StringSerde, StringSerde)) *> - outerGate.await() /* handler waiting on innerGate now */ *> - consumer.waitForCurrentRecordsCompletion.timed - .map(_._1) /* after 'delay' the consumer's innerGate opens and handler completes */ - .flatMap(d => waitForTasksDuration.set(d).commit) - .fork - delay = 1.second - tasksDurationFiber <- waitForTasksDuration.get.tap(d => STM.check(d > 0.millis)).commit.fork - _ <- innerGate.toggle(true).delay(delay) /*and now handler will complete */ - tasksDuration <- tasksDurationFiber.join - } yield tasksDuration must between(delay * 0.9, delay * 3) - } - } yield test - } + s"rewind positions on poll failure${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + type BinaryRecord = ConsumerRecord[Chunk[Byte], Chunk[Byte]] + type BinaryDecryptor = Decryptor[Any, RuntimeException, Chunk[Byte], Chunk[Byte]] + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic(prefix = "poll-fail") + group <- randomGroup + failToDecrypt <- Ref.make(false) + messages <- AwaitableRef.make[Seq[ConsumerRecord[String, String]]](Nil) + handler = RecordHandler((cr: ConsumerRecord[String, String]) => ZIO.debug(s"***** Consumed: $cr") *> messages.update(_ :+ cr)) + .withDeserializers(StringSerde, StringSerde) + .ignore + cId <- clientId + decryptor: BinaryDecryptor = new BinaryDecryptor { + override def decrypt(record: ConsumerRecord[Chunk[Byte], Chunk[Byte]])( + implicit trace: Trace + ): ZIO[Any, RuntimeException, ConsumerRecord[Chunk[Byte], Chunk[Byte]]] = + ZIO.whenZIO(failToDecrypt.get)(ZIO.fail(new RuntimeException)).map(_ => record) + } - "rewind positions on poll failure" in - ZIO.scoped { - type BinaryRecord = ConsumerRecord[Chunk[Byte], Chunk[Byte]] - type BinaryDecryptor = Decryptor[Any, RuntimeException, Chunk[Byte], Chunk[Byte]] - for { - r <- getShared - TestResources(kafka, producer) = r - topic <- kafka.createRandomTopic(prefix = "poll-fail") - group <- randomGroup - failToDecrypt <- Ref.make(false) - messages <- AwaitableRef.make[Seq[ConsumerRecord[String, String]]](Nil) - handler = RecordHandler((cr: ConsumerRecord[String, String]) => ZIO.debug(s"***** Consumed: $cr") *> messages.update(_ :+ cr)) - .withDeserializers(StringSerde, StringSerde) - .ignore - cId <- clientId - decryptor: BinaryDecryptor = new BinaryDecryptor { - override def decrypt(record: ConsumerRecord[Chunk[Byte], Chunk[Byte]])( - implicit trace: Trace - ): ZIO[Any, RuntimeException, ConsumerRecord[Chunk[Byte], Chunk[Byte]]] = - ZIO.whenZIO(failToDecrypt.get)(ZIO.fail(new RuntimeException)).map(_ => record) - } - - pollFailedMetrics <- TestMetrics.queue - - config = configFor(kafka, group, topic).copy(clientId = cId, decryptor = decryptor) - aRecord = (i: Int) => ProducerRecord(topic, s"payload-$i", Some(s"key-$i")) - _ <- RecordConsumer.make(config, handler).flatMap { consumer => - val Seq(rec1, rec2) = (1 to 2) map aRecord - consumer.resubscribe(ConsumerSubscription.topics(topic)) *> producer.produce(rec1, StringSerde, StringSerde) *> - messages.await(_.exists(_.value == rec1.value.get)) *> failToDecrypt.set(true) *> - producer.produce(rec2, StringSerde, StringSerde) *> next[PollingFailed](pollFailedMetrics) *> failToDecrypt.set(false) *> - messages.await(_.exists(_.value == rec2.value.get), 5.seconds) - } - } yield ok - } + pollFailedMetrics <- TestMetrics.queue + + config = configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + .copy(clientId = cId, decryptor = decryptor) + aRecord = (i: Int) => ProducerRecord(topic, s"payload-$i", Some(s"key-$i")) + _ <- RecordConsumer.make(config, handler).flatMap { consumer => + val Seq(rec1, rec2) = (1 to 2) map aRecord + consumer.resubscribe(ConsumerSubscription.topics(topic)) *> producer.produce(rec1, StringSerde, StringSerde) *> + messages.await(_.exists(_.value == rec1.value.get)) *> failToDecrypt.set(true) *> + producer.produce(rec2, StringSerde, StringSerde) *> next[PollingFailed](pollFailedMetrics) *> failToDecrypt.set(false) *> + messages.await(_.exists(_.value == rec2.value.get), 5.seconds) + } + } yield ok + } + } def next[A <: GreyhoundMetric](queue: Queue[GreyhoundMetric]) = queue.take.repeatUntil(_.isInstanceOf[A]) @@ -475,9 +553,13 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { pattern: Pattern, group: String, handler: RecordHandler[Any, Nothing, Chunk[Byte], Chunk[Byte]], - i: Int + i: Int, + useParallelConsumer: Boolean ) = - RecordConsumer.make(configFor(kafka, group, pattern).copy(clientId = s"client-$i", offsetReset = OffsetReset.Earliest), handler) + RecordConsumer.make( + configFor(kafka, group, pattern, useParallelConsumer).copy(clientId = s"client-$i", offsetReset = OffsetReset.Earliest), + handler + ) private def configFor(kafka: ManagedKafka, group: Group, topic: Topic, mutateEventLoop: EventLoopConfig => EventLoopConfig = identity) = RecordConsumerConfig( @@ -489,8 +571,19 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { eventLoopConfig = mutateEventLoop(EventLoopConfig.Default) ) - private def configFor(kafka: ManagedKafka, group: Group, pattern: Pattern) = - RecordConsumerConfig(kafka.bootstrapServers, group, TopicPattern(pattern), extraProperties = fastConsumerMetadataFetching) + private def configFor( + kafka: ManagedKafka, + group: Group, + pattern: Pattern, + useParallelConsumer: Boolean + ) = + RecordConsumerConfig( + kafka.bootstrapServers, + group, + TopicPattern(pattern), + extraProperties = fastConsumerMetadataFetching, + eventLoopConfig = EventLoopConfig.Default.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ) private def fastConsumerMetadataFetching = Map("metadata.max.age.ms" -> "0") @@ -500,6 +593,9 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { payload <- randomId } yield ProducerRecord(topic, payload, Some(key)) + private def parallelConsumerString(isParallel: Boolean) = + if (isParallel) " - using parallel consumer" else "" + } object TimedOutWaitingForMessages extends RuntimeException diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/BUILD.bazel b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/BUILD.bazel new file mode 100644 index 00000000..f0c1bbb8 --- /dev/null +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/BUILD.bazel @@ -0,0 +1,27 @@ +package(default_visibility = ["//visibility:public"]) + +sources() + +specs2_ite2e_test( + name = "parallel", + srcs = [ + ":sources", + ], + deps = [ + "//core/src/it/resources", + "//core/src/it/scala/com/wixpress/dst/greyhound/testenv", + "//core/src/it/scala/com/wixpress/dst/greyhound/testkit", + "//core/src/main/scala/com/wixpress/dst/greyhound/core", + "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer", + "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/domain", + "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry", + "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", + "//core/src/main/scala/com/wixpress/dst/greyhound/core/producer", + "//core/src/main/scala/com/wixpress/dst/greyhound/core/zioutils", + "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", + "@dev_zio_izumi_reflect_2_12", + "@dev_zio_zio_2_12", + "@dev_zio_zio_managed_2_12", + "@org_apache_kafka_kafka_clients", + ], +) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala new file mode 100644 index 00000000..c5986d27 --- /dev/null +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala @@ -0,0 +1,325 @@ +package com.wixpress.dst.greyhound.core.parallel +import com.wixpress.dst.greyhound.core.Serdes.StringSerde +import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.SkippedGapsOnInitialization +import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.Topics +import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, ConsumerSubscription, RecordHandler} +import com.wixpress.dst.greyhound.core.consumer.{EventLoopConfig, RebalanceListener, RecordConsumer, RecordConsumerConfig} +import com.wixpress.dst.greyhound.core.producer.{ProducerRecord, ReportingProducer} +import com.wixpress.dst.greyhound.core.testkit.RecordMatchers.{beRecordWithKey, beRecordWithValue, beRecordsWithKeysAndValues} +import com.wixpress.dst.greyhound.core.testkit.{eventuallyZ, BaseTestWithSharedEnv, TestMetrics} +import com.wixpress.dst.greyhound.core.zioutils.CountDownLatch +import com.wixpress.dst.greyhound.core.{Group, Topic, TopicPartition} +import com.wixpress.dst.greyhound.testenv.ITEnv +import com.wixpress.dst.greyhound.testenv.ITEnv.{clientId, partitions, randomGroup, randomId, Env, ManagedKafkaOps, TestResources} +import com.wixpress.dst.greyhound.testkit.ManagedKafka +import zio.Clock.sleep +import zio.{Queue, ZIO, _} + +class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { + sequential + + override def env = ITEnv.ManagedEnv + + override def sharedEnv = ITEnv.testResources() + + "consume messages correctly after rebalance" in { + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic1 <- kafka.createRandomTopic(prefix = "topic1") + topic2 <- kafka.createRandomTopic(prefix = "topic2") + group <- randomGroup + + queue <- Queue.unbounded[ConsumerRecord[String, String]] + handler = RecordHandler((cr: ConsumerRecord[String, String]) => queue.offer(cr)).withDeserializers(StringSerde, StringSerde) + cId <- clientId + config = parallelConsumerConfig(kafka, topic1, group, cId) + records1 = producerRecords(topic1, "1", partitions, 10) + records2 = producerRecords(topic1, "2", partitions, 10) + numRecordsExpected = records1.size + records2.size + messagesOption <- for { + consumer <- RecordConsumer.make(config, handler) + _ <- produceRecords(producer, records1) + _ <- sleep(5.seconds) + _ <- consumer.resubscribe(ConsumerSubscription.topics(topic1, topic2)) // trigger rebalance + _ <- sleep(500.millis) + _ <- produceRecords(producer, records2) + maybeMessages <- queue + .takeBetween(numRecordsExpected, numRecordsExpected) + .timeout(60.seconds) + .tap(o => ZIO.when(o.isEmpty)(Console.printLine("timeout waiting for messages"))) + } yield maybeMessages + messages <- ZIO.fromOption(messagesOption).orElseFail(TimedOutWaitingForMessages) + } yield { + messages must beRecordsWithKeysAndValues(records1 ++ records2) + } + } + } + + "consume messages exactly once when processing following multiple consecutive polls" in { + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic() + group <- randomGroup + queue <- Queue.unbounded[ConsumerRecord[String, String]] + handlerWithSleep = + RecordHandler((cr: ConsumerRecord[String, String]) => { + (if (cr.partition == cr.offset) ZIO.sleep(2.seconds) // sleep to simulate long processing time and go through multiple polls + else ZIO.unit) *> queue.offer(cr) + }) + .withDeserializers(StringSerde, StringSerde) + cId <- clientId + config = parallelConsumerConfig(kafka, topic, group, cId) + records = producerRecords(topic, "1", partitions, 5) + messagesOption <- RecordConsumer.make(config, handlerWithSleep).flatMap { consumer => + produceRecords(producer, records) *> ZIO.sleep(3.seconds) *> + queue + .takeBetween(records.size, records.size) + .timeout(60.seconds) + .tap(o => ZIO.when(o.isEmpty)(Console.printLine("timeout waiting for messages!"))) + } + messages <- ZIO.fromOption(messagesOption).orElseFail(TimedOutWaitingForMessages) + } yield { + messages must + allOf( + records.map(r => beRecordWithKey(r.key.get) and beRecordWithValue(r.value.get)): _* + ) + } + } + } + + "consume gaps after rebalance and skip already-consumed records" in { + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic() + group <- randomGroup + cId <- clientId + partition = 0 + allMessages = 10 + fastMessages = allMessages - 1 + drainTimeout = 5.seconds + + keyWithSlowHandling = "slow-key" + numProcessedMessges <- Ref.make[Int](0) + fastMessagesLatch <- CountDownLatch.make(fastMessages) + + randomKeys <- ZIO.foreach(1 to fastMessages)(i => randomKey(i.toString)).map(_.toSeq) + + fastRecords = randomKeys.map { key => recordWithKey(topic, key, partition) } + slowRecord = recordWithKey(topic, keyWithSlowHandling, partition) + + finishRebalance <- Promise.make[Nothing, Unit] + + // handler that sleeps only on the slow key + handler = RecordHandler { cr: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => + (cr.key match { + case Some(k) if k == Chunk.fromArray(keyWithSlowHandling.getBytes) => + // make sure the handler doesn't finish before the rebalance is done, including drain timeout + finishRebalance.await *> ZIO.sleep(drainTimeout + 1.second) + case _ => fastMessagesLatch.countDown + }) *> numProcessedMessges.update(_ + 1) + } + _ <- + for { + consumer <- makeParallelConsumer(handler, kafka, topic, group, cId, drainTimeout = drainTimeout, startPaused = true) + _ <- produceRecords(producer, Seq(slowRecord)) + _ <- produceRecords(producer, fastRecords) + // produce is done synchronously to make sure all records are produced before consumer starts, so all records are polled at once + _ <- consumer.resume + _ <- fastMessagesLatch.await + _ <- ZIO.sleep(2.second) // sleep to ensure commit is done before rebalance + // start another consumer to trigger a rebalance before slow handler is done + _ <- makeParallelConsumer( + handler, + kafka, + topic, + group, + cId, + drainTimeout = drainTimeout, + onAssigned = assigned => ZIO.when(assigned.nonEmpty)(finishRebalance.succeed()) + ) + } yield () + + _ <- eventuallyZ(numProcessedMessges.get, 20.seconds)(_ == allMessages) + } yield { + ok + } + } + } + + "migrate correctly from regular record consumer to parallel consumer - consume every record once" in { + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic() + group <- randomGroup + cId <- clientId + + regularConfig = configFor(kafka, group, Set(topic)) + parallelConfig = parallelConsumerConfig(kafka, topic, group, cId) // same group name for both consumers + queue <- Queue.unbounded[ConsumerRecord[String, String]] + handler = RecordHandler((cr: ConsumerRecord[String, String]) => queue.offer(cr)).withDeserializers(StringSerde, StringSerde) + + records1 = producerRecords(topic, "1", partitions, 3) + records2 = producerRecords(topic, "2", partitions, 3) + _ <- ZIO.debug(s"records1:\n${records1.mkString("\n")}\nrecords2:\n${records2.mkString("\n")}") + numMessages = records1.size + records2.size + + _ <- RecordConsumer.make(regularConfig, handler) + _ <- produceRecords(producer, records1) + _ <- ZIO.sleep(3.seconds) + _ <- RecordConsumer.make(parallelConfig, handler).delay(3.seconds) + _ <- produceRecords(producer, records2) + _ <- ZIO.sleep(3.seconds) + messagesOption <- RecordConsumer.make(parallelConfig, handler).flatMap { _ => + produceRecords(producer, records2) *> ZIO.sleep(3.seconds) *> + queue + .takeBetween(numMessages, numMessages) + .timeout(60.seconds) + .tap(o => ZIO.when(o.isEmpty)(Console.printLine("timeout waiting for messages!"))) + } + messages <- ZIO.fromOption(messagesOption).orElseFail(TimedOutWaitingForMessages) + } yield { + messages must beRecordsWithKeysAndValues(records1 ++ records2) + } + } + } + + "migrate from parallel consumer with gaps to regular consumer - consume from latest and report non-consumed gaps" in { + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic() + group <- randomGroup + cId <- clientId + partition = 0 + allMessages = 10 + fastMessages = allMessages - 1 + + skippedGaps <- Ref.make[Int](0) + metricsQueue <- TestMetrics.queue + + regularConfig = configFor(kafka, group, Set(topic)) + _ <- metricsQueue.take + .flatMap { + case m: SkippedGapsOnInitialization => + ZIO.debug(s">>> got SkippedGapsOnInitialization with gaps: ${m.gaps}") *> skippedGaps.update(_ + 1) + case _ => ZIO.unit + } + .repeat(Schedule.forever) + .fork + + keyWithSlowHandling = "slow-key" + numProcessedMessages <- Ref.make[Int](0) + fastMessagesLatch <- CountDownLatch.make(fastMessages) + + randomKeys <- ZIO.foreach(1 to fastMessages)(i => randomKey(i.toString)).map(_.toSeq) + + fastRecords = randomKeys.map { key => recordWithKey(topic, key, partition) } + slowRecord = recordWithKey(topic, keyWithSlowHandling, partition) + additionalRecords = producerRecords(topic, "additional", 1, 5) + + finishRebalance <- Promise.make[Nothing, Unit] + + // handler that sleeps forever on the slow key + parallelConsumerHandler = RecordHandler { cr: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => + (cr.key match { + case Some(k) if k == Chunk.fromArray(keyWithSlowHandling.getBytes) => + ZIO.sleep(Duration.Infinity) + case _ => fastMessagesLatch.countDown + }) *> numProcessedMessages.update(_ + 1) + } + + regularConsumerHandler = RecordHandler { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => numProcessedMessages.update(_ + 1) } + + parallelConsumer <- makeParallelConsumer(parallelConsumerHandler, kafka, topic, group, cId, startPaused = true) + _ <- produceRecords(producer, Seq(slowRecord)) + _ <- produceRecords(producer, fastRecords) + // produce is done synchronously to make sure all records are produced before consumer starts, so all records are polled at once + _ <- parallelConsumer.resume + _ <- fastMessagesLatch.await + _ <- ZIO.sleep(2.second) // sleep to ensure commit is done before rebalance + // migrate to regular fromLatest consumer while gap exists + _ <- parallelConsumer.shutdown() *> RecordConsumer.make(regularConfig, regularConsumerHandler) + _ <- produceRecords(producer, additionalRecords) + _ <- eventuallyZ(numProcessedMessages.get, 20.seconds)(_ == fastMessages + additionalRecords.size) + _ <- eventuallyZ(skippedGaps.get, 20.seconds)(_.must(beGreaterThanOrEqualTo(1))) + } yield { + ok + } + } + } + + private def configFor( + kafka: ManagedKafka, + group: Group, + topics: Set[Topic], + mutateEventLoop: EventLoopConfig => EventLoopConfig = identity, + extraProperties: Map[String, String] = Map.empty + ) = RecordConsumerConfig( + bootstrapServers = kafka.bootstrapServers, + group = group, + initialSubscription = Topics(topics), + eventLoopConfig = mutateEventLoop(EventLoopConfig.Default), + extraProperties = extraProperties + ) + + private def makeParallelConsumer( + handler: RecordHandler[Any, Nothing, Chunk[Byte], Chunk[Byte]], + kafka: ManagedKafka, + topic: String, + group: String, + cId: String, + drainTimeout: Duration = 20.seconds, + startPaused: Boolean = false, + onAssigned: Set[TopicPartition] => UIO[Any] = _ => ZIO.unit + ) = + RecordConsumer.make(parallelConsumerConfig(kafka, topic, group, cId, drainTimeout, startPaused, onAssigned), handler) + + private def parallelConsumerConfig( + kafka: ManagedKafka, + topic: String, + group: String, + cId: String, + drainTimeout: Duration = 20.seconds, + startPaused: Boolean = false, + onAssigned: Set[TopicPartition] => UIO[Any] = _ => ZIO.unit + ) = { + configFor( + kafka, + group, + Set(topic), + mutateEventLoop = _.copy( + consumePartitionInParallel = true, + maxParallelism = 10, + drainTimeout = drainTimeout, + startPaused = startPaused, + rebalanceListener = RebalanceListener(onAssigned = onAssigned) + ) + ) + .copy(clientId = cId) + } + + private def producerRecords(topic: String, tag: String, partitions: Int, recordsPerPartition: Int) = (0 until partitions).flatMap(p => + (0 until recordsPerPartition).map(i => ProducerRecord(topic, s"value-t$tag-p$p-$i", Some(s"key-t$tag-p$p-$i"), partition = Some(p))) + ) + + def produceRecords(producer: ReportingProducer[Any], records: Seq[ProducerRecord[String, String]]) = + ZIO + .foreach(records)(r => producer.produce(r, StringSerde, StringSerde)) + + private def recordWithKey(topic: String, key: String, partition: Int) = + ProducerRecord(topic, "", Some(key), partition = Some(partition)) + + private def randomKey(prefix: String) = + randomId.map(r => s"$prefix-$r") +} + +object TimedOutWaitingForMessages extends RuntimeException diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/OffsetAndMetadata.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/OffsetAndMetadata.scala index 6b49ec3a..d03516aa 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/OffsetAndMetadata.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/OffsetAndMetadata.scala @@ -10,5 +10,8 @@ object OffsetAndMetadata { def apply(offsetAndMetadata: KafkaOffsetAndMetadata): OffsetAndMetadata = OffsetAndMetadata(offsetAndMetadata.offset(), offsetAndMetadata.metadata()) + def apply(offset: Offset): OffsetAndMetadata = + OffsetAndMetadata(offset, NO_METADATA) + val NO_METADATA = "" } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index 7592362e..64168a77 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -33,16 +33,24 @@ trait Consumer { def commit(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] + def commitWithMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata])(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] + def endOffsets(partitions: Set[TopicPartition])(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] def beginningOffsets(partitions: Set[TopicPartition])(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] def committedOffsets(partitions: Set[TopicPartition])(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] + def committedOffsetsAndMetadata(partitions: Set[TopicPartition])(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] + def offsetsForTimes(topicPartitionsOnTimestamp: Map[TopicPartition, Long])(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] def commitOnRebalance(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] + def commitWithMetadataOnRebalance(offsets: Map[TopicPartition, OffsetAndMetadata])( + implicit trace: Trace + ): RIO[GreyhoundMetrics, DelayedRebalanceEffect] + def pause(partitions: Set[TopicPartition])(implicit trace: Trace): ZIO[GreyhoundMetrics, IllegalStateException, Unit] def resume(partitions: Set[TopicPartition])(implicit trace: Trace): ZIO[GreyhoundMetrics, IllegalStateException, Unit] @@ -96,7 +104,8 @@ object Consumer { timeoutIfSeek = 10.seconds, initialSeek = cfg.initialSeek, rewindUncommittedOffsetsBy = cfg.rewindUncommittedOffsetsByMillis.millis, - offsetResetIsEarliest = cfg.offsetReset == OffsetReset.Earliest + offsetResetIsEarliest = cfg.offsetReset == OffsetReset.Earliest, + parallelConsumer = cfg.useParallelConsumer ) } yield { new Consumer { @@ -144,10 +153,22 @@ object Consumer { withConsumerBlocking(_.committed(kafkaPartitions(partitions))) .map(_.asScala.collect { case (tp: KafkaTopicPartition, o: KafkaOffsetAndMetadata) => (TopicPartition(tp), o.offset) }.toMap) + override def committedOffsetsAndMetadata( + partitions: NonEmptySet[TopicPartition] + )(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] = + withConsumerBlocking(_.committed(kafkaPartitions(partitions))) + .map(_.asScala.collect { case (tp: KafkaTopicPartition, om: KafkaOffsetAndMetadata) => (TopicPartition(tp), OffsetAndMetadata(om.offset, om.metadata))}.toMap) + override def commit(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, cfg.commitMetadataString)))) } + override def commitWithMetadata( + offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata] + )(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { + withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(offsetsAndMetadata))) + } + override def commitOnRebalance( offsets: Map[TopicPartition, Offset] )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { @@ -157,6 +178,11 @@ object Consumer { ZIO.succeed(DelayedRebalanceEffect(consumer.commitSync(kOffsets))) } + override def commitWithMetadataOnRebalance( + offsets: Map[TopicPartition, OffsetAndMetadata] + )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = + ZIO.succeed(DelayedRebalanceEffect(consumer.commitSync(kafkaOffsetsAndMetaData(offsets)))) + override def pause(partitions: Set[TopicPartition])(implicit trace: Trace): ZIO[Any, IllegalStateException, Unit] = withConsumer(_.pause(kafkaPartitions(partitions))).refineOrDie { case e: IllegalStateException => e } @@ -285,7 +311,8 @@ case class ConsumerConfig( consumerAttributes: Map[String, String] = Map.empty, decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, - rewindUncommittedOffsetsByMillis: Long = 0L + rewindUncommittedOffsetsByMillis: Long = 0L, + useParallelConsumer: Boolean = false ) extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = Map( @@ -320,12 +347,16 @@ object OffsetReset { trait UnsafeOffsetOperations { def committed(partitions: Set[TopicPartition], timeout: zio.Duration): Map[TopicPartition, Offset] + def committedWithMetadata(partitions: Set[TopicPartition], timeout: zio.Duration): Map[TopicPartition, OffsetAndMetadata] + def beginningOffsets(partitions: Set[TopicPartition], timeout: zio.Duration): Map[TopicPartition, Offset] def position(partition: TopicPartition, timeout: zio.Duration): Offset def commit(offsets: Map[TopicPartition, Offset], timeout: Duration): Unit + def commitWithMetadata(offsets: Map[TopicPartition, OffsetAndMetadata], timeout: Duration): Unit + def seek(offsets: Map[TopicPartition, Offset]): Unit def endOffsets(partitions: Set[TopicPartition], timeout: Duration): Map[TopicPartition, Offset] @@ -357,6 +388,20 @@ object UnsafeOffsetOperations { } } + override def committedWithMetadata( + partitions: NonEmptySet[TopicPartition], + timeout: zio.Duration + ): Map[TopicPartition, OffsetAndMetadata] = { + consumer + .committed(partitions.map(_.asKafka).asJava, timeout) + .asScala + .toMap + .collect { + case (tp, ofm) if ofm != null => + TopicPartition(tp) -> OffsetAndMetadata(ofm.offset(), ofm.metadata()) + } + } + override def beginningOffsets(partitions: Set[TopicPartition], timeout: Duration): Map[TopicPartition, Offset] = consumer .beginningOffsets(partitions.map(_.asKafka).asJava, timeout) @@ -374,6 +419,10 @@ object UnsafeOffsetOperations { consumer.commitSync(kafkaOffsets(offsets), timeout) } + override def commitWithMetadata(offsets: Map[TopicPartition, OffsetAndMetadata], timeout: zio.Duration): Unit = { + consumer.commitSync(kafkaOffsetsAndMetaData(offsets), timeout) + } + override def seek(offsets: Map[TopicPartition, Offset]): Unit = offsets.foreach { case (tp, offset) => Try(consumer.seek(tp.asKafka, offset)) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index b618d0ba..a3d93217 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -1,7 +1,7 @@ package com.wixpress.dst.greyhound.core.consumer import java.util.concurrent.TimeUnit -import com.wixpress.dst.greyhound.core.consumer.Dispatcher.Record +import com.wixpress.dst.greyhound.core.consumer.Dispatcher.{Record, Records} import com.wixpress.dst.greyhound.core.consumer.DispatcherMetric._ import com.wixpress.dst.greyhound.core.consumer.RecordConsumer.Env import com.wixpress.dst.greyhound.core.consumer.SubmitResult._ @@ -20,6 +20,8 @@ import java.lang.System.currentTimeMillis trait Dispatcher[-R] { def submit(record: Record): URIO[R with Env, SubmitResult] + def submitBatch(records: Records): URIO[R with Env, SubmitResult] + def resumeablePartitions(paused: Set[TopicPartition]): URIO[Any, Set[TopicPartition]] def revoke(partitions: Set[TopicPartition]): URIO[GreyhoundMetrics, Unit] @@ -36,7 +38,8 @@ trait Dispatcher[-R] { } object Dispatcher { - type Record = ConsumerRecord[Chunk[Byte], Chunk[Byte]] + type Record = ConsumerRecord[Chunk[Byte], Chunk[Byte]] + type Records = Seq[Record] def make[R]( group: Group, @@ -48,7 +51,12 @@ object Dispatcher { delayResumeOfPausedPartition: Long = 0, consumerAttributes: Map[String, String] = Map.empty, workersShutdownRef: Ref[Map[TopicPartition, ShutdownPromise]], - startPaused: Boolean = false + startPaused: Boolean = false, + consumeInParallel: Boolean = false, + maxParallelism: Int = 1, + updateBatch: Chunk[Record] => UIO[Unit] = _ => ZIO.unit, + currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] = _ => + ZIO.succeed(Map.empty) )(implicit trace: Trace): UIO[Dispatcher[R]] = for { p <- Promise.make[Nothing, Unit] @@ -63,6 +71,22 @@ object Dispatcher { submitted <- worker.submit(record) } yield if (submitted) Submitted else Rejected + override def submitBatch(records: Records): URIO[R with Env, SubmitResult] = + for { + _ <- report(SubmittingRecordBatch(group, clientId, records.size, consumerAttributes)) + allSamePartition = records.map(r => RecordTopicPartition(r)).distinct.size == 1 + submitResult <- if (allSamePartition) { + val partition = RecordTopicPartition(records.head) + for { + worker <- workerFor(partition, records.head.offset) + submitted <- worker.submitBatch(records) + } yield submitted + } else ZIO.succeed(SubmitBatchResult(success = false, Some(records.minBy(_.offset)))) + + } yield + if (allSamePartition && submitResult.success) Submitted + else RejectedBatch(submitResult.firstRejected.getOrElse(records.minBy(_.offset))) + override def resumeablePartitions(paused: Set[TopicPartition]): URIO[Any, Set[TopicPartition]] = workers.get.flatMap { workers => ZIO.foldLeft(paused)(Set.empty[TopicPartition]) { (acc, partition) => @@ -130,7 +154,20 @@ object Dispatcher { case None => for { _ <- report(StartingWorker(group, clientId, partition, offset, consumerAttributes)) - worker <- Worker.make(state, handleWithMetrics, highWatermark, group, clientId, partition, drainTimeout, consumerAttributes) + worker <- Worker.make( + state, + handleWithMetrics, + highWatermark, + group, + clientId, + partition, + drainTimeout, + consumerAttributes, + consumeInParallel, + maxParallelism, + updateBatch, + currentGaps + ) _ <- workers.update(_ + (partition -> worker)) shutdownPromise <- AwaitShutdown.make _ <- workersShutdownRef.update(_.updated(partition, shutdownPromise)) @@ -180,6 +217,8 @@ object Dispatcher { trait Worker { def submit(record: Record): URIO[Any, Boolean] + def submitBatch(records: Records): URIO[Any, SubmitBatchResult] + def expose: URIO[Any, WorkerExposedState] def shutdown: URIO[Any, Unit] @@ -198,14 +237,22 @@ object Dispatcher { clientId: ClientId, partition: TopicPartition, drainTimeout: Duration, - consumerAttributes: Map[String, String] + consumerAttributes: Map[String, String], + consumeInParallel: Boolean, + maxParallelism: Int, + updateBatch: Chunk[Record] => UIO[Unit] = _ => ZIO.unit, + currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] )(implicit trace: Trace): URIO[R with Env, Worker] = for { queue <- Queue.dropping[Record](capacity) internalState <- TRef.make(WorkerInternalState.empty).commit fiber <- - (reportWorkerRunningInInterval(every = 60.seconds, internalState)(partition, group, clientId).forkDaemon *> - pollOnce(status, internalState, handle, queue, group, clientId, partition, consumerAttributes) - .repeatWhile(_ == true)).forkDaemon + (reportWorkerRunningInInterval(every = 60.seconds, internalState)(partition, group, clientId).forkDaemon *> + (if (consumeInParallel) + pollBatch(status, internalState, handle, queue, group, clientId, partition, consumerAttributes, maxParallelism, updateBatch, currentGaps) + else pollOnce(status, internalState, handle, queue, group, clientId, partition, consumerAttributes)) + .repeatWhile(_ == true)) + .interruptible + .forkDaemon } yield new Worker { override def submit(record: Record): URIO[Any, Boolean] = queue @@ -220,6 +267,25 @@ object Dispatcher { } ) + override def submitBatch( + records: Records + ): URIO[Any, SubmitBatchResult] = + queue + .offerAll(records) + .tap(notInserted => + ZIO.when(notInserted.nonEmpty) { + Clock + .currentTime(TimeUnit.MILLISECONDS) + .flatMap(now => + internalState.update(s => if (s.reachedHighWatermarkSince.nonEmpty) s else s.reachedHighWatermark(now)).commit + ) + } + ) + .map(rejected => { + val isSuccess = rejected.isEmpty + SubmitBatchResult(isSuccess, if (isSuccess) None else Some(rejected.minBy(_.offset))) + }) + override def expose: URIO[Any, WorkerExposedState] = (queue.size zip internalState.get.commit) .flatMap { case (queued, state) => @@ -278,6 +344,93 @@ object Dispatcher { case DispatcherState.ShuttingDown => ZIO.succeed(false) } + + private def pollBatch[R]( + state: Ref[DispatcherState], + internalState: TRef[WorkerInternalState], + handle: Record => URIO[R, Any], + queue: Queue[Record], + group: Group, + clientId: ClientId, + partition: TopicPartition, + consumerAttributes: Map[String, String], + maxParallelism: Int, + updateBatch: Chunk[Record] => UIO[Unit], + currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] + )(implicit trace: Trace): ZIO[R with GreyhoundMetrics, Any, Boolean] = + internalState.update(s => s.cleared).commit *> + state.get.flatMap { + case DispatcherState.Running => + queue.takeAll.flatMap { + case records if records.nonEmpty => + handleBatch( + records, + internalState, + handle, + group, + clientId, + partition, + consumerAttributes, + maxParallelism, + updateBatch, + currentGaps + ) + case _ => isActive(internalState).delay(5.millis) + } + case DispatcherState.Paused(resume) => + report(WorkerWaitingForResume(group, clientId, partition, consumerAttributes)) *> resume.await.timeout(30.seconds) *> + isActive(internalState) + case DispatcherState.ShuttingDown => + ZIO.succeed(false) + } + private def handleBatch[R]( + records: Chunk[Record], + internalState: TRef[WorkerInternalState], + handle: Record => URIO[R, Any], + group: Group, + clientId: ClientId, + partition: TopicPartition, + consumerAttributes: Map[ClientId, ClientId], + maxParallelism: RuntimeFlags, + updateBatch: Chunk[Record] => UIO[Unit], + currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] + ): ZIO[R with GreyhoundMetrics, Throwable, Boolean] = + for { + _ <- report(TookAllRecordsFromQueue(records.size, records, group, clientId, consumerAttributes)) + _ <- ZIO + .attempt(currentTimeMillis()) + .flatMap(t => internalState.updateAndGet(_.startedWith(t)).commit) + .tapBoth( + e => report(FailToUpdateParallelCurrentExecutionStarted(records.size, group, clientId, consumerAttributes, e)), + t => report(CurrentExecutionStartedEvent(partition, group, clientId, t.currentExecutionStarted)) + ) + groupedRecords = records.groupBy(_.key).values // todo: add sub-grouping for records without key + latestCommitGaps <- currentGaps(records.map(r => TopicPartition(r.topic, r.partition)).toSet) + _ <- ZIO + .foreachParDiscard(groupedRecords)(sameKeyRecords => + ZIO.foreach(sameKeyRecords) { record => + if (shouldRecordBeHandled(record, latestCommitGaps)) { + handle(record).interruptible.ignore *> updateBatch(sameKeyRecords).interruptible + } else + report(SkippedPreviouslyHandledRecord(record, group, clientId, consumerAttributes)) + + } + ) + .withParallelism(maxParallelism) + res <- isActive(internalState) + } yield res + } + + private def shouldRecordBeHandled(record: Record, maybeGaps: Map[TopicPartition, Option[OffsetAndGaps]]): Boolean = { + maybeGaps.get(TopicPartition(record.topic, record.partition)) match { + case Some(maybeOffsetAndGapsForPartition) => + maybeOffsetAndGapsForPartition match { + case Some(offsetAndGapsForPartition) if offsetAndGapsForPartition.gaps.nonEmpty => + record.offset > offsetAndGapsForPartition.offset || offsetAndGapsForPartition.gaps.exists(_.contains(record.offset)) + case _ => true + } + case None => true + } } private def reportWorkerRunningInInterval( @@ -328,8 +481,12 @@ object SubmitResult { case object Rejected extends SubmitResult + case class RejectedBatch(firstRejected: Record) extends SubmitResult + } +case class SubmitBatchResult(success: Boolean, firstRejected: Option[Record]) extends SubmitResult + sealed trait DispatcherMetric extends GreyhoundMetric object DispatcherMetric { @@ -354,6 +511,9 @@ object DispatcherMetric { case class SubmittingRecord[K, V](group: Group, clientId: ClientId, record: ConsumerRecord[K, V], attributes: Map[String, String]) extends DispatcherMetric + case class SubmittingRecordBatch[K, V](group: Group, clientId: ClientId, numRecords: Int, attributes: Map[String, String]) + extends DispatcherMetric + case class HandlingRecord[K, V]( group: Group, clientId: ClientId, @@ -371,6 +531,13 @@ object DispatcherMetric { ) extends DispatcherMetric case class TookRecordFromQueue(record: Record, group: Group, clientId: ClientId, attributes: Map[String, String]) extends DispatcherMetric + case class TookAllRecordsFromQueue( + numRecords: Int, + records: Chunk[Record], + group: Group, + clientId: ClientId, + attributes: Map[String, String] + ) extends DispatcherMetric case class FailToUpdateCurrentExecutionStarted( record: Record, group: Group, @@ -378,6 +545,13 @@ object DispatcherMetric { attributes: Map[String, String], e: Throwable ) extends DispatcherMetric + case class FailToUpdateParallelCurrentExecutionStarted( + numRecords: Int, + group: Group, + clientId: ClientId, + attributes: Map[String, String], + e: Throwable + ) extends DispatcherMetric case class WorkerWaitingForResume(group: Group, clientId: ClientId, partition: TopicPartition, attributes: Map[String, String]) extends DispatcherMetric @@ -389,6 +563,9 @@ object DispatcherMetric { currentExecutionStarted: Option[Long] ) extends DispatcherMetric + case class SkippedPreviouslyHandledRecord(record: Record, group: Group, clientId: ClientId, attributes: Map[String, String]) + extends DispatcherMetric + } case class DispatcherExposedState(workersState: Map[TopicPartition, WorkerExposedState], state: Dispatcher.DispatcherState) { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 3c8940ac..0708c19a 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -1,9 +1,12 @@ package com.wixpress.dst.greyhound.core.consumer import com.wixpress.dst.greyhound.core._ +import com.wixpress.dst.greyhound.core.consumer.Consumer.Records +import com.wixpress.dst.greyhound.core.consumer.Dispatcher.Record import com.wixpress.dst.greyhound.core.consumer.EventLoopMetric._ import com.wixpress.dst.greyhound.core.consumer.EventLoopState.{Paused, Running, ShuttingDown} import com.wixpress.dst.greyhound.core.consumer.RecordConsumer.Env +import com.wixpress.dst.greyhound.core.consumer.SubmitResult.RejectedBatch import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerSubscription, RecordHandler} import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics.report import com.wixpress.dst.greyhound.core.metrics.{GreyhoundMetric, GreyhoundMetrics} @@ -37,7 +40,11 @@ object EventLoop { val start = for { _ <- report(StartingEventLoop(clientId, group, consumerAttributes)) offsets <- Offsets.make - handle = handler.andThen(offsets.update).handle(_) + offsetsAndGaps <- OffsetsAndGaps.make + handle = if (config.consumePartitionInParallel) { cr: Record => handler.handle(cr) } + else handler.andThen(offsets.update).handle(_) + updateBatch = { records: Chunk[Record] => offsetsAndGaps.update(records) } + currentGaps = { partitions: Set[TopicPartition] => currentGapsForPartitions(partitions, clientId)(consumer) } _ <- report(CreatingDispatcher(clientId, group, consumerAttributes, config.startPaused)) dispatcher <- Dispatcher.make( group, @@ -49,18 +56,33 @@ object EventLoop { config.delayResumeOfPausedPartition, consumerAttributes, workersShutdownRef, - config.startPaused + config.startPaused, + config.consumePartitionInParallel, + config.maxParallelism, + updateBatch, + currentGaps ) positionsRef <- Ref.make(Map.empty[TopicPartition, Offset]) pausedPartitionsRef <- Ref.make(Set.empty[TopicPartition]) partitionsAssigned <- Promise.make[Nothing, Unit] // TODO how to handle errors in subscribe? - rebalanceListener = listener(pausedPartitionsRef, config, dispatcher, partitionsAssigned, group, consumer, clientId, offsets) + rebalanceListener = listener( + pausedPartitionsRef, + config, + dispatcher, + partitionsAssigned, + group, + consumer, + clientId, + offsets, + offsetsAndGaps, + config.consumePartitionInParallel + ) _ <- report(SubscribingToInitialSubAndRebalanceListener(clientId, group, consumerAttributes)) _ <- subscribe(initialSubscription, rebalanceListener)(consumer) running <- Ref.make[EventLoopState](Running) _ <- report(CreatingPollOnceFiber(clientId, group, consumerAttributes)) - fiber <- pollOnce(running, consumer, dispatcher, pausedPartitionsRef, positionsRef, offsets, config, clientId, group) + fiber <- pollOnce(running, consumer, dispatcher, pausedPartitionsRef, positionsRef, offsets, config, clientId, group, offsetsAndGaps) .repeatWhile(_ == true) .forkDaemon _ <- report(AwaitingPartitionsAssignment(clientId, group, consumerAttributes)) @@ -138,7 +160,8 @@ object EventLoop { offsets: Offsets, config: EventLoopConfig, clientId: ClientId, - group: Group + group: Group, + offsetsAndGaps: OffsetsAndGaps ): URIO[R2 with Env, Boolean] = running.get.flatMap { case Running => @@ -146,7 +169,7 @@ object EventLoop { _ <- resumePartitions(consumer, clientId, group, dispatcher, paused) records <- pollAndHandle(consumer, dispatcher, paused, config) _ <- updatePositions(records, positionsRef, consumer, clientId) - _ <- commitOffsets(consumer, offsets) + _ <- if (config.consumePartitionInParallel) commitOffsetsAndGaps(consumer, offsetsAndGaps) else commitOffsets(consumer, offsets) _ <- ZIO.when(records.isEmpty)(ZIO.sleep(50.millis)) } yield true @@ -162,7 +185,9 @@ object EventLoop { group: Group, consumer0: Consumer, clientId: ClientId, - offsets: Offsets + offsets: Offsets, + offsetsAndGaps: OffsetsAndGaps, + useParallelConsumer: Boolean ) = { config.rebalanceListener *> new RebalanceListener[GreyhoundMetrics] { @@ -171,12 +196,13 @@ object EventLoop { partitions: Set[TopicPartition] )(implicit trace: Trace): URIO[GreyhoundMetrics, DelayedRebalanceEffect] = { for { - _ <- pausedPartitionsRef.update(_ -- partitions) - isRevokeTimedOut <- dispatcher.revoke(partitions).timeout(config.drainTimeout).map(_.isEmpty) - _ <- ZIO.when(isRevokeTimedOut)( - report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumer.config.consumerAttributes)) - ) - delayedRebalanceEffect <- commitOffsetsOnRebalance(consumer0, offsets) + _ <- pausedPartitionsRef.update(_ -- partitions) + isRevokeTimedOut <- dispatcher.revoke(partitions).timeout(config.drainTimeout).map(_.isEmpty) + _ <- ZIO.when(isRevokeTimedOut)( + report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumer.config.consumerAttributes)) + ) + delayedRebalanceEffect <- if (useParallelConsumer) commitOffsetsAndGapsOnRebalance(consumer0, offsetsAndGaps) + else commitOffsetsOnRebalance(consumer0, offsets) } yield delayedRebalanceEffect } @@ -214,24 +240,61 @@ object EventLoop { for { records <- consumer.poll(config.fetchTimeout).catchAll(_ => ZIO.succeed(Nil)) paused <- pausedRef.get - pausedTopics <- ZIO.foldLeft(records)(paused) { (acc, record) => - val partition = record.topicPartition - if (acc contains partition) - report(PartitionThrottled(partition, record.offset, consumer.config.consumerAttributes)).as(acc) - else - dispatcher.submit(record).flatMap { - case SubmitResult.Submitted => ZIO.succeed(acc) - case SubmitResult.Rejected => - report(HighWatermarkReached(partition, record.offset, consumer.config.consumerAttributes)) *> - consumer.pause(record).fold(_ => acc, _ => acc + partition) - } - } + pausedTopics <- if (config.consumePartitionInParallel) submitRecordsAsBatch(consumer, dispatcher, records, paused) + else submitRecordsSequentially(consumer, dispatcher, records, paused) _ <- pausedRef.update(_ => pausedTopics) } yield records + private def submitRecordsSequentially[R2, R1]( + consumer: Consumer, + dispatcher: Dispatcher[R2], + records: Records, + paused: Set[TopicPartition] + ): ZIO[R2 with Env, Nothing, Set[TopicPartition]] = { + ZIO.foldLeft(records)(paused) { (acc, record) => + val partition = record.topicPartition + if (acc contains partition) + report(PartitionThrottled(partition, record.offset, consumer.config.consumerAttributes)).as(acc) + else + dispatcher.submit(record).flatMap { + case SubmitResult.Submitted => ZIO.succeed(acc) + case SubmitResult.Rejected => + report(HighWatermarkReached(partition, record.offset, consumer.config.consumerAttributes)) *> + consumer.pause(record).fold(_ => acc, _ => acc + partition) + } + } + } + + private def submitRecordsAsBatch[R2, R1]( + consumer: Consumer, + dispatcher: Dispatcher[R2], + records: Records, + paused: Set[TopicPartition] + ): ZIO[R2 with Env, Nothing, Set[TopicPartition]] = { + val recordsByPartition = records.groupBy(_.topicPartition) + ZIO.foldLeft(recordsByPartition)(paused) { (acc, partitionToRecords) => + val partition = partitionToRecords._1 + if (acc contains partition) + report(PartitionThrottled(partition, partitionToRecords._2.map(_.offset).min, consumer.config.consumerAttributes)).as(acc) + else + dispatcher.submitBatch(partitionToRecords._2.toSeq).flatMap { + case SubmitResult.Submitted => ZIO.succeed(acc) + case RejectedBatch(firstRejected) => + report(HighWatermarkReached(partition, firstRejected.offset, consumer.config.consumerAttributes)) *> + consumer.pause(firstRejected).fold(_ => acc, _ => acc + partition) + } + } + } + private def commitOffsets(consumer: Consumer, offsets: Offsets): URIO[GreyhoundMetrics, Unit] = offsets.committable.flatMap { committable => consumer.commit(committable).catchAll { _ => offsets.update(committable) } } + private def commitOffsetsAndGaps(consumer: Consumer, offsetsAndGaps: OffsetsAndGaps): URIO[GreyhoundMetrics, Unit] = + offsetsAndGaps.getCommittableAndClear.flatMap { committable => + val offsetsAndMetadataToCommit = OffsetsAndGaps.toOffsetsAndMetadata(committable) + consumer.commitWithMetadata(offsetsAndMetadataToCommit).catchAll { _ => offsetsAndGaps.setCommittable(committable) } + } + private def commitOffsetsOnRebalance( consumer: Consumer, offsets: Offsets @@ -251,6 +314,32 @@ object EventLoop { } } + private def commitOffsetsAndGapsOnRebalance( + consumer: Consumer, + offsetsAndGaps: OffsetsAndGaps + ): URIO[GreyhoundMetrics, DelayedRebalanceEffect] = { + for { + committable <- offsetsAndGaps.getCommittableAndClear + tle <- consumer + .commitWithMetadataOnRebalance(OffsetsAndGaps.toOffsetsAndMetadata(committable)) + .catchAll { _ => offsetsAndGaps.setCommittable(committable) *> DelayedRebalanceEffect.zioUnit } + runtime <- ZIO.runtime[Any] + } yield tle.catchAll { _ => zio.Unsafe.unsafe { implicit s => + runtime.unsafe + .run(offsetsAndGaps.setCommittable(committable)) + .getOrThrowFiberFailure() + } + } + } + + private def currentGapsForPartitions(partitions: Set[TopicPartition], clientId: ClientId)( + consumer: Consumer + ): ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] = + consumer + .committedOffsetsAndMetadata(partitions) + .map { committed => committed.mapValues(om => OffsetsAndGaps.parseGapsString(om.metadata)) } + .catchAll(t => report(FailedToFetchCommittedGaps(t, clientId, consumer.config.consumerAttributes)).as(Map.empty)) + } case class EventLoopConfig( @@ -260,7 +349,9 @@ case class EventLoopConfig( highWatermark: Int, rebalanceListener: RebalanceListener[Any], delayResumeOfPausedPartition: Long, - startPaused: Boolean + startPaused: Boolean, + consumePartitionInParallel: Boolean, + maxParallelism: Int ) object EventLoopConfig { @@ -271,7 +362,9 @@ object EventLoopConfig { highWatermark = 256, rebalanceListener = RebalanceListener.Empty, delayResumeOfPausedPartition = 0, - startPaused = false + startPaused = false, + consumePartitionInParallel = false, + maxParallelism = 1 ) } @@ -305,6 +398,9 @@ object EventLoopMetric { case class FailedToUpdatePositions(t: Throwable, clientId: ClientId, attributes: Map[String, String] = Map.empty) extends EventLoopMetric + case class FailedToFetchCommittedGaps(t: Throwable, clientId: ClientId, attributes: Map[String, String] = Map.empty) + extends EventLoopMetric + case class CreatingDispatcher(clientId: ClientId, group: Group, attributes: Map[String, String], startPaused: Boolean) extends EventLoopMetric diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala index f5718555..2704b5d6 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -1,6 +1,9 @@ package com.wixpress.dst.greyhound.core.consumer -import com.wixpress.dst.greyhound.core.{Offset, TopicPartition} +import com.wixpress.dst.greyhound.core.consumer.Gap.GAP_SEPARATOR +import com.wixpress.dst.greyhound.core.consumer.OffsetAndGaps.{GAPS_STRING_SEPARATOR, LAST_HANDLED_OFFSET_SEPARATOR} +import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordTopicPartition} +import com.wixpress.dst.greyhound.core.{Offset, OffsetAndMetadata, TopicPartition} import zio._ trait OffsetsAndGaps { @@ -10,6 +13,16 @@ trait OffsetsAndGaps { def update(partition: TopicPartition, batch: Seq[Offset]): UIO[Unit] + def update(record: ConsumerRecord[_, _]): UIO[Unit] = + update(RecordTopicPartition(record), Seq(record.offset)) + + def update(records: Chunk[ConsumerRecord[_, _]]): UIO[Unit] = { + val sortedBatch = records.sortBy(_.offset) + update(RecordTopicPartition(sortedBatch.head), sortedBatch.map(_.offset) ++ Seq(sortedBatch.last.offset + 1)) + } + + def setCommittable(offsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] + def contains(partition: TopicPartition, offset: Offset): UIO[Boolean] } @@ -48,6 +61,9 @@ object OffsetsAndGaps { override def contains(partition: TopicPartition, offset: Offset): UIO[Boolean] = ref.get.map(_.get(partition).fold(false)(_.contains(offset))) + override def setCommittable(offsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] = + ref.update { _ => offsets } + private def gapsInBatch(batch: Seq[Offset], prevLastOffset: Offset): Seq[Gap] = batch.sorted .foldLeft(Seq.empty[Gap], prevLastOffset) { @@ -73,20 +89,65 @@ object OffsetsAndGaps { } } } + + def toOffsetsAndMetadata(offsetsAndGaps: Map[TopicPartition, OffsetAndGaps]): Map[TopicPartition, OffsetAndMetadata] = + offsetsAndGaps.mapValues(offsetAndGaps => + OffsetAndMetadata(offsetAndGaps.offset, offsetAndGaps.gapsString) + ) // todo: add encoding and compression to plain gaps string + + def parseGapsString(offsetAndGapsString: String): Option[OffsetAndGaps] = { + val lastHandledOffsetSeparatorIndex = offsetAndGapsString.indexOf(LAST_HANDLED_OFFSET_SEPARATOR) + if (lastHandledOffsetSeparatorIndex < 0) + None + else { + val lastHandledOffset = offsetAndGapsString.substring(0, lastHandledOffsetSeparatorIndex).toLong + val gaps = offsetAndGapsString + .substring(lastHandledOffsetSeparatorIndex + 1) + .split(GAPS_STRING_SEPARATOR) + .map(_.split(GAP_SEPARATOR)) + .collect { case Array(start, end) => Gap(start.toLong, end.toLong) } + .toSeq + .sortBy(_.start) + Some(OffsetAndGaps(lastHandledOffset, gaps)) + } + } + + def firstGapOffset(gapsString: String): Option[Offset] = { + val maybeOffsetAndGaps = parseGapsString(gapsString) + maybeOffsetAndGaps match { + case Some(offsetAndGaps) if offsetAndGaps.gaps.nonEmpty => Some(offsetAndGaps.gaps.minBy(_.start).start) + case _ => None + } + } } case class Gap(start: Offset, end: Offset) { def contains(offset: Offset): Boolean = start <= offset && offset <= end def size: Long = end - start + 1 + + override def toString: String = s"$start$GAP_SEPARATOR$end" +} + +object Gap { + val GAP_SEPARATOR = "_" } case class OffsetAndGaps(offset: Offset, gaps: Seq[Gap], committable: Boolean = true) { def contains(offset: Offset): Boolean = gaps.exists(_.contains(offset)) def markCommitted: OffsetAndGaps = copy(committable = false) + + def gapsString: String = { + if (gaps.isEmpty) "" + else + s"${offset.toString}${LAST_HANDLED_OFFSET_SEPARATOR}${gaps.sortBy(_.start).mkString(GAPS_STRING_SEPARATOR)}" + } } object OffsetAndGaps { + val GAPS_STRING_SEPARATOR = "$" + val LAST_HANDLED_OFFSET_SEPARATOR = "#" + def apply(offset: Offset): OffsetAndGaps = OffsetAndGaps(offset, Seq.empty[Gap]) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala index e684eef7..234e7ba9 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala @@ -1,12 +1,11 @@ package com.wixpress.dst.greyhound.core.consumer import java.time.Clock -import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.{CommittedMissingOffsets, CommittedMissingOffsetsFailed} -import com.wixpress.dst.greyhound.core.{ClientId, Group, Offset, TopicPartition} +import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.{CommittedMissingOffsets, CommittedMissingOffsetsFailed, SkippedGapsOnInitialization} +import com.wixpress.dst.greyhound.core.consumer.OffsetsAndGaps.{firstGapOffset, parseGapsString} +import com.wixpress.dst.greyhound.core.{ClientId, Group, Offset, OffsetAndMetadata, TopicPartition} import com.wixpress.dst.greyhound.core.metrics.{GreyhoundMetric, GreyhoundMetrics} - import zio.{URIO, ZIO} - import zio._ /** @@ -24,17 +23,19 @@ class OffsetsInitializer( initialSeek: InitialOffsetsSeek, rewindUncommittedOffsetsBy: Duration, clock: Clock = Clock.systemUTC, - offsetResetIsEarliest: Boolean + offsetResetIsEarliest: Boolean, + parallelConsumer: Boolean ) { def initializeOffsets(partitions: Set[TopicPartition]): Unit = { val hasSeek = initialSeek != InitialOffsetsSeek.default val effectiveTimeout = if (hasSeek) timeoutIfSeek else timeout withReporting(partitions, rethrow = hasSeek) { - val committed = offsetOperations.committed(partitions, effectiveTimeout) + val committed = offsetOperations.committedWithMetadata(partitions, effectiveTimeout) val beginning = offsetOperations.beginningOffsets(partitions, effectiveTimeout) val endOffsets = offsetOperations.endOffsets(partitions, effectiveTimeout) - val PartitionActions(toOffsets, toPause) = calculateTargetOffsets(partitions, beginning, committed, endOffsets, effectiveTimeout) + val PartitionActions(toOffsets, toPause) = + calculateTargetOffsets(partitions, beginning, committed, endOffsets, effectiveTimeout, parallelConsumer) val notCommitted = partitions -- committed.keySet -- toOffsets.keySet offsetOperations.pause(toPause) @@ -46,43 +47,60 @@ class OffsetsInitializer( .map { case (tp, maybeRewindedOffset) => (tp, maybeRewindedOffset.orElse(endOffsets.get(tp)).getOrElse(0L)) } val positions = - notCommitted.map(tp => tp -> offsetOperations.position(tp, effectiveTimeout)).toMap ++ toOffsets ++ rewindUncommittedOffsets + notCommitted.map(tp => tp -> offsetOperations.position(tp, effectiveTimeout)).toMap.mapValues(OffsetAndMetadata.apply) ++ + toOffsets ++ rewindUncommittedOffsets.mapValues(OffsetAndMetadata.apply) if ((toOffsets ++ rewindUncommittedOffsets).nonEmpty) { - offsetOperations.seek(toOffsets ++ rewindUncommittedOffsets) + offsetOperations.seek(toOffsets.mapValues(_.offset) ++ rewindUncommittedOffsets) } if (positions.nonEmpty) { - offsetOperations.commit(positions, effectiveTimeout) + offsetOperations.commitWithMetadata(positions, effectiveTimeout) } - positions + positions.mapValues(_.offset) } } - case class PartitionActions(offsetSeeks: Map[TopicPartition, Offset], partitionsToPause: Set[TopicPartition]) + case class PartitionActions(offsetSeeks: Map[TopicPartition, OffsetAndMetadata], partitionsToPause: Set[TopicPartition]) private def calculateTargetOffsets( partitions: Set[TopicPartition], beginning: Map[TopicPartition, Offset], - committed: Map[TopicPartition, Offset], + committed: Map[TopicPartition, OffsetAndMetadata], endOffsets: Map[TopicPartition, Offset], - timeout: Duration + timeout: Duration, + parallelConsumer: Boolean ): PartitionActions = { + val currentCommittedOffsets = partitions.map((_, None)).toMap ++ committed.mapValues(Some.apply) val seekTo: Map[TopicPartition, SeekTo] = initialSeek.seekOffsetsFor( assignedPartitions = partitions, beginningOffsets = partitions.map((_, None)).toMap ++ beginning.mapValues(Some.apply), endOffsets = partitions.map((_, None)).toMap ++ endOffsets.mapValues(Some.apply), - currentCommittedOffsets = partitions.map((_, None)).toMap ++ committed.mapValues(Some.apply) + currentCommittedOffsets = currentCommittedOffsets.mapValues(_.map(_.offset)) ) - val seekToOffsets = seekTo.collect { case (k, v: SeekTo.SeekToOffset) => k -> v.offset } + val seekToOffsets = seekTo.collect { case (k, v: SeekTo.SeekToOffset) => k -> OffsetAndMetadata(v.offset) } val seekToEndPartitions = seekTo.collect { case (k, SeekTo.SeekToEnd) => k }.toSet val toPause = seekTo.collect { case (k, SeekTo.Pause) => k } - val seekToEndOffsets = fetchEndOffsets(seekToEndPartitions, timeout) - val toOffsets = seekToOffsets ++ seekToEndOffsets - + val seekToEndOffsets = fetchEndOffsets(seekToEndPartitions, timeout).mapValues(OffsetAndMetadata.apply) + val gapsSmallestOffsets = currentCommittedOffsets + .collect { case (tp, Some(om)) => tp -> om } + .map(tpom => tpom._1 -> (firstGapOffset(tpom._2.metadata), tpom._2.metadata)) + .collect { case (tp, (Some(offset), metadata)) => tp -> OffsetAndMetadata(offset, metadata) } + val seekToGapsOffsets = if (parallelConsumer) gapsSmallestOffsets else Map.empty + val toOffsets = seekToOffsets ++ seekToEndOffsets ++ seekToGapsOffsets + + if (!parallelConsumer && gapsSmallestOffsets.nonEmpty) reportSkippedGaps(currentCommittedOffsets) PartitionActions(offsetSeeks = toOffsets, partitionsToPause = toPause.toSet) } + private def reportSkippedGaps(currentCommittedOffsets: Map[TopicPartition, Option[OffsetAndMetadata]]) = { + val skippedGaps = currentCommittedOffsets + .collect { case (tp, Some(om)) => tp -> om } + .map(tpom => tpom._1 -> parseGapsString(tpom._2.metadata)) + .collect { case (tp, Some(gaps)) => tp -> gaps } + reporter(SkippedGapsOnInitialization(clientId, group, skippedGaps)) + } + private def fetchEndOffsets(seekToEndPartitions: Set[TopicPartition], timeout: Duration) = { if (seekToEndPartitions.nonEmpty) { offsetOperations.endOffsets(seekToEndPartitions, timeout) @@ -151,7 +169,8 @@ object OffsetsInitializer { initialSeek: InitialOffsetsSeek, clock: Clock = Clock.systemUTC, rewindUncommittedOffsetsBy: Duration, - offsetResetIsEarliest: Boolean + offsetResetIsEarliest: Boolean, + parallelConsumer: Boolean )(implicit trace: Trace): URIO[GreyhoundMetrics, OffsetsInitializer] = for { metrics <- ZIO.environment[GreyhoundMetrics].map(_.get) runtime <- ZIO.runtime[Any] @@ -165,6 +184,7 @@ object OffsetsInitializer { initialSeek: InitialOffsetsSeek, rewindUncommittedOffsetsBy, clock, - offsetResetIsEarliest + offsetResetIsEarliest, + parallelConsumer ) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala index 1b426d60..92c7faf5 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala @@ -79,15 +79,17 @@ object RecordConsumer { (initialSubscription, topicsToCreate) = config.retryConfig.fold((config.initialSubscription, Set.empty[Topic]))(policy => maybeAddRetryTopics(policy, config, nonBlockingRetryHelper) ) - _ <- ZIO.when(config.createRetryTopics)(AdminClient - .make(AdminClientConfig(config.bootstrapServers, config.kafkaAuthProperties), config.consumerAttributes) - .tap(client => - client.createTopics( - topicsToCreate.map(topic => - TopicConfig(topic, partitions = 1, replicationFactor = 1, cleanupPolicy = CleanupPolicy.Delete(86400000L)) + _ <- ZIO.when(config.createRetryTopics)( + AdminClient + .make(AdminClientConfig(config.bootstrapServers, config.kafkaAuthProperties), config.consumerAttributes) + .tap(client => + client.createTopics( + topicsToCreate.map(topic => + TopicConfig(topic, partitions = 1, replicationFactor = 1, cleanupPolicy = CleanupPolicy.Delete(86400000L)) + ) ) ) - )) + ) blockingState <- Ref.make[Map[BlockingTarget, BlockingState]](Map.empty) blockingStateResolver = BlockingStateResolver(blockingState) workersShutdownRef <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) @@ -206,7 +208,8 @@ object RecordConsumer { config.consumerAttributes, config.decryptor, config.commitMetadataString, - config.rewindUncommittedOffsetsBy.toMillis + config.rewindUncommittedOffsetsBy.toMillis, + config.eventLoopConfig.consumePartitionInParallel ) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala index d0f93a00..6202220d 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala @@ -109,6 +109,41 @@ case class ReportingConsumer(clientId: ClientId, group: Group, internal: Consume } else DelayedRebalanceEffect.zioUnit } + override def commitWithMetadataOnRebalance(offsets: Map[TopicPartition, OffsetAndMetadata] + )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = + ZIO.runtime[GreyhoundMetrics].flatMap { runtime => + if (offsets.nonEmpty) { + report(CommittingOffsetsWithMetadata(clientId, group, offsets, calledOnRebalance = true, attributes = config.consumerAttributes)) *> + internal + .commitWithMetadataOnRebalance(offsets) + .tapError { error => + report(CommitWithMetadataFailed(clientId, group, error, offsets, calledOnRebalance = true, attributes = config.consumerAttributes)) + } + .map( + _.tapError { error => // handle commit errors in ThreadLockedEffect + zio.Unsafe.unsafe { implicit s => + runtime.unsafe + .run( + report( + CommitWithMetadataFailed(clientId, group, error, offsets, calledOnRebalance = true, attributes = config.consumerAttributes) + ) + ) + .getOrThrowFiberFailure() + } + } *> + DelayedRebalanceEffect( + zio.Unsafe.unsafe { implicit s => + runtime.unsafe + .run( + report(CommittedOffsetsWithMetadata(clientId, group, offsets, calledOnRebalance = true, attributes = config.consumerAttributes)) + ) + .getOrThrowFiberFailure() + } + ) + ) + } else DelayedRebalanceEffect.zioUnit + } + override def commit(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { ZIO .when(offsets.nonEmpty) { @@ -122,6 +157,34 @@ case class ReportingConsumer(clientId: ClientId, group: Group, internal: Consume .unit } + override def commitWithMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata])( + implicit trace: Trace + ): RIO[GreyhoundMetrics, Unit] = { + val offsets = offsetsAndMetadata.map { case (tp, om) => tp -> om.offset } + ZIO + .when(offsetsAndMetadata.nonEmpty) { + report( + CommittingOffsets( + clientId, + group, + offsets, + calledOnRebalance = false, + attributes = config.consumerAttributes + ) + ) *> internal.commitWithMetadata(offsetsAndMetadata).tapError { error => report(CommitFailed(clientId, group, error, offsets)) } *> + report( + CommittedOffsets( + clientId, + group, + offsets, + calledOnRebalance = false, + attributes = config.consumerAttributes + ) + ) + } + .unit + } + override def pause(partitions: Set[TopicPartition])(implicit trace: Trace): ZIO[GreyhoundMetrics, IllegalStateException, Unit] = ZIO .when(partitions.nonEmpty) { @@ -159,6 +222,9 @@ case class ReportingConsumer(clientId: ClientId, group: Group, internal: Consume override def committedOffsets(partitions: Set[TopicPartition])(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] = internal.committedOffsets(partitions) + override def committedOffsetsAndMetadata(partitions: NonEmptySet[TopicPartition])(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] = + internal.committedOffsetsAndMetadata(partitions) + override def position(topicPartition: TopicPartition)(implicit trace: Trace): Task[Offset] = internal.position(topicPartition) @@ -201,6 +267,14 @@ object ConsumerMetric { attributes: Map[String, String] = Map.empty ) extends ConsumerMetric + case class CommittingOffsetsWithMetadata( + clientId: ClientId, + group: Group, + offsets: Map[TopicPartition, OffsetAndMetadata], + calledOnRebalance: Boolean, + attributes: Map[String, String] = Map.empty + ) extends ConsumerMetric + case class CommittedOffsets( clientId: ClientId, group: Group, @@ -209,6 +283,14 @@ object ConsumerMetric { attributes: Map[String, String] = Map.empty ) extends ConsumerMetric + case class CommittedOffsetsWithMetadata( + clientId: ClientId, + group: Group, + offsets: Map[TopicPartition, OffsetAndMetadata], + calledOnRebalance: Boolean, + attributes: Map[String, String] = Map.empty + ) extends ConsumerMetric + case class PausingPartitions( clientId: ClientId, group: Group, @@ -268,6 +350,15 @@ object ConsumerMetric { attributes: Map[String, String] = Map.empty ) extends ConsumerMetric + case class CommitWithMetadataFailed( + clientId: ClientId, + group: Group, + error: Throwable, + offsets: Map[TopicPartition, OffsetAndMetadata], + calledOnRebalance: Boolean = false, + attributes: Map[String, String] = Map.empty + ) extends ConsumerMetric + case class PausePartitionsFailed( clientId: ClientId, group: Group, @@ -323,4 +414,6 @@ object ConsumerMetric { case class ClosedConsumer(group: Group, clientId: ClientId, result: MetricResult[Throwable, Unit]) extends ConsumerMetric + case class SkippedGapsOnInitialization(clientId: ClientId, group: Group, gaps: Map[TopicPartition, OffsetAndGaps]) extends ConsumerMetric + } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala index 8331d84b..aa335fde 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala @@ -9,7 +9,7 @@ import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHa import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.testkit.{BaseTest, TestMetrics} import com.wixpress.dst.greyhound.core.zioutils.AwaitShutdown.ShutdownPromise -import com.wixpress.dst.greyhound.core.{Headers, Offset, Topic, TopicPartition} +import com.wixpress.dst.greyhound.core.{Headers, Offset, OffsetAndMetadata, Topic, TopicPartition} import zio._ import java.util.regex.Pattern @@ -139,11 +139,21 @@ trait EmptyConsumer extends Consumer { override def commit(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): Task[Unit] = ZIO.unit + override def commitWithMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata])( + implicit trace: Trace + ): RIO[GreyhoundMetrics, Unit] = + ZIO.unit + override def commitOnRebalance(offsets: Map[TopicPartition, Offset])( implicit trace: Trace ): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = DelayedRebalanceEffect.zioUnit + override def commitWithMetadataOnRebalance(offsets: Map[TopicPartition, OffsetAndMetadata])( + implicit trace: Trace + ): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = + DelayedRebalanceEffect.zioUnit + override def pause(partitions: Set[TopicPartition])(implicit trace: Trace): ZIO[Any, IllegalStateException, Unit] = ZIO.unit @@ -173,4 +183,7 @@ trait EmptyConsumer extends Consumer { override def committedOffsets(partitions: Set[TopicPartition])(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] = ZIO.succeed(Map.empty) + + override def committedOffsetsAndMetadata(partitions: Set[TopicPartition])(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] = + ZIO.succeed(Map.empty) } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala index 5e708452..2b8ffb0f 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala @@ -2,7 +2,7 @@ package com.wixpress.dst.greyhound.core.consumer import java.time.{Clock, Duration, ZoneId} import java.util.concurrent.atomic.AtomicReference -import com.wixpress.dst.greyhound.core.TopicPartition +import com.wixpress.dst.greyhound.core.{OffsetAndMetadata, TopicPartition} import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.{CommittedMissingOffsets, CommittedMissingOffsetsFailed} import com.wixpress.dst.greyhound.core.consumer.SeekTo.{SeekToEnd, SeekToOffset} import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetric @@ -40,8 +40,8 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { p3 -> p3Pos ) there was - one(offsetOps).commit( - missingOffsets, + one(offsetOps).commitWithMetadata( + missingOffsets.mapValues(OffsetAndMetadata(_)), timeout ) @@ -57,8 +57,8 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { val expected = Map(p1 -> p1Pos, p3 -> p3Pos) there was - one(offsetOps).commit( - expected, + one(offsetOps).commitWithMetadata( + expected.mapValues(OffsetAndMetadata(_)), timeoutIfSeek ) @@ -79,8 +79,8 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { val expected = Map(p1 -> p1Pos, p3 -> p3Pos, p2 -> p2Pos) there was - one(offsetOps).commit( - expected, + one(offsetOps).commitWithMetadata( + expected.mapValues(OffsetAndMetadata(_)), timeoutIfSeek ) there was one(offsetOps).seek(Map(p1 -> p1Pos, p2 -> p2Pos)) @@ -93,27 +93,27 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { "fail if operation fails and there are relevant seekTo offsets" in new ctx(seekTo = Map(p1 -> SeekToOffset(p1Pos))) { val e = new RuntimeException(randomStr) - offsetOps.committed(any(), any()) throws e + offsetOps.committedWithMetadata(any(), any()) throws e committer.initializeOffsets(partitions) must throwA(e) reported must contain(CommittedMissingOffsetsFailed(clientId, group, partitions, Map.empty, elapsed = Duration.ZERO, e)) } - "report errors in `commit()`, but not fail" in + "report errors in `commitWithMetadata()`, but not fail" in new ctx { val e = new RuntimeException(randomStr) givenCommittedOffsets(partitions)(Map(p1 -> randomInt)) val p2Pos, p3Pos = randomInt.toLong givenPositions(p2 -> p2Pos, p3 -> p3Pos) - offsetOps.commit(any(), any()) throws e + offsetOps.commitWithMetadata(any(), any()) throws e committer.initializeOffsets(partitions) reported must contain(CommittedMissingOffsetsFailed(clientId, group, partitions, Map.empty, elapsed = Duration.ZERO, e)) } - "report errors in `commit()`, but not fail" in + "report errors in `commitWithMetadata()`, but not fail" in new ctx { val e = new RuntimeException(randomStr) givenCommittedOffsets(partitions)(Map(p1 -> randomInt)) @@ -142,8 +142,8 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { ) there was - one(offsetOps).commit( - missingOffsets ++ rewindedOffsets, + one(offsetOps).commitWithMetadata( + (missingOffsets ++ rewindedOffsets).mapValues(OffsetAndMetadata(_)), timeout ) } @@ -162,8 +162,8 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { ) there was - one(offsetOps).commit( - committedOffsets, + one(offsetOps).commitWithMetadata( + committedOffsets.mapValues(OffsetAndMetadata(_)), timeout ) } @@ -186,8 +186,8 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { ) there was - one(offsetOps).commit( - missingOffsets ++ rewindedOffsets, + one(offsetOps).commitWithMetadata( + (missingOffsets ++ rewindedOffsets).mapValues(OffsetAndMetadata(_)), timeout ) } @@ -214,7 +214,8 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { if (seekTo == Map.empty) InitialOffsetsSeek.default else (_, _, _, _) => seekTo, rewindUncommittedOffsetsBy = zio.Duration.fromMillis(15 * 60 * 1000), clock, - offsetResetIsEarliest = offsetReset == OffsetReset.Earliest + offsetResetIsEarliest = offsetReset == OffsetReset.Earliest, + false ) def randomOffsets(partitions: Set[TopicPartition]) = partitions.map(p => p -> randomInt.toLong).toMap @@ -222,7 +223,7 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { def givenCommittedOffsets(partitions: Set[TopicPartition], timeout: zio.Duration = timeout)( result: Map[TopicPartition, Long] ) = { - offsetOps.committed(partitions, timeout) returns result + offsetOps.committedWithMetadata(partitions, timeout) returns result.mapValues(OffsetAndMetadata(_)) } def givenEndOffsets(partitions: Set[TopicPartition], timeout: zio.Duration = timeout)(result: Map[TopicPartition, Long]) = { diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoopTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoopTest.scala index 19f4b43a..bb1144cc 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoopTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoopTest.scala @@ -1,6 +1,6 @@ package com.wixpress.dst.greyhound.core.consumer.batched -import com.wixpress.dst.greyhound.core.{Offset, Topic, TopicPartition} +import com.wixpress.dst.greyhound.core.{Offset, OffsetAndMetadata, Topic, TopicPartition} import com.wixpress.dst.greyhound.core.consumer.Consumer.Records import com.wixpress.dst.greyhound.core.consumer.batched.BatchConsumer.RecordBatch import com.wixpress.dst.greyhound.core.consumer.batched.BatchEventLoopMetric.{FullBatchHandled, RecordsHandled} @@ -162,6 +162,11 @@ class BatchEventLoopTest extends JUnitRunnableSpec { ZIO.succeed(println(s"commit($offsets)")) *> committedOffsetsRef.update(_ ++ offsets) } + override def commitWithMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata])( + implicit trace: Trace + ): RIO[GreyhoundMetrics, Unit] = + committedOffsetsRef.update(_ ++ offsetsAndMetadata.map { case (tp, om) => tp -> om.offset }) + override def commitOnRebalance( offsets: Map[TopicPartition, Offset] )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { @@ -173,7 +178,18 @@ class BatchEventLoopTest extends JUnitRunnableSpec { ) } } + + override def commitWithMetadataOnRebalance( + offsets: Map[TopicPartition, OffsetAndMetadata] + )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { + ZIO.runtime[Any].flatMap { rt => + ZIO.succeed(DelayedRebalanceEffect(zio.Unsafe.unsafe { implicit s => + rt.unsafe.run(committedOffsetsRef.update(_ ++ offsets.mapValues(_.offset))).getOrThrowFiberFailure() + })) + } + } } + val handler = new BatchRecordHandler[Any, Throwable, Chunk[Byte], Chunk[Byte]] { override def handle(records: RecordBatch): ZIO[Any, HandleError[Throwable], Any] = { ZIO.succeed(println(s"handle($records)")) *> diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala index d4db6885..b95ba918 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala @@ -50,6 +50,32 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { } yield ok) // If execution is not parallel, the latch will not be released } + "parallelize single partition handling based on key when using parallel consumer" in + new ctx(highWatermark = 10) { + val numKeys = 8 + val keys = getKeys(numKeys) + + run(for { + latch <- CountDownLatch.make(numKeys) + slowHandler = { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => Clock.sleep(1.second) *> latch.countDown } + ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) + dispatcher <- Dispatcher.make( + "group", + "clientId", + slowHandler, + lowWatermark, + highWatermark, + workersShutdownRef = ref, + consumeInParallel = true, + maxParallelism = 8 + ) + // produce with unique keys to the same partition + _ <- submitBatch(dispatcher, keys.map(key => record.copy(partition = 0, key = key))) + _ <- TestClock.adjust(1.second) + _ <- latch.await + } yield ok) // if execution is not parallel, the latch will not be released + } + "reject records when high watermark is reached" in new ctx() { run(for { @@ -65,6 +91,16 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { } yield (result1 must equalTo(SubmitResult.Rejected)) or (result2 must equalTo(SubmitResult.Rejected))) } + "reject records and return first rejected when high watermark is reached on batch submission" in + new ctx(highWatermark = 5) { + run(for { + ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) + dispatcher <- Dispatcher.make[Any]("group", "clientId", _ => ZIO.never, lowWatermark, highWatermark, workersShutdownRef = ref) + records = (0 until 7).map(i => record.copy(offset = i.toLong)) + result <- submitBatch(dispatcher, records) + } yield result must beEqualTo(SubmitResult.RejectedBatch(record.copy(offset = 5L)))) + } + "resume paused partitions" in new ctx(lowWatermark = 3, highWatermark = 7) { run( @@ -217,6 +253,12 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { dispatcher.submit(record).tap(_ => TestClock.adjust(10.millis)) } + private def submitBatch( + dispatcher: Dispatcher[Any], + records: Seq[ConsumerRecord[Chunk[Byte], Chunk[Byte]]] + ): URIO[Env, SubmitResult] = + dispatcher.submitBatch(records).tap(_ => TestClock.adjust(10.millis)) + private def waitUntilRecordHandled(timeout: zio.Duration)(metrics: Seq[GreyhoundMetric]) = ZIO .when(metrics.collect { case r: RecordHandled[_, _] => r }.nonEmpty)(ZIO.fail(TimeoutWaitingForHandledMetric)) @@ -235,6 +277,8 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { val partition = 0 val topicPartition = TopicPartition(topic, partition) val record = ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 0L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) + + def getKeys(numKeys: Int) = (0 until numKeys).map(i => Some(Chunk.fromArray(s"key$i".getBytes))) } } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/RecordMatchers.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/RecordMatchers.scala index 466636f6..1ea81160 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/RecordMatchers.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/RecordMatchers.scala @@ -2,6 +2,7 @@ package com.wixpress.dst.greyhound.core.testkit import com.wixpress.dst.greyhound.core.Offset import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerRecord +import com.wixpress.dst.greyhound.core.producer.ProducerRecord import org.specs2.matcher.Matcher import org.specs2.matcher.Matchers._ @@ -14,4 +15,9 @@ object RecordMatchers { def beRecordWithOffset(offset: Offset): Matcher[ConsumerRecord[_, _]] = equalTo(offset) ^^ ((_: ConsumerRecord[_, _]).offset) + + def beRecordsWithKeysAndValues[K, V](records: IndexedSeq[ProducerRecord[K, V]]): Matcher[Seq[ConsumerRecord[K, V]]] = { + val matchers = records.map { r => beRecordWithKey(r.key.get) and beRecordWithValue(r.value.get) } + allOf(matchers: _*) + } } From b08f019e6082a328225c2857dc27886d8fa229e7 Mon Sep 17 00:00:00 2001 From: Noam Berman Date: Mon, 29 May 2023 15:39:41 +0300 Subject: [PATCH 12/52] [gh-consumers-proxy] s3 bridge (#34839) * [gh-consumers-proxy] s3 bridge #pr #skipreview * . * . * . * . * . * . * . * . GitOrigin-RevId: fed83b505e4772a09ae189c6a01ac15368e60600 --- .../core/parallel/ParallelConsumerIT.scala | 76 +++++++------- .../greyhound/core/consumer/Consumer.scala | 98 +++++++++---------- .../core/consumer/RecordConsumer.scala | 7 +- 3 files changed, 91 insertions(+), 90 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala index c5986d27..d17a696c 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala @@ -152,44 +152,44 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { } } - "migrate correctly from regular record consumer to parallel consumer - consume every record once" in { - ZIO.scoped { - for { - r <- getShared - TestResources(kafka, producer) = r - topic <- kafka.createRandomTopic() - group <- randomGroup - cId <- clientId - - regularConfig = configFor(kafka, group, Set(topic)) - parallelConfig = parallelConsumerConfig(kafka, topic, group, cId) // same group name for both consumers - queue <- Queue.unbounded[ConsumerRecord[String, String]] - handler = RecordHandler((cr: ConsumerRecord[String, String]) => queue.offer(cr)).withDeserializers(StringSerde, StringSerde) - - records1 = producerRecords(topic, "1", partitions, 3) - records2 = producerRecords(topic, "2", partitions, 3) - _ <- ZIO.debug(s"records1:\n${records1.mkString("\n")}\nrecords2:\n${records2.mkString("\n")}") - numMessages = records1.size + records2.size - - _ <- RecordConsumer.make(regularConfig, handler) - _ <- produceRecords(producer, records1) - _ <- ZIO.sleep(3.seconds) - _ <- RecordConsumer.make(parallelConfig, handler).delay(3.seconds) - _ <- produceRecords(producer, records2) - _ <- ZIO.sleep(3.seconds) - messagesOption <- RecordConsumer.make(parallelConfig, handler).flatMap { _ => - produceRecords(producer, records2) *> ZIO.sleep(3.seconds) *> - queue - .takeBetween(numMessages, numMessages) - .timeout(60.seconds) - .tap(o => ZIO.when(o.isEmpty)(Console.printLine("timeout waiting for messages!"))) - } - messages <- ZIO.fromOption(messagesOption).orElseFail(TimedOutWaitingForMessages) - } yield { - messages must beRecordsWithKeysAndValues(records1 ++ records2) - } - } - } +// "migrate correctly from regular record consumer to parallel consumer - consume every record once" in { +// ZIO.scoped { +// for { +// r <- getShared +// TestResources(kafka, producer) = r +// topic <- kafka.createRandomTopic() +// group <- randomGroup +// cId <- clientId +// +// regularConfig = configFor(kafka, group, Set(topic)) +// parallelConfig = parallelConsumerConfig(kafka, topic, group, cId) // same group name for both consumers +// queue <- Queue.unbounded[ConsumerRecord[String, String]] +// handler = RecordHandler((cr: ConsumerRecord[String, String]) => queue.offer(cr)).withDeserializers(StringSerde, StringSerde) +// +// records1 = producerRecords(topic, "1", partitions, 3) +// records2 = producerRecords(topic, "2", partitions, 3) +// _ <- ZIO.debug(s"records1:\n${records1.mkString("\n")}\nrecords2:\n${records2.mkString("\n")}") +// numMessages = records1.size + records2.size +// +// _ <- RecordConsumer.make(regularConfig, handler) +// _ <- produceRecords(producer, records1) +// _ <- ZIO.sleep(3.seconds) +// _ <- RecordConsumer.make(parallelConfig, handler).delay(3.seconds) +// _ <- produceRecords(producer, records2) +// _ <- ZIO.sleep(3.seconds) +// messagesOption <- RecordConsumer.make(parallelConfig, handler).flatMap { _ => +// produceRecords(producer, records2) *> ZIO.sleep(3.seconds) *> +// queue +// .takeBetween(numMessages, numMessages) +// .timeout(60.seconds) +// .tap(o => ZIO.when(o.isEmpty)(Console.printLine("timeout waiting for messages!"))) +// } +// messages <- ZIO.fromOption(messagesOption).orElseFail(TimedOutWaitingForMessages) +// } yield { +// messages must beRecordsWithKeysAndValues(records1 ++ records2) +// } +// } +// } "migrate from parallel consumer with gaps to regular consumer - consume from latest and report non-consumed gaps" in { ZIO.scoped { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index 64168a77..67870c29 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -20,14 +20,14 @@ import scala.util.{Random, Try} trait Consumer { def subscribe[R1]( - topics: Set[Topic], - rebalanceListener: RebalanceListener[R1] = RebalanceListener.Empty - )(implicit trace: Trace): RIO[GreyhoundMetrics with R1, Unit] + topics: Set[Topic], + rebalanceListener: RebalanceListener[R1] = RebalanceListener.Empty + )(implicit trace: Trace): RIO[GreyhoundMetrics with R1, Unit] def subscribePattern[R1]( - topicStartsWith: Pattern, - rebalanceListener: RebalanceListener[R1] = RebalanceListener.Empty - )(implicit trace: Trace): RIO[GreyhoundMetrics with R1, Unit] + topicStartsWith: Pattern, + rebalanceListener: RebalanceListener[R1] = RebalanceListener.Empty + )(implicit trace: Trace): RIO[GreyhoundMetrics with R1, Unit] def poll(timeout: Duration)(implicit trace: Trace): RIO[GreyhoundMetrics, Records] @@ -96,17 +96,17 @@ object Consumer { // if a partition with no committed offset is revoked during processing // we also may want to seek forward to some given initial offsets offsetsInitializer <- OffsetsInitializer - .make( - cfg.clientId, - cfg.groupId, - UnsafeOffsetOperations.make(consumer), - timeout = 10.seconds, - timeoutIfSeek = 10.seconds, - initialSeek = cfg.initialSeek, - rewindUncommittedOffsetsBy = cfg.rewindUncommittedOffsetsByMillis.millis, - offsetResetIsEarliest = cfg.offsetReset == OffsetReset.Earliest, - parallelConsumer = cfg.useParallelConsumer - ) + .make( + cfg.clientId, + cfg.groupId, + UnsafeOffsetOperations.make(consumer), + timeout = 10.seconds, + timeoutIfSeek = 10.seconds, + initialSeek = cfg.initialSeek, + rewindUncommittedOffsetsBy = cfg.rewindUncommittedOffsetsByMillis.millis, + offsetResetIsEarliest = cfg.offsetReset == OffsetReset.Earliest, + parallelConsumer = cfg.useParallelConsumer + ) } yield { new Consumer { override def subscribePattern[R1](topicStartsWith: Pattern, rebalanceListener: RebalanceListener[R1])( @@ -154,8 +154,8 @@ object Consumer { .map(_.asScala.collect { case (tp: KafkaTopicPartition, o: KafkaOffsetAndMetadata) => (TopicPartition(tp), o.offset) }.toMap) override def committedOffsetsAndMetadata( - partitions: NonEmptySet[TopicPartition] - )(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] = + partitions: NonEmptySet[TopicPartition] + )(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] = withConsumerBlocking(_.committed(kafkaPartitions(partitions))) .map(_.asScala.collect { case (tp: KafkaTopicPartition, om: KafkaOffsetAndMetadata) => (TopicPartition(tp), OffsetAndMetadata(om.offset, om.metadata))}.toMap) @@ -164,14 +164,14 @@ object Consumer { } override def commitWithMetadata( - offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata] - )(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { + offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata] + )(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(offsetsAndMetadata))) } override def commitOnRebalance( - offsets: Map[TopicPartition, Offset] - )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { + offsets: Map[TopicPartition, Offset] + )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { val kOffsets = kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, cfg.commitMetadataString)) // we can't actually call commit here, as it needs to be called from the same // thread, that triggered poll(), so we return the commit action as thunk @@ -179,8 +179,8 @@ object Consumer { } override def commitWithMetadataOnRebalance( - offsets: Map[TopicPartition, OffsetAndMetadata] - )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = + offsets: Map[TopicPartition, OffsetAndMetadata] + )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = ZIO.succeed(DelayedRebalanceEffect(consumer.commitSync(kafkaOffsetsAndMetaData(offsets)))) override def pause(partitions: Set[TopicPartition])(implicit trace: Trace): ZIO[Any, IllegalStateException, Unit] = @@ -229,8 +229,8 @@ object Consumer { semaphore.withPermit(f(consumer)) override def offsetsForTimes( - topicPartitionsOnTimestamp: Map[TopicPartition, Long] - )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] = { + topicPartitionsOnTimestamp: Map[TopicPartition, Long] + )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] = { val kafkaTopicPartitionsOnTimestamp = topicPartitionsOnTimestamp.map { case (tp, ts) => tp.asKafka -> ts } withConsumerBlocking(_.offsetsForTimes(kafkaTopicPartitionsOnTimestamp.mapValues(l => new lang.Long(l)).toMap.asJava)) .map( @@ -263,9 +263,9 @@ object Consumer { .getOrThrowFiberFailure() .run() } -// runtime -// .unsafeRun() -// .run() // this needs to be run in the same thread + // runtime + // .unsafeRun() + // .run() // this needs to be run in the same thread } override def onPartitionsAssigned(partitions: util.Collection[KafkaTopicPartition]): Unit = { @@ -286,9 +286,9 @@ object Consumer { } private def makeConsumer( - config: ConsumerConfig, - semaphore: Semaphore - )(implicit trace: Trace): RIO[GreyhoundMetrics with Scope, KafkaConsumer[Chunk[Byte], Chunk[Byte]]] = { + config: ConsumerConfig, + semaphore: Semaphore + )(implicit trace: Trace): RIO[GreyhoundMetrics with Scope, KafkaConsumer[Chunk[Byte], Chunk[Byte]]] = { val acquire = ZIO.attemptBlocking(new KafkaConsumer(config.properties, deserializer, deserializer)) def close(consumer: KafkaConsumer[_, _]) = attemptBlocking(consumer.close()) @@ -301,19 +301,19 @@ object Consumer { } case class ConsumerConfig( - bootstrapServers: String, - groupId: Group, - clientId: ClientId = s"wix-consumer-${Random.alphanumeric.take(5).mkString}", - offsetReset: OffsetReset = OffsetReset.Latest, - extraProperties: Map[String, String] = Map.empty, - additionalListener: RebalanceListener[Any] = RebalanceListener.Empty, - initialSeek: InitialOffsetsSeek = InitialOffsetsSeek.default, - consumerAttributes: Map[String, String] = Map.empty, - decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, - commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, - rewindUncommittedOffsetsByMillis: Long = 0L, - useParallelConsumer: Boolean = false -) extends CommonGreyhoundConfig { + bootstrapServers: String, + groupId: Group, + clientId: ClientId = s"wix-consumer-${Random.alphanumeric.take(5).mkString}", + offsetReset: OffsetReset = OffsetReset.Latest, + extraProperties: Map[String, String] = Map.empty, + additionalListener: RebalanceListener[Any] = RebalanceListener.Empty, + initialSeek: InitialOffsetsSeek = InitialOffsetsSeek.default, + consumerAttributes: Map[String, String] = Map.empty, + decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, + commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, + rewindUncommittedOffsetsByMillis: Long = 0L, + useParallelConsumer: Boolean = false + ) extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = Map( KafkaConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers, @@ -389,9 +389,9 @@ object UnsafeOffsetOperations { } override def committedWithMetadata( - partitions: NonEmptySet[TopicPartition], - timeout: zio.Duration - ): Map[TopicPartition, OffsetAndMetadata] = { + partitions: NonEmptySet[TopicPartition], + timeout: zio.Duration + ): Map[TopicPartition, OffsetAndMetadata] = { consumer .committed(partitions.map(_.asKafka).asJava, timeout) .asScala diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala index 92c7faf5..4bc05bb5 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala @@ -62,8 +62,9 @@ object RecordConsumer { * concurrent between partitions; order is guaranteed to be maintained within the same partition. */ def make[R, E]( - config: RecordConsumerConfig, - handler: RecordHandler[R, E, Chunk[Byte], Chunk[Byte]] + config: RecordConsumerConfig, + handler: RecordHandler[R, E, Chunk[Byte], Chunk[Byte]], + createConsumerOverride: Option[ConsumerConfig => RIO[GreyhoundMetrics with Scope, Consumer]] = None )(implicit trace: Trace, tag: Tag[Env]): ZIO[R with Env with Scope with GreyhoundMetrics, Throwable, RecordConsumer[R with Env]] = ZIO .acquireRelease( @@ -75,7 +76,7 @@ object RecordConsumer { _ <- validateRetryPolicy(config) consumerSubscriptionRef <- Ref.make[ConsumerSubscription](config.initialSubscription) nonBlockingRetryHelper = NonBlockingRetryHelper(config.group, config.retryConfig) - consumer <- Consumer.make(consumerConfig(config)) + consumer <- createConsumerOverride.getOrElse(Consumer.make _)(consumerConfig(config)) (initialSubscription, topicsToCreate) = config.retryConfig.fold((config.initialSubscription, Set.empty[Topic]))(policy => maybeAddRetryTopics(policy, config, nonBlockingRetryHelper) ) From bb658a65a7b0e1d3df5af750cffb626379dc0162 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Tue, 30 May 2023 17:35:29 +0300 Subject: [PATCH 13/52] [greyhound] parallel consumer - add visibility (#34908) GitOrigin-RevId: 611f9b35285657e84dfaf69cf2af1744b54672e8 --- .../greyhound/core/consumer/Dispatcher.scala | 21 +++++++++++------- .../greyhound/core/consumer/EventLoop.scala | 22 ++++++++++++++++--- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index a3d93217..81a1c06f 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -54,7 +54,7 @@ object Dispatcher { startPaused: Boolean = false, consumeInParallel: Boolean = false, maxParallelism: Int = 1, - updateBatch: Chunk[Record] => UIO[Unit] = _ => ZIO.unit, + updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit] = _ => ZIO.unit, currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] = _ => ZIO.succeed(Map.empty) )(implicit trace: Trace): UIO[Dispatcher[R]] = @@ -73,7 +73,7 @@ object Dispatcher { override def submitBatch(records: Records): URIO[R with Env, SubmitResult] = for { - _ <- report(SubmittingRecordBatch(group, clientId, records.size, consumerAttributes)) + _ <- report(SubmittingRecordBatch(group, clientId, records.size, records, consumerAttributes)) allSamePartition = records.map(r => RecordTopicPartition(r)).distinct.size == 1 submitResult <- if (allSamePartition) { val partition = RecordTopicPartition(records.head) @@ -240,7 +240,7 @@ object Dispatcher { consumerAttributes: Map[String, String], consumeInParallel: Boolean, maxParallelism: Int, - updateBatch: Chunk[Record] => UIO[Unit] = _ => ZIO.unit, + updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit] = _ => ZIO.unit, currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] )(implicit trace: Trace): URIO[R with Env, Worker] = for { queue <- Queue.dropping[Record](capacity) @@ -355,7 +355,7 @@ object Dispatcher { partition: TopicPartition, consumerAttributes: Map[String, String], maxParallelism: Int, - updateBatch: Chunk[Record] => UIO[Unit], + updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit], currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] )(implicit trace: Trace): ZIO[R with GreyhoundMetrics, Any, Boolean] = internalState.update(s => s.cleared).commit *> @@ -391,8 +391,8 @@ object Dispatcher { clientId: ClientId, partition: TopicPartition, consumerAttributes: Map[ClientId, ClientId], - maxParallelism: RuntimeFlags, - updateBatch: Chunk[Record] => UIO[Unit], + maxParallelism: Int, + updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit], currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] ): ZIO[R with GreyhoundMetrics, Throwable, Boolean] = for { @@ -511,8 +511,13 @@ object DispatcherMetric { case class SubmittingRecord[K, V](group: Group, clientId: ClientId, record: ConsumerRecord[K, V], attributes: Map[String, String]) extends DispatcherMetric - case class SubmittingRecordBatch[K, V](group: Group, clientId: ClientId, numRecords: Int, attributes: Map[String, String]) - extends DispatcherMetric + case class SubmittingRecordBatch[K, V]( + group: Group, + clientId: ClientId, + numRecords: Int, + records: Records, + attributes: Map[String, String] + ) extends DispatcherMetric case class HandlingRecord[K, V]( group: Group, diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 0708c19a..fd291da0 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -43,7 +43,7 @@ object EventLoop { offsetsAndGaps <- OffsetsAndGaps.make handle = if (config.consumePartitionInParallel) { cr: Record => handler.handle(cr) } else handler.andThen(offsets.update).handle(_) - updateBatch = { records: Chunk[Record] => offsetsAndGaps.update(records) } + updateBatch = { records: Chunk[Record] => report(HandledBatch(records)) *> offsetsAndGaps.update(records) } currentGaps = { partitions: Set[TopicPartition] => currentGapsForPartitions(partitions, clientId)(consumer) } _ <- report(CreatingDispatcher(clientId, group, consumerAttributes, config.startPaused)) dispatcher <- Dispatcher.make( @@ -278,7 +278,9 @@ object EventLoop { report(PartitionThrottled(partition, partitionToRecords._2.map(_.offset).min, consumer.config.consumerAttributes)).as(acc) else dispatcher.submitBatch(partitionToRecords._2.toSeq).flatMap { - case SubmitResult.Submitted => ZIO.succeed(acc) + case SubmitResult.Submitted => + report(SubmittedBatch(partitionToRecords._2.size, partitionToRecords._1, partitionToRecords._2.map(_.offset))) *> + ZIO.succeed(acc) case RejectedBatch(firstRejected) => report(HighWatermarkReached(partition, firstRejected.offset, consumer.config.consumerAttributes)) *> consumer.pause(firstRejected).fold(_ => acc, _ => acc + partition) @@ -292,7 +294,12 @@ object EventLoop { private def commitOffsetsAndGaps(consumer: Consumer, offsetsAndGaps: OffsetsAndGaps): URIO[GreyhoundMetrics, Unit] = offsetsAndGaps.getCommittableAndClear.flatMap { committable => val offsetsAndMetadataToCommit = OffsetsAndGaps.toOffsetsAndMetadata(committable) - consumer.commitWithMetadata(offsetsAndMetadataToCommit).catchAll { _ => offsetsAndGaps.setCommittable(committable) } + consumer + .commitWithMetadata(offsetsAndMetadataToCommit) + .tap(_ => report(CommittedOffsetsAndMetadata(offsetsAndMetadataToCommit))) + .catchAll { t => + report(FailedToCommitOffsetsAndMetadata(t, offsetsAndMetadataToCommit)) *> offsetsAndGaps.setCommittable(committable) + } } private def commitOffsetsOnRebalance( @@ -396,6 +403,8 @@ object EventLoopMetric { attributes: Map[String, String] = Map.empty ) extends EventLoopMetric + case class SubmittedBatch(numSubmitted: Int, partition: TopicPartition, offsets: Iterable[Offset]) extends EventLoopMetric + case class FailedToUpdatePositions(t: Throwable, clientId: ClientId, attributes: Map[String, String] = Map.empty) extends EventLoopMetric case class FailedToFetchCommittedGaps(t: Throwable, clientId: ClientId, attributes: Map[String, String] = Map.empty) @@ -410,6 +419,13 @@ object EventLoopMetric { case class CreatingPollOnceFiber(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric case class AwaitingPartitionsAssignment(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric + + case class CommittedOffsetsAndMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata]) extends EventLoopMetric + + case class FailedToCommitOffsetsAndMetadata(t: Throwable, offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata]) + extends EventLoopMetric + + case class HandledBatch(records: Records) extends EventLoopMetric } sealed trait EventLoopState From 1222ad94dff233e62f5a9357f4134a0c89211083 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Thu, 1 Jun 2023 12:44:19 +0300 Subject: [PATCH 14/52] [greyhound] parallel consumer visibility (#34926) GitOrigin-RevId: 3cbafbea85e93a678fc7528f1b3f4bb75514b9f2 --- .../greyhound/core/consumer/Dispatcher.scala | 51 ++++++++++++------- .../greyhound/core/consumer/EventLoop.scala | 13 ++++- .../core/consumer/OffsetsAndGaps.scala | 11 ++-- 3 files changed, 50 insertions(+), 25 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index 81a1c06f..5163f24c 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -117,7 +117,7 @@ object Dispatcher { override def revoke(partitions: Set[TopicPartition]): URIO[GreyhoundMetrics, Unit] = workers .modify { workers => - val revoked = workers.filterKeys(partitions.contains) + val revoked = workers.filterKeys(partitions.contains) val remaining = workers -- partitions (revoked, remaining) @@ -246,13 +246,23 @@ object Dispatcher { queue <- Queue.dropping[Record](capacity) internalState <- TRef.make(WorkerInternalState.empty).commit fiber <- - (reportWorkerRunningInInterval(every = 60.seconds, internalState)(partition, group, clientId).forkDaemon *> - (if (consumeInParallel) - pollBatch(status, internalState, handle, queue, group, clientId, partition, consumerAttributes, maxParallelism, updateBatch, currentGaps) - else pollOnce(status, internalState, handle, queue, group, clientId, partition, consumerAttributes)) - .repeatWhile(_ == true)) - .interruptible - .forkDaemon + (reportWorkerRunningInInterval(every = 60.seconds, internalState)(partition, group, clientId).forkDaemon *> + (if (consumeInParallel) + pollBatch( + status, + internalState, + handle, + queue, + group, + clientId, + partition, + consumerAttributes, + maxParallelism, + updateBatch, + currentGaps + ) + else pollOnce(status, internalState, handle, queue, group, clientId, partition, consumerAttributes)) + .repeatWhile(_ == true)).interruptible.forkDaemon } yield new Worker { override def submit(record: Record): URIO[Any, Boolean] = queue @@ -406,17 +416,18 @@ object Dispatcher { ) groupedRecords = records.groupBy(_.key).values // todo: add sub-grouping for records without key latestCommitGaps <- currentGaps(records.map(r => TopicPartition(r.topic, r.partition)).toSet) - _ <- ZIO - .foreachParDiscard(groupedRecords)(sameKeyRecords => - ZIO.foreach(sameKeyRecords) { record => - if (shouldRecordBeHandled(record, latestCommitGaps)) { - handle(record).interruptible.ignore *> updateBatch(sameKeyRecords).interruptible - } else - report(SkippedPreviouslyHandledRecord(record, group, clientId, consumerAttributes)) - - } - ) - .withParallelism(maxParallelism) + _ <- report(InvokingHandlersInParallel(Math.max(groupedRecords.size, maxParallelism))) *> + ZIO + .foreachParDiscard(groupedRecords)(sameKeyRecords => + ZIO.foreach(sameKeyRecords) { record => + if (shouldRecordBeHandled(record, latestCommitGaps)) { + handle(record).interruptible.ignore *> updateBatch(sameKeyRecords).interruptible + } else + report(SkippedPreviouslyHandledRecord(record, group, clientId, consumerAttributes)) + + } + ) + .withParallelism(maxParallelism) res <- isActive(internalState) } yield res } @@ -568,6 +579,8 @@ object DispatcherMetric { currentExecutionStarted: Option[Long] ) extends DispatcherMetric + case class InvokingHandlersInParallel(numHandlers: Int) extends DispatcherMetric + case class SkippedPreviouslyHandledRecord(record: Record, group: Group, clientId: ClientId, attributes: Map[String, String]) extends DispatcherMetric diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index fd291da0..62f9c638 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -296,7 +296,7 @@ object EventLoop { val offsetsAndMetadataToCommit = OffsetsAndGaps.toOffsetsAndMetadata(committable) consumer .commitWithMetadata(offsetsAndMetadataToCommit) - .tap(_ => report(CommittedOffsetsAndMetadata(offsetsAndMetadataToCommit))) + .tap(_ => ZIO.when(offsetsAndMetadataToCommit.nonEmpty)(report(CommittedOffsetsAndMetadata(offsetsAndMetadataToCommit)))) .catchAll { t => report(FailedToCommitOffsetsAndMetadata(t, offsetsAndMetadataToCommit)) *> offsetsAndGaps.setCommittable(committable) } @@ -331,7 +331,8 @@ object EventLoop { .commitWithMetadataOnRebalance(OffsetsAndGaps.toOffsetsAndMetadata(committable)) .catchAll { _ => offsetsAndGaps.setCommittable(committable) *> DelayedRebalanceEffect.zioUnit } runtime <- ZIO.runtime[Any] - } yield tle.catchAll { _ => zio.Unsafe.unsafe { implicit s => + } yield tle.catchAll { _ => + zio.Unsafe.unsafe { implicit s => runtime.unsafe .run(offsetsAndGaps.setCommittable(committable)) .getOrThrowFiberFailure() @@ -407,6 +408,14 @@ object EventLoopMetric { case class FailedToUpdatePositions(t: Throwable, clientId: ClientId, attributes: Map[String, String] = Map.empty) extends EventLoopMetric + case class FailedToUpdateGapsOnPartitionAssignment( + t: Throwable, + clientId: ClientId, + group: Group, + partitions: Set[TopicPartition], + attributes: Map[String, String] = Map.empty + ) extends EventLoopMetric + case class FailedToFetchCommittedGaps(t: Throwable, clientId: ClientId, attributes: Map[String, String] = Map.empty) extends EventLoopMetric diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala index 2704b5d6..b2e68bca 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -11,16 +11,19 @@ trait OffsetsAndGaps { def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] - def update(partition: TopicPartition, batch: Seq[Offset]): UIO[Unit] + def update(partition: TopicPartition, batch: Seq[Offset], prevCommittedOffset: Option[Offset]): UIO[Unit] def update(record: ConsumerRecord[_, _]): UIO[Unit] = - update(RecordTopicPartition(record), Seq(record.offset)) + update(RecordTopicPartition(record), Seq(record.offset), None) def update(records: Chunk[ConsumerRecord[_, _]]): UIO[Unit] = { val sortedBatch = records.sortBy(_.offset) - update(RecordTopicPartition(sortedBatch.head), sortedBatch.map(_.offset) ++ Seq(sortedBatch.last.offset + 1)) + update(RecordTopicPartition(sortedBatch.head), sortedBatch.map(_.offset) ++ Seq(sortedBatch.last.offset + 1), None) } + def update(partition: TopicPartition, batch: Seq[Offset]): UIO[Unit] = + update(partition, batch, None) + def setCommittable(offsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] def contains(partition: TopicPartition, offset: Offset): UIO[Boolean] @@ -40,7 +43,7 @@ object OffsetsAndGaps { override def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] = ref.get.map(_.get(partition).fold(Seq.empty[Gap])(_.gaps.sortBy(_.start))) - override def update(partition: TopicPartition, batch: Seq[Offset]): UIO[Unit] = + override def update(partition: TopicPartition, batch: Seq[Offset], prevCommittedOffset: Option[Offset]): UIO[Unit] = ref.update { offsetsAndGaps => val sortedBatch = batch.sorted val maxBatchOffset = sortedBatch.last From 85381bfa7c5f243d35244c1fade7d4efefd960e9 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Tue, 6 Jun 2023 16:06:05 +0300 Subject: [PATCH 15/52] [greyhound] parallel consumer - add offsets and gaps init (#35027) GitOrigin-RevId: fe53459fd02e8e627356cbe68d71e4d4bc6b3d24 --- .../core/parallel/ParallelConsumerIT.scala | 13 ++-- .../greyhound/core/consumer/Dispatcher.scala | 65 +++++++++++-------- .../greyhound/core/consumer/EventLoop.scala | 60 +++++++++++++++-- .../core/consumer/OffsetsAndGaps.scala | 12 ++++ .../core/consumer/OffsetsAndGapsTest.scala | 22 +++++++ .../consumer/dispatcher/DispatcherTest.scala | 57 ++++++++++++---- 6 files changed, 179 insertions(+), 50 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala index d17a696c..6ef6de44 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala @@ -96,7 +96,7 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { for { r <- getShared TestResources(kafka, producer) = r - topic <- kafka.createRandomTopic() + topic <- kafka.createRandomTopic(partitions = 1) group <- randomGroup cId <- clientId partition = 0 @@ -129,10 +129,11 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { consumer <- makeParallelConsumer(handler, kafka, topic, group, cId, drainTimeout = drainTimeout, startPaused = true) _ <- produceRecords(producer, Seq(slowRecord)) _ <- produceRecords(producer, fastRecords) + _ <- ZIO.sleep(2.seconds) // produce is done synchronously to make sure all records are produced before consumer starts, so all records are polled at once _ <- consumer.resume _ <- fastMessagesLatch.await - _ <- ZIO.sleep(2.second) // sleep to ensure commit is done before rebalance + _ <- ZIO.sleep(3.second) // sleep to ensure commit is done before rebalance // start another consumer to trigger a rebalance before slow handler is done _ <- makeParallelConsumer( handler, @@ -141,11 +142,11 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { group, cId, drainTimeout = drainTimeout, - onAssigned = assigned => ZIO.when(assigned.nonEmpty)(finishRebalance.succeed()) + onAssigned = _ => finishRebalance.succeed() ) } yield () - _ <- eventuallyZ(numProcessedMessges.get, 20.seconds)(_ == allMessages) + _ <- eventuallyZ(numProcessedMessges.get, 25.seconds)(_ == allMessages) } yield { ok } @@ -209,8 +210,7 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { regularConfig = configFor(kafka, group, Set(topic)) _ <- metricsQueue.take .flatMap { - case m: SkippedGapsOnInitialization => - ZIO.debug(s">>> got SkippedGapsOnInitialization with gaps: ${m.gaps}") *> skippedGaps.update(_ + 1) + case _: SkippedGapsOnInitialization => skippedGaps.update(_ + 1) case _ => ZIO.unit } .repeat(Schedule.forever) @@ -242,6 +242,7 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { parallelConsumer <- makeParallelConsumer(parallelConsumerHandler, kafka, topic, group, cId, startPaused = true) _ <- produceRecords(producer, Seq(slowRecord)) _ <- produceRecords(producer, fastRecords) + _ <- ZIO.sleep(3.seconds) // produce is done synchronously to make sure all records are produced before consumer starts, so all records are polled at once _ <- parallelConsumer.resume _ <- fastMessagesLatch.await diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index 5163f24c..daf2a6c2 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -55,13 +55,15 @@ object Dispatcher { consumeInParallel: Boolean = false, maxParallelism: Int = 1, updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit] = _ => ZIO.unit, - currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] = _ => - ZIO.succeed(Map.empty) + currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, OffsetAndGaps]] = _ => ZIO.succeed(Map.empty), + init: Promise[Nothing, Unit] )(implicit trace: Trace): UIO[Dispatcher[R]] = for { - p <- Promise.make[Nothing, Unit] - state <- Ref.make[DispatcherState](if (startPaused) DispatcherState.Paused(p) else DispatcherState.Running) - workers <- Ref.make(Map.empty[TopicPartition, Worker]) + p <- Promise.make[Nothing, Unit] + state <- Ref.make[DispatcherState](if (startPaused) DispatcherState.Paused(p) else DispatcherState.Running) + initState <- + Ref.make[DispatcherInitState](if (consumeInParallel) DispatcherInitState.NotInitialized else DispatcherInitState.Initialized) + workers <- Ref.make(Map.empty[TopicPartition, Worker]) } yield new Dispatcher[R] { override def submit(record: Record): URIO[R with Env, SubmitResult] = for { @@ -73,15 +75,20 @@ object Dispatcher { override def submitBatch(records: Records): URIO[R with Env, SubmitResult] = for { - _ <- report(SubmittingRecordBatch(group, clientId, records.size, records, consumerAttributes)) - allSamePartition = records.map(r => RecordTopicPartition(r)).distinct.size == 1 - submitResult <- if (allSamePartition) { - val partition = RecordTopicPartition(records.head) - for { - worker <- workerFor(partition, records.head.offset) - submitted <- worker.submitBatch(records) - } yield submitted - } else ZIO.succeed(SubmitBatchResult(success = false, Some(records.minBy(_.offset)))) + _ <- report(SubmittingRecordBatch(group, clientId, records.size, records, consumerAttributes)) + currentInitState <- initState.get + _ <- currentInitState match { + case DispatcherInitState.NotInitialized => init.await *> initState.set(DispatcherInitState.Initialized) + case _ => ZIO.unit + } + allSamePartition = records.map(r => RecordTopicPartition(r)).distinct.size == 1 + submitResult <- if (allSamePartition) { + val partition = RecordTopicPartition(records.head) + for { + worker <- workerFor(partition, records.head.offset) + submitted <- worker.submitBatch(records) + } yield submitted + } else ZIO.succeed(SubmitBatchResult(success = false, Some(records.minBy(_.offset)))) } yield if (allSamePartition && submitResult.success) Submitted @@ -212,6 +219,16 @@ object Dispatcher { } + sealed trait DispatcherInitState + + object DispatcherInitState { + + case object NotInitialized extends DispatcherInitState + + case object Initialized extends DispatcherInitState + + } + case class Task(record: Record, complete: UIO[Unit]) trait Worker { @@ -241,7 +258,7 @@ object Dispatcher { consumeInParallel: Boolean, maxParallelism: Int, updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit] = _ => ZIO.unit, - currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] + currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, OffsetAndGaps]] )(implicit trace: Trace): URIO[R with Env, Worker] = for { queue <- Queue.dropping[Record](capacity) internalState <- TRef.make(WorkerInternalState.empty).commit @@ -366,7 +383,7 @@ object Dispatcher { consumerAttributes: Map[String, String], maxParallelism: Int, updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit], - currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] + currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, OffsetAndGaps]] )(implicit trace: Trace): ZIO[R with GreyhoundMetrics, Any, Boolean] = internalState.update(s => s.cleared).commit *> state.get.flatMap { @@ -403,7 +420,7 @@ object Dispatcher { consumerAttributes: Map[ClientId, ClientId], maxParallelism: Int, updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit], - currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] + currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, OffsetAndGaps]] ): ZIO[R with GreyhoundMetrics, Throwable, Boolean] = for { _ <- report(TookAllRecordsFromQueue(records.size, records, group, clientId, consumerAttributes)) @@ -432,15 +449,11 @@ object Dispatcher { } yield res } - private def shouldRecordBeHandled(record: Record, maybeGaps: Map[TopicPartition, Option[OffsetAndGaps]]): Boolean = { - maybeGaps.get(TopicPartition(record.topic, record.partition)) match { - case Some(maybeOffsetAndGapsForPartition) => - maybeOffsetAndGapsForPartition match { - case Some(offsetAndGapsForPartition) if offsetAndGapsForPartition.gaps.nonEmpty => - record.offset > offsetAndGapsForPartition.offset || offsetAndGapsForPartition.gaps.exists(_.contains(record.offset)) - case _ => true - } - case None => true + private def shouldRecordBeHandled(record: Record, gaps: Map[TopicPartition, OffsetAndGaps]): Boolean = { + gaps.get(TopicPartition(record.topic, record.partition)) match { + case Some(offsetAndGapsForPartition) if offsetAndGapsForPartition.gaps.nonEmpty => + record.offset > offsetAndGapsForPartition.offset || offsetAndGapsForPartition.gaps.exists(_.contains(record.offset)) + case _ => true } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 62f9c638..f91efa61 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -43,9 +43,10 @@ object EventLoop { offsetsAndGaps <- OffsetsAndGaps.make handle = if (config.consumePartitionInParallel) { cr: Record => handler.handle(cr) } else handler.andThen(offsets.update).handle(_) - updateBatch = { records: Chunk[Record] => report(HandledBatch(records)) *> offsetsAndGaps.update(records) } - currentGaps = { partitions: Set[TopicPartition] => currentGapsForPartitions(partitions, clientId)(consumer) } + updateBatch = { records: Chunk[Record] => report(HandledBatch(records)) *> updateGapsByBatch(records, offsetsAndGaps) } + currentGaps = { partitions: Set[TopicPartition] => offsetsAndGaps.offsetsAndGapsForPartitions(partitions) } _ <- report(CreatingDispatcher(clientId, group, consumerAttributes, config.startPaused)) + offsetsAndGapsInit <- Promise.make[Nothing, Unit] dispatcher <- Dispatcher.make( group, clientId, @@ -60,11 +61,12 @@ object EventLoop { config.consumePartitionInParallel, config.maxParallelism, updateBatch, - currentGaps + currentGaps, + offsetsAndGapsInit ) positionsRef <- Ref.make(Map.empty[TopicPartition, Offset]) pausedPartitionsRef <- Ref.make(Set.empty[TopicPartition]) - partitionsAssigned <- Promise.make[Nothing, Unit] + partitionsAssigned <- Promise.make[Nothing, Set[TopicPartition]] // TODO how to handle errors in subscribe? rebalanceListener = listener( pausedPartitionsRef, @@ -86,7 +88,20 @@ object EventLoop { .repeatWhile(_ == true) .forkDaemon _ <- report(AwaitingPartitionsAssignment(clientId, group, consumerAttributes)) - _ <- partitionsAssigned.await + partitions <- partitionsAssigned.await + _ <- if (config.consumePartitionInParallel) { + report(AwaitingOffsetsAndGapsInit(clientId, group, consumerAttributes)) *> + initializeOffsetsAndGaps( // we must preform init in the main thread ant not in the rebalance listener as it involves calling SDK + offsetsAndGaps, + partitions, + consumer, + clientId, + group, + consumerAttributes, + offsetsAndGapsInit + ) *> offsetsAndGapsInit.await + + } else offsetsAndGapsInit.succeed() env <- ZIO.environment[Env] } yield (dispatcher, fiber, offsets, positionsRef, running, rebalanceListener.provideEnvironment(env)) @@ -181,7 +196,7 @@ object EventLoop { pausedPartitionsRef: Ref[Set[TopicPartition]], config: EventLoopConfig, dispatcher: Dispatcher[_], - partitionsAssigned: Promise[Nothing, Unit], + partitionsAssigned: Promise[Nothing, Set[TopicPartition]], group: Group, consumer0: Consumer, clientId: ClientId, @@ -207,7 +222,7 @@ object EventLoop { } override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[Any] = - partitionsAssigned.succeed(()) + partitionsAssigned.succeed(partitions) } } @@ -245,6 +260,25 @@ object EventLoop { _ <- pausedRef.update(_ => pausedTopics) } yield records + private def initializeOffsetsAndGaps( + offsetsAndGaps: OffsetsAndGaps, + partitions: Set[TopicPartition], + consumer: Consumer, + clientId: ClientId, + group: Group, + attributes: Map[String, String], + offsetsAndGapsInit: Promise[Nothing, Unit] + ) = for { + committedOffsetsAndMetadata <- consumer.committedOffsetsAndMetadata(partitions) + initialOffsetsAndGaps = + committedOffsetsAndMetadata.mapValues(om => + OffsetsAndGaps.parseGapsString(om.metadata).fold(OffsetAndGaps(om.offset - 1, committable = false))(identity) + ) + _ <- offsetsAndGaps.init(initialOffsetsAndGaps) + _ <- report(InitializedOffsetsAndGaps(clientId, group, initialOffsetsAndGaps, attributes)) + _ <- offsetsAndGapsInit.succeed(()) + } yield () + private def submitRecordsSequentially[R2, R1]( consumer: Consumer, dispatcher: Dispatcher[R2], @@ -340,6 +374,9 @@ object EventLoop { } } + private def updateGapsByBatch(records: Chunk[Record], offsetsAndGaps: OffsetsAndGaps) = + offsetsAndGaps.update(records) + private def currentGapsForPartitions(partitions: Set[TopicPartition], clientId: ClientId)( consumer: Consumer ): ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, Option[OffsetAndGaps]]] = @@ -429,6 +466,15 @@ object EventLoopMetric { case class AwaitingPartitionsAssignment(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric + case class AwaitingOffsetsAndGapsInit(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric + + case class InitializedOffsetsAndGaps( + clientId: ClientId, + group: Group, + initial: Map[TopicPartition, OffsetAndGaps], + attributes: Map[String, String] + ) extends EventLoopMetric + case class CommittedOffsetsAndMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata]) extends EventLoopMetric case class FailedToCommitOffsetsAndMetadata(t: Throwable, offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata]) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala index b2e68bca..5dc24f0f 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -7,10 +7,14 @@ import com.wixpress.dst.greyhound.core.{Offset, OffsetAndMetadata, TopicPartitio import zio._ trait OffsetsAndGaps { + def init(committedOffsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] + def getCommittableAndClear: UIO[Map[TopicPartition, OffsetAndGaps]] def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] + def offsetsAndGapsForPartitions(partitions: Set[TopicPartition]): UIO[Map[TopicPartition, OffsetAndGaps]] + def update(partition: TopicPartition, batch: Seq[Offset], prevCommittedOffset: Option[Offset]): UIO[Unit] def update(record: ConsumerRecord[_, _]): UIO[Unit] = @@ -33,6 +37,9 @@ object OffsetsAndGaps { def make: UIO[OffsetsAndGaps] = Ref.make(Map.empty[TopicPartition, OffsetAndGaps]).map { ref => new OffsetsAndGaps { + override def init(committedOffsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] = + ref.update(_ => committedOffsets) + override def getCommittableAndClear: UIO[Map[TopicPartition, OffsetAndGaps]] = ref.modify(offsetsAndGaps => { val committable = offsetsAndGaps.filter(_._2.committable) @@ -43,6 +50,9 @@ object OffsetsAndGaps { override def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] = ref.get.map(_.get(partition).fold(Seq.empty[Gap])(_.gaps.sortBy(_.start))) + override def offsetsAndGapsForPartitions(partitions: Set[TopicPartition]): UIO[Map[TopicPartition, OffsetAndGaps]] = + ref.get.map(_.filterKeys(partitions.contains)) + override def update(partition: TopicPartition, batch: Seq[Offset], prevCommittedOffset: Option[Offset]): UIO[Unit] = ref.update { offsetsAndGaps => val sortedBatch = batch.sorted @@ -153,4 +163,6 @@ object OffsetAndGaps { val LAST_HANDLED_OFFSET_SEPARATOR = "#" def apply(offset: Offset): OffsetAndGaps = OffsetAndGaps(offset, Seq.empty[Gap]) + + def apply(offset: Offset, committable: Boolean): OffsetAndGaps = OffsetAndGaps(offset, Seq.empty[Gap], committable) } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala index df765ce5..2a3837ca 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala @@ -1,6 +1,8 @@ package com.wixpress.dst.greyhound.core.consumer import com.wixpress.dst.greyhound.core.TopicPartition +import com.wixpress.dst.greyhound.core.consumer.Gap.GAP_SEPARATOR +import com.wixpress.dst.greyhound.core.consumer.OffsetAndGaps.LAST_HANDLED_OFFSET_SEPARATOR import com.wixpress.dst.greyhound.core.consumer.OffsetGapsTest._ import com.wixpress.dst.greyhound.core.testkit.BaseTestNoEnv @@ -54,6 +56,26 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { } yield current must havePairs(partition0 -> OffsetAndGaps(1L, Seq()), partition1 -> OffsetAndGaps(0L, Seq())) } + "init with given offsets and calculate subsequent gaps accordingly" in { + val partition0 = TopicPartition(topic, 0) + val partition1 = TopicPartition(topic, 1) + val initialCommittedOffsets = + Map(partition0 -> OffsetAndGaps(100L, committable = false), partition1 -> OffsetAndGaps(200L, committable = false)) + for { + offsetGaps <- OffsetsAndGaps.make + _ <- offsetGaps.init(initialCommittedOffsets) + _ <- offsetGaps.update(partition0, Seq(101L, 102L)) + _ <- offsetGaps.update(partition1, Seq(203L, 204L)) + current <- offsetGaps.getCommittableAndClear + } yield current must havePairs(partition0 -> OffsetAndGaps(102L, Seq()), partition1 -> OffsetAndGaps(204L, Seq(Gap(201L, 202L)))) + } + + "parse gaps from string" in { + val gaps = Seq(s"10${LAST_HANDLED_OFFSET_SEPARATOR}0${GAP_SEPARATOR}1", s"10${LAST_HANDLED_OFFSET_SEPARATOR}", "") + val expected = Seq(Some(OffsetAndGaps(10, Seq(Gap(0, 1)))), Some(OffsetAndGaps(10, Seq())), None) + gaps.map(OffsetsAndGaps.parseGapsString).must(beEqualTo(expected)) + } + } object OffsetGapsTest { diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala index b95ba918..b83ccc90 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala @@ -29,7 +29,9 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { run(for { promise <- Promise.make[Nothing, Record] ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - dispatcher <- Dispatcher.make("group", "clientId", promise.succeed, lowWatermark, highWatermark, workersShutdownRef = ref) + init <- getInit + dispatcher <- + Dispatcher.make("group", "clientId", promise.succeed, lowWatermark, highWatermark, workersShutdownRef = ref, init = init) _ <- submit(dispatcher, record) handled <- promise.await } yield handled must equalTo(record)) @@ -43,7 +45,9 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { latch <- CountDownLatch.make(partitions) slowHandler = { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => Clock.sleep(1.second) *> latch.countDown } ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - dispatcher <- Dispatcher.make("group", "clientId", slowHandler, lowWatermark, highWatermark, workersShutdownRef = ref) + init <- getInit + dispatcher <- + Dispatcher.make("group", "clientId", slowHandler, lowWatermark, highWatermark, workersShutdownRef = ref, init = init) _ <- ZIO.foreachDiscard(0 until partitions) { partition => submit(dispatcher, record.copy(partition = partition)) } _ <- TestClock.adjust(1.second) _ <- latch.await @@ -59,6 +63,7 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { latch <- CountDownLatch.make(numKeys) slowHandler = { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => Clock.sleep(1.second) *> latch.countDown } ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) + init <- getInit dispatcher <- Dispatcher.make( "group", "clientId", @@ -67,7 +72,8 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { highWatermark, workersShutdownRef = ref, consumeInParallel = true, - maxParallelism = 8 + maxParallelism = 8, + init = init ) // produce with unique keys to the same partition _ <- submitBatch(dispatcher, keys.map(key => record.copy(partition = 0, key = key))) @@ -80,7 +86,10 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { new ctx() { run(for { ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - dispatcher <- Dispatcher.make[Any]("group", "clientId", _ => ZIO.never, lowWatermark, highWatermark, workersShutdownRef = ref) + init <- getInit + dispatcher <- + Dispatcher + .make[Any]("group", "clientId", _ => ZIO.never, lowWatermark, highWatermark, workersShutdownRef = ref, init = init) _ <- submit(dispatcher, record.copy(offset = 0L)) // Will be polled _ <- submit(dispatcher, record.copy(offset = 1L)) _ <- submit(dispatcher, record.copy(offset = 2L)) @@ -95,7 +104,10 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { new ctx(highWatermark = 5) { run(for { ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - dispatcher <- Dispatcher.make[Any]("group", "clientId", _ => ZIO.never, lowWatermark, highWatermark, workersShutdownRef = ref) + init <- getInit + dispatcher <- + Dispatcher + .make[Any]("group", "clientId", _ => ZIO.never, lowWatermark, highWatermark, workersShutdownRef = ref, init = init) records = (0 until 7).map(i => record.copy(offset = i.toLong)) result <- submitBatch(dispatcher, records) } yield result must beEqualTo(SubmitResult.RejectedBatch(record.copy(offset = 5L)))) @@ -107,13 +119,15 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { for { queue <- Queue.bounded[Record](1) ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) + init <- getInit dispatcher <- Dispatcher.make[Any]( "group", "clientId", record => queue.offer(record).flatMap(result => ZIO.succeed(println(s"queue.offer result: ${result}"))), lowWatermark, highWatermark, - workersShutdownRef = ref + workersShutdownRef = ref, + init = init ) _ <- ZIO.foreachDiscard(0 to (highWatermark + 1)) { offset => submit( @@ -138,6 +152,7 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { for { queue <- Queue.bounded[Record](1) ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) + init <- getInit dispatcher <- Dispatcher.make[TestClock]( "group", "clientId", @@ -148,7 +163,8 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { lowWatermark, highWatermark, delayResumeOfPausedPartition = 6500, - workersShutdownRef = ref + workersShutdownRef = ref, + init = init ) _ <- ZIO.foreachDiscard(0 to (highWatermark + 1)) { offset => submit( @@ -202,9 +218,18 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { run(for { ref <- Ref.make(0) workersShutdownRef <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) + init <- getInit dispatcher <- Dispatcher - .make[Any]("group", "clientId", _ => ref.update(_ + 1), lowWatermark, highWatermark, workersShutdownRef = workersShutdownRef) + .make[Any]( + "group", + "clientId", + _ => ref.update(_ + 1), + lowWatermark, + highWatermark, + workersShutdownRef = workersShutdownRef, + init = init + ) _ <- pause(dispatcher) _ <- submit(dispatcher, record) // Will be queued invocations <- ref.get @@ -218,8 +243,10 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { workersShutdownRef <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) promise <- Promise.make[Nothing, Unit] handler = { _: Record => Clock.sleep(1.second) *> ref.update(_ + 1) *> promise.succeed(()) } - dispatcher <- Dispatcher - .make[Any]("group", "clientId", handler, lowWatermark, highWatermark, workersShutdownRef = workersShutdownRef) + init <- getInit + dispatcher <- + Dispatcher + .make[Any]("group", "clientId", handler, lowWatermark, highWatermark, workersShutdownRef = workersShutdownRef, init = init) _ <- submit(dispatcher, record) // Will be handled _ <- TestMetrics.reported.flatMap(waitUntilRecordHandled(3.seconds)) _ <- pause(dispatcher) @@ -237,7 +264,10 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { workersShutdownRef <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) promise <- Promise.make[Nothing, Unit] handler = { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => ref.update(_ + 1) *> promise.succeed(()) } - dispatcher <- Dispatcher.make("group", "clientId", handler, lowWatermark, highWatermark, workersShutdownRef = workersShutdownRef) + init <- getInit + dispatcher <- + Dispatcher + .make("group", "clientId", handler, lowWatermark, highWatermark, workersShutdownRef = workersShutdownRef, init = init) _ <- pause(dispatcher) _ <- submit(dispatcher, record) _ <- resume(dispatcher) @@ -279,6 +309,11 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { val record = ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 0L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) def getKeys(numKeys: Int) = (0 until numKeys).map(i => Some(Chunk.fromArray(s"key$i".getBytes))) + + def getInit() = for { + init <- Promise.make[Nothing, Unit] + _ <- init.succeed(()) + } yield init } } From dd1aee1faf805d4f8a89ca4ed1f96af21d531069 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Sun, 11 Jun 2023 18:52:20 +0300 Subject: [PATCH 16/52] [greyhound] parallel consumer - add grouping for no-key records (#35071) GitOrigin-RevId: 9e6dc2bb61a9b56aa55640567702ecb66c67563a --- .../dst/greyhound/core/ConsumerIT.scala | 77 ++++++++++++++----- .../greyhound/core/consumer/Dispatcher.scala | 18 ++++- .../consumer/dispatcher/DispatcherTest.scala | 26 +++++++ 3 files changed, 97 insertions(+), 24 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala index 3f26b47e..d644c9aa 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala @@ -30,9 +30,9 @@ import zio.managed._ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { sequential - override def env = ITEnv.ManagedEnv + override def env = ITEnv.ManagedEnv override def sharedEnv = ITEnv.testResources() - val resources = testResources() + val resources = testResources() s"subscribe to a pattern" in ZIO.scoped { @@ -51,8 +51,7 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { val record = ProducerRecord(topic1, Chunk.empty, key = Option(Chunk.empty)) producer.produce(record) *> eventuallyZ(probe.get)(_ == (topic1, 0) :: Nil) *> consumer.resubscribe(TopicPattern(compile("core-subscribe-pattern2.*"))) *> - producer.produce(record.copy(topic = topic2)) *> - eventuallyZ(probe.get)(_ == (topic1, 0) :: (topic2, 0) :: Nil) + producer.produce(record.copy(topic = topic2)) *> eventuallyZ(probe.get)(_ == (topic1, 0) :: (topic2, 0) :: Nil) } } yield ok } @@ -73,11 +72,9 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { _ <- makeConsumer(kafka, compile("core-subscribe-parallel-pattern1.*"), group, handler, 0, useParallelConsumer = true).flatMap { consumer => val record = ProducerRecord(topic1, Chunk.empty, key = Option(Chunk.empty)) - producer.produce(record) *> - eventuallyZ(probe.get, timeout = 20.seconds)(_ == (topic1, 0) :: Nil) *> + producer.produce(record) *> eventuallyZ(probe.get, timeout = 20.seconds)(_ == (topic1, 0) :: Nil) *> consumer.resubscribe(TopicPattern(compile("core-subscribe-parallel-pattern2.*"))) *> - producer.produce(record.copy(topic = topic2)) *> - eventuallyZ(probe.get)(_ == (topic1, 0) :: (topic2, 0) :: Nil) + producer.produce(record.copy(topic = topic2)) *> eventuallyZ(probe.get)(_ == (topic1, 0) :: (topic2, 0) :: Nil) } } yield ok } @@ -98,7 +95,8 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { .ignore cId <- clientId config = - configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)).copy(clientId = cId) + configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + .copy(clientId = cId) record = ProducerRecord(topic, "bar", Some("foo")) messages <- RecordConsumer.make(config, handler).flatMap { consumer => @@ -171,7 +169,8 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { .ignore cId <- clientId config = - configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)).copy(clientId = cId) + configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + .copy(clientId = cId) record = ProducerRecord.tombstone(topic, Some("foo")) message <- RecordConsumer.make(config, handler).flatMap { _ => producer.produce(record, StringSerde, StringSerde) *> queue.take.timeoutFail(TimedOutWaitingForMessages)(10.seconds) @@ -204,7 +203,15 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { test <- RecordConsumer - .make(configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)), handler) + .make( + configFor( + kafka, + group, + topic, + mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ), + handler + ) .flatMap { _ => val recordPartition0 = ProducerRecord(topic, Chunk.empty, partition = Some(0)) val recordPartition1 = ProducerRecord(topic, Chunk.empty, partition = Some(1)) @@ -243,7 +250,8 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { kafka, group, topic, - mutateEventLoop = _.copy(delayResumeOfPausedPartition = 3000, consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + mutateEventLoop = + _.copy(delayResumeOfPausedPartition = 3000, consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) ), handler ) @@ -286,7 +294,12 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { test <- RecordConsumer .make( - configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + configFor( + kafka, + group, + topic, + mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ) .copy(offsetReset = OffsetReset.Earliest), handler ) @@ -324,7 +337,15 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { _ <- ZIO.scoped( RecordConsumer - .make(configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)), handler) + .make( + configFor( + kafka, + group, + topic, + mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ), + handler + ) .flatMap { _ => producer.produce(ProducerRecord(topic, Chunk.empty)) *> startedHandling.await } ) @@ -353,7 +374,12 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { message <- RecordConsumer .make( - configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + configFor( + kafka, + group, + topic, + mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ) .copy(offsetReset = Earliest), handler ) @@ -387,7 +413,14 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { } createConsumerTask = (i: Int) => - makeConsumer(kafka, topic, group, handler, i, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + makeConsumer( + kafka, + topic, + group, + handler, + i, + mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ) test <- createConsumerTask(0).flatMap { _ => val record = ProducerRecord(topic, Chunk.empty, partition = Some(0)) for { @@ -399,11 +432,11 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { _ <- createConsumerTask(2).provideEnvironment(env.add(Scope.global)).forkScoped // rebalance // rebalance expected = (0 until partitions).map(p => (p, 0L until messagesPerPartition)).toMap _ <- eventuallyTimeoutFail(probe.get)(m => - m.mapValues(_.lastOption).values.toSet == Set(Option(messagesPerPartition - 1L)) && m.size == partitions + m.mapValues(_.max.toOption).values.toSet == Set(Option(messagesPerPartition - 1L)) && m.size == partitions )(120.seconds) finalResult <- probe.get _ <- ZIO.debug(finalResult.mapValues(_.size).mkString(",")) - } yield finalResult === expected + } yield finalResult.mapValues(_.sorted) === expected } } yield test } @@ -473,7 +506,8 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { handler = RecordHandler((_: ConsumerRecord[String, String]) => outerGate.toggle(true) *> innerGate.await()) .withDeserializers(StringSerde, StringSerde) .ignore - config = configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + config = + configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) record = ProducerRecord(topic, "bar", Some("foo")) test <- RecordConsumer.make(config, handler).flatMap { consumer => @@ -517,8 +551,9 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { pollFailedMetrics <- TestMetrics.queue - config = configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) - .copy(clientId = cId, decryptor = decryptor) + config = + configFor(kafka, group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8)) + .copy(clientId = cId, decryptor = decryptor) aRecord = (i: Int) => ProducerRecord(topic, s"payload-$i", Some(s"key-$i")) _ <- RecordConsumer.make(config, handler).flatMap { consumer => val Seq(rec1, rec2) = (1 to 2) map aRecord diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index daf2a6c2..45aeb3e3 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -431,9 +431,9 @@ object Dispatcher { e => report(FailToUpdateParallelCurrentExecutionStarted(records.size, group, clientId, consumerAttributes, e)), t => report(CurrentExecutionStartedEvent(partition, group, clientId, t.currentExecutionStarted)) ) - groupedRecords = records.groupBy(_.key).values // todo: add sub-grouping for records without key + groupedRecords = groupRecordsForParallelHandling(records, maxParallelism) latestCommitGaps <- currentGaps(records.map(r => TopicPartition(r.topic, r.partition)).toSet) - _ <- report(InvokingHandlersInParallel(Math.max(groupedRecords.size, maxParallelism))) *> + _ <- report(InvokingHandlersInParallel(partition, numHandlers = Math.min(groupedRecords.size, maxParallelism))) *> ZIO .foreachParDiscard(groupedRecords)(sameKeyRecords => ZIO.foreach(sameKeyRecords) { record => @@ -457,6 +457,18 @@ object Dispatcher { } } + private def groupRecordsForParallelHandling(records: Chunk[Record], maxParallelism: Int): Iterable[Chunk[Record]] = { + val recordsByKey = records.groupBy(_.key) + val withKeyGroups = recordsByKey.collect { case (Some(_), records) => records } + val unusedParallelism = maxParallelism - withKeyGroups.size + val noKeyGroups = recordsByKey.get(None) match { + case Some(records) => + if (unusedParallelism > 0) records.grouped(Math.max(records.size / unusedParallelism, 1)).toIterable else Iterable(records) + case None => Chunk.empty + } + withKeyGroups ++ noKeyGroups + } + private def reportWorkerRunningInInterval( every: zio.Duration, state: TRef[WorkerInternalState] @@ -592,7 +604,7 @@ object DispatcherMetric { currentExecutionStarted: Option[Long] ) extends DispatcherMetric - case class InvokingHandlersInParallel(numHandlers: Int) extends DispatcherMetric + case class InvokingHandlersInParallel(partition: TopicPartition, numHandlers: Int) extends DispatcherMetric case class SkippedPreviouslyHandledRecord(record: Record, group: Group, clientId: ClientId, attributes: Map[String, String]) extends DispatcherMetric diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala index b83ccc90..5a89a1b7 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala @@ -82,6 +82,32 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { } yield ok) // if execution is not parallel, the latch will not be released } + "parallelize single partition handling of no-key records when using parallel consumer" in + new ctx(highWatermark = 10) { + val numRecords = 8 + + run(for { + latch <- CountDownLatch.make(numRecords) + slowHandler = { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => Clock.sleep(1.second) *> latch.countDown } + ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) + init <- getInit + dispatcher <- Dispatcher.make( + "group", + "clientId", + slowHandler, + lowWatermark, + highWatermark, + workersShutdownRef = ref, + consumeInParallel = true, + maxParallelism = numRecords, + init = init + ) + _ <- submitBatch(dispatcher, (1 to numRecords).map(_ => record.copy(partition = 0, key = None))) + _ <- TestClock.adjust(1.second) + _ <- latch.await + } yield ok) // if execution is not parallel, the latch will not be released + } + "reject records when high watermark is reached" in new ctx() { run(for { From cab79d413453c74ade9cd64187370365498728cc Mon Sep 17 00:00:00 2001 From: Natan Silnitsky Date: Tue, 13 Jun 2023 18:22:32 +0300 Subject: [PATCH 17/52] make GreyhoundNG RetryConfig serializable (so can be sent over gRpc and persisted) (#35207) * make GreyhoundNG RetryConfig serializable (so can be sent over gRpc and persisted) #pr * fix lp build * full cross-repo * Initial cross-repo check * Updating cross-repo check * Updating cross-repo check * Updating cross-repo check * Updating cross-repo check * CR changes --------- Co-authored-by: wixapiregistry <58037308+wixapiregistry@users.noreply.github.com> GitOrigin-RevId: feaf45b3c77f9074a3445bffdd6f748d3df77056 --- .../dst/greyhound/core/retry/RetryIT.scala | 17 +- .../retry/BlockingRetryRecordHandler.scala | 25 +-- .../core/consumer/retry/RetryAttempt.scala | 43 +++-- .../core/consumer/retry/RetryConfig.scala | 158 ++++++++++++++++-- .../RetryConsumerRecordHandlerTest.scala | 12 +- .../consumer/retry/ZRetryConfigTest.scala | 8 +- .../scala/GreyhoundConsumersBuilder.scala | 10 +- .../greyhound/scala/RetryConfigBuilder.scala | 2 +- 8 files changed, 204 insertions(+), 71 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/retry/RetryIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/retry/RetryIT.scala index ed382807..834be9f0 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/retry/RetryIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/retry/RetryIT.scala @@ -6,6 +6,7 @@ import com.wixpress.dst.greyhound.core.consumer._ import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.{TopicPattern, Topics} import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHandler} import com.wixpress.dst.greyhound.core.consumer.retry.NonBlockingRetryHelper.fixedRetryTopic +import com.wixpress.dst.greyhound.core.consumer.retry.RetryConfigForTopic.{finiteBlockingRetryConfigForTopic, nonBlockingRetryConfigForTopic} import com.wixpress.dst.greyhound.core.consumer.retry._ import com.wixpress.dst.greyhound.core.producer.{Encryptor, ProducerRecord} import com.wixpress.dst.greyhound.core.testkit.{eventuallyZ, AwaitableRef, BaseTestWithSharedEnv} @@ -40,8 +41,8 @@ class RetryIT extends BaseTestWithSharedEnv[Env, TestResources] { done <- Promise.make[Nothing, ConsumerRecord[String, String]] retryConfig = ZRetryConfig .perTopicRetries { - case `topic` => RetryConfigForTopic(() => Nil, NonBlockingBackoffPolicy(1.second :: Nil)) - case `anotherTopic` => RetryConfigForTopic(() => Nil, NonBlockingBackoffPolicy(1.second :: Nil)) + case `topic` => nonBlockingRetryConfigForTopic(1.second :: Nil) + case `anotherTopic` => nonBlockingRetryConfigForTopic(1.second :: Nil) } .copy(produceEncryptor = _ => ZIO.succeed(dummyEncryptor)) @@ -76,7 +77,7 @@ class RetryIT extends BaseTestWithSharedEnv[Env, TestResources] { retryConfig = ZRetryConfig .finiteBlockingRetry(100.millis, 100.millis) - .withCustomRetriesFor { case `topic2` => RetryConfigForTopic(() => 300.millis :: Nil, NonBlockingBackoffPolicy.empty) } + .withCustomRetriesFor { case `topic2` => finiteBlockingRetryConfigForTopic(300.millis :: Nil) } retryHandler = failingBlockingRecordHandlerWith(consumedValuesRef, Set(topic, topic2)).withDeserializers(StringSerde, StringSerde) _ <- RecordConsumer.make(configFor(kafka, group, retryConfig, topic, topic2), retryHandler).flatMap { _ => producer.produce(ProducerRecord(topic, "bar", Some("foo")), StringSerde, StringSerde) *> Clock.sleep(2.seconds) *> @@ -225,7 +226,7 @@ class RetryIT extends BaseTestWithSharedEnv[Env, TestResources] { invocations <- Ref.make(0) done <- Promise.make[Nothing, ConsumerRecord[String, String]] retryConfig = ZRetryConfig.retryForPattern( - RetryConfigForTopic(() => Nil, NonBlockingBackoffPolicy(Seq(1.second, 1.second, 1.seconds))) + nonBlockingRetryConfigForTopic(List(1.second, 1.second, 1.seconds)) ) retryHandler = failingRecordHandler(invocations, done).withDeserializers(StringSerde, StringSerde) success <- RecordConsumer @@ -258,7 +259,9 @@ class RetryIT extends BaseTestWithSharedEnv[Env, TestResources] { group <- randomGroup originalTopicCallCount <- Ref.make[Int](0) retryTopicCallCount <- Ref.make[Int](0) - retryConfig = ZRetryConfig.blockingFollowedByNonBlockingRetry(List(1.second), NonBlockingBackoffPolicy(List(1.seconds))) + retryConfig = + ZRetryConfig + .blockingFollowedByNonBlockingRetry(FiniteBlockingBackoffPolicy(List(1.second)), NonBlockingBackoffPolicy(List(1.seconds))) retryHandler = failingBlockingNonBlockingRecordHandler(originalTopicCallCount, retryTopicCallCount, topic).withDeserializers( StringSerde, StringSerde @@ -281,9 +284,7 @@ class RetryIT extends BaseTestWithSharedEnv[Env, TestResources] { for { r <- getShared TestResources(kafka, _) = r - retryConfig = ZRetryConfig.retryForPattern( - RetryConfigForTopic(() => Nil, NonBlockingBackoffPolicy(Seq(1.second, 1.second, 1.seconds))) - ) + retryConfig = ZRetryConfig.retryForPattern(nonBlockingRetryConfigForTopic(List(1.second, 1.second, 1.seconds))) handler = RecordHandler { _: ConsumerRecord[String, String] => ZIO.unit }.withDeserializers(StringSerde, StringSerde) _ <- RecordConsumer .make(configFor(kafka, "group", retryConfig, "topic"), handler) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala index 2f5d59a9..5544e200 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingRetryRecordHandler.scala @@ -3,7 +3,7 @@ package com.wixpress.dst.greyhound.core.consumer.retry import java.util.concurrent.TimeUnit import com.wixpress.dst.greyhound.core.{Group, TopicPartition} import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHandler} -import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, IgnoringOnce, Blocking => InternalBlocking} +import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, Blocking => InternalBlocking, IgnoringOnce} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.{BlockingRetryHandlerInvocationFailed, DoneBlockingBeforeRetry, NoRetryOnNonRetryableFailure} import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics.report @@ -30,7 +30,11 @@ private[retry] object BlockingRetryRecordHandler { override def handle(record: ConsumerRecord[K, V])(implicit trace: Trace): ZIO[GreyhoundMetrics with R, Nothing, LastHandleResult] = { val topicPartition = TopicPartition(record.topic, record.partition) - def pollBlockingStateWithSuspensions(record: ConsumerRecord[K, V], interval: Duration, start: Long): URIO[GreyhoundMetrics, PollResult] = { + def pollBlockingStateWithSuspensions( + record: ConsumerRecord[K, V], + interval: Duration, + start: Long + ): URIO[GreyhoundMetrics, PollResult] = { for { shouldBlock <- blockingStateResolver.resolve(record) shouldPollAgain <- @@ -73,7 +77,8 @@ private[retry] object BlockingRetryRecordHandler { case error => interval .map { interval => - report(BlockingRetryHandlerInvocationFailed(topicPartition, record.offset, error.toString)) *> blockOnErrorFor(record, interval) + report(BlockingRetryHandlerInvocationFailed(topicPartition, record.offset, error.toString)) *> + blockOnErrorFor(record, interval) } .getOrElse(ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false))) } @@ -94,12 +99,13 @@ private[retry] object BlockingRetryRecordHandler { if (nonBlockingHandler.isHandlingRetryTopicMessage(group, record)) { ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) } else { - val durationsIncludingForInvocationWithNoErrorHandling = retryConfig.blockingBackoffs(record.topic)().map(Some(_)) :+ None + val durations = retryConfig.blockingBackoffs(record.topic) + val durationsIncludingForInvocationWithNoErrorHandling = durations.map(Some(_)) :+ None for { result <- retryEvery(record, durationsIncludingForInvocationWithNoErrorHandling) { (rec, interval) => - handleAndMaybeBlockOnErrorFor(rec, interval) - } - _ <- maybeBackToStateBlocking + handleAndMaybeBlockOnErrorFor(rec, interval) + } + _ <- maybeBackToStateBlocking } yield result } } @@ -111,7 +117,7 @@ private[retry] object BlockingRetryRecordHandler { ZIO.succeed(as.iterator).flatMap { i => def loop(retryAttempt: Option[RetryAttempt]): ZIO[R, E, LastHandleResult] = if (i.hasNext) { - val nextDelay = i.next + val nextDelay = i.next val recordWithAttempt = retryAttempt.fold(record) { attempt => record.copy(headers = record.headers ++ RetryAttempt.toHeaders(attempt)) } @@ -127,8 +133,7 @@ private[retry] object BlockingRetryRecordHandler { } else ZIO.succeed(result) } - } - else ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) + } else ZIO.succeed(LastHandleResult(lastHandleSucceeded = false, shouldContinue = false)) loop(None) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala index 3fa3cba8..9de958e8 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala @@ -11,7 +11,8 @@ import java.time.Instant /** * Description of a retry attempt - * @param attempt contains which attempt is it, starting from 0 including blocking and non-blocking attempts + * @param attempt + * contains which attempt is it, starting from 0 including blocking and non-blocking attempts */ case class RetryAttempt( originalTopic: Topic, @@ -33,10 +34,10 @@ object RetryAttempt { private def toChunk(str: String): Chunk[Byte] = Chunk.fromArray(str.getBytes) def toHeaders(attempt: RetryAttempt): Headers = Headers( - RetryHeader.Submitted -> toChunk(attempt.submittedAt.toEpochMilli.toString), - RetryHeader.Backoff -> toChunk(attempt.backoff.toMillis.toString), + RetryHeader.Submitted -> toChunk(attempt.submittedAt.toEpochMilli.toString), + RetryHeader.Backoff -> toChunk(attempt.backoff.toMillis.toString), RetryHeader.OriginalTopic -> toChunk(attempt.originalTopic), - RetryHeader.RetryAttempt -> toChunk(attempt.attempt.toString), + RetryHeader.RetryAttempt -> toChunk(attempt.attempt.toString) ) private case class RetryAttemptHeaders( @@ -49,14 +50,14 @@ object RetryAttempt { private def fromHeaders(headers: Headers): Task[RetryAttemptHeaders] = for { submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) - backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) - topic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) - attempt <- headers.get(RetryHeader.RetryAttempt, longDeserializer) + backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) + topic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) + attempt <- headers.get(RetryHeader.RetryAttempt, longDeserializer) } yield RetryAttemptHeaders(topic, attempt.map(_.toInt), submitted, backoff) /** @return None on infinite blocking retries */ def maxBlockingAttempts(topic: Topic, retryConfig: Option[RetryConfig]): Option[Int] = - retryConfig.map(_.blockingBackoffs(topic)()).fold(Option(0)) { + retryConfig.map(_.blockingBackoffs(topic)).fold(Option(0)) { case finite if finite.hasDefiniteSize => Some(finite.size) case _ => None } @@ -68,31 +69,29 @@ object RetryAttempt { } def extract( - headers: Headers, - topic: Topic, - group: Group, - subscription: ConsumerSubscription, - retryConfig: Option[RetryConfig], + headers: Headers, + topic: Topic, + group: Group, + subscription: ConsumerSubscription, + retryConfig: Option[RetryConfig] )(implicit trace: Trace): UIO[Option[RetryAttempt]] = { def maybeNonBlockingAttempt(hs: RetryAttemptHeaders): Option[RetryAttempt] = for { - submitted <- hs.submittedAt - backoff <- hs.backoff + submitted <- hs.submittedAt + backoff <- hs.backoff TopicAttempt(originalTopic, attempt) <- attemptNumberFromTopic(subscription, topic, hs.originalTopic, group) - blockingRetries = maxBlockingAttempts(originalTopic, retryConfig).getOrElse(0) + blockingRetries = maxBlockingAttempts(originalTopic, retryConfig).getOrElse(0) } yield RetryAttempt(originalTopic, blockingRetries + attempt, submitted, backoff) def maybeBlockingAttempt(hs: RetryAttemptHeaders): Option[RetryAttempt] = for { - submitted <- hs.submittedAt - backoff <- hs.backoff + submitted <- hs.submittedAt + backoff <- hs.backoff originalTopic <- hs.originalTopic if originalTopic == topic - attempt <- hs.attempt + attempt <- hs.attempt } yield RetryAttempt(originalTopic, attempt, submitted, backoff) - fromHeaders(headers).map { hs => - maybeNonBlockingAttempt(hs) orElse maybeBlockingAttempt(hs) - } + fromHeaders(headers).map { hs => maybeNonBlockingAttempt(hs) orElse maybeBlockingAttempt(hs) } }.catchAll(_ => ZIO.none) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConfig.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConfig.scala index 5632f042..96dacd2c 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConfig.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConfig.scala @@ -15,8 +15,24 @@ case class RetryConfig( produceRetryBackoff: Duration = 5.seconds, produceEncryptor: ConsumerRecord[_, _] => UIO[Encryptor] = _ => ZIO.succeed(NoOpEncryptor)(zio.Trace.empty) ) { - def blockingBackoffs(topic: Topic) = - get(topic)(_.blockingBackoffs)(ifEmpty = () => Nil) + def blockingBackoffs(topic: Topic): Seq[ZDuration] = + get(topic) { + case RetryConfigForTopic(FiniteBlockingBackoffPolicy(intervals), _, _) if intervals.nonEmpty => intervals + case RetryConfigForTopic(_, InfiniteFixedBackoff(fixed), _) => Stream.continually(fixed) + case RetryConfigForTopic(_, expMax: InfiniteExponentialBackoffsMaxInterval, _) => + exponentialBackoffs(expMax.initialInterval, expMax.maximalInterval, expMax.backOffMultiplier, infiniteRetryMaxInteval = true) + case RetryConfigForTopic(_, expMult: InfiniteExponentialBackoffsMaxMultiplication, _) => + exponentialBackoffs( + expMult.initialInterval, + expMult.maxMultiplications, + expMult.backOffMultiplier, + infiniteRetryMaxInterval = true + ) + case _ => Nil + }(ifEmpty = Nil) + + def finiteBlockingBackoffs(topic: Topic) = + get(topic)(_.finiteBlockingBackoffs)(ifEmpty = FiniteBlockingBackoffPolicy.empty) def retryType(originalTopic: Topic): RetryType = get(originalTopic)(_.retryType)(ifEmpty = NoRetries) @@ -43,11 +59,46 @@ case class RetryConfig( else ifEmpty ) +} + +trait InfiniteBlockingBackoffPolicy { + def nonEmpty: Boolean +} + +case object EmptyInfiniteBlockingBackoffPolicy extends InfiniteBlockingBackoffPolicy { + override def nonEmpty: Boolean = false +} + +sealed case class InfiniteFixedBackoff(interval: ZDuration) extends InfiniteBlockingBackoffPolicy { + override def nonEmpty: Boolean = true +} + +sealed case class InfiniteExponentialBackoffsMaxInterval( + initialInterval: ZDuration, + maximalInterval: ZDuration, + backOffMultiplier: Float +) extends InfiniteBlockingBackoffPolicy { + override def nonEmpty: Boolean = true +} + +sealed case class InfiniteExponentialBackoffsMaxMultiplication( + initialInterval: ZDuration, + maxMultiplications: Int, + backOffMultiplier: Float +) extends InfiniteBlockingBackoffPolicy { + override def nonEmpty: Boolean = true +} +case class FiniteBlockingBackoffPolicy(intervals: List[ZDuration]) { + def nonEmpty = intervals.nonEmpty +} + +object FiniteBlockingBackoffPolicy { + val empty = FiniteBlockingBackoffPolicy(Nil) } case class NonBlockingBackoffPolicy( - intervals: Seq[ZDuration], + intervals: List[ZDuration], recordMutate: ProducerRecord[Chunk[Byte], Chunk[Byte]] => ProducerRecord[Chunk[Byte], Chunk[Byte]] = identity ) { def nonEmpty = intervals.nonEmpty @@ -59,22 +110,48 @@ object NonBlockingBackoffPolicy { val empty = NonBlockingBackoffPolicy(Nil) } -case class RetryConfigForTopic(blockingBackoffs: () => Seq[ZDuration], nonBlockingBackoffs: NonBlockingBackoffPolicy) { - def nonEmpty: Boolean = blockingBackoffs().nonEmpty || nonBlockingBackoffs.nonEmpty +case class RetryConfigForTopic( + finiteBlockingBackoffs: FiniteBlockingBackoffPolicy, + infiniteBlockingBackoffs: InfiniteBlockingBackoffPolicy, + nonBlockingBackoffs: NonBlockingBackoffPolicy +) { + def nonEmpty: Boolean = finiteBlockingBackoffs.nonEmpty || infiniteBlockingBackoffs.nonEmpty || nonBlockingBackoffs.nonEmpty def retryType: RetryType = - if (blockingBackoffs.apply().nonEmpty) { + if (finiteBlockingBackoffs.nonEmpty) { if (nonBlockingBackoffs.nonEmpty) BlockingFollowedByNonBlocking else Blocking - } else { + } else if (infiniteBlockingBackoffs.nonEmpty) + Blocking + else { NonBlocking } } object RetryConfigForTopic { - val empty = RetryConfigForTopic(() => Nil, NonBlockingBackoffPolicy.empty) + val empty = RetryConfigForTopic(FiniteBlockingBackoffPolicy.empty, EmptyInfiniteBlockingBackoffPolicy, NonBlockingBackoffPolicy.empty) + + def nonBlockingRetryConfigForTopic(intervals: List[ZDuration]) = + RetryConfigForTopic(FiniteBlockingBackoffPolicy.empty, EmptyInfiniteBlockingBackoffPolicy, NonBlockingBackoffPolicy(intervals)) + + def finiteBlockingRetryConfigForTopic(intervals: List[ZDuration]) = + RetryConfigForTopic(FiniteBlockingBackoffPolicy(intervals), EmptyInfiniteBlockingBackoffPolicy, NonBlockingBackoffPolicy.empty) + + def infiniteBlockingRetryConfigForTopic(interval: ZDuration) = + RetryConfigForTopic(FiniteBlockingBackoffPolicy.empty, InfiniteFixedBackoff(interval), NonBlockingBackoffPolicy.empty) + + def infiniteBlockingRetryConfigForTopic( + initialInterval: ZDuration, + maxInterval: ZDuration, + backOffMultiplier: Float + ) = + RetryConfigForTopic( + FiniteBlockingBackoffPolicy.empty, + InfiniteExponentialBackoffsMaxInterval(initialInterval, maxInterval, backOffMultiplier), + NonBlockingBackoffPolicy.empty + ) } object ZRetryConfig { @@ -82,18 +159,27 @@ object ZRetryConfig { forAllTopics( RetryConfigForTopic( nonBlockingBackoffs = NonBlockingBackoffPolicy(firstRetry :: otherRetries.toList), - blockingBackoffs = () => List.empty + finiteBlockingBackoffs = FiniteBlockingBackoffPolicy.empty, + infiniteBlockingBackoffs = EmptyInfiniteBlockingBackoffPolicy ) ) def finiteBlockingRetry(firstRetry: ZDuration, otherRetries: ZDuration*): RetryConfig = forAllTopics( - RetryConfigForTopic(blockingBackoffs = () => firstRetry :: otherRetries.toList, nonBlockingBackoffs = NonBlockingBackoffPolicy.empty) + RetryConfigForTopic( + finiteBlockingBackoffs = FiniteBlockingBackoffPolicy(firstRetry :: otherRetries.toList), + infiniteBlockingBackoffs = EmptyInfiniteBlockingBackoffPolicy, + nonBlockingBackoffs = NonBlockingBackoffPolicy.empty + ) ) def infiniteBlockingRetry(interval: ZDuration): RetryConfig = forAllTopics( - RetryConfigForTopic(blockingBackoffs = () => Stream.continually(interval), nonBlockingBackoffs = NonBlockingBackoffPolicy.empty) + RetryConfigForTopic( + infiniteBlockingBackoffs = InfiniteFixedBackoff(interval), + finiteBlockingBackoffs = FiniteBlockingBackoffPolicy.empty, + nonBlockingBackoffs = NonBlockingBackoffPolicy.empty + ) ) def exponentialBackoffBlockingRetry( @@ -101,32 +187,68 @@ object ZRetryConfig { maximalInterval: ZDuration, backOffMultiplier: Float, infiniteRetryMaxInterval: Boolean - ): RetryConfig = + ): RetryConfig = { + val (finite, infinite) = if (infiniteRetryMaxInterval) { + ( + FiniteBlockingBackoffPolicy.empty, + InfiniteExponentialBackoffsMaxInterval(initialInterval, maximalInterval, backOffMultiplier) + ) + } else { + ( + FiniteBlockingBackoffPolicy( + exponentialBackoffs(initialInterval, maximalInterval, backOffMultiplier, infiniteRetryMaxInterval).toList + ), + EmptyInfiniteBlockingBackoffPolicy + ) + } forAllTopics( RetryConfigForTopic( - blockingBackoffs = () => exponentialBackoffs(initialInterval, maximalInterval, backOffMultiplier, infiniteRetryMaxInterval), + finiteBlockingBackoffs = finite, + infiniteBlockingBackoffs = infinite, nonBlockingBackoffs = NonBlockingBackoffPolicy.empty ) ) + } def exponentialBackoffBlockingRetry( initialInterval: ZDuration, maxMultiplications: Int, backOffMultiplier: Float, infiniteRetryMaxInterval: Boolean - ): RetryConfig = + ): RetryConfig = { + val (finite, infinite) = if (infiniteRetryMaxInterval) { + ( + FiniteBlockingBackoffPolicy.empty, + InfiniteExponentialBackoffsMaxMultiplication(initialInterval, maxMultiplications, backOffMultiplier) + ) + } else { + ( + FiniteBlockingBackoffPolicy( + exponentialBackoffs(initialInterval, maxMultiplications, backOffMultiplier, infiniteRetryMaxInterval).toList + ), + EmptyInfiniteBlockingBackoffPolicy + ) + } forAllTopics( RetryConfigForTopic( - blockingBackoffs = () => exponentialBackoffs(initialInterval, maxMultiplications, backOffMultiplier, infiniteRetryMaxInterval), + finiteBlockingBackoffs = finite, + infiniteBlockingBackoffs = infinite, nonBlockingBackoffs = NonBlockingBackoffPolicy.empty ) ) + } def blockingFollowedByNonBlockingRetry( - blockingBackoffs: NonEmptyList[ZDuration], + blockingBackoffs: FiniteBlockingBackoffPolicy, nonBlockingBackoffs: NonBlockingBackoffPolicy ): RetryConfig = - forAllTopics(RetryConfigForTopic(blockingBackoffs = () => blockingBackoffs, nonBlockingBackoffs = nonBlockingBackoffs)) + forAllTopics( + RetryConfigForTopic( + finiteBlockingBackoffs = blockingBackoffs, + nonBlockingBackoffs = nonBlockingBackoffs, + infiniteBlockingBackoffs = EmptyInfiniteBlockingBackoffPolicy + ) + ) def perTopicRetries(configs: PartialFunction[Topic, RetryConfigForTopic]) = RetryConfig(configs, None) @@ -178,7 +300,7 @@ object RetryConfig { def blockingFollowedByNonBlockingRetry(blockingBackoffs: NonEmptyList[Duration], nonBlockingBackoffs: List[Duration]): RetryConfig = ZRetryConfig.blockingFollowedByNonBlockingRetry( - blockingBackoffs = blockingBackoffs.map(ZDuration.fromScala), + blockingBackoffs = FiniteBlockingBackoffPolicy(blockingBackoffs.map(ZDuration.fromScala)), nonBlockingBackoffs = NonBlockingBackoffPolicy(nonBlockingBackoffs.map(ZDuration.fromScala)) ) } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala index 0797ba25..ef9fccb1 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala @@ -6,6 +6,7 @@ import com.wixpress.dst.greyhound.core._ import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.Topics import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordHandler} import com.wixpress.dst.greyhound.core.consumer.retry.BlockingState.{Blocked, Blocking => InternalBlocking, IgnoringAll, IgnoringOnce} +import com.wixpress.dst.greyhound.core.consumer.retry.RetryConfigForTopic.nonBlockingRetryConfigForTopic import com.wixpress.dst.greyhound.core.consumer.retry.RetryConsumerRecordHandlerTest.{offset, partition, _} import com.wixpress.dst.greyhound.core.consumer.retry.RetryRecordHandlerMetric.{BlockingIgnoredForAllFor, BlockingIgnoredOnceFor, BlockingRetryHandlerInvocationFailed, NoRetryOnNonRetryableFailure} import com.wixpress.dst.greyhound.core.producer.{ProducerError, ProducerRecord} @@ -311,7 +312,10 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics group, failingHandlerWith(handleCountRef), ZRetryConfig - .blockingFollowedByNonBlockingRetry(List(10.millis, 500.millis), NonBlockingBackoffPolicy(List(1.second))), + .blockingFollowedByNonBlockingRetry( + FiniteBlockingBackoffPolicy(List(10.millis, 500.millis)), + NonBlockingBackoffPolicy(List(1.second)) + ), producer, Topics(Set(topic)), blockingState, @@ -336,9 +340,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics topic <- randomTopicName otherTopic <- randomTopicName blockingState <- Ref.make[Map[BlockingTarget, BlockingState]](Map.empty) - policy = ZRetryConfig.perTopicRetries { - case `otherTopic` => RetryConfigForTopic(() => Nil, NonBlockingBackoffPolicy(1.second :: Nil)) - } + policy = ZRetryConfig.perTopicRetries { case `otherTopic` => nonBlockingRetryConfigForTopic(1.second :: Nil) } retryHandler = RetryRecordHandler.withRetries( group, failingHandler, @@ -410,7 +412,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics retryHelper, awaitShutdown = _ => ZIO.succeed(awaitShutdown) ) - val headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, now, 3.seconds)) + val headers = RetryAttempt.toHeaders(RetryAttempt(topic, 0, now, 3.seconds)) for { key <- bytes value <- bytes diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/ZRetryConfigTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/ZRetryConfigTest.scala index ac6dca03..6535207d 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/ZRetryConfigTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/ZRetryConfigTest.scala @@ -29,12 +29,12 @@ class ZRetryConfigTest extends SpecificationWithJUnit { val absMult = abs(params.mult) val safeInit = if (init < 10) 10 else init - for (i <- 0 to max) yield { backoffs()(i) } mustEqual (pow(1 + absMult, i) * safeInit).toLong.millis + for (i <- 0 to max) yield { backoffs(i) } mustEqual (pow(1 + absMult, i) * safeInit).toLong.millis val maxMult = math.max(0, max) val lastDurationToCheck = abs(max + 1) * 2 val firstDurationToCheck = math.max(0, max + 1) for (i <- firstDurationToCheck to lastDurationToCheck) - yield backoffs()(i) mustEqual (pow(1 + absMult, maxMult) * safeInit).toLong.millis + yield backoffs(i) mustEqual (pow(1 + absMult, maxMult) * safeInit).toLong.millis } } @@ -57,11 +57,11 @@ class ZRetryConfigTest extends SpecificationWithJUnit { val absMult = abs(params.mult) val safeInit = if (init < 10) 10 else init - for (i <- 0 to maxMult) yield { backoffs()(i) } mustEqual (pow(1 + absMult, i) * safeInit).toLong.millis + for (i <- 0 to maxMult) yield { backoffs(i) } mustEqual (pow(1 + absMult, i) * safeInit).toLong.millis val lastDurationToCheck = abs(maxMult + 1) * 2 val firstDurationToCheck = math.max(0, maxMult + 1) for (i <- firstDurationToCheck to lastDurationToCheck) - yield backoffs()(i) mustEqual (pow(1 + absMult, maxMult) * safeInit).toLong.millis + yield backoffs(i) mustEqual (pow(1 + absMult, maxMult) * safeInit).toLong.millis } } } diff --git a/java-interop/src/main/java/com/wixpress/dst/greyhound/scala/GreyhoundConsumersBuilder.scala b/java-interop/src/main/java/com/wixpress/dst/greyhound/scala/GreyhoundConsumersBuilder.scala index f3924695..484acc95 100644 --- a/java-interop/src/main/java/com/wixpress/dst/greyhound/scala/GreyhoundConsumersBuilder.scala +++ b/java-interop/src/main/java/com/wixpress/dst/greyhound/scala/GreyhoundConsumersBuilder.scala @@ -5,9 +5,9 @@ import com.wixpress.dst.greyhound.core import com.wixpress.dst.greyhound.core.consumer.batched.{BatchConsumer, BatchConsumerConfig, BatchEventLoopConfig} import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerSubscription.Topics import com.wixpress.dst.greyhound.core.consumer.domain.{RecordHandler => CoreRecordHandler} -import com.wixpress.dst.greyhound.core.consumer.retry.{NonBlockingBackoffPolicy, RetryConfig => CoreRetryConfig, RetryConfigForTopic} +import com.wixpress.dst.greyhound.core.consumer.retry.{EmptyInfiniteBlockingBackoffPolicy, FiniteBlockingBackoffPolicy, InfiniteBlockingBackoffPolicy, NonBlockingBackoffPolicy, RetryConfigForTopic, RetryConfig => CoreRetryConfig} import com.wixpress.dst.greyhound.core.consumer.{RecordConsumer, RecordConsumerConfig} -import com.wixpress.dst.greyhound.core.{consumer, Group, NonEmptySet, Topic} +import com.wixpress.dst.greyhound.core.{Group, NonEmptySet, Topic, consumer} import com.wixpress.dst.greyhound.future.GreyhoundRuntime import com.wixpress.dst.greyhound.future.GreyhoundRuntime.Env import zio._ @@ -138,7 +138,11 @@ class GreyhoundConsumersBuilder(val config: GreyhoundConfig) { retryConfig.map(config => { val forTopic = - RetryConfigForTopic(() => config.blockingBackoffs().asScala, NonBlockingBackoffPolicy(config.nonBlockingBackoffs.asScala)) + RetryConfigForTopic( + FiniteBlockingBackoffPolicy(config.blockingBackoffs().asScala.toList), + EmptyInfiniteBlockingBackoffPolicy, + NonBlockingBackoffPolicy(config.nonBlockingBackoffs.asScala.toList) + ) CoreRetryConfig({ case _ => forTopic }, None) }) diff --git a/java-interop/src/main/java/com/wixpress/dst/greyhound/scala/RetryConfigBuilder.scala b/java-interop/src/main/java/com/wixpress/dst/greyhound/scala/RetryConfigBuilder.scala index bd0f5538..795844b1 100644 --- a/java-interop/src/main/java/com/wixpress/dst/greyhound/scala/RetryConfigBuilder.scala +++ b/java-interop/src/main/java/com/wixpress/dst/greyhound/scala/RetryConfigBuilder.scala @@ -44,7 +44,7 @@ object RetryConfigBuilder { } private def fromCoreRetryConfig(coreRetryConfig: com.wixpress.dst.greyhound.core.consumer.retry.RetryConfig): RetryConfig = { - val blocking: util.List[Duration] = seqAsJavaList(coreRetryConfig.blockingBackoffs("").apply) + val blocking: util.List[Duration] = seqAsJavaList(coreRetryConfig.blockingBackoffs("")) val nonBlocking: util.List[Duration] = seqAsJavaList(coreRetryConfig.nonBlockingBackoffs("").intervals) new RetryConfig(blocking, nonBlocking) } From a6d1ebed6045f7086da1a815e46cf4d91dff6d97 Mon Sep 17 00:00:00 2001 From: Noam Berman Date: Sun, 18 Jun 2023 00:45:50 +0300 Subject: [PATCH 18/52] multi-tenant consumer proxy redesign - initial commit (#35244) GitOrigin-RevId: e7a2532372964be70c34350951ca99bfc52f7684 --- .../dst/greyhound/core/TopicPartition.scala | 2 + .../greyhound/core/admin/AdminClient.scala | 262 +++++++++++------- .../greyhound/core/consumer/Consumer.scala | 7 +- .../core/consumer/domain/ConsumerRecord.scala | 14 +- .../core/consumer/EventLoopTest.scala | 2 +- .../core/consumer/batched/TestSupport.scala | 3 +- .../consumer/dispatcher/DispatcherTest.scala | 14 +- .../retry/BlockingStateResolverTest.scala | 22 +- .../RetryConsumerRecordHandlerTest.scala | 38 +-- .../dst/greyhound/core/testkit/Maker.scala | 2 +- 10 files changed, 217 insertions(+), 149 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/TopicPartition.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/TopicPartition.scala index 5e4f0cb4..db6f0531 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/TopicPartition.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/TopicPartition.scala @@ -7,6 +7,8 @@ case class TopicPartition(topic: Topic, partition: Partition) { } object TopicPartition { + def fromKafka(topicPartition: KafkaTopicPartition): TopicPartition = + TopicPartition(topicPartition.topic, topicPartition.partition) def apply(topicPartition: KafkaTopicPartition): TopicPartition = TopicPartition(topicPartition.topic, topicPartition.partition) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala index 763cf966..f5ac837f 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala @@ -9,7 +9,7 @@ import com.wixpress.dst.greyhound.core.zioutils.KafkaFutures._ import com.wixpress.dst.greyhound.core.{CommonGreyhoundConfig, GHThrowable, Group, GroupTopicPartition, OffsetAndMetadata, Topic, TopicConfig, TopicPartition} import org.apache.kafka.clients.admin.AlterConfigOp.OpType import org.apache.kafka.clients.admin.ConfigEntry.ConfigSource -import org.apache.kafka.clients.admin.{AdminClient => KafkaAdminClient, AdminClientConfig => KafkaAdminClientConfig, AlterConfigOp, Config, ConfigEntry, ListConsumerGroupOffsetsOptions, NewPartitions, NewTopic, TopicDescription} +import org.apache.kafka.clients.admin.{AlterConfigOp, Config, ConfigEntry, ListConsumerGroupOffsetsOptions, ListConsumerGroupOffsetsSpec, NewPartitions, NewTopic, TopicDescription, AdminClient => KafkaAdminClient, AdminClientConfig => KafkaAdminClientConfig} import org.apache.kafka.common.config.ConfigResource import org.apache.kafka.common.config.ConfigResource.Type.TOPIC import org.apache.kafka.common.errors.{InvalidTopicException, TopicExistsException, UnknownTopicOrPartitionException} @@ -32,26 +32,34 @@ trait AdminClient { def topicsExist(topics: Set[Topic])(implicit trace: Trace): ZIO[Any, Throwable, Map[Topic, Boolean]] def createTopics( - configs: Set[TopicConfig], - ignoreErrors: Throwable => Boolean = isTopicExistsError + configs: Set[TopicConfig], + ignoreErrors: Throwable => Boolean = isTopicExistsError )(implicit trace: Trace): RIO[GreyhoundMetrics, Map[String, Option[Throwable]]] def numberOfBrokers(implicit trace: Trace): RIO[Any, Int] def propertiesFor(topics: Set[Topic])(implicit trace: Trace): RIO[Any, Map[Topic, TopicPropertiesResult]] + def commit(group: Group, commits: Map[TopicPartition, OffsetAndMetadata])(implicit trace: Trace): ZIO[Any, Throwable, Unit] + def listGroups()(implicit trace: Trace): RIO[Any, Set[String]] - def groupOffsets(groups: Set[String])(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] + def groupOffsets(groups: Set[Group])(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] + + def groupOffsetsSpecific(requestedTopicPartitions: Map[Group, Set[TopicPartition]])( + implicit trace: Trace + ): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] - def groupState(groups: Set[String])(implicit trace: Trace): RIO[Any, Map[String, GroupState]] +// def groupOffsetsSpecific(requestedTopicPartitions: Map[Group, Set[TopicPartition]])(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] + + def groupState(groups: Set[Group])(implicit trace: Trace): RIO[Any, Map[String, GroupState]] def deleteTopic(topic: Topic)(implicit trace: Trace): RIO[Any, Unit] def describeConsumerGroups(groupIds: Set[Group])(implicit trace: Trace): RIO[Any, Map[Group, ConsumerGroupDescription]] def consumerGroupOffsets(groupId: Group, onlyPartitions: Option[Set[TopicPartition]] = None)( - implicit trace: Trace + implicit trace: Trace ): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] def increasePartitions(topic: Topic, newCount: Int)(implicit trace: Trace): RIO[Any with GreyhoundMetrics, Unit] @@ -62,9 +70,9 @@ trait AdminClient { * true, use the deprecated non incremental alter */ def updateTopicConfigProperties( - topic: Topic, - configProperties: Map[String, ConfigPropOp], - useNonIncrementalAlter: Boolean = false + topic: Topic, + configProperties: Map[String, ConfigPropOp], + useNonIncrementalAlter: Boolean = false )(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] def attributes: Map[String, String] @@ -134,14 +142,13 @@ object AdminClient { .values() .asScala .headOption - .map { - case (_, topicResult) => - topicResult.asZio.either.flatMap { - case Right(_) => ZIO.succeed(true) - case Left(_: UnknownTopicOrPartitionException) => ZIO.succeed(false) - case Left(_: InvalidTopicException) => ZIO.succeed(false) - case Left(ex) => ZIO.fail(ex) - } + .map { case (_, topicResult) => + topicResult.asZio.either.flatMap { + case Right(_) => ZIO.succeed(true) + case Left(_: UnknownTopicOrPartitionException) => ZIO.succeed(false) + case Left(_: InvalidTopicException) => ZIO.succeed(false) + case Left(ex) => ZIO.fail(ex) + } } .getOrElse(ZIO.succeed(false)) } @@ -149,32 +156,35 @@ object AdminClient { override def topicsExist(topics: Set[Topic])(implicit trace: Trace): ZIO[Any, Throwable, Map[Topic, Boolean]] = attemptBlocking(client.describeTopics(topics.asJava)).flatMap { result => ZIO - .foreach(result.values().asScala.toSeq) { - case (topic, topicResult) => - topicResult.asZio.either.flatMap { - case Right(_) => ZIO.succeed(topic -> true) - case Left(_: UnknownTopicOrPartitionException) => ZIO.succeed(topic -> false) - case Left(ex) => ZIO.fail(ex) - } + .foreach(result.values().asScala.toSeq) { case (topic, topicResult) => + topicResult.asZio.either.flatMap { + case Right(_) => ZIO.succeed(topic -> true) + case Left(_: UnknownTopicOrPartitionException) => ZIO.succeed(topic -> false) + case Left(ex) => ZIO.fail(ex) + } } .map(_.toMap) } override def createTopics( - configs: Set[TopicConfig], - ignoreErrors: Throwable => Boolean = isTopicExistsError + configs: Set[TopicConfig], + ignoreErrors: Throwable => Boolean = isTopicExistsError )(implicit trace: Trace): RIO[GreyhoundMetrics, Map[String, Option[Throwable]]] = { val configsByTopic = configs.map(c => c.name -> c).toMap attemptBlocking(client.createTopics(configs.map(toNewTopic).asJava)).flatMap { result => ZIO - .foreach(result.values.asScala.toSeq) { - case (topic, topicResult) => - topicResult.asZio.unit - .reporting(res => - TopicCreated(topic, configsByTopic(topic).partitions, attributes, res.mapExit(fromExit(isTopicExistsError))) + .foreach(result.values.asScala.toSeq) { case (topic, topicResult) => + topicResult.asZio.unit + .reporting(res => + TopicCreated( + topic, + configsByTopic(topic).partitions, + attributes, + res.mapExit(fromExit(isTopicExistsError)) ) - .either - .map(topic -> _.left.toOption.filterNot(ignoreErrors)) + ) + .either + .map(topic -> _.left.toOption.filterNot(ignoreErrors)) } .map(_.toMap) } @@ -184,15 +194,20 @@ object AdminClient { attemptBlocking(client.describeCluster()) .flatMap(_.nodes().asZio.map(_.size)) - override def propertiesFor(topics: Set[Topic])(implicit trace: Trace): RIO[Any, Map[Topic, TopicPropertiesResult]] = + override def propertiesFor( + topics: Set[Topic] + )(implicit trace: Trace): RIO[Any, Map[Topic, TopicPropertiesResult]] = (describeConfigs(client, topics) zipPar describePartitions(client, topics)).map { case (configsPerTopic, partitionsAndReplicationPerTopic) => partitionsAndReplicationPerTopic .map(pair => pair -> configsPerTopic.getOrElse(pair._1, TopicPropertiesResult.TopicDoesnExist(pair._1))) .map { - case ((topic, TopicProperties(_, partitions, _, replication)), TopicProperties(_, _, propertiesMap, _)) => + case ( + (topic, TopicProperties(_, partitions, _, replication)), + TopicProperties(_, _, propertiesMap, _) + ) => topic -> TopicPropertiesResult(topic, partitions, propertiesMap, replication) - case ((topic, _), _) => topic -> TopicPropertiesResult.TopicDoesnExist(topic) + case ((topic, _), _) => topic -> TopicPropertiesResult.TopicDoesnExist(topic) } } @@ -210,33 +225,68 @@ object AdminClient { groups <- result.valid().asZio } yield groups.asScala.map(_.groupId()).toSet - override def groupOffsets(groups: Set[String])(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] = + override def commit(group: Group, commits: Map[TopicPartition, OffsetAndMetadata])(implicit trace: Trace): ZIO[Any, Throwable, Unit] = + attemptBlocking(client.alterConsumerGroupOffsets(group, + commits.map { case (tp, offset) => (tp.asKafka, offset.asKafka) }.asJava)).unit + + override def groupOffsetsSpecific( + requestedTopicPartitions: Map[Group, Set[TopicPartition]] + )(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] = for { - result <- ZIO.foreach(groups)(group => attemptBlocking(group -> client.listConsumerGroupOffsets(group))) + result <- ZIO.flatten( + ZIO + .attemptBlocking( + client.listConsumerGroupOffsets( + requestedTopicPartitions + .mapValues(tps => + new ListConsumerGroupOffsetsSpec().topicPartitions(tps.map(_.asKafka).asJavaCollection) + ) + .asJava + ) + ) + .map(_.all.asZio) + ) + rawOffsets = result.asScala.toMap.mapValues(_.asScala.toMap) + offset = + rawOffsets.map { case (group, offsets) => + offsets.map{case (tp, offset) => + (GroupTopicPartition(group, TopicPartition.fromKafka(tp)), PartitionOffset(Option(offset).map(_.offset()).getOrElse(0L))) + } + } + groupOffsets = offset.foldLeft(Map.empty[GroupTopicPartition, PartitionOffset])((x, y) => x ++ y) + } yield groupOffsets + + override def groupOffsets( + groups: Set[String] + )(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] = + for { + result <- ZIO.foreach(groups)(group => attemptBlocking(group -> client.listConsumerGroupOffsets(group))) // TODO: remove ._1 , ._2 rawOffsetsEffects = result.toMap.mapValues(_.partitionsToOffsetAndMetadata().asZio) - offsetsEffects = + offsetsEffects = rawOffsetsEffects.map(offset => offset._2.map(f => - f.asScala.map(p => p.copy(GroupTopicPartition(offset._1, core.TopicPartition(p._1)), PartitionOffset(p._2.offset()))) + f.asScala.map(p => + p.copy(GroupTopicPartition(offset._1, core.TopicPartition(p._1)), PartitionOffset(p._2.offset())) + ) ) ) - offsetsMapSets <- ZIO.collectAll(offsetsEffects) - groupOffsets = offsetsMapSets.foldLeft(Map.empty[GroupTopicPartition, PartitionOffset])((x, y) => x ++ y) + offsetsMapSets <- ZIO.collectAll(offsetsEffects) + groupOffsets = offsetsMapSets.foldLeft(Map.empty[GroupTopicPartition, PartitionOffset])((x, y) => x ++ y) } yield groupOffsets override def groupState(groups: Set[String])(implicit trace: Trace): RIO[Any, Map[String, GroupState]] = for { - result <- attemptBlocking(client.describeConsumerGroups(groups.asJava)) + result <- attemptBlocking(client.describeConsumerGroups(groups.asJava)) groupEffects = result.describedGroups().asScala.mapValues(_.asZio).toMap - groupsList <- ZIO.collectAll(groupEffects.values) - membersMap = groupsList.groupBy(_.groupId()).mapValues(_.flatMap(_.members().asScala)).toMap - groupState = membersMap - .mapValues(members => { - val topicPartitionsMap = members.flatMap(_.assignment().topicPartitions().asScala) - GroupState(topicPartitionsMap.map(TopicPartition(_)).toSet) - }) - .toMap + groupsList <- ZIO.collectAll(groupEffects.values) + membersMap = groupsList.groupBy(_.groupId()).mapValues(_.flatMap(_.members().asScala)).toMap + groupState = membersMap + .mapValues(members => { + val topicPartitionsMap = members.flatMap(_.assignment().topicPartitions().asScala) + GroupState(topicPartitionsMap.map(TopicPartition(_)).toSet) + }) + .toMap } yield groupState override def deleteTopic(topic: Topic)(implicit trace: Trace): RIO[Any, Unit] = { @@ -245,7 +295,9 @@ object AdminClient { .unit } - override def describeConsumerGroups(groupIds: Set[Group])(implicit trace: Trace): RIO[Any, Map[Group, ConsumerGroupDescription]] = { + override def describeConsumerGroups( + groupIds: Set[Group] + )(implicit trace: Trace): RIO[Any, Map[Group, ConsumerGroupDescription]] = { for { desc <- attemptBlocking(client.describeConsumerGroups(groupIds.asJava).all()) all <- desc.asZio @@ -253,19 +305,22 @@ object AdminClient { } override def consumerGroupOffsets( - groupId: Group, - onlyPartitions: Option[Set[TopicPartition]] = None + groupId: Group, + onlyPartitions: Option[Set[TopicPartition]] = None )(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] = { val maybePartitions: util.List[common.TopicPartition] = onlyPartitions.map(_.map(_.asKafka).toList.asJava).orNull for { desc <- attemptBlocking( - client.listConsumerGroupOffsets(groupId, new ListConsumerGroupOffsetsOptions().topicPartitions(maybePartitions)) - ) - res <- attemptBlocking(desc.partitionsToOffsetAndMetadata().get()) + client + .listConsumerGroupOffsets(groupId, new ListConsumerGroupOffsetsOptions().topicPartitions(maybePartitions)) + ) + res <- attemptBlocking(desc.partitionsToOffsetAndMetadata().get()) } yield res.asScala.toMap.map { case (tp, om) => (TopicPartition(tp), OffsetAndMetadata(om)) } } - override def increasePartitions(topic: Topic, newCount: Int)(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { + override def increasePartitions(topic: Topic, newCount: Int)( + implicit trace: Trace + ): RIO[GreyhoundMetrics, Unit] = { attemptBlocking(client.createPartitions(Map(topic -> NewPartitions.increaseTo(newCount)).asJava)) .flatMap(_.all().asZio) .unit @@ -273,9 +328,9 @@ object AdminClient { } override def updateTopicConfigProperties( - topic: Topic, - configProperties: Map[String, ConfigPropOp], - useNonIncrementalAlter: Boolean = false + topic: Topic, + configProperties: Map[String, ConfigPropOp], + useNonIncrementalAlter: Boolean = false )(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { if (useNonIncrementalAlter) updateTopicConfigUsingAlter(topic, configProperties) else updateTopicConfigIncremental(topic, configProperties) @@ -290,25 +345,24 @@ object AdminClient { described <- describeConfigs(client, Set(topic)) beforeProps <- described.values.head.getOrFail beforeConfig = beforeProps.propertiesThat(_.isTopicSpecific) - configToSet = configProperties.foldLeft(beforeConfig) { - case (acc, (key, ConfigPropOp.Delete)) => acc - key - case (acc, (key, ConfigPropOp.Set(value))) => acc + (key -> value) - } - configJava = new Config(configToSet.map { case (k, v) => new ConfigEntry(k, v) }.toList.asJava) - _ <- attemptBlocking(client.alterConfigs(Map(resource -> configJava).asJava)) - .flatMap(_.all().asZio) + configToSet = configProperties.foldLeft(beforeConfig) { + case (acc, (key, ConfigPropOp.Delete)) => acc - key + case (acc, (key, ConfigPropOp.Set(value))) => acc + (key -> value) + } + configJava = new Config(configToSet.map { case (k, v) => new ConfigEntry(k, v) }.toList.asJava) + _ <- attemptBlocking(client.alterConfigs(Map(resource -> configJava).asJava)) + .flatMap(_.all().asZio) } yield () ).reporting(TopicConfigUpdated(topic, configProperties, incremental = false, attributes, _)) } private def updateTopicConfigIncremental(topic: Topic, configProperties: Map[String, ConfigPropOp]) = { val resource = new ConfigResource(ConfigResource.Type.TOPIC, topic) - val ops = configProperties.map { - case (key, value) => - value match { - case ConfigPropOp.Delete => new AlterConfigOp(new ConfigEntry(key, null), OpType.DELETE) - case ConfigPropOp.Set(value) => new AlterConfigOp(new ConfigEntry(key, value), OpType.SET) - } + val ops = configProperties.map { case (key, value) => + value match { + case ConfigPropOp.Delete => new AlterConfigOp(new ConfigEntry(key, null), OpType.DELETE) + case ConfigPropOp.Set(value) => new AlterConfigOp(new ConfigEntry(key, value), OpType.SET) + } }.asJavaCollection attemptBlocking(client.incrementalAlterConfigs(Map(resource -> ops).asJava)) .flatMap(_.all().asZio) @@ -320,11 +374,11 @@ object AdminClient { } private def describeConfigs(client: KafkaAdminClient, topics: Set[Topic]): RIO[Any, Map[Topic, TopicPropertiesResult]] = - attemptBlocking(client.describeConfigs(topics.map(t => new ConfigResource(TOPIC, t)).asJavaCollection)) flatMap { result => - ZIO - .collectAll( - result.values.asScala.toMap.map { - case (resource, kf) => + attemptBlocking(client.describeConfigs(topics.map(t => new ConfigResource(TOPIC, t)).asJavaCollection)) flatMap { + result => + ZIO + .collectAll( + result.values.asScala.toMap.map { case (resource, kf) => kf.asZio .map { config => resource.name -> @@ -335,35 +389,36 @@ object AdminClient { 0 ) } - .catchSome { - case _: UnknownTopicOrPartitionException => - ZIO.succeed(resource.name -> TopicPropertiesResult.TopicDoesnExist(resource.name)) + .catchSome { case _: UnknownTopicOrPartitionException => + ZIO.succeed(resource.name -> TopicPropertiesResult.TopicDoesnExist(resource.name)) } - } - ) - .map(_.toMap) + } + ) + .map(_.toMap) } - private def describePartitions(client: KafkaAdminClient, topics: Set[Topic]): RIO[Any, Map[Topic, TopicPropertiesResult]] = + private def describePartitions( + client: KafkaAdminClient, + topics: Set[Topic] + ): RIO[Any, Map[Topic, TopicPropertiesResult]] = attemptBlocking(client.describeTopics(topics.asJavaCollection)) .flatMap { result => ZIO - .collectAll(result.values.asScala.toMap.map { - case (topic, kf) => - kf.asZio - .map { desc => - val replication = desc.partitions.asScala.map(_.replicas.size).sorted.headOption.getOrElse(0) - topic -> - TopicPropertiesResult.TopicProperties( - topic, - desc.partitions.size, - Seq.empty, - replication - ) - } - .catchSome { - case _: UnknownTopicOrPartitionException => ZIO.succeed(topic -> TopicPropertiesResult.TopicDoesnExist(topic)) - } + .collectAll(result.values.asScala.toMap.map { case (topic, kf) => + kf.asZio + .map { desc => + val replication = desc.partitions.asScala.map(_.replicas.size).sorted.headOption.getOrElse(0) + topic -> + TopicPropertiesResult.TopicProperties( + topic, + desc.partitions.size, + Seq.empty, + replication + ) + } + .catchSome { case _: UnknownTopicOrPartitionException => + ZIO.succeed(topic -> TopicPropertiesResult.TopicDoesnExist(topic)) + } }) .map(_.toMap) } @@ -372,7 +427,8 @@ object AdminClient { Option(e.getCause).exists(_.isInstanceOf[TopicExistsException]) } -case class AdminClientConfig(bootstrapServers: String, extraProperties: Map[String, String] = Map.empty) extends CommonGreyhoundConfig { +case class AdminClientConfig(bootstrapServers: String, extraProperties: Map[String, String] = Map.empty) + extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = Map(KafkaAdminClientConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers) ++ extraProperties diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index 67870c29..8103b68e 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -69,6 +69,8 @@ trait Consumer { def assignment(implicit trace: Trace): Task[Set[TopicPartition]] + def assign(tps: Set[TopicPartition])(implicit trace: Trace): Task[Unit] = ZIO.fail(new IllegalStateException("Not implemented")) + def config(implicit trace: Trace): ConsumerConfig def listTopics(implicit trace: Trace): RIO[Any, Map[Topic, List[PartitionInfo]]] @@ -124,7 +126,7 @@ object Consumer { override def poll(timeout: Duration)(implicit trace: Trace): RIO[Any, Records] = withConsumerM { c => rewindPositionsOnError(c) { - attemptBlocking(c.poll(time.Duration.ofMillis(timeout.toMillis)).asScala.map(ConsumerRecord(_))) + attemptBlocking(c.poll(time.Duration.ofMillis(timeout.toMillis)).asScala.map(rec => ConsumerRecord(rec, config.groupId))) .flatMap(ZIO.foreach(_)(cfg.decryptor.decrypt)) } } @@ -208,6 +210,9 @@ object Consumer { withConsumer(_.assignment().asScala.toSet.map(TopicPartition.apply(_: org.apache.kafka.common.TopicPartition))) } + override def assign(tps: Set[TopicPartition])(implicit trace: Trace): Task[Unit] = + withConsumer(_.assign(kafkaPartitions(tps))) + private def allPositionsUnsafe = attemptBlocking { consumer .assignment() diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/domain/ConsumerRecord.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/domain/ConsumerRecord.scala index ff2bfabc..1ec2e077 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/domain/ConsumerRecord.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/domain/ConsumerRecord.scala @@ -13,7 +13,8 @@ case class ConsumerRecord[+K, +V]( value: V, pollTime: Long, bytesTotal: Long, - producedTimestamp: Long + producedTimestamp: Long, + consumerGroupId: ConsumerGroupId ) { def id: String = s"$topic:$partition:$offset" @@ -30,7 +31,8 @@ case class ConsumerRecord[+K, +V]( value = fv(value), pollTime = pollTime, bytesTotal = bytesTotal, - producedTimestamp = producedTimestamp + producedTimestamp = producedTimestamp, + consumerGroupId = consumerGroupId ) def bimapM[R, E, K2, V2](fk: K => ZIO[R, E, K2], fv: V => ZIO[R, E, V2]): ZIO[R, E, ConsumerRecord[K2, V2]] = @@ -46,7 +48,8 @@ case class ConsumerRecord[+K, +V]( value = value2, pollTime = pollTime, bytesTotal = bytesTotal, - producedTimestamp = producedTimestamp + producedTimestamp = producedTimestamp, + consumerGroupId = consumerGroupId ) def mapKey[K2](f: K => K2): ConsumerRecord[K2, V] = bimap(f, identity) @@ -56,7 +59,7 @@ case class ConsumerRecord[+K, +V]( } object ConsumerRecord { - def apply[K, V](record: KafkaConsumerRecord[K, V]): ConsumerRecord[K, V] = + def apply[K, V](record: KafkaConsumerRecord[K, V], consumerGroupId: ConsumerGroupId): ConsumerRecord[K, V] = ConsumerRecord( topic = record.topic, partition = record.partition, @@ -67,6 +70,7 @@ object ConsumerRecord { pollTime = System.currentTimeMillis, producedTimestamp = record.timestamp, bytesTotal = record.serializedValueSize() + record.serializedKeySize() + - record.headers().toArray.map(h => h.key.length + h.value.length).sum + record.headers().toArray.map(h => h.key.length + h.value.length).sum, + consumerGroupId = consumerGroupId ) } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala index aa335fde..15b4b1f3 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala @@ -114,7 +114,7 @@ object EventLoopTest { val partition = 0 val offset = 0L val record: ConsumerRecord[Chunk[Byte], Chunk[Byte]] = - ConsumerRecord(topic, partition, offset, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) + ConsumerRecord(topic, partition, offset, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") val exception = new RuntimeException("oops") def recordsFrom(records: ConsumerRecord[Chunk[Byte], Chunk[Byte]]*): Records = { diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/TestSupport.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/TestSupport.scala index 906e4b02..969d74c4 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/TestSupport.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/TestSupport.scala @@ -19,7 +19,8 @@ object TestSupport { Chunk.fromArray(payload.getBytes), 0L, payload.getBytes.length, - 0L + 0L, + "" ) def records(topicCount: Int = 4, partitions: Int = 4, perPartition: Int = 3, hint: String = "") = { diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala index 5a89a1b7..d2b8484f 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala @@ -158,12 +158,12 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { _ <- ZIO.foreachDiscard(0 to (highWatermark + 1)) { offset => submit( dispatcher, - ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, offset, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) + ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, offset, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") ) } _ <- submit( dispatcher, - ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 6L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) + ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 6L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") ) // Will be dropped _ <- eventuallyZ(dispatcher.resumeablePartitions(Set(topicPartition)))(_.isEmpty) _ <- ZIO.foreachDiscard(1 to 4)(_ => queue.take) @@ -195,13 +195,13 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { _ <- ZIO.foreachDiscard(0 to (highWatermark + 1)) { offset => submit( dispatcher, - ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, offset, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) + ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, offset, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") ) } overCapacitySubmitResult <- submit( dispatcher, - ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 6L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) + ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 6L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") ) // Will be dropped resumeablePartitionsWhenInHighWatermark <- dispatcher.resumeablePartitions(Set(topicPartition)) _ <- ZIO.foreachDiscard(1 to 4)(_ => queue.take) @@ -212,13 +212,13 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { _ <- ZIO.foreachDiscard(0 to 3) { offset => submit( dispatcher, - ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, offset, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) + ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, offset, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") ) } overCapacitySubmitResult2 <- submit( dispatcher, - ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 16L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) + ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 16L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") ) // Will be dropped _ <- ZIO.foreachDiscard(1 to 4)(_ => queue.take) _ <- TestClock.adjust(1.second) @@ -332,7 +332,7 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { val topic = "topic" val partition = 0 val topicPartition = TopicPartition(topic, partition) - val record = ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 0L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L) + val record = ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 0L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") def getKeys(numKeys: Int) = (0 until numKeys).map(i => Some(Chunk.fromArray(s"key$i".getBytes))) diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingStateResolverTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingStateResolverTest.scala index 3455e336..9450e35b 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingStateResolverTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/BlockingStateResolverTest.scala @@ -31,7 +31,7 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM resolver = BlockingStateResolver(blockingState) _ <- blockingState.set(Map(TopicPartitionTarget(TopicPartition(topic, partition)) -> state)) - shouldBlock <- resolver.resolve(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + shouldBlock <- resolver.resolve(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) } yield shouldBlock === expectedShouldBlock } } @@ -48,7 +48,7 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM resolver = BlockingStateResolver(blockingState) _ <- blockingState.set(Map(TopicTarget(topic) -> state)) - shouldBlock <- resolver.resolve(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + shouldBlock <- resolver.resolve(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) } yield shouldBlock === expectedShouldBlock } } @@ -62,7 +62,7 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM resolver = BlockingStateResolver(blockingState) - shouldBlock <- resolver.resolve(ConsumerRecord(missingTopic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + shouldBlock <- resolver.resolve(ConsumerRecord(missingTopic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) } yield shouldBlock === true } @@ -77,7 +77,7 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM resolver = BlockingStateResolver(blockingState) - record = ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L) + record = ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L, "") shouldBlock <- resolver.resolve(record) updatedStateMap <- blockingState.get updatedState = updatedStateMap(TopicPartitionTarget(tpartition)) @@ -95,7 +95,7 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM resolver = BlockingStateResolver(blockingState) - record = ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L) + record = ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L, "") shouldBlock <- resolver.resolve(record) updatedStateMap <- blockingState.get updatedState = updatedStateMap(TopicPartitionTarget(tpartition)) @@ -111,7 +111,7 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM resolver = BlockingStateResolver(blockingState) - shouldBlock <- resolver.resolve(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + shouldBlock <- resolver.resolve(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) updatedStateMap <- blockingState.get updatedState = updatedStateMap(TopicTarget(topic)) } yield shouldBlock === true and updatedState === InternalBlocking @@ -150,7 +150,7 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM ) ) - shouldBlock <- resolver.resolve(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + shouldBlock <- resolver.resolve(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) } yield shouldBlock === expectedShouldBlock } } @@ -170,8 +170,8 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM resolver = BlockingStateResolver(blockingState) - record = ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L) - record2 = ConsumerRecord(anotherTopic, partition, offset, headers, Some(key), value, 0L, 0L, 0L) + record = ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L, "") + record2 = ConsumerRecord(anotherTopic, partition, offset, headers, Some(key), value, 0L, 0L, 0L, "") shouldBlockBefore <- resolver.resolve(record) shouldBlockBefore2 <- resolver.resolve(record2) _ <- resolver.setBlockingState(BlockErrors(topic)) @@ -219,7 +219,7 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM resolver = BlockingStateResolver(blockingState) - record = ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L) + record = ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "") shouldBlock <- resolver.resolve(record) updatedStateMap <- blockingState.get updatedStateTopic = updatedStateMap(TopicTarget(topic)) @@ -228,7 +228,7 @@ class BlockingStateResolverTest extends BaseTest[TestEnvironment with GreyhoundM } } - final val BlockedMessageState = Blocked(ConsumerRecord("", 0, 0, Headers.Empty, None, "", 0L, 0L, 0L)) + final val BlockedMessageState = Blocked(ConsumerRecord("", 0, 0, Headers.Empty, None, "", 0L, 0L, 0L, "")) } case class Foo(message: String) diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala index ef9fccb1..39018b94 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryConsumerRecordHandlerTest.scala @@ -48,7 +48,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) key <- bytes value <- bytes - _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) record <- producer.records.take now <- currentTime } yield { @@ -84,7 +84,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics value <- bytes begin <- currentTime headers = RetryAttempt.toHeaders(RetryAttempt(topic, attempt, begin, 1.second)) - _ <- retryHandler.handle(ConsumerRecord(retryTopic, partition, offset, headers, None, value, 0L, 0L, 0L)).fork + _ <- retryHandler.handle(ConsumerRecord(retryTopic, partition, offset, headers, None, value, 0L, 0L, 0L, "")).fork _ <- TestClock.adjust(1.second).repeat(Schedule.once) end <- executionTime.await.disconnect.timeoutFail(TimeoutWaitingForAssertion)(5.seconds) } yield end must beBetween(begin.plusSeconds(1), begin.plusSeconds(3)) @@ -108,7 +108,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) key <- bytes value <- bytes - record = ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L) + record = ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "") _ <- retryHandler.handle(record).fork _ <- adjustTestClockFor(100.millis) _ <- eventuallyZ(blockingState.get)(_.get(TopicPartitionTarget(tpartition)).contains(Blocked(record))) @@ -140,7 +140,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) key <- bytes value <- bytes - _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)).fork + _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")).fork _ <- adjustTestClockFor(4.seconds) _ <- eventuallyZ(TestClock.adjust(100.millis) *> TestMetrics.reported)( _.contains(NoRetryOnNonRetryableFailure(tpartition, offset, cause)) @@ -167,7 +167,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) key <- bytes value <- bytes - _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)).fork + _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")).fork _ <- adjustTestClockFor(1.second, 1.2) metrics <- TestMetrics.reported _ <- eventuallyZ(handleCountRef.get)(_ >= 10) @@ -194,7 +194,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) key <- bytes value <- bytes - record = ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L) + record = ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "") fiber <- retryHandler.handle(record).fork _ <- adjustTestClockFor(retryDurations.head, 0.5) _ <- eventuallyZ(TestMetrics.reported)(metrics => @@ -206,7 +206,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics _ <- adjustTestClockFor(retryDurations.head) _ <- fiber.join _ <- eventuallyZ(TestMetrics.reported)(_.contains(BlockingIgnoredOnceFor(tpartition, offset))) - _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset + 1, Headers.Empty, Some(key), value, 0L, 0L, 0L)).fork + _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset + 1, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")).fork _ <- adjustTestClockFor(retryDurations.head, 1.5) _ <- eventuallyZ(TestMetrics.reported)(metrics => !metrics.contains(BlockingIgnoredOnceFor(tpartition, offset + 1)) && @@ -234,11 +234,11 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics key <- bytes value <- bytes _ <- blockingState.set(Map(TopicPartitionTarget(tpartition) -> IgnoringOnce)) - fiber <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)).fork + fiber <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")).fork _ <- adjustTestClockFor(50.millis) _ <- eventuallyZ(TestMetrics.reported)(_.contains(BlockingIgnoredOnceFor(tpartition, offset))) _ <- fiber.join - _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset + 1, Headers.Empty, Some(key), value, 0L, 0L, 0L)).fork + _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset + 1, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")).fork _ <- adjustTestClockFor(50.millis, 1.5) _ <- eventuallyZ(TestMetrics.reported)(metrics => !metrics.contains(BlockingIgnoredOnceFor(tpartition, offset + 1)) && @@ -276,7 +276,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) key <- bytes value <- bytes - fiber <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)).fork + fiber <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")).fork _ <- adjustTestClockFor(retryDurations.head, 0.5) _ <- eventuallyZ(TestMetrics.reported)(list => !list.contains(BlockingIgnoredForAllFor(tpartition, offset)) && @@ -286,12 +286,12 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics _ <- adjustTestClockFor(retryDurations.head) _ <- fiber.join _ <- eventuallyZ(TestMetrics.reported)(_.contains(BlockingIgnoredForAllFor(tpartition, offset))) - _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset + 1, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset + 1, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) _ <- eventuallyZ(TestMetrics.reported)(_.contains(BlockingIgnoredForAllFor(tpartition, offset + 1))) _ <- blockingState.set(Map(target(tpartition) -> InternalBlocking)) _ <- handleCountRef.set(0) - _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset + 2, Headers.Empty, Some(key), value, 0L, 0L, 0L)).fork + _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset + 2, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")).fork _ <- adjustTestClockFor(retryDurations.head * 1.2) _ <- eventuallyZ(TestMetrics.reported)(_.contains(BlockingRetryHandlerInvocationFailed(tpartition, offset + 2, "RetriableError"))) _ <- adjustTestClockFor(retryDurations(1) * 1.2) @@ -323,7 +323,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) key <- bytes value <- bytes - _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)).fork + _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")).fork _ <- adjustTestClockFor(4.seconds) _ <- eventuallyZ(TestClock.adjust(100.millis) *> TestMetrics.reported)( _.contains(BlockingRetryHandlerInvocationFailed(tpartition, offset, "RetriableError")) @@ -353,8 +353,8 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics key <- bytes value <- bytes value2 <- bytes - _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) // no retry - _ <- retryHandler.handle(ConsumerRecord(otherTopic, partition, offset, Headers.Empty, Some(key), value2, 0L, 0L, 0L)) // with retry + _ <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) // no retry + _ <- retryHandler.handle(ConsumerRecord(otherTopic, partition, offset, Headers.Empty, Some(key), value2, 0L, 0L, 0L, "")) // with retry producedRecords <- producer.records.takeAll } yield { producedRecords.map(_.value.get) === value2 :: Nil @@ -381,7 +381,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics ) key <- bytes value <- bytes - handling <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)).forkDaemon + handling <- retryHandler.handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")).forkDaemon _ <- TestClock.adjust(201.millis) _ <- producer.records.takeN(3) _ <- producerFails.set(false) @@ -417,7 +417,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics key <- bytes value <- bytes handling <- retryHandler - .handle(ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L)) + .handle(ConsumerRecord(topic, partition, offset, headers, Some(key), value, 0L, 0L, 0L, "")) .forkDaemon } yield handling } @@ -451,7 +451,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics key <- bytes value <- bytes handling <- retryHandler - .handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + .handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) .forkDaemon } yield handling } @@ -486,7 +486,7 @@ class RetryConsumerRecordHandlerTest extends BaseTest[TestClock with TestMetrics key <- bytes value <- bytes handling <- retryHandler - .handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L)) + .handle(ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "")) .forkDaemon } yield handling } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/Maker.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/Maker.scala index 6b291d0e..1af6be7b 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/Maker.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/testkit/Maker.scala @@ -28,5 +28,5 @@ object Maker { topic <- randomTopicName key <- bytes value <- bytes - } yield new ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L) + } yield new ConsumerRecord(topic, partition, offset, Headers.Empty, Some(key), value, 0L, 0L, 0L, "") } From ad13e688856f94cb570661be7f3523e2d310e300 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Tue, 20 Jun 2023 17:51:44 +0300 Subject: [PATCH 19/52] [greyhound] parallel consumer - add gaps limit (#35313) GitOrigin-RevId: af1fd4bede9f00146454b2f28657bc1c845ae682 --- .../greyhound/core/consumer/Dispatcher.scala | 99 +++++++++++++++---- .../greyhound/core/consumer/EventLoop.scala | 7 +- .../core/consumer/OffsetsAndGaps.scala | 3 + .../consumer/dispatcher/DispatcherTest.scala | 23 +++++ 4 files changed, 110 insertions(+), 22 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index 45aeb3e3..d35b6788 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -56,6 +56,7 @@ object Dispatcher { maxParallelism: Int = 1, updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit] = _ => ZIO.unit, currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, OffsetAndGaps]] = _ => ZIO.succeed(Map.empty), + gapsSizeLimit: Int = 256, init: Promise[Nothing, Unit] )(implicit trace: Trace): UIO[Dispatcher[R]] = for { @@ -85,8 +86,9 @@ object Dispatcher { submitResult <- if (allSamePartition) { val partition = RecordTopicPartition(records.head) for { - worker <- workerFor(partition, records.head.offset) - submitted <- worker.submitBatch(records) + worker <- workerFor(partition, records.head.offset) + currentGaps <- currentGaps(Set(partition)) + submitted <- worker.submitBatch(records, currentGaps) } yield submitted } else ZIO.succeed(SubmitBatchResult(success = false, Some(records.minBy(_.offset)))) @@ -172,6 +174,7 @@ object Dispatcher { consumerAttributes, consumeInParallel, maxParallelism, + gapsSizeLimit, updateBatch, currentGaps ) @@ -234,7 +237,7 @@ object Dispatcher { trait Worker { def submit(record: Record): URIO[Any, Boolean] - def submitBatch(records: Records): URIO[Any, SubmitBatchResult] + def submitBatch(records: Records, currentGaps: Map[TopicPartition, OffsetAndGaps]): URIO[Env, SubmitBatchResult] def expose: URIO[Any, WorkerExposedState] @@ -257,6 +260,7 @@ object Dispatcher { consumerAttributes: Map[String, String], consumeInParallel: Boolean, maxParallelism: Int, + gapsSizeLimit: Int, updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit] = _ => ZIO.unit, currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, OffsetAndGaps]] )(implicit trace: Trace): URIO[R with Env, Worker] = for { @@ -295,23 +299,13 @@ object Dispatcher { ) override def submitBatch( - records: Records - ): URIO[Any, SubmitBatchResult] = - queue - .offerAll(records) - .tap(notInserted => - ZIO.when(notInserted.nonEmpty) { - Clock - .currentTime(TimeUnit.MILLISECONDS) - .flatMap(now => - internalState.update(s => if (s.reachedHighWatermarkSince.nonEmpty) s else s.reachedHighWatermark(now)).commit - ) - } - ) - .map(rejected => { - val isSuccess = rejected.isEmpty - SubmitBatchResult(isSuccess, if (isSuccess) None else Some(rejected.minBy(_.offset))) - }) + records: Records, + currentGaps: Map[TopicPartition, OffsetAndGaps] + ): URIO[Env, SubmitBatchResult] = { + val gapsSize = OffsetAndGaps.gapsSize(currentGaps) + if (gapsSize + records.size <= gapsSizeLimit) submitBatchToQueue(queue, records, internalState) + else submitBatchPartially(group, clientId, partition, consumerAttributes, gapsSizeLimit, queue, internalState, records, gapsSize) + } override def expose: URIO[Any, WorkerExposedState] = (queue.size zip internalState.get.commit) .flatMap { @@ -340,6 +334,61 @@ object Dispatcher { internalState.get.flatMap(state => STM.check(state.currentExecutionStarted.isEmpty)).commit } + private def submitBatchPartially[R]( + group: Group, + clientId: ClientId, + partition: TopicPartition, + consumerAttributes: Map[ClientId, ClientId], + gapsSizeLimit: Int, + queue: Queue[Record], + internalState: TRef[WorkerInternalState], + records: Records, + gapsSize: Int + ) = { + if (gapsSize == gapsSizeLimit) { // no records can be submitted + report( + DroppedRecordsDueToGapsSizeLimit(records.size, records.minBy(_.offset).offset, group, partition, clientId, consumerAttributes) + ) *> ZIO.succeed(SubmitBatchResult(success = false, firstRejected = Some(records.minBy(_.offset)))) + } else { + val sortedRecords = records.sortBy(_.offset) + val recordsToSubmit = sortedRecords.take(gapsSizeLimit - gapsSize) + val firstNotSubmitted = + sortedRecords + .take(gapsSizeLimit - gapsSize + 1) + .last // flow control in the calling function ensures this is safe, since records.size > gapsSizeLimit - gapsSize + report( + DroppedRecordsDueToGapsSizeLimit(recordsToSubmit.size, firstNotSubmitted.offset, group, partition, clientId, consumerAttributes) + ) *> + submitBatchToQueue(queue, recordsToSubmit, internalState).flatMap { + case SubmitBatchResult(true, _) => + ZIO.succeed(SubmitBatchResult(success = false, firstRejected = Some(firstNotSubmitted))) + case SubmitBatchResult(false, firstRejected) => + ZIO.succeed(SubmitBatchResult(success = false, firstRejected = firstRejected)) + } + } + } + + private def submitBatchToQueue[R]( + queue: Queue[Record], + records: Records, + internalState: TRef[WorkerInternalState] + ): URIO[Any, SubmitBatchResult] = + queue + .offerAll(records) + .tap(notInserted => + ZIO.when(notInserted.nonEmpty) { + Clock + .currentTime(TimeUnit.MILLISECONDS) + .flatMap(now => + internalState.update(s => if (s.reachedHighWatermarkSince.nonEmpty) s else s.reachedHighWatermark(now)).commit + ) + } + ) + .map(rejected => { + val isSuccess = rejected.isEmpty + SubmitBatchResult(isSuccess, if (isSuccess) None else Some(rejected.minBy(_.offset))) + }) + private def pollOnce[R]( state: Ref[DispatcherState], internalState: TRef[WorkerInternalState], @@ -579,6 +628,16 @@ object DispatcherMetric { clientId: ClientId, attributes: Map[String, String] ) extends DispatcherMetric + + case class DroppedRecordsDueToGapsSizeLimit( + numRecords: Int, + firstDroppedOffset: Long, + group: Group, + partition: TopicPartition, + clientId: ClientId, + attributes: Map[String, String] + ) extends DispatcherMetric + case class FailToUpdateCurrentExecutionStarted( record: Record, group: Group, diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index f91efa61..afe474a5 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -62,6 +62,7 @@ object EventLoop { config.maxParallelism, updateBatch, currentGaps, + config.gapsSizeLimit, offsetsAndGapsInit ) positionsRef <- Ref.make(Map.empty[TopicPartition, Offset]) @@ -396,7 +397,8 @@ case class EventLoopConfig( delayResumeOfPausedPartition: Long, startPaused: Boolean, consumePartitionInParallel: Boolean, - maxParallelism: Int + maxParallelism: Int, + gapsSizeLimit: Int ) object EventLoopConfig { @@ -409,7 +411,8 @@ object EventLoopConfig { delayResumeOfPausedPartition = 0, startPaused = false, consumePartitionInParallel = false, - maxParallelism = 1 + maxParallelism = 1, + gapsSizeLimit = 256 // todo: calculate actual gaps limit based on the max metadata size ) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala index 5dc24f0f..9255bcbb 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -165,4 +165,7 @@ object OffsetAndGaps { def apply(offset: Offset): OffsetAndGaps = OffsetAndGaps(offset, Seq.empty[Gap]) def apply(offset: Offset, committable: Boolean): OffsetAndGaps = OffsetAndGaps(offset, Seq.empty[Gap], committable) + + def gapsSize(gaps: Map[TopicPartition, OffsetAndGaps]): Int = + gaps.values.flatMap(_.gaps).map(_.size.toInt).sum } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala index d2b8484f..88bc7d4f 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala @@ -139,6 +139,29 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { } yield result must beEqualTo(SubmitResult.RejectedBatch(record.copy(offset = 5L)))) } + "reject records and return first rejected when gaps limit is reached" in + new ctx(highWatermark = 20) { + val gapsSizeLimit = 5 + run(for { + ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) + init <- getInit + dispatcher <- + Dispatcher + .make[Any]( + "group", + "clientId", + _ => ZIO.never, + lowWatermark, + highWatermark, + workersShutdownRef = ref, + init = init, + gapsSizeLimit = gapsSizeLimit + ) + records = (0 until 7).map(i => record.copy(offset = i.toLong)) + result <- submitBatch(dispatcher, records) + } yield result must beEqualTo(SubmitResult.RejectedBatch(record.copy(offset = 5L)))) + } + "resume paused partitions" in new ctx(lowWatermark = 3, highWatermark = 7) { run( From 6e2186aa2ec6dbff3601525357af80316187734a Mon Sep 17 00:00:00 2001 From: Leon Burdinov Date: Wed, 21 Jun 2023 15:12:05 +0300 Subject: [PATCH 20/52] Fixes in preparation for ZIO 2.0.15 (#35320) * batch event loop fix * fix worker timeout interruption. cleanup code GitOrigin-RevId: 31023194ccc982dfe8720620bfbfe26dc274918c --- .../core/parallel/ParallelConsumerIT.scala | 70 +++++++++---------- .../greyhound/core/consumer/Dispatcher.scala | 52 ++++++++------ .../consumer/batched/BatchEventLoop.scala | 29 ++++---- .../consumer/batched/BatchEventLoopTest.scala | 29 ++++---- 4 files changed, 96 insertions(+), 84 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala index 6ef6de44..e9651995 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala @@ -104,49 +104,46 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { fastMessages = allMessages - 1 drainTimeout = 5.seconds - keyWithSlowHandling = "slow-key" - numProcessedMessges <- Ref.make[Int](0) - fastMessagesLatch <- CountDownLatch.make(fastMessages) + numProcessedMessages <- Ref.make[Int](0) + fastMessagesLatch <- CountDownLatch.make(fastMessages) randomKeys <- ZIO.foreach(1 to fastMessages)(i => randomKey(i.toString)).map(_.toSeq) fastRecords = randomKeys.map { key => recordWithKey(topic, key, partition) } - slowRecord = recordWithKey(topic, keyWithSlowHandling, partition) + slowRecord = recordWithoutKey(topic, partition) finishRebalance <- Promise.make[Nothing, Unit] // handler that sleeps only on the slow key - handler = RecordHandler { cr: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => - (cr.key match { - case Some(k) if k == Chunk.fromArray(keyWithSlowHandling.getBytes) => - // make sure the handler doesn't finish before the rebalance is done, including drain timeout - finishRebalance.await *> ZIO.sleep(drainTimeout + 1.second) - case _ => fastMessagesLatch.countDown - }) *> numProcessedMessges.update(_ + 1) - } - _ <- - for { - consumer <- makeParallelConsumer(handler, kafka, topic, group, cId, drainTimeout = drainTimeout, startPaused = true) - _ <- produceRecords(producer, Seq(slowRecord)) - _ <- produceRecords(producer, fastRecords) - _ <- ZIO.sleep(2.seconds) - // produce is done synchronously to make sure all records are produced before consumer starts, so all records are polled at once - _ <- consumer.resume - _ <- fastMessagesLatch.await - _ <- ZIO.sleep(3.second) // sleep to ensure commit is done before rebalance - // start another consumer to trigger a rebalance before slow handler is done - _ <- makeParallelConsumer( - handler, - kafka, - topic, - group, - cId, - drainTimeout = drainTimeout, - onAssigned = _ => finishRebalance.succeed() - ) - } yield () - - _ <- eventuallyZ(numProcessedMessges.get, 25.seconds)(_ == allMessages) + handler = RecordHandler { cr: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => + (cr.key match { + case Some(_) => + fastMessagesLatch.countDown + case None => + // make sure the handler doesn't finish before the rebalance is done, including drain timeout + finishRebalance.await *> ZIO.sleep(drainTimeout + 5.second) + }) *> numProcessedMessages.update(_ + 1) + } + consumer <- makeParallelConsumer(handler, kafka, topic, group, cId, drainTimeout = drainTimeout, startPaused = true) + _ <- produceRecords(producer, Seq(slowRecord)) + _ <- produceRecords(producer, fastRecords) + _ <- ZIO.sleep(2.seconds) + // produce is done synchronously to make sure all records are produced before consumer starts, so all records are polled at once + _ <- consumer.resume + _ <- fastMessagesLatch.await + _ <- ZIO.sleep(3.second) // sleep to ensure commit is done before rebalance + // start another consumer to trigger a rebalance before slow handler is done + _ <- makeParallelConsumer( + handler, + kafka, + topic, + group, + cId, + drainTimeout = drainTimeout, + onAssigned = _ => finishRebalance.succeed() + ) + + _ <- eventuallyZ(numProcessedMessages.get, 25.seconds)(_ == allMessages) } yield { ok } @@ -319,6 +316,9 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { private def recordWithKey(topic: String, key: String, partition: Int) = ProducerRecord(topic, "", Some(key), partition = Some(partition)) + private def recordWithoutKey(topic: String, partition: Int) = + ProducerRecord(topic, "", None, partition = Some(partition)) + private def randomKey(prefix: String) = randomId.map(r => s"$prefix-$r") } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index d35b6788..44441099 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -196,15 +196,18 @@ object Dispatcher { ZIO .foreachParDiscard(workers) { case (partition, worker) => - report(StoppingWorker(group, clientId, partition, drainTimeout.toMillis, consumerAttributes)) *> - workersShutdownRef.get.flatMap(_.get(partition).fold(ZIO.unit)(promise => promise.onShutdown.shuttingDown)) *> - worker.shutdown - .catchSomeCause { - case _: Cause[InterruptedException] => ZIO.unit - } // happens on revoke - must not fail on it so we have visibility to worker completion - .timed - .map(_._1) - .flatMap(duration => report(WorkerStopped(group, clientId, partition, duration.toMillis, consumerAttributes))) + for { + _ <- report(StoppingWorker(group, clientId, partition, drainTimeout.toMillis, consumerAttributes)) + workersShutdownMap <- workersShutdownRef.get + _ <- workersShutdownMap.get(partition).fold(ZIO.unit)(promise => promise.onShutdown.shuttingDown) + duration <- worker.shutdown + .catchSomeCause { + case _: Cause[InterruptedException] => ZIO.unit + } // happens on revoke - must not fail on it so we have visibility to worker completion + .timed + .map(_._1) + _ <- report(WorkerStopped(group, clientId, partition, duration.toMillis, consumerAttributes)) + } yield () } .resurrect .ignore @@ -324,7 +327,7 @@ object Dispatcher { override def shutdown: URIO[Any, Unit] = for { _ <- internalState.update(_.shutdown).commit - timeout <- fiber.join.ignore.disconnect.timeout(drainTimeout) + timeout <- fiber.join.ignore.interruptible.timeout(drainTimeout) _ <- ZIO.when(timeout.isEmpty)(fiber.interruptFork) } yield () @@ -404,19 +407,26 @@ object Dispatcher { case DispatcherState.Running => queue.poll.flatMap { case Some(record) => - report(TookRecordFromQueue(record, group, clientId, consumerAttributes)) *> - ZIO - .attempt(currentTimeMillis()) - .flatMap(t => internalState.updateAndGet(_.startedWith(t)).commit) - .tapBoth( - e => report(FailToUpdateCurrentExecutionStarted(record, group, clientId, consumerAttributes, e)), - t => report(CurrentExecutionStartedEvent(partition, group, clientId, t.currentExecutionStarted)) - ) *> handle(record).interruptible.ignore *> isActive(internalState) - case None => isActive(internalState).delay(5.millis) + for { + _ <- report(TookRecordFromQueue(record, group, clientId, consumerAttributes)) + clock <- ZIO.clock + executionStartTime <- clock.currentTime(TimeUnit.MILLISECONDS) + _ <- internalState + .updateAndGet(_.startedWith(executionStartTime)) + .commit + _ <- report(CurrentExecutionStartedEvent(partition, group, clientId, Some(executionStartTime))) + _ <- handle(record).interruptible.ignore + active <- isActive(internalState) + } yield active + case None => + isActive(internalState).delay(5.millis) } case DispatcherState.Paused(resume) => - report(WorkerWaitingForResume(group, clientId, partition, consumerAttributes)) *> resume.await.timeout(30.seconds) *> - isActive(internalState) + for { + _ <- report(WorkerWaitingForResume(group, clientId, partition, consumerAttributes)) + _ <- resume.await.timeout(30.seconds) + active <- isActive(internalState) + } yield active case DispatcherState.ShuttingDown => ZIO.succeed(false) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoop.scala index 4c453446..02c3564c 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoop.scala @@ -99,22 +99,21 @@ private[greyhound] class BatchEventLoopImpl[R]( ) ) - private def pollAndHandle()(implicit trace: Trace): URIO[R, Unit] = for { - _ <- pauseAndResume().provide(ZLayer.succeed(capturedR)) - records <- - consumer - .poll(config.fetchTimeout) - .provide(ZLayer.succeed(capturedR)) - .catchAll(_ => ZIO.succeed(Nil)) - .flatMap(records => seekRequests.get.map(seeks => records.filterNot(record => seeks.keys.toSet.contains(record.topicPartition)))) - _ <- handleRecords(records).timed - .tap { case (duration, _) => report(FullBatchHandled(clientId, group, records.toSeq, duration, consumerAttributes)) } + private def pollAndHandle()(implicit trace: Trace): URIO[R with GreyhoundMetrics, Unit] = for { + _ <- pauseAndResume().ignore + allRecords <- consumer + .poll(config.fetchTimeout) + .catchAll(_ => ZIO.succeed(Nil)) + seeks <- seekRequests.get.map(_.keySet) + records = allRecords.filterNot(record => seeks.contains(record.topicPartition)) + _ <- handleRecords(records).timed + .tap { case (duration, _) => report(FullBatchHandled(clientId, group, records.toSeq, duration, consumerAttributes)) } } yield () private def pauseAndResume()(implicit trace: Trace) = for { pr <- elState.shouldPauseAndResume() - _ <- ZIO.when(pr.toPause.nonEmpty)((consumer.pause(pr.toPause) *> elState.partitionsPaused(pr.toPause)).ignore) - _ <- ZIO.when(pr.toResume.nonEmpty)((consumer.resume(pr.toResume) *> elState.partitionsResumed(pr.toResume)).ignore) + _ <- ZIO.when(pr.toPause.nonEmpty)(consumer.pause(pr.toPause) *> elState.partitionsPaused(pr.toPause)) + _ <- ZIO.when(pr.toResume.nonEmpty)(consumer.resume(pr.toResume) *> elState.partitionsResumed(pr.toResume)) } yield () private def handleRecords(polled: Records)(implicit trace: Trace): ZIO[R, Nothing, Unit] = { @@ -512,10 +511,10 @@ private[greyhound] class BatchEventLoopState( partitionsPaused(pauseResume.toPause) *> partitionsResumed(pauseResume.toResume) def shouldPauseAndResume[R]()(implicit trace: Trace): URIO[R, PauseResume] = for { - pending <- allPending + pending <- allPending.map(_.keySet) paused <- pausedPartitions - toPause = pending.keySet -- paused - toResume = paused -- pending.keySet + toPause = pending -- paused + toResume = paused -- pending } yield PauseResume(toPause, toResume) def appendPending(records: Consumer.Records)(implicit trace: Trace): UIO[Unit] = { diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoopTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoopTest.scala index bb1144cc..fda607f6 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoopTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoopTest.scala @@ -52,7 +52,7 @@ class BatchEventLoopTest extends JUnitRunnableSpec { ZIO.scoped(BatchEventLoop.make(group, ConsumerSubscription.topics(topics: _*), consumer, handler, clientId, retry).flatMap { loop => for { - _ <- ZIO.succeed(println(s"Should not retry for retry: $retry, cause: $cause")) + _ <- ZIO.debug(s"Should not retry for retry: $retry, cause: $cause") _ <- givenHandleError(failOnPartition(0, cause)) _ <- givenRecords(consumerRecords) handledRecords <- handled.await(_.nonEmpty) @@ -79,7 +79,7 @@ class BatchEventLoopTest extends JUnitRunnableSpec { ZIO.scoped(BatchEventLoop.make(group, ConsumerSubscription.topics(topics: _*), consumer, handler, clientId, Some(retry)).flatMap { loop => for { - _ <- ZIO.succeed(println(s"Should retry for cause: $cause")) + _ <- ZIO.debug(s"Should retry for cause: $cause") _ <- givenHandleError(failOnPartition(0, cause)) _ <- givenRecords(consumerRecords) handled1 <- handled.await(_.nonEmpty) @@ -153,13 +153,14 @@ class BatchEventLoopTest extends JUnitRunnableSpec { val consumer = new EmptyConsumer { override def poll(timeout: Duration)(implicit trace: Trace): Task[Records] = - queue.take + queue.take.interruptible .timeout(timeout) .map(_.getOrElse(Iterable.empty)) - .tap(r => ZIO.succeed(println(s"poll($timeout): $r"))) + .tap(r => ZIO.debug(s"poll($timeout): $r")) + override def commit(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): Task[Unit] = { - ZIO.succeed(println(s"commit($offsets)")) *> committedOffsetsRef.update(_ ++ offsets) + ZIO.debug(s"commit($offsets)") *> committedOffsetsRef.update(_ ++ offsets) } override def commitWithMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata])( @@ -190,14 +191,16 @@ class BatchEventLoopTest extends JUnitRunnableSpec { } } - val handler = new BatchRecordHandler[Any, Throwable, Chunk[Byte], Chunk[Byte]] { - override def handle(records: RecordBatch): ZIO[Any, HandleError[Throwable], Any] = { - ZIO.succeed(println(s"handle($records)")) *> - (handlerErrorsRef.get.flatMap(he => he(records.records).fold(ZIO.unit: IO[HandleError[Throwable], Unit])(ZIO.failCause(_))) *> - handled.update(_ :+ records.records)) - .tapErrorCause(e => ZIO.succeed(println(s"handle failed with $e, records: $records"))) - .tap(_ => ZIO.succeed(println(s"handled $records"))) - } + val handler = new BatchRecordHandler[Any, Throwable, Chunk[Byte], Chunk[Byte]] { + override def handle(records: RecordBatch): ZIO[Any, HandleError[Throwable], Any] = for { + _ <- ZIO.debug(s"handle($records)") + he <- handlerErrorsRef.get + _ <- he(records.records).fold(ZIO.unit: IO[HandleError[Throwable], Unit])(ZIO.failCause(_)) + _ <- handled + .update(_ :+ records.records) + .tapErrorCause(e => ZIO.debug(s"handle failed with $e, records: $records")) + .tap(_ => ZIO.debug(s"handled $records")) + } yield () } def givenRecords(records: Seq[Consumer.Record]) = queue.offer(records) From 5561154fa3bc71ee68121508ada89255a9d9ab3e Mon Sep 17 00:00:00 2001 From: Noam Berman Date: Thu, 22 Jun 2023 18:46:02 +0300 Subject: [PATCH 21/52] [greyhound-consumer-proxy] start from latest offset when group doesn't exist (temporary until full config is done) (#35428) * [greyhound-consumer-proxy] start from latest offset when group doesn't exist (temporary until full config is done) #automerge #skipreview * . GitOrigin-RevId: 34b953997eee3d167bb2beb23d50045daf03460f --- .../greyhound/core/admin/AdminClient.scala | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala index f5ac837f..cb7d7054 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala @@ -6,10 +6,10 @@ import com.wixpress.dst.greyhound.core.admin.AdminClient.isTopicExistsError import com.wixpress.dst.greyhound.core.admin.TopicPropertiesResult.{TopicDoesnExistException, TopicProperties} import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.zioutils.KafkaFutures._ -import com.wixpress.dst.greyhound.core.{CommonGreyhoundConfig, GHThrowable, Group, GroupTopicPartition, OffsetAndMetadata, Topic, TopicConfig, TopicPartition} +import com.wixpress.dst.greyhound.core.{CommonGreyhoundConfig, GHThrowable, Group, GroupTopicPartition, Offset, OffsetAndMetadata, Topic, TopicConfig, TopicPartition} import org.apache.kafka.clients.admin.AlterConfigOp.OpType import org.apache.kafka.clients.admin.ConfigEntry.ConfigSource -import org.apache.kafka.clients.admin.{AlterConfigOp, Config, ConfigEntry, ListConsumerGroupOffsetsOptions, ListConsumerGroupOffsetsSpec, NewPartitions, NewTopic, TopicDescription, AdminClient => KafkaAdminClient, AdminClientConfig => KafkaAdminClientConfig} +import org.apache.kafka.clients.admin.{AlterConfigOp, Config, ConfigEntry, ListConsumerGroupOffsetsOptions, ListConsumerGroupOffsetsSpec, ListOffsetsOptions, ListOffsetsResult, NewPartitions, NewTopic, OffsetSpec, TopicDescription, AdminClient => KafkaAdminClient, AdminClientConfig => KafkaAdminClientConfig} import org.apache.kafka.common.config.ConfigResource import org.apache.kafka.common.config.ConfigResource.Type.TOPIC import org.apache.kafka.common.errors.{InvalidTopicException, TopicExistsException, UnknownTopicOrPartitionException} @@ -27,6 +27,10 @@ trait AdminClient { def listTopics()(implicit trace: Trace): RIO[Any, Set[String]] + def listEndOffsets( + tps: Set[TopicPartition] + )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] + def topicExists(topic: String)(implicit trace: Trace): RIO[Any, Boolean] def topicsExist(topics: Set[Topic])(implicit trace: Trace): ZIO[Any, Throwable, Map[Topic, Boolean]] @@ -40,7 +44,9 @@ trait AdminClient { def propertiesFor(topics: Set[Topic])(implicit trace: Trace): RIO[Any, Map[Topic, TopicPropertiesResult]] - def commit(group: Group, commits: Map[TopicPartition, OffsetAndMetadata])(implicit trace: Trace): ZIO[Any, Throwable, Unit] + def commit(group: Group, commits: Map[TopicPartition, OffsetAndMetadata])( + implicit trace: Trace + ): ZIO[Any, Throwable, Unit] def listGroups()(implicit trace: Trace): RIO[Any, Set[String]] @@ -216,6 +222,20 @@ object AdminClient { topics <- result.names().asZio } yield topics.asScala.toSet + override def listEndOffsets( + tps: Set[TopicPartition] + )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] = { + val j: java.util.Map[org.apache.kafka.common.TopicPartition, OffsetSpec] = + tps.map { tp => (tp.asKafka, OffsetSpec.latest()) }.toMap.asJava + + for { + result <- attemptBlocking(client.listOffsets(j)) + results <- result.all.asZio.map(_.asScala.toMap.map { case (tp, offset) => + (TopicPartition.fromKafka(tp), offset.offset()) + }) + } yield results + } + private def toNewTopic(config: TopicConfig): NewTopic = new NewTopic(config.name, config.partitions, config.replicationFactor.toShort) .configs(config.propertiesMap.asJava) @@ -225,9 +245,13 @@ object AdminClient { groups <- result.valid().asZio } yield groups.asScala.map(_.groupId()).toSet - override def commit(group: Group, commits: Map[TopicPartition, OffsetAndMetadata])(implicit trace: Trace): ZIO[Any, Throwable, Unit] = - attemptBlocking(client.alterConsumerGroupOffsets(group, - commits.map { case (tp, offset) => (tp.asKafka, offset.asKafka) }.asJava)).unit + override def commit(group: Group, commits: Map[TopicPartition, OffsetAndMetadata])( + implicit trace: Trace + ): ZIO[Any, Throwable, Unit] = + attemptBlocking( + client + .alterConsumerGroupOffsets(group, commits.map { case (tp, offset) => (tp.asKafka, offset.asKafka) }.asJava) + ).unit override def groupOffsetsSpecific( requestedTopicPartitions: Map[Group, Set[TopicPartition]] @@ -249,9 +273,13 @@ object AdminClient { rawOffsets = result.asScala.toMap.mapValues(_.asScala.toMap) offset = rawOffsets.map { case (group, offsets) => - offsets.map{case (tp, offset) => - (GroupTopicPartition(group, TopicPartition.fromKafka(tp)), PartitionOffset(Option(offset).map(_.offset()).getOrElse(0L))) + offsets.map { case (tp, offset) => + ( + GroupTopicPartition(group, TopicPartition.fromKafka(tp)), + PartitionOffset(Option(offset).map(_.offset()).getOrElse(-1L)) + ) } + .filter{case (_, o) => o.offset >= 0} } groupOffsets = offset.foldLeft(Map.empty[GroupTopicPartition, PartitionOffset])((x, y) => x ++ y) } yield groupOffsets From 11861471d029acfae7a3cfdddfb1bd1785735add Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Sun, 25 Jun 2023 18:17:27 +0300 Subject: [PATCH 22/52] [greyhound] parallel consumer - compression and encoding for gaps limit (#35423) GitOrigin-RevId: 10b761cb8fcd926178aebf586e99ed1a8829c734 --- .../greyhound/core/compression/BUILD.bazel | 14 +++++++ .../core/compression/GzipCompression.scala | 39 +++++++++++++++++++ .../dst/greyhound/core/consumer/BUILD.bazel | 1 + .../core/consumer/OffsetsAndGaps.scala | 24 +++++++----- .../greyhound/core/compression/BUILD.bazel | 15 +++++++ .../compression/GzipCompressionTest.scala | 11 ++++++ .../core/consumer/OffsetsAndGapsTest.scala | 11 +++--- 7 files changed, 101 insertions(+), 14 deletions(-) create mode 100644 core/src/main/scala/com/wixpress/dst/greyhound/core/compression/BUILD.bazel create mode 100644 core/src/main/scala/com/wixpress/dst/greyhound/core/compression/GzipCompression.scala create mode 100644 core/src/test/scala/com/wixpress/dst/greyhound/core/compression/BUILD.bazel create mode 100644 core/src/test/scala/com/wixpress/dst/greyhound/core/compression/GzipCompressionTest.scala diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/compression/BUILD.bazel b/core/src/main/scala/com/wixpress/dst/greyhound/core/compression/BUILD.bazel new file mode 100644 index 00000000..dedf42f3 --- /dev/null +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/compression/BUILD.bazel @@ -0,0 +1,14 @@ +package(default_visibility = ["//visibility:public"]) + +# visibility is extended to allow packaging a jar to deploy to maven central +sources(["//core:__subpackages__"]) + +scala_library( + name = "compression", + srcs = [ + ":sources", + ], + deps = [ + "@org_apache_commons_commons_compress", + ], +) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/compression/GzipCompression.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/compression/GzipCompression.scala new file mode 100644 index 00000000..cc8c1191 --- /dev/null +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/compression/GzipCompression.scala @@ -0,0 +1,39 @@ +package com.wixpress.dst.greyhound.core.compression + +import org.apache.commons.compress.utils.IOUtils + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream} +import java.util.zip.{GZIPInputStream, GZIPOutputStream} +import scala.util.Try + +object GzipCompression { + def compress(input: Array[Byte]): Array[Byte] = { + val bos = new ByteArrayOutputStream(input.length) + val gzip = new GZIPOutputStream(bos) + gzip.write(input) + gzip.close() + val compressed = bos.toByteArray + bos.close() + compressed + } + + def decompress(compressed: Array[Byte]): Option[Array[Byte]] = { + val byteStream = new ByteArrayInputStream(compressed) + Try(new GZIPInputStream(byteStream)) + .flatMap(gzipStream => + Try { + val result = IOUtils.toByteArray(gzipStream) + gzipStream.close() + byteStream.close() + result + }.recover { + case e: Throwable => + Try(gzipStream.close()) + Try(byteStream.close()) + e.printStackTrace() + throw e + } + ) + .toOption + } +} diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/BUILD.bazel b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/BUILD.bazel index 007b00c7..584724fb 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/BUILD.bazel +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/BUILD.bazel @@ -14,6 +14,7 @@ scala_library( "@dev_zio_zio_stacktracer_2_12", "//core/src/main/scala/com/wixpress/dst/greyhound/core", "//core/src/main/scala/com/wixpress/dst/greyhound/core/admin", + "//core/src/main/scala/com/wixpress/dst/greyhound/core/compression", "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/domain", "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry", "//core/src/main/scala/com/wixpress/dst/greyhound/core/metrics", diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala index 9255bcbb..d6070e9d 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -1,11 +1,14 @@ package com.wixpress.dst.greyhound.core.consumer +import com.wixpress.dst.greyhound.core.compression.GzipCompression import com.wixpress.dst.greyhound.core.consumer.Gap.GAP_SEPARATOR import com.wixpress.dst.greyhound.core.consumer.OffsetAndGaps.{GAPS_STRING_SEPARATOR, LAST_HANDLED_OFFSET_SEPARATOR} import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, RecordTopicPartition} import com.wixpress.dst.greyhound.core.{Offset, OffsetAndMetadata, TopicPartition} import zio._ +import java.util.Base64 + trait OffsetsAndGaps { def init(committedOffsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] @@ -104,11 +107,15 @@ object OffsetsAndGaps { } def toOffsetsAndMetadata(offsetsAndGaps: Map[TopicPartition, OffsetAndGaps]): Map[TopicPartition, OffsetAndMetadata] = - offsetsAndGaps.mapValues(offsetAndGaps => - OffsetAndMetadata(offsetAndGaps.offset, offsetAndGaps.gapsString) - ) // todo: add encoding and compression to plain gaps string - - def parseGapsString(offsetAndGapsString: String): Option[OffsetAndGaps] = { + offsetsAndGaps.mapValues(offsetAndGaps => OffsetAndMetadata(offsetAndGaps.offset, offsetAndGaps.gapsString)) + + def parseGapsString(rawOffsetAndGapsString: String): Option[OffsetAndGaps] = { + val offsetAndGapsString = + if (rawOffsetAndGapsString.nonEmpty) + new String( + GzipCompression.decompress(Base64.getDecoder.decode(rawOffsetAndGapsString)).getOrElse(Array.empty) + ) + else "" val lastHandledOffsetSeparatorIndex = offsetAndGapsString.indexOf(LAST_HANDLED_OFFSET_SEPARATOR) if (lastHandledOffsetSeparatorIndex < 0) None @@ -152,9 +159,8 @@ case class OffsetAndGaps(offset: Offset, gaps: Seq[Gap], committable: Boolean = def markCommitted: OffsetAndGaps = copy(committable = false) def gapsString: String = { - if (gaps.isEmpty) "" - else - s"${offset.toString}${LAST_HANDLED_OFFSET_SEPARATOR}${gaps.sortBy(_.start).mkString(GAPS_STRING_SEPARATOR)}" + val plainGapsString = s"${offset.toString}${LAST_HANDLED_OFFSET_SEPARATOR}${gaps.sortBy(_.start).mkString(GAPS_STRING_SEPARATOR)}" + Base64.getEncoder.encodeToString(GzipCompression.compress(plainGapsString.getBytes())) } } @@ -167,5 +173,5 @@ object OffsetAndGaps { def apply(offset: Offset, committable: Boolean): OffsetAndGaps = OffsetAndGaps(offset, Seq.empty[Gap], committable) def gapsSize(gaps: Map[TopicPartition, OffsetAndGaps]): Int = - gaps.values.flatMap(_.gaps).map(_.size.toInt).sum + gaps.values.map(_.gaps.size).sum } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/compression/BUILD.bazel b/core/src/test/scala/com/wixpress/dst/greyhound/core/compression/BUILD.bazel new file mode 100644 index 00000000..5447e093 --- /dev/null +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/compression/BUILD.bazel @@ -0,0 +1,15 @@ +package(default_visibility = ["//visibility:public"]) + +sources() + +specs2_unit_test( + name = "compression", + srcs = [ + ":sources", + ], + deps = [ + "//core/src/main/scala/com/wixpress/dst/greyhound/core", + "//core/src/main/scala/com/wixpress/dst/greyhound/core/compression", + "//core/src/test/scala/com/wixpress/dst/greyhound/core/testkit", + ], +) diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/compression/GzipCompressionTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/compression/GzipCompressionTest.scala new file mode 100644 index 00000000..f8409feb --- /dev/null +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/compression/GzipCompressionTest.scala @@ -0,0 +1,11 @@ +package com.wixpress.dst.greyhound.core.compression + +import com.wixpress.dst.greyhound.core.testkit.BaseTestNoEnv + +class GzipCompressionTest extends BaseTestNoEnv { + "GZIPCompressor" should { + "return None for bad input" in { + GzipCompression.decompress("not a gzip".toCharArray.map(_.toByte)) must beNone + } + } +} diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala index 2a3837ca..f21c194b 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala @@ -70,15 +70,16 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { } yield current must havePairs(partition0 -> OffsetAndGaps(102L, Seq()), partition1 -> OffsetAndGaps(204L, Seq(Gap(201L, 202L)))) } - "parse gaps from string" in { - val gaps = Seq(s"10${LAST_HANDLED_OFFSET_SEPARATOR}0${GAP_SEPARATOR}1", s"10${LAST_HANDLED_OFFSET_SEPARATOR}", "") - val expected = Seq(Some(OffsetAndGaps(10, Seq(Gap(0, 1)))), Some(OffsetAndGaps(10, Seq())), None) - gaps.map(OffsetsAndGaps.parseGapsString).must(beEqualTo(expected)) + "parse offsets and gaps correctly" in { + val offsetsAndGaps = Seq(OffsetAndGaps(10, Seq(Gap(0, 1))), OffsetAndGaps(10, Seq())) + val gapsStringsToTest = offsetsAndGaps.map(_.gapsString) ++ Seq("") // use gapsString method to get compressed and encoded strings + val expected = Seq(Some(OffsetAndGaps(10, Seq(Gap(0, 1)))), Some(OffsetAndGaps(10, Seq())), None) + gapsStringsToTest.map(OffsetsAndGaps.parseGapsString).must(beEqualTo(expected)) } } object OffsetGapsTest { - val topic = "some-topic" + val topic = "some-topic" val topicPartition = TopicPartition(topic, 0) } From e1373a5f23da4055ed3462155dfeeea81e45b0ef Mon Sep 17 00:00:00 2001 From: Natan Silnitsky Date: Thu, 29 Jun 2023 19:28:01 +0300 Subject: [PATCH 23/52] greyhound proxy non blocking retries (#35456) * greyhound proxy retry: API + failing test + gh side * extract and unit test current state * remainder * fix UT * fix ITs * try with increasing eventually GitOrigin-RevId: e60cbb04d6d6b603e22954bbe4a8658ef594712a --- .../core/consumer/retry/RetryAttempt.scala | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala index 9de958e8..6b740690 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala @@ -28,6 +28,23 @@ object RetryHeader { val RetryAttempt = "GH_RetryAttempt" } +case class RetryAttemptHeaders( + originalTopic: Option[Topic], + attempt: Option[RetryAttemptNumber], + submittedAt: Option[Instant], + backoff: Option[Duration] +) + +object RetryAttemptHeaders { + def fromHeaders(headers: Headers): Task[RetryAttemptHeaders] = + for { + submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) + backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) + topic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) + attempt <- headers.get(RetryHeader.RetryAttempt, longDeserializer) + } yield RetryAttemptHeaders(topic, attempt.map(_.toInt), submitted, backoff) +} + object RetryAttempt { type RetryAttemptNumber = Int @@ -40,21 +57,6 @@ object RetryAttempt { RetryHeader.RetryAttempt -> toChunk(attempt.attempt.toString) ) - private case class RetryAttemptHeaders( - originalTopic: Option[Topic], - attempt: Option[RetryAttemptNumber], - submittedAt: Option[Instant], - backoff: Option[Duration] - ) - - private def fromHeaders(headers: Headers): Task[RetryAttemptHeaders] = - for { - submitted <- headers.get(RetryHeader.Submitted, instantDeserializer) - backoff <- headers.get(RetryHeader.Backoff, durationDeserializer) - topic <- headers.get[String](RetryHeader.OriginalTopic, StringSerde) - attempt <- headers.get(RetryHeader.RetryAttempt, longDeserializer) - } yield RetryAttemptHeaders(topic, attempt.map(_.toInt), submitted, backoff) - /** @return None on infinite blocking retries */ def maxBlockingAttempts(topic: Topic, retryConfig: Option[RetryConfig]): Option[Int] = retryConfig.map(_.blockingBackoffs(topic)).fold(Option(0)) { @@ -92,6 +94,6 @@ object RetryAttempt { attempt <- hs.attempt } yield RetryAttempt(originalTopic, attempt, submitted, backoff) - fromHeaders(headers).map { hs => maybeNonBlockingAttempt(hs) orElse maybeBlockingAttempt(hs) } + RetryAttemptHeaders.fromHeaders(headers).map { hs => maybeNonBlockingAttempt(hs) orElse maybeBlockingAttempt(hs) } }.catchAll(_ => ZIO.none) } From 797de70a68794ab23405d4d90ed2254c882576e6 Mon Sep 17 00:00:00 2001 From: Noam Berman Date: Sun, 2 Jul 2023 20:22:43 +0300 Subject: [PATCH 24/52] new kafka monitor server for proxy (currently) (#35598) * new kafka monitor server for proxy (currently) #automerge #skipreview * . * . * . GitOrigin-RevId: 156ff1cbde2b11cfb35259600de00b47e1012367 --- .../greyhound/core/admin/AdminClient.scala | 66 ++++++++++++------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala index cb7d7054..17ab8b07 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala @@ -1,26 +1,26 @@ package com.wixpress.dst.greyhound.core.admin -import java.util import com.wixpress.dst.greyhound.core import com.wixpress.dst.greyhound.core.admin.AdminClient.isTopicExistsError +import com.wixpress.dst.greyhound.core.admin.AdminClientMetric.TopicCreateResult.fromExit +import com.wixpress.dst.greyhound.core.admin.AdminClientMetric.{TopicConfigUpdated, TopicCreated, TopicPartitionsIncreased} import com.wixpress.dst.greyhound.core.admin.TopicPropertiesResult.{TopicDoesnExistException, TopicProperties} import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics +import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics._ import com.wixpress.dst.greyhound.core.zioutils.KafkaFutures._ -import com.wixpress.dst.greyhound.core.{CommonGreyhoundConfig, GHThrowable, Group, GroupTopicPartition, Offset, OffsetAndMetadata, Topic, TopicConfig, TopicPartition} +import com.wixpress.dst.greyhound.core._ import org.apache.kafka.clients.admin.AlterConfigOp.OpType import org.apache.kafka.clients.admin.ConfigEntry.ConfigSource -import org.apache.kafka.clients.admin.{AlterConfigOp, Config, ConfigEntry, ListConsumerGroupOffsetsOptions, ListConsumerGroupOffsetsSpec, ListOffsetsOptions, ListOffsetsResult, NewPartitions, NewTopic, OffsetSpec, TopicDescription, AdminClient => KafkaAdminClient, AdminClientConfig => KafkaAdminClientConfig} +import org.apache.kafka.clients.admin.{AlterConfigOp, Config, ConfigEntry, ListConsumerGroupOffsetsOptions, ListConsumerGroupOffsetsSpec, NewPartitions, NewTopic, OffsetSpec, AdminClient => KafkaAdminClient, AdminClientConfig => KafkaAdminClientConfig} +import org.apache.kafka.common import org.apache.kafka.common.config.ConfigResource import org.apache.kafka.common.config.ConfigResource.Type.TOPIC import org.apache.kafka.common.errors.{InvalidTopicException, TopicExistsException, UnknownTopicOrPartitionException} +import zio.ZIO.attemptBlocking import zio.{IO, RIO, Scope, Trace, ZIO} -import GreyhoundMetrics._ -import com.wixpress.dst.greyhound.core.admin.AdminClientMetric.TopicCreateResult.fromExit -import com.wixpress.dst.greyhound.core.admin.AdminClientMetric.{TopicConfigUpdated, TopicCreated, TopicPartitionsIncreased} -import org.apache.kafka.common +import java.util import scala.collection.JavaConverters._ -import zio.ZIO.attemptBlocking trait AdminClient { def shutdown(implicit trace: Trace): RIO[Any, Unit] @@ -56,8 +56,6 @@ trait AdminClient { implicit trace: Trace ): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] -// def groupOffsetsSpecific(requestedTopicPartitions: Map[Group, Set[TopicPartition]])(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] - def groupState(groups: Set[Group])(implicit trace: Trace): RIO[Any, Map[String, GroupState]] def deleteTopic(topic: Topic)(implicit trace: Trace): RIO[Any, Unit] @@ -68,13 +66,12 @@ trait AdminClient { implicit trace: Trace ): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] + def consumerGroupsOffsets( + groups: Map[Group, Option[Set[TopicPartition]]] + )(implicit trace: Trace): RIO[Any, Map[Group, Map[TopicPartition, OffsetAndMetadata]]] + def increasePartitions(topic: Topic, newCount: Int)(implicit trace: Trace): RIO[Any with GreyhoundMetrics, Unit] - /** - * @param useNonIncrementalAlter - * \- [[org.apache.kafka.clients.admin.AdminClient.incrementalAlterConfigs()]] is not supported by older brokers (< 2.3), so if this is - * true, use the deprecated non incremental alter - */ def updateTopicConfigProperties( topic: Topic, configProperties: Map[String, ConfigPropOp], @@ -273,13 +270,14 @@ object AdminClient { rawOffsets = result.asScala.toMap.mapValues(_.asScala.toMap) offset = rawOffsets.map { case (group, offsets) => - offsets.map { case (tp, offset) => - ( - GroupTopicPartition(group, TopicPartition.fromKafka(tp)), - PartitionOffset(Option(offset).map(_.offset()).getOrElse(-1L)) - ) - } - .filter{case (_, o) => o.offset >= 0} + offsets + .map { case (tp, offset) => + ( + GroupTopicPartition(group, TopicPartition.fromKafka(tp)), + PartitionOffset(Option(offset).map(_.offset()).getOrElse(-1L)) + ) + } + .filter { case (_, o) => o.offset >= 0 } } groupOffsets = offset.foldLeft(Map.empty[GroupTopicPartition, PartitionOffset])((x, y) => x ++ y) } yield groupOffsets @@ -346,6 +344,30 @@ object AdminClient { } yield res.asScala.toMap.map { case (tp, om) => (TopicPartition(tp), OffsetAndMetadata(om)) } } + override def consumerGroupsOffsets( + groups: Map[Group, Option[Set[TopicPartition]]] + )(implicit trace: Trace): RIO[Any, Map[Group, Map[TopicPartition, OffsetAndMetadata]]] = + for { + desc <- attemptBlocking( + client + .listConsumerGroupOffsets( + groups + .mapValues(tps => + new ListConsumerGroupOffsetsSpec().topicPartitions(tps.map(_.map(_.asKafka).toList.asJava).orNull) + ) + .asJava + ) + ) + res <- attemptBlocking(groups.map(g => (g._1, desc.partitionsToOffsetAndMetadata(g._1).get()))) + } yield res.map { case (group, o) => + ( + group, + o.asScala.toSeq + .map(om => (TopicPartition.fromKafka(om._1), OffsetAndMetadata(om._2.offset(), om._2.metadata()))) + .toMap + ) + } + override def increasePartitions(topic: Topic, newCount: Int)( implicit trace: Trace ): RIO[GreyhoundMetrics, Unit] = { From d7f277d0ab0e38d6bd5a14d370b7f648c43f4b06 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Thu, 6 Jul 2023 11:54:45 +0300 Subject: [PATCH 25/52] [greyhound] fix OffsetsInitializer metadata bug (#35684) GitOrigin-RevId: d9758275c502278ed9fd845b3781cd92b7152f8a --- .../core/consumer/OffsetsAndGaps.scala | 17 ++++++++----- .../core/consumer/OffsetsInitializer.scala | 8 ++----- .../consumer/OffsetsInitializerTest.scala | 24 +++++++++++++++++++ 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala index d6070e9d..a062ccd5 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -8,6 +8,7 @@ import com.wixpress.dst.greyhound.core.{Offset, OffsetAndMetadata, TopicPartitio import zio._ import java.util.Base64 +import scala.util.Try trait OffsetsAndGaps { def init(committedOffsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] @@ -111,11 +112,9 @@ object OffsetsAndGaps { def parseGapsString(rawOffsetAndGapsString: String): Option[OffsetAndGaps] = { val offsetAndGapsString = - if (rawOffsetAndGapsString.nonEmpty) - new String( - GzipCompression.decompress(Base64.getDecoder.decode(rawOffsetAndGapsString)).getOrElse(Array.empty) - ) - else "" + if (rawOffsetAndGapsString.nonEmpty) { + Try(new String(GzipCompression.decompress(Base64.getDecoder.decode(rawOffsetAndGapsString)).getOrElse(Array.empty))).getOrElse("") + } else "" val lastHandledOffsetSeparatorIndex = offsetAndGapsString.indexOf(LAST_HANDLED_OFFSET_SEPARATOR) if (lastHandledOffsetSeparatorIndex < 0) None @@ -132,13 +131,19 @@ object OffsetsAndGaps { } } - def firstGapOffset(gapsString: String): Option[Offset] = { + private def firstGapOffset(gapsString: String): Option[Offset] = { val maybeOffsetAndGaps = parseGapsString(gapsString) maybeOffsetAndGaps match { case Some(offsetAndGaps) if offsetAndGaps.gaps.nonEmpty => Some(offsetAndGaps.gaps.minBy(_.start).start) case _ => None } } + + def gapsSmallestOffsets(offsets: Map[TopicPartition, Option[OffsetAndMetadata]]): Map[TopicPartition, OffsetAndMetadata] = + offsets + .collect { case (tp, Some(om)) => tp -> om } + .map(tpom => tpom._1 -> (firstGapOffset(tpom._2.metadata), tpom._2.metadata)) + .collect { case (tp, (Some(offset), metadata)) => tp -> OffsetAndMetadata(offset, metadata) } } case class Gap(start: Offset, end: Offset) { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala index 234e7ba9..31b5d2bd 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala @@ -2,7 +2,6 @@ package com.wixpress.dst.greyhound.core.consumer import java.time.Clock import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.{CommittedMissingOffsets, CommittedMissingOffsetsFailed, SkippedGapsOnInitialization} -import com.wixpress.dst.greyhound.core.consumer.OffsetsAndGaps.{firstGapOffset, parseGapsString} import com.wixpress.dst.greyhound.core.{ClientId, Group, Offset, OffsetAndMetadata, TopicPartition} import com.wixpress.dst.greyhound.core.metrics.{GreyhoundMetric, GreyhoundMetrics} import zio.{URIO, ZIO} @@ -82,10 +81,7 @@ class OffsetsInitializer( val seekToEndPartitions = seekTo.collect { case (k, SeekTo.SeekToEnd) => k }.toSet val toPause = seekTo.collect { case (k, SeekTo.Pause) => k } val seekToEndOffsets = fetchEndOffsets(seekToEndPartitions, timeout).mapValues(OffsetAndMetadata.apply) - val gapsSmallestOffsets = currentCommittedOffsets - .collect { case (tp, Some(om)) => tp -> om } - .map(tpom => tpom._1 -> (firstGapOffset(tpom._2.metadata), tpom._2.metadata)) - .collect { case (tp, (Some(offset), metadata)) => tp -> OffsetAndMetadata(offset, metadata) } + val gapsSmallestOffsets = OffsetsAndGaps.gapsSmallestOffsets(currentCommittedOffsets) val seekToGapsOffsets = if (parallelConsumer) gapsSmallestOffsets else Map.empty val toOffsets = seekToOffsets ++ seekToEndOffsets ++ seekToGapsOffsets @@ -96,7 +92,7 @@ class OffsetsInitializer( private def reportSkippedGaps(currentCommittedOffsets: Map[TopicPartition, Option[OffsetAndMetadata]]) = { val skippedGaps = currentCommittedOffsets .collect { case (tp, Some(om)) => tp -> om } - .map(tpom => tpom._1 -> parseGapsString(tpom._2.metadata)) + .map(tpom => tpom._1 -> OffsetsAndGaps.parseGapsString(tpom._2.metadata)) .collect { case (tp, Some(gaps)) => tp -> gaps } reporter(SkippedGapsOnInitialization(clientId, group, skippedGaps)) } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala index 2b8ffb0f..32aeb11a 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializerTest.scala @@ -192,6 +192,24 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { ) } + "commit offsets even when unknown metadata string is present in committed offsets" in + new ctx() { + givenCommittedOffsetsAndMetadata(partitions)(Map(p1 -> OffsetAndMetadata(randomInt, randomStr))) + givenPositions(p2 -> p2Pos, p3 -> p3Pos) + + committer.initializeOffsets(partitions) + + val missingOffsets = Map( + p2 -> p2Pos, + p3 -> p3Pos + ) + there was + one(offsetOps).commitWithMetadata( + missingOffsets.mapValues(OffsetAndMetadata(_)), + timeout + ) + } + class ctx(val seekTo: Map[TopicPartition, SeekTo] = Map.empty, offsetReset: OffsetReset = OffsetReset.Latest) extends Scope { private val metricsLogRef = new AtomicReference(Seq.empty[GreyhoundMetric]) def reported = metricsLogRef.get @@ -226,6 +244,12 @@ class OffsetsInitializerTest extends SpecificationWithJUnit with Mockito { offsetOps.committedWithMetadata(partitions, timeout) returns result.mapValues(OffsetAndMetadata(_)) } + def givenCommittedOffsetsAndMetadata(partitions: Set[TopicPartition], timeout: zio.Duration = timeout)( + result: Map[TopicPartition, OffsetAndMetadata] + ) = { + offsetOps.committedWithMetadata(partitions, timeout) returns result + } + def givenEndOffsets(partitions: Set[TopicPartition], timeout: zio.Duration = timeout)(result: Map[TopicPartition, Long]) = { offsetOps.endOffsets(partitions, timeout) returns result } From 975c135b4b9830ae0bb12081fb43342ee51d326f Mon Sep 17 00:00:00 2001 From: Natan Silnitsky Date: Thu, 13 Jul 2023 11:37:52 +0300 Subject: [PATCH 26/52] ConsumerIT: add a test: allow to override offsetReset with autoResetOffset from extra properties (#35845) ConsumerIT: add a test: allow to override offsetReset with autoResetOffset from extra properties #automerge GitOrigin-RevId: a9159b9c15445daa243e84a0b4bcaeae8db8b29d --- .../dst/greyhound/core/ConsumerIT.scala | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala index d644c9aa..107810f1 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala @@ -390,6 +390,41 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { } } + s"allow to override offsetReset with autoResetOffset from extra properties${parallelConsumerString(useParallelConsumer)}" in + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + _ <- ZIO.debug(">>>> starting test: earliestTest") + topic <- kafka.createRandomTopic(prefix = "core-from-earliest") + group <- randomGroup + + queue <- Queue.unbounded[ConsumerRecord[String, String]] + handler = RecordHandler(queue.offer(_: ConsumerRecord[String, String])) + .withDeserializers(StringSerde, StringSerde) + .ignore + + record = ProducerRecord(topic, "bar", Some("foo")) + _ <- producer.produce(record, StringSerde, StringSerde) + + message <- RecordConsumer + .make( + configFor( + kafka, + group, + topic, + mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ) + .copy(offsetReset = Latest, extraProperties = Map("auto.offset.reset" -> "earliest")), + handler + ) + .flatMap { _ => queue.take } + .timeout(10.seconds) + } yield { + message.get must (beRecordWithKey("foo") and beRecordWithValue("bar")) + } + } + s"not lose messages while throttling after rebalance${parallelConsumerString(useParallelConsumer)}" in ZIO.scoped { for { From b69b93bca41f6f7ea7bd2fd981dd35700a420cb7 Mon Sep 17 00:00:00 2001 From: Natan Silnitsky Date: Thu, 13 Jul 2023 12:35:08 +0300 Subject: [PATCH 27/52] gh RecordConsumer - add visibility to extra properties setup with auto.offset.reset (#35848) GitOrigin-RevId: 7b878399acd1e25003ccae071e817ed22b2fba21 --- .../greyhound/core/consumer/RecordConsumer.scala | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala index 4bc05bb5..19c05ee1 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala @@ -62,16 +62,23 @@ object RecordConsumer { * concurrent between partitions; order is guaranteed to be maintained within the same partition. */ def make[R, E]( - config: RecordConsumerConfig, - handler: RecordHandler[R, E, Chunk[Byte], Chunk[Byte]], - createConsumerOverride: Option[ConsumerConfig => RIO[GreyhoundMetrics with Scope, Consumer]] = None + config: RecordConsumerConfig, + handler: RecordHandler[R, E, Chunk[Byte], Chunk[Byte]], + createConsumerOverride: Option[ConsumerConfig => RIO[GreyhoundMetrics with Scope, Consumer]] = None )(implicit trace: Trace, tag: Tag[Env]): ZIO[R with Env with Scope with GreyhoundMetrics, Throwable, RecordConsumer[R with Env]] = ZIO .acquireRelease( for { consumerShutdown <- AwaitShutdown.make _ <- GreyhoundMetrics - .report(CreatingConsumer(config.clientId, config.group, config.bootstrapServers, config.consumerAttributes)) + .report( + CreatingConsumer( + config.clientId, + config.group, + config.bootstrapServers, + config.consumerAttributes ++ config.extraProperties + ) + ) _ <- validateRetryPolicy(config) consumerSubscriptionRef <- Ref.make[ConsumerSubscription](config.initialSubscription) From b3e9658fc1b2222618c48b595a6bfdfc2cfb9322 Mon Sep 17 00:00:00 2001 From: Natan Silnitsky Date: Sun, 16 Jul 2023 11:28:02 +0300 Subject: [PATCH 28/52] gh RecordConsumer - allow to override offsetReset with autoResetOffset from extra properties taking into account a non-zero rewindUncommittedOffsetsBy (#35863) * gh RecordConsumer - allow to override offsetReset with autoResetOffset from extra properties taking into account a non-zero rewindUncommittedOffsetsBy #automerge * fix tests depending on com.wixpress.greyhound.GreyhoundTestingSupport * try another tactic to fix tests * fix build file #automerge GitOrigin-RevId: dcf84b5740a3f99e1a637fca84c0ef6059b0e1a9 --- .../dst/greyhound/core/ConsumerIT.scala | 63 +++++------ .../greyhound/core/consumer/Consumer.scala | 104 ++++++++++-------- 2 files changed, 89 insertions(+), 78 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala index 107810f1..23f0293d 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala @@ -1,8 +1,5 @@ package com.wixpress.dst.greyhound.core -import java.util.concurrent.{TimeUnit, TimeoutException} -import java.util.regex.Pattern -import java.util.regex.Pattern.compile import com.wixpress.dst.greyhound.core.Serdes._ import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.PollingFailed import com.wixpress.dst.greyhound.core.consumer.EventLoop.Handler @@ -14,18 +11,18 @@ import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetric import com.wixpress.dst.greyhound.core.producer.ProducerRecord import com.wixpress.dst.greyhound.core.testkit.RecordMatchers._ import com.wixpress.dst.greyhound.core.testkit.{eventuallyTimeoutFail, eventuallyZ, AwaitableRef, BaseTestWithSharedEnv, TestMetrics} -import com.wixpress.dst.greyhound.core.zioutils.CountDownLatch -import com.wixpress.dst.greyhound.core.zioutils.Gate +import com.wixpress.dst.greyhound.core.zioutils.{CountDownLatch, Gate} import com.wixpress.dst.greyhound.testenv.ITEnv import com.wixpress.dst.greyhound.testenv.ITEnv.{clientId, _} import com.wixpress.dst.greyhound.testkit.ManagedKafka import org.specs2.specification.core.Fragments -import zio.Clock -import zio.stm.{STM, TRef} -import zio._ -import zio.{Clock, Console, _} import zio.Clock.sleep -import zio.managed._ +import zio.{Clock, _} +import zio.stm.{STM, TRef} + +import java.util.concurrent.{TimeUnit, TimeoutException} +import java.util.regex.Pattern +import java.util.regex.Pattern.compile class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { sequential @@ -390,36 +387,40 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { } } - s"allow to override offsetReset with autoResetOffset from extra properties${parallelConsumerString(useParallelConsumer)}" in + s"allow to override offsetReset with autoResetOffset from extra properties taking into account a non-zero rewindUncommittedOffsetsBy${parallelConsumerString(useParallelConsumer)}" in ZIO.scoped { for { - r <- getShared + r <- getShared TestResources(kafka, producer) = r - _ <- ZIO.debug(">>>> starting test: earliestTest") - topic <- kafka.createRandomTopic(prefix = "core-from-earliest") - group <- randomGroup + _ <- ZIO.debug(">>>> starting test: earliestTest") + topic <- kafka.createRandomTopic(prefix = "core-from-earliest") + group <- randomGroup - queue <- Queue.unbounded[ConsumerRecord[String, String]] + queue <- Queue.unbounded[ConsumerRecord[String, String]] handler = RecordHandler(queue.offer(_: ConsumerRecord[String, String])) - .withDeserializers(StringSerde, StringSerde) - .ignore + .withDeserializers(StringSerde, StringSerde) + .ignore record = ProducerRecord(topic, "bar", Some("foo")) - _ <- producer.produce(record, StringSerde, StringSerde) + _ <- producer.produce(record, StringSerde, StringSerde) message <- RecordConsumer - .make( - configFor( - kafka, - group, - topic, - mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) - ) - .copy(offsetReset = Latest, extraProperties = Map("auto.offset.reset" -> "earliest")), - handler - ) - .flatMap { _ => queue.take } - .timeout(10.seconds) + .make( + configFor( + kafka, + group, + topic, + mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) + ) + .copy( + offsetReset = Latest, // "default" + extraProperties = Map("auto.offset.reset" -> "earliest"), // overriden by "custom properties" + rewindUncommittedOffsetsBy = 1.millis // non-zero so that we can check that it's taken into account + ), + handler + ) + .flatMap { _ => queue.take } + .timeout(10.seconds) } yield { message.get must (beRecordWithKey("foo") and beRecordWithValue("bar")) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index 8103b68e..480994af 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -20,14 +20,14 @@ import scala.util.{Random, Try} trait Consumer { def subscribe[R1]( - topics: Set[Topic], - rebalanceListener: RebalanceListener[R1] = RebalanceListener.Empty - )(implicit trace: Trace): RIO[GreyhoundMetrics with R1, Unit] + topics: Set[Topic], + rebalanceListener: RebalanceListener[R1] = RebalanceListener.Empty + )(implicit trace: Trace): RIO[GreyhoundMetrics with R1, Unit] def subscribePattern[R1]( - topicStartsWith: Pattern, - rebalanceListener: RebalanceListener[R1] = RebalanceListener.Empty - )(implicit trace: Trace): RIO[GreyhoundMetrics with R1, Unit] + topicStartsWith: Pattern, + rebalanceListener: RebalanceListener[R1] = RebalanceListener.Empty + )(implicit trace: Trace): RIO[GreyhoundMetrics with R1, Unit] def poll(timeout: Duration)(implicit trace: Trace): RIO[GreyhoundMetrics, Records] @@ -98,17 +98,17 @@ object Consumer { // if a partition with no committed offset is revoked during processing // we also may want to seek forward to some given initial offsets offsetsInitializer <- OffsetsInitializer - .make( - cfg.clientId, - cfg.groupId, - UnsafeOffsetOperations.make(consumer), - timeout = 10.seconds, - timeoutIfSeek = 10.seconds, - initialSeek = cfg.initialSeek, - rewindUncommittedOffsetsBy = cfg.rewindUncommittedOffsetsByMillis.millis, - offsetResetIsEarliest = cfg.offsetReset == OffsetReset.Earliest, - parallelConsumer = cfg.useParallelConsumer - ) + .make( + cfg.clientId, + cfg.groupId, + UnsafeOffsetOperations.make(consumer), + timeout = 10.seconds, + timeoutIfSeek = 10.seconds, + initialSeek = cfg.initialSeek, + rewindUncommittedOffsetsBy = cfg.rewindUncommittedOffsetsByMillis.millis, + offsetResetIsEarliest = cfg.offsetResetIsEarliest, + parallelConsumer = cfg.useParallelConsumer + ) } yield { new Consumer { override def subscribePattern[R1](topicStartsWith: Pattern, rebalanceListener: RebalanceListener[R1])( @@ -156,24 +156,31 @@ object Consumer { .map(_.asScala.collect { case (tp: KafkaTopicPartition, o: KafkaOffsetAndMetadata) => (TopicPartition(tp), o.offset) }.toMap) override def committedOffsetsAndMetadata( - partitions: NonEmptySet[TopicPartition] - )(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] = + partitions: NonEmptySet[TopicPartition] + )(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] = withConsumerBlocking(_.committed(kafkaPartitions(partitions))) - .map(_.asScala.collect { case (tp: KafkaTopicPartition, om: KafkaOffsetAndMetadata) => (TopicPartition(tp), OffsetAndMetadata(om.offset, om.metadata))}.toMap) + .map( + _.asScala + .collect { + case (tp: KafkaTopicPartition, om: KafkaOffsetAndMetadata) => + (TopicPartition(tp), OffsetAndMetadata(om.offset, om.metadata)) + } + .toMap + ) override def commit(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, cfg.commitMetadataString)))) } override def commitWithMetadata( - offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata] - )(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { + offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata] + )(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(offsetsAndMetadata))) } override def commitOnRebalance( - offsets: Map[TopicPartition, Offset] - )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { + offsets: Map[TopicPartition, Offset] + )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { val kOffsets = kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, cfg.commitMetadataString)) // we can't actually call commit here, as it needs to be called from the same // thread, that triggered poll(), so we return the commit action as thunk @@ -181,8 +188,8 @@ object Consumer { } override def commitWithMetadataOnRebalance( - offsets: Map[TopicPartition, OffsetAndMetadata] - )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = + offsets: Map[TopicPartition, OffsetAndMetadata] + )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = ZIO.succeed(DelayedRebalanceEffect(consumer.commitSync(kafkaOffsetsAndMetaData(offsets)))) override def pause(partitions: Set[TopicPartition])(implicit trace: Trace): ZIO[Any, IllegalStateException, Unit] = @@ -234,8 +241,8 @@ object Consumer { semaphore.withPermit(f(consumer)) override def offsetsForTimes( - topicPartitionsOnTimestamp: Map[TopicPartition, Long] - )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] = { + topicPartitionsOnTimestamp: Map[TopicPartition, Long] + )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] = { val kafkaTopicPartitionsOnTimestamp = topicPartitionsOnTimestamp.map { case (tp, ts) => tp.asKafka -> ts } withConsumerBlocking(_.offsetsForTimes(kafkaTopicPartitionsOnTimestamp.mapValues(l => new lang.Long(l)).toMap.asJava)) .map( @@ -291,9 +298,9 @@ object Consumer { } private def makeConsumer( - config: ConsumerConfig, - semaphore: Semaphore - )(implicit trace: Trace): RIO[GreyhoundMetrics with Scope, KafkaConsumer[Chunk[Byte], Chunk[Byte]]] = { + config: ConsumerConfig, + semaphore: Semaphore + )(implicit trace: Trace): RIO[GreyhoundMetrics with Scope, KafkaConsumer[Chunk[Byte], Chunk[Byte]]] = { val acquire = ZIO.attemptBlocking(new KafkaConsumer(config.properties, deserializer, deserializer)) def close(consumer: KafkaConsumer[_, _]) = attemptBlocking(consumer.close()) @@ -306,19 +313,19 @@ object Consumer { } case class ConsumerConfig( - bootstrapServers: String, - groupId: Group, - clientId: ClientId = s"wix-consumer-${Random.alphanumeric.take(5).mkString}", - offsetReset: OffsetReset = OffsetReset.Latest, - extraProperties: Map[String, String] = Map.empty, - additionalListener: RebalanceListener[Any] = RebalanceListener.Empty, - initialSeek: InitialOffsetsSeek = InitialOffsetsSeek.default, - consumerAttributes: Map[String, String] = Map.empty, - decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, - commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, - rewindUncommittedOffsetsByMillis: Long = 0L, - useParallelConsumer: Boolean = false - ) extends CommonGreyhoundConfig { + bootstrapServers: String, + groupId: Group, + clientId: ClientId = s"wix-consumer-${Random.alphanumeric.take(5).mkString}", + offsetReset: OffsetReset = OffsetReset.Latest, + extraProperties: Map[String, String] = Map.empty, + additionalListener: RebalanceListener[Any] = RebalanceListener.Empty, + initialSeek: InitialOffsetsSeek = InitialOffsetsSeek.default, + consumerAttributes: Map[String, String] = Map.empty, + decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, + commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, + rewindUncommittedOffsetsByMillis: Long = 0L, + useParallelConsumer: Boolean = false +) extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = Map( KafkaConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers, @@ -334,6 +341,9 @@ case class ConsumerConfig( KafkaConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false" ) ++ extraProperties + def offsetResetIsEarliest: Boolean = + extraProperties.get("auto.offset.reset").map(_ == "earliest").getOrElse(offsetReset == OffsetReset.Earliest) + def withExtraProperties(props: (String, String)*) = copy(extraProperties = extraProperties ++ props) @@ -394,9 +404,9 @@ object UnsafeOffsetOperations { } override def committedWithMetadata( - partitions: NonEmptySet[TopicPartition], - timeout: zio.Duration - ): Map[TopicPartition, OffsetAndMetadata] = { + partitions: NonEmptySet[TopicPartition], + timeout: zio.Duration + ): Map[TopicPartition, OffsetAndMetadata] = { consumer .committed(partitions.map(_.asKafka).asJava, timeout) .asScala From c19bee0a5848575bc678ea0fc25cf63daa5617e5 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Sun, 16 Jul 2023 12:44:01 +0300 Subject: [PATCH 29/52] [greyhound] gaps limit calculation script (#35898) [greyhound] gaps limit calculation script #automerge GitOrigin-RevId: 75a060f20589036570bad2077d3e906fa8db9bcd --- .../wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala index a062ccd5..7e68e153 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -167,6 +167,8 @@ case class OffsetAndGaps(offset: Offset, gaps: Seq[Gap], committable: Boolean = val plainGapsString = s"${offset.toString}${LAST_HANDLED_OFFSET_SEPARATOR}${gaps.sortBy(_.start).mkString(GAPS_STRING_SEPARATOR)}" Base64.getEncoder.encodeToString(GzipCompression.compress(plainGapsString.getBytes())) } + + def plainGapsString: String = s"${offset.toString}${LAST_HANDLED_OFFSET_SEPARATOR}${gaps.sortBy(_.start).mkString(GAPS_STRING_SEPARATOR)}" } object OffsetAndGaps { From ee3e9501deab9ef42686e7f49eba18fd558c3367 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Sun, 16 Jul 2023 13:57:06 +0300 Subject: [PATCH 30/52] [greyhound] parallel consumer - update gaps limit (#35902) [greyhound] parallel consumer - update gaps limit #automerge GitOrigin-RevId: bebbfb8b314cb96c4f7ce09d79686f2163bc7a6d --- .../com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala | 2 +- .../com/wixpress/dst/greyhound/core/consumer/EventLoop.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index 44441099..5558ef39 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -56,7 +56,7 @@ object Dispatcher { maxParallelism: Int = 1, updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit] = _ => ZIO.unit, currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, OffsetAndGaps]] = _ => ZIO.succeed(Map.empty), - gapsSizeLimit: Int = 256, + gapsSizeLimit: Int = 500, init: Promise[Nothing, Unit] )(implicit trace: Trace): UIO[Dispatcher[R]] = for { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index afe474a5..d5aaa3b5 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -412,7 +412,7 @@ object EventLoopConfig { startPaused = false, consumePartitionInParallel = false, maxParallelism = 1, - gapsSizeLimit = 256 // todo: calculate actual gaps limit based on the max metadata size + gapsSizeLimit = 500 ) } From 03122e3d857eba411ce1fe73a4a0a180a16a4441 Mon Sep 17 00:00:00 2001 From: Noam Berman Date: Mon, 17 Jul 2023 00:32:34 +0300 Subject: [PATCH 31/52] [gh-consumer-proxy] fix: if position is lower than beginning offset - move forward to first offset (#35924) GitOrigin-RevId: 881fe31e29499ec28bfbd5cd6f526feac6b4d363 --- .../greyhound/core/admin/AdminClient.scala | 338 ++++++++++-------- 1 file changed, 181 insertions(+), 157 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala index 17ab8b07..5e762468 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/admin/AdminClient.scala @@ -11,7 +11,7 @@ import com.wixpress.dst.greyhound.core.zioutils.KafkaFutures._ import com.wixpress.dst.greyhound.core._ import org.apache.kafka.clients.admin.AlterConfigOp.OpType import org.apache.kafka.clients.admin.ConfigEntry.ConfigSource -import org.apache.kafka.clients.admin.{AlterConfigOp, Config, ConfigEntry, ListConsumerGroupOffsetsOptions, ListConsumerGroupOffsetsSpec, NewPartitions, NewTopic, OffsetSpec, AdminClient => KafkaAdminClient, AdminClientConfig => KafkaAdminClientConfig} +import org.apache.kafka.clients.admin.{AdminClient => KafkaAdminClient, AdminClientConfig => KafkaAdminClientConfig, AlterConfigOp, Config, ConfigEntry, ListConsumerGroupOffsetsOptions, ListConsumerGroupOffsetsSpec, NewPartitions, NewTopic, OffsetSpec} import org.apache.kafka.common import org.apache.kafka.common.config.ConfigResource import org.apache.kafka.common.config.ConfigResource.Type.TOPIC @@ -28,7 +28,11 @@ trait AdminClient { def listTopics()(implicit trace: Trace): RIO[Any, Set[String]] def listEndOffsets( - tps: Set[TopicPartition] + tps: Set[TopicPartition] + )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] + + def listBeginningOffsets( + tps: Set[TopicPartition] )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] def topicExists(topic: String)(implicit trace: Trace): RIO[Any, Boolean] @@ -36,8 +40,8 @@ trait AdminClient { def topicsExist(topics: Set[Topic])(implicit trace: Trace): ZIO[Any, Throwable, Map[Topic, Boolean]] def createTopics( - configs: Set[TopicConfig], - ignoreErrors: Throwable => Boolean = isTopicExistsError + configs: Set[TopicConfig], + ignoreErrors: Throwable => Boolean = isTopicExistsError )(implicit trace: Trace): RIO[GreyhoundMetrics, Map[String, Option[Throwable]]] def numberOfBrokers(implicit trace: Trace): RIO[Any, Int] @@ -45,7 +49,7 @@ trait AdminClient { def propertiesFor(topics: Set[Topic])(implicit trace: Trace): RIO[Any, Map[Topic, TopicPropertiesResult]] def commit(group: Group, commits: Map[TopicPartition, OffsetAndMetadata])( - implicit trace: Trace + implicit trace: Trace ): ZIO[Any, Throwable, Unit] def listGroups()(implicit trace: Trace): RIO[Any, Set[String]] @@ -53,7 +57,7 @@ trait AdminClient { def groupOffsets(groups: Set[Group])(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] def groupOffsetsSpecific(requestedTopicPartitions: Map[Group, Set[TopicPartition]])( - implicit trace: Trace + implicit trace: Trace ): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] def groupState(groups: Set[Group])(implicit trace: Trace): RIO[Any, Map[String, GroupState]] @@ -63,19 +67,19 @@ trait AdminClient { def describeConsumerGroups(groupIds: Set[Group])(implicit trace: Trace): RIO[Any, Map[Group, ConsumerGroupDescription]] def consumerGroupOffsets(groupId: Group, onlyPartitions: Option[Set[TopicPartition]] = None)( - implicit trace: Trace + implicit trace: Trace ): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] def consumerGroupsOffsets( - groups: Map[Group, Option[Set[TopicPartition]]] + groups: Map[Group, Option[Set[TopicPartition]]] )(implicit trace: Trace): RIO[Any, Map[Group, Map[TopicPartition, OffsetAndMetadata]]] def increasePartitions(topic: Topic, newCount: Int)(implicit trace: Trace): RIO[Any with GreyhoundMetrics, Unit] def updateTopicConfigProperties( - topic: Topic, - configProperties: Map[String, ConfigPropOp], - useNonIncrementalAlter: Boolean = false + topic: Topic, + configProperties: Map[String, ConfigPropOp], + useNonIncrementalAlter: Boolean = false )(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] def attributes: Map[String, String] @@ -145,13 +149,14 @@ object AdminClient { .values() .asScala .headOption - .map { case (_, topicResult) => - topicResult.asZio.either.flatMap { - case Right(_) => ZIO.succeed(true) - case Left(_: UnknownTopicOrPartitionException) => ZIO.succeed(false) - case Left(_: InvalidTopicException) => ZIO.succeed(false) - case Left(ex) => ZIO.fail(ex) - } + .map { + case (_, topicResult) => + topicResult.asZio.either.flatMap { + case Right(_) => ZIO.succeed(true) + case Left(_: UnknownTopicOrPartitionException) => ZIO.succeed(false) + case Left(_: InvalidTopicException) => ZIO.succeed(false) + case Left(ex) => ZIO.fail(ex) + } } .getOrElse(ZIO.succeed(false)) } @@ -159,35 +164,37 @@ object AdminClient { override def topicsExist(topics: Set[Topic])(implicit trace: Trace): ZIO[Any, Throwable, Map[Topic, Boolean]] = attemptBlocking(client.describeTopics(topics.asJava)).flatMap { result => ZIO - .foreach(result.values().asScala.toSeq) { case (topic, topicResult) => - topicResult.asZio.either.flatMap { - case Right(_) => ZIO.succeed(topic -> true) - case Left(_: UnknownTopicOrPartitionException) => ZIO.succeed(topic -> false) - case Left(ex) => ZIO.fail(ex) - } + .foreach(result.values().asScala.toSeq) { + case (topic, topicResult) => + topicResult.asZio.either.flatMap { + case Right(_) => ZIO.succeed(topic -> true) + case Left(_: UnknownTopicOrPartitionException) => ZIO.succeed(topic -> false) + case Left(ex) => ZIO.fail(ex) + } } .map(_.toMap) } override def createTopics( - configs: Set[TopicConfig], - ignoreErrors: Throwable => Boolean = isTopicExistsError + configs: Set[TopicConfig], + ignoreErrors: Throwable => Boolean = isTopicExistsError )(implicit trace: Trace): RIO[GreyhoundMetrics, Map[String, Option[Throwable]]] = { val configsByTopic = configs.map(c => c.name -> c).toMap attemptBlocking(client.createTopics(configs.map(toNewTopic).asJava)).flatMap { result => ZIO - .foreach(result.values.asScala.toSeq) { case (topic, topicResult) => - topicResult.asZio.unit - .reporting(res => - TopicCreated( - topic, - configsByTopic(topic).partitions, - attributes, - res.mapExit(fromExit(isTopicExistsError)) + .foreach(result.values.asScala.toSeq) { + case (topic, topicResult) => + topicResult.asZio.unit + .reporting(res => + TopicCreated( + topic, + configsByTopic(topic).partitions, + attributes, + res.mapExit(fromExit(isTopicExistsError)) + ) ) - ) - .either - .map(topic -> _.left.toOption.filterNot(ignoreErrors)) + .either + .map(topic -> _.left.toOption.filterNot(ignoreErrors)) } .map(_.toMap) } @@ -198,7 +205,7 @@ object AdminClient { .flatMap(_.nodes().asZio.map(_.size)) override def propertiesFor( - topics: Set[Topic] + topics: Set[Topic] )(implicit trace: Trace): RIO[Any, Map[Topic, TopicPropertiesResult]] = (describeConfigs(client, topics) zipPar describePartitions(client, topics)).map { case (configsPerTopic, partitionsAndReplicationPerTopic) => @@ -219,17 +226,33 @@ object AdminClient { topics <- result.names().asZio } yield topics.asScala.toSet + override def listBeginningOffsets( + tps: Set[TopicPartition] + )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] = { + val j: java.util.Map[org.apache.kafka.common.TopicPartition, OffsetSpec] = + tps.map { tp => (tp.asKafka, OffsetSpec.earliest()) }.toMap.asJava + + for { + result <- attemptBlocking(client.listOffsets(j)) + results <- result.all.asZio.map(_.asScala.toMap.map { + case (tp, offset) => + (TopicPartition.fromKafka(tp), offset.offset()) + }) + } yield results + } + override def listEndOffsets( - tps: Set[TopicPartition] + tps: Set[TopicPartition] )(implicit trace: Trace): RIO[Any, Map[TopicPartition, Offset]] = { val j: java.util.Map[org.apache.kafka.common.TopicPartition, OffsetSpec] = tps.map { tp => (tp.asKafka, OffsetSpec.latest()) }.toMap.asJava for { - result <- attemptBlocking(client.listOffsets(j)) - results <- result.all.asZio.map(_.asScala.toMap.map { case (tp, offset) => - (TopicPartition.fromKafka(tp), offset.offset()) - }) + result <- attemptBlocking(client.listOffsets(j)) + results <- result.all.asZio.map(_.asScala.toMap.map { + case (tp, offset) => + (TopicPartition.fromKafka(tp), offset.offset()) + }) } yield results } @@ -243,7 +266,7 @@ object AdminClient { } yield groups.asScala.map(_.groupId()).toSet override def commit(group: Group, commits: Map[TopicPartition, OffsetAndMetadata])( - implicit trace: Trace + implicit trace: Trace ): ZIO[Any, Throwable, Unit] = attemptBlocking( client @@ -251,68 +274,66 @@ object AdminClient { ).unit override def groupOffsetsSpecific( - requestedTopicPartitions: Map[Group, Set[TopicPartition]] + requestedTopicPartitions: Map[Group, Set[TopicPartition]] )(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] = for { - result <- ZIO.flatten( - ZIO - .attemptBlocking( - client.listConsumerGroupOffsets( - requestedTopicPartitions - .mapValues(tps => - new ListConsumerGroupOffsetsSpec().topicPartitions(tps.map(_.asKafka).asJavaCollection) - ) - .asJava - ) - ) - .map(_.all.asZio) - ) - rawOffsets = result.asScala.toMap.mapValues(_.asScala.toMap) - offset = - rawOffsets.map { case (group, offsets) => - offsets - .map { case (tp, offset) => - ( - GroupTopicPartition(group, TopicPartition.fromKafka(tp)), - PartitionOffset(Option(offset).map(_.offset()).getOrElse(-1L)) - ) - } - .filter { case (_, o) => o.offset >= 0 } + result <- ZIO.flatten( + ZIO + .attemptBlocking( + client.listConsumerGroupOffsets( + requestedTopicPartitions + .mapValues(tps => new ListConsumerGroupOffsetsSpec().topicPartitions(tps.map(_.asKafka).asJavaCollection)) + .asJava + ) + ) + .map(_.all.asZio) + ) + rawOffsets = result.asScala.toMap.mapValues(_.asScala.toMap) + offset = + rawOffsets.map { + case (group, offsets) => + offsets + .map { + case (tp, offset) => + ( + GroupTopicPartition(group, TopicPartition.fromKafka(tp)), + PartitionOffset(Option(offset).map(_.offset()).getOrElse(-1L)) + ) + } + .filter { case (_, o) => o.offset >= 0 } } groupOffsets = offset.foldLeft(Map.empty[GroupTopicPartition, PartitionOffset])((x, y) => x ++ y) } yield groupOffsets override def groupOffsets( - groups: Set[String] + groups: Set[String] )(implicit trace: Trace): RIO[Any, Map[GroupTopicPartition, PartitionOffset]] = for { - result <- ZIO.foreach(groups)(group => attemptBlocking(group -> client.listConsumerGroupOffsets(group))) + result <- ZIO.foreach(groups)(group => attemptBlocking(group -> client.listConsumerGroupOffsets(group))) // TODO: remove ._1 , ._2 rawOffsetsEffects = result.toMap.mapValues(_.partitionsToOffsetAndMetadata().asZio) - offsetsEffects = + offsetsEffects = rawOffsetsEffects.map(offset => offset._2.map(f => - f.asScala.map(p => - p.copy(GroupTopicPartition(offset._1, core.TopicPartition(p._1)), PartitionOffset(p._2.offset())) - ) + f.asScala.map(p => p.copy(GroupTopicPartition(offset._1, core.TopicPartition(p._1)), PartitionOffset(p._2.offset()))) ) ) - offsetsMapSets <- ZIO.collectAll(offsetsEffects) - groupOffsets = offsetsMapSets.foldLeft(Map.empty[GroupTopicPartition, PartitionOffset])((x, y) => x ++ y) + offsetsMapSets <- ZIO.collectAll(offsetsEffects) + groupOffsets = offsetsMapSets.foldLeft(Map.empty[GroupTopicPartition, PartitionOffset])((x, y) => x ++ y) } yield groupOffsets override def groupState(groups: Set[String])(implicit trace: Trace): RIO[Any, Map[String, GroupState]] = for { - result <- attemptBlocking(client.describeConsumerGroups(groups.asJava)) + result <- attemptBlocking(client.describeConsumerGroups(groups.asJava)) groupEffects = result.describedGroups().asScala.mapValues(_.asZio).toMap - groupsList <- ZIO.collectAll(groupEffects.values) - membersMap = groupsList.groupBy(_.groupId()).mapValues(_.flatMap(_.members().asScala)).toMap - groupState = membersMap - .mapValues(members => { - val topicPartitionsMap = members.flatMap(_.assignment().topicPartitions().asScala) - GroupState(topicPartitionsMap.map(TopicPartition(_)).toSet) - }) - .toMap + groupsList <- ZIO.collectAll(groupEffects.values) + membersMap = groupsList.groupBy(_.groupId()).mapValues(_.flatMap(_.members().asScala)).toMap + groupState = membersMap + .mapValues(members => { + val topicPartitionsMap = members.flatMap(_.assignment().topicPartitions().asScala) + GroupState(topicPartitionsMap.map(TopicPartition(_)).toSet) + }) + .toMap } yield groupState override def deleteTopic(topic: Topic)(implicit trace: Trace): RIO[Any, Unit] = { @@ -322,7 +343,7 @@ object AdminClient { } override def describeConsumerGroups( - groupIds: Set[Group] + groupIds: Set[Group] )(implicit trace: Trace): RIO[Any, Map[Group, ConsumerGroupDescription]] = { for { desc <- attemptBlocking(client.describeConsumerGroups(groupIds.asJava).all()) @@ -331,45 +352,45 @@ object AdminClient { } override def consumerGroupOffsets( - groupId: Group, - onlyPartitions: Option[Set[TopicPartition]] = None + groupId: Group, + onlyPartitions: Option[Set[TopicPartition]] = None )(implicit trace: Trace): RIO[Any, Map[TopicPartition, OffsetAndMetadata]] = { val maybePartitions: util.List[common.TopicPartition] = onlyPartitions.map(_.map(_.asKafka).toList.asJava).orNull for { desc <- attemptBlocking( - client - .listConsumerGroupOffsets(groupId, new ListConsumerGroupOffsetsOptions().topicPartitions(maybePartitions)) - ) - res <- attemptBlocking(desc.partitionsToOffsetAndMetadata().get()) + client + .listConsumerGroupOffsets(groupId, new ListConsumerGroupOffsetsOptions().topicPartitions(maybePartitions)) + ) + res <- attemptBlocking(desc.partitionsToOffsetAndMetadata().get()) } yield res.asScala.toMap.map { case (tp, om) => (TopicPartition(tp), OffsetAndMetadata(om)) } } override def consumerGroupsOffsets( - groups: Map[Group, Option[Set[TopicPartition]]] + groups: Map[Group, Option[Set[TopicPartition]]] )(implicit trace: Trace): RIO[Any, Map[Group, Map[TopicPartition, OffsetAndMetadata]]] = for { - desc <- attemptBlocking( - client - .listConsumerGroupOffsets( - groups - .mapValues(tps => - new ListConsumerGroupOffsetsSpec().topicPartitions(tps.map(_.map(_.asKafka).toList.asJava).orNull) - ) - .asJava - ) - ) - res <- attemptBlocking(groups.map(g => (g._1, desc.partitionsToOffsetAndMetadata(g._1).get()))) - } yield res.map { case (group, o) => - ( - group, - o.asScala.toSeq - .map(om => (TopicPartition.fromKafka(om._1), OffsetAndMetadata(om._2.offset(), om._2.metadata()))) - .toMap - ) + desc <- + attemptBlocking( + client + .listConsumerGroupOffsets( + groups + .mapValues(tps => new ListConsumerGroupOffsetsSpec().topicPartitions(tps.map(_.map(_.asKafka).toList.asJava).orNull)) + .asJava + ) + ) + res <- attemptBlocking(groups.map(g => (g._1, desc.partitionsToOffsetAndMetadata(g._1).get()))) + } yield res.map { + case (group, o) => + ( + group, + o.asScala.toSeq + .map(om => (TopicPartition.fromKafka(om._1), OffsetAndMetadata(om._2.offset(), om._2.metadata()))) + .toMap + ) } override def increasePartitions(topic: Topic, newCount: Int)( - implicit trace: Trace + implicit trace: Trace ): RIO[GreyhoundMetrics, Unit] = { attemptBlocking(client.createPartitions(Map(topic -> NewPartitions.increaseTo(newCount)).asJava)) .flatMap(_.all().asZio) @@ -378,9 +399,9 @@ object AdminClient { } override def updateTopicConfigProperties( - topic: Topic, - configProperties: Map[String, ConfigPropOp], - useNonIncrementalAlter: Boolean = false + topic: Topic, + configProperties: Map[String, ConfigPropOp], + useNonIncrementalAlter: Boolean = false )(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { if (useNonIncrementalAlter) updateTopicConfigUsingAlter(topic, configProperties) else updateTopicConfigIncremental(topic, configProperties) @@ -395,24 +416,25 @@ object AdminClient { described <- describeConfigs(client, Set(topic)) beforeProps <- described.values.head.getOrFail beforeConfig = beforeProps.propertiesThat(_.isTopicSpecific) - configToSet = configProperties.foldLeft(beforeConfig) { - case (acc, (key, ConfigPropOp.Delete)) => acc - key - case (acc, (key, ConfigPropOp.Set(value))) => acc + (key -> value) - } - configJava = new Config(configToSet.map { case (k, v) => new ConfigEntry(k, v) }.toList.asJava) - _ <- attemptBlocking(client.alterConfigs(Map(resource -> configJava).asJava)) - .flatMap(_.all().asZio) + configToSet = configProperties.foldLeft(beforeConfig) { + case (acc, (key, ConfigPropOp.Delete)) => acc - key + case (acc, (key, ConfigPropOp.Set(value))) => acc + (key -> value) + } + configJava = new Config(configToSet.map { case (k, v) => new ConfigEntry(k, v) }.toList.asJava) + _ <- attemptBlocking(client.alterConfigs(Map(resource -> configJava).asJava)) + .flatMap(_.all().asZio) } yield () ).reporting(TopicConfigUpdated(topic, configProperties, incremental = false, attributes, _)) } private def updateTopicConfigIncremental(topic: Topic, configProperties: Map[String, ConfigPropOp]) = { val resource = new ConfigResource(ConfigResource.Type.TOPIC, topic) - val ops = configProperties.map { case (key, value) => - value match { - case ConfigPropOp.Delete => new AlterConfigOp(new ConfigEntry(key, null), OpType.DELETE) - case ConfigPropOp.Set(value) => new AlterConfigOp(new ConfigEntry(key, value), OpType.SET) - } + val ops = configProperties.map { + case (key, value) => + value match { + case ConfigPropOp.Delete => new AlterConfigOp(new ConfigEntry(key, null), OpType.DELETE) + case ConfigPropOp.Set(value) => new AlterConfigOp(new ConfigEntry(key, value), OpType.SET) + } }.asJavaCollection attemptBlocking(client.incrementalAlterConfigs(Map(resource -> ops).asJava)) .flatMap(_.all().asZio) @@ -424,11 +446,11 @@ object AdminClient { } private def describeConfigs(client: KafkaAdminClient, topics: Set[Topic]): RIO[Any, Map[Topic, TopicPropertiesResult]] = - attemptBlocking(client.describeConfigs(topics.map(t => new ConfigResource(TOPIC, t)).asJavaCollection)) flatMap { - result => - ZIO - .collectAll( - result.values.asScala.toMap.map { case (resource, kf) => + attemptBlocking(client.describeConfigs(topics.map(t => new ConfigResource(TOPIC, t)).asJavaCollection)) flatMap { result => + ZIO + .collectAll( + result.values.asScala.toMap.map { + case (resource, kf) => kf.asZio .map { config => resource.name -> @@ -439,36 +461,39 @@ object AdminClient { 0 ) } - .catchSome { case _: UnknownTopicOrPartitionException => - ZIO.succeed(resource.name -> TopicPropertiesResult.TopicDoesnExist(resource.name)) + .catchSome { + case _: UnknownTopicOrPartitionException => + ZIO.succeed(resource.name -> TopicPropertiesResult.TopicDoesnExist(resource.name)) } - } - ) - .map(_.toMap) + } + ) + .map(_.toMap) } private def describePartitions( - client: KafkaAdminClient, - topics: Set[Topic] + client: KafkaAdminClient, + topics: Set[Topic] ): RIO[Any, Map[Topic, TopicPropertiesResult]] = attemptBlocking(client.describeTopics(topics.asJavaCollection)) .flatMap { result => ZIO - .collectAll(result.values.asScala.toMap.map { case (topic, kf) => - kf.asZio - .map { desc => - val replication = desc.partitions.asScala.map(_.replicas.size).sorted.headOption.getOrElse(0) - topic -> - TopicPropertiesResult.TopicProperties( - topic, - desc.partitions.size, - Seq.empty, - replication - ) - } - .catchSome { case _: UnknownTopicOrPartitionException => - ZIO.succeed(topic -> TopicPropertiesResult.TopicDoesnExist(topic)) - } + .collectAll(result.values.asScala.toMap.map { + case (topic, kf) => + kf.asZio + .map { desc => + val replication = desc.partitions.asScala.map(_.replicas.size).sorted.headOption.getOrElse(0) + topic -> + TopicPropertiesResult.TopicProperties( + topic, + desc.partitions.size, + Seq.empty, + replication + ) + } + .catchSome { + case _: UnknownTopicOrPartitionException => + ZIO.succeed(topic -> TopicPropertiesResult.TopicDoesnExist(topic)) + } }) .map(_.toMap) } @@ -477,8 +502,7 @@ object AdminClient { Option(e.getCause).exists(_.isInstanceOf[TopicExistsException]) } -case class AdminClientConfig(bootstrapServers: String, extraProperties: Map[String, String] = Map.empty) - extends CommonGreyhoundConfig { +case class AdminClientConfig(bootstrapServers: String, extraProperties: Map[String, String] = Map.empty) extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = Map(KafkaAdminClientConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers) ++ extraProperties From 7434dd72817cd404d2825dbf7db1f8a63639b431 Mon Sep 17 00:00:00 2001 From: Natan Silnitsky Date: Tue, 18 Jul 2023 12:00:18 +0300 Subject: [PATCH 32/52] fix build for greyhound publish to maven (#35951) fix build for greyhound publish to maven #automerge GitOrigin-RevId: 65676f4cc39274c75267220d7b90e904c9d1d5d8 --- core/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/BUILD.bazel b/core/BUILD.bazel index cdc03814..7220c083 100644 --- a/core/BUILD.bazel +++ b/core/BUILD.bazel @@ -7,6 +7,7 @@ scala_library( srcs = [ "//core/src/main/scala/com/wixpress/dst/greyhound/core:sources", "//core/src/main/scala/com/wixpress/dst/greyhound/core/admin:sources", + "//core/src/main/scala/com/wixpress/dst/greyhound/core/compression:sources", "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer:sources", "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched:sources", "//core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/domain:sources", @@ -26,6 +27,7 @@ scala_library( "@dev_zio_zio_managed_2_12", "@dev_zio_zio_stacktracer_2_12", "@dev_zio_zio_streams_2_12", + "@org_apache_commons_commons_compress", "@org_apache_kafka_kafka_clients", "@org_slf4j_slf4j_api", ], From cb30bb4a98e41347d13ce67c7657297ba8289a0a Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Thu, 20 Jul 2023 15:21:44 +0300 Subject: [PATCH 33/52] [greyhound] parallel consumer - improve logging (#36009) * [greyhound] parallel consumer - improve logging * dummy commit GitOrigin-RevId: 60e02e82d457076a06b0c0f2a575130051db8c69 --- .../dst/greyhound/core/consumer/OffsetsInitializer.scala | 8 +++++--- .../dst/greyhound/core/consumer/ReportingConsumer.scala | 7 ++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala index 31b5d2bd..a5f0a98d 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala @@ -90,11 +90,13 @@ class OffsetsInitializer( } private def reportSkippedGaps(currentCommittedOffsets: Map[TopicPartition, Option[OffsetAndMetadata]]) = { - val skippedGaps = currentCommittedOffsets + val committedOffsetsAndGaps = currentCommittedOffsets .collect { case (tp, Some(om)) => tp -> om } .map(tpom => tpom._1 -> OffsetsAndGaps.parseGapsString(tpom._2.metadata)) - .collect { case (tp, Some(gaps)) => tp -> gaps } - reporter(SkippedGapsOnInitialization(clientId, group, skippedGaps)) + .collect { case (tp, Some(offsetAndGaps)) => tp -> offsetAndGaps } + val skippedGaps = committedOffsetsAndGaps.collect { case (tp, offsetAndGaps) if offsetAndGaps.gaps.nonEmpty => tp -> offsetAndGaps } + + reporter(SkippedGapsOnInitialization(clientId, group, skippedGaps, committedOffsetsAndGaps)) } private def fetchEndOffsets(seekToEndPartitions: Set[TopicPartition], timeout: Duration) = { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala index 6202220d..da62f294 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala @@ -414,6 +414,11 @@ object ConsumerMetric { case class ClosedConsumer(group: Group, clientId: ClientId, result: MetricResult[Throwable, Unit]) extends ConsumerMetric - case class SkippedGapsOnInitialization(clientId: ClientId, group: Group, gaps: Map[TopicPartition, OffsetAndGaps]) extends ConsumerMetric + case class SkippedGapsOnInitialization( + clientId: ClientId, + group: Group, + skippedGaps: Map[TopicPartition, OffsetAndGaps], + currentCommittedOffsetsAndGaps: Map[TopicPartition, OffsetAndGaps] + ) extends ConsumerMetric } From 9b676c9a03575716faea490429ad96f0b37f6b12 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Mon, 24 Jul 2023 13:42:11 +0300 Subject: [PATCH 34/52] [greyhound] parallel consumer - add logs (#36082) [greyhound] parllel consumer - add logs GitOrigin-RevId: 2d3421c1d110e20b68222e195cd82227b772dc93 --- .../dst/greyhound/core/consumer/OffsetsInitializer.scala | 9 ++++++--- .../dst/greyhound/core/consumer/ReportingConsumer.scala | 6 ++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala index a5f0a98d..d0229ae2 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsInitializer.scala @@ -1,7 +1,7 @@ package com.wixpress.dst.greyhound.core.consumer import java.time.Clock -import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.{CommittedMissingOffsets, CommittedMissingOffsetsFailed, SkippedGapsOnInitialization} +import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.{CommittedMissingOffsets, CommittedMissingOffsetsFailed, FoundGapsOnInitialization, SkippedGapsOnInitialization} import com.wixpress.dst.greyhound.core.{ClientId, Group, Offset, OffsetAndMetadata, TopicPartition} import com.wixpress.dst.greyhound.core.metrics.{GreyhoundMetric, GreyhoundMetrics} import zio.{URIO, ZIO} @@ -82,8 +82,11 @@ class OffsetsInitializer( val toPause = seekTo.collect { case (k, SeekTo.Pause) => k } val seekToEndOffsets = fetchEndOffsets(seekToEndPartitions, timeout).mapValues(OffsetAndMetadata.apply) val gapsSmallestOffsets = OffsetsAndGaps.gapsSmallestOffsets(currentCommittedOffsets) - val seekToGapsOffsets = if (parallelConsumer) gapsSmallestOffsets else Map.empty - val toOffsets = seekToOffsets ++ seekToEndOffsets ++ seekToGapsOffsets + + if (gapsSmallestOffsets.nonEmpty) reporter(FoundGapsOnInitialization(clientId, group, gapsSmallestOffsets)) + + val seekToGapsOffsets = if (parallelConsumer) gapsSmallestOffsets else Map.empty + val toOffsets = seekToOffsets ++ seekToEndOffsets ++ seekToGapsOffsets if (!parallelConsumer && gapsSmallestOffsets.nonEmpty) reportSkippedGaps(currentCommittedOffsets) PartitionActions(offsetSeeks = toOffsets, partitionsToPause = toPause.toSet) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala index da62f294..d6f122a8 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala @@ -421,4 +421,10 @@ object ConsumerMetric { currentCommittedOffsetsAndGaps: Map[TopicPartition, OffsetAndGaps] ) extends ConsumerMetric + case class FoundGapsOnInitialization( + clientId: ClientId, + group: Group, + gapsSmallestOffsets: Map[TopicPartition, OffsetAndMetadata] + ) extends ConsumerMetric + } From 118de100a870af3aea0705ac8594ae1c472786bb Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Wed, 26 Jul 2023 14:44:32 +0300 Subject: [PATCH 35/52] [greyhound] parallel consumer - fix update bug (#36161) GitOrigin-RevId: 5b75a27782dec7e534f72f57d3b3ba38c4194d1e --- .../core/parallel/ParallelConsumerIT.scala | 72 +++++++++---------- .../greyhound/core/consumer/Dispatcher.scala | 12 ++-- 2 files changed, 40 insertions(+), 44 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala index e9651995..eaa52c02 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala @@ -119,7 +119,7 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { (cr.key match { case Some(_) => fastMessagesLatch.countDown - case None => + case None => // make sure the handler doesn't finish before the rebalance is done, including drain timeout finishRebalance.await *> ZIO.sleep(drainTimeout + 5.second) }) *> numProcessedMessages.update(_ + 1) @@ -150,44 +150,38 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { } } -// "migrate correctly from regular record consumer to parallel consumer - consume every record once" in { -// ZIO.scoped { -// for { -// r <- getShared -// TestResources(kafka, producer) = r -// topic <- kafka.createRandomTopic() -// group <- randomGroup -// cId <- clientId -// -// regularConfig = configFor(kafka, group, Set(topic)) -// parallelConfig = parallelConsumerConfig(kafka, topic, group, cId) // same group name for both consumers -// queue <- Queue.unbounded[ConsumerRecord[String, String]] -// handler = RecordHandler((cr: ConsumerRecord[String, String]) => queue.offer(cr)).withDeserializers(StringSerde, StringSerde) -// -// records1 = producerRecords(topic, "1", partitions, 3) -// records2 = producerRecords(topic, "2", partitions, 3) -// _ <- ZIO.debug(s"records1:\n${records1.mkString("\n")}\nrecords2:\n${records2.mkString("\n")}") -// numMessages = records1.size + records2.size -// -// _ <- RecordConsumer.make(regularConfig, handler) -// _ <- produceRecords(producer, records1) -// _ <- ZIO.sleep(3.seconds) -// _ <- RecordConsumer.make(parallelConfig, handler).delay(3.seconds) -// _ <- produceRecords(producer, records2) -// _ <- ZIO.sleep(3.seconds) -// messagesOption <- RecordConsumer.make(parallelConfig, handler).flatMap { _ => -// produceRecords(producer, records2) *> ZIO.sleep(3.seconds) *> -// queue -// .takeBetween(numMessages, numMessages) -// .timeout(60.seconds) -// .tap(o => ZIO.when(o.isEmpty)(Console.printLine("timeout waiting for messages!"))) -// } -// messages <- ZIO.fromOption(messagesOption).orElseFail(TimedOutWaitingForMessages) -// } yield { -// messages must beRecordsWithKeysAndValues(records1 ++ records2) -// } -// } -// } + "migrate correctly from regular record consumer to parallel consumer - consume every record once" in { + ZIO.scoped { + for { + r <- getShared + TestResources(kafka, producer) = r + topic <- kafka.createRandomTopic() + group <- randomGroup + cId <- clientId + + regularConfig = configFor(kafka, group, Set(topic)) + parallelConfig = parallelConsumerConfig(kafka, topic, group, cId) // same group name for both consumers + queue <- Queue.unbounded[ConsumerRecord[String, String]] + regularHandler = RecordHandler((cr: ConsumerRecord[String, String]) => queue.offer(cr)).withDeserializers(StringSerde, StringSerde) + longRunningHandler = RecordHandler((cr: ConsumerRecord[String, String]) => + (if (cr.offset % 2 == 0) ZIO.sleep(2.seconds) else ZIO.unit) *> queue.offer(cr) + ).withDeserializers(StringSerde, StringSerde) + + records1 = producerRecords(topic, "1", partitions, 20) + records2 = producerRecords(topic, "2", partitions, 20) + numMessages = records1.size + records2.size + + _ <- RecordConsumer.make(regularConfig, regularHandler) + _ <- produceRecords(producer, records1) + _ <- eventuallyZ(queue.size)(_ == records1.size) + _ <- ZIO.sleep(10.seconds) + _ <- RecordConsumer.make(parallelConfig, longRunningHandler).delay(3.seconds) + _ <- produceRecords(producer, records2) + _ <- ZIO.sleep(10.seconds) + _ <- eventuallyZ(queue.size, timeout = 20.seconds)(_ == numMessages) + } yield ok + } + } "migrate from parallel consumer with gaps to regular consumer - consume from latest and report non-consumed gaps" in { ZIO.scoped { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index 5558ef39..08034fec 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -497,10 +497,9 @@ object Dispatcher { .foreachParDiscard(groupedRecords)(sameKeyRecords => ZIO.foreach(sameKeyRecords) { record => if (shouldRecordBeHandled(record, latestCommitGaps)) { - handle(record).interruptible.ignore *> updateBatch(sameKeyRecords).interruptible + handle(record).interruptible.ignore *> updateBatch(Chunk(record)).interruptible } else - report(SkippedPreviouslyHandledRecord(record, group, clientId, consumerAttributes)) - + report(SkippedPreviouslyHandledRecord(record, group, clientId)) } ) .withParallelism(maxParallelism) @@ -675,8 +674,11 @@ object DispatcherMetric { case class InvokingHandlersInParallel(partition: TopicPartition, numHandlers: Int) extends DispatcherMetric - case class SkippedPreviouslyHandledRecord(record: Record, group: Group, clientId: ClientId, attributes: Map[String, String]) - extends DispatcherMetric + case class SkippedPreviouslyHandledRecord( + record: Record, + group: Group, + clientId: ClientId + ) extends DispatcherMetric } From e57a8bd454bcba658b31278df728f01f42dfa54f Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Thu, 27 Jul 2023 13:58:31 +0300 Subject: [PATCH 36/52] [greyhound] change onPartitionsAssigned result to DelayedRebalanceEffect (#36206) GitOrigin-RevId: 43388b3fc6c16782373261553029c86e5111286d --- .../greyhound/core/consumer/Consumer.scala | 1 + .../greyhound/core/consumer/EventLoop.scala | 6 +++-- .../core/consumer/RebalanceListener.scala | 27 ++++++++++++------- .../core/consumer/RecordConsumer.scala | 4 +-- .../core/consumer/ReportingConsumer.scala | 2 +- .../core/consumer/batched/BatchConsumer.scala | 10 ++++--- .../consumer/batched/BatchEventLoop.scala | 4 +-- .../core/consumer/RebalanceListenerTest.scala | 4 +-- 8 files changed, 36 insertions(+), 22 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index 480994af..70fe937c 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -289,6 +289,7 @@ object Consumer { rebalanceListener.onPartitionsAssigned(consumer, assigned) ) .getOrThrowFiberFailure() + .run() } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index d5aaa3b5..7c81308e 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -222,8 +222,10 @@ object EventLoop { } yield delayedRebalanceEffect } - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[Any] = - partitionsAssigned.succeed(partitions) + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( + implicit trace: Trace + ): UIO[DelayedRebalanceEffect] = + partitionsAssigned.succeed(partitions).as(DelayedRebalanceEffect.unit) } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RebalanceListener.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RebalanceListener.scala index e08433ca..00bfd8cf 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RebalanceListener.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RebalanceListener.scala @@ -5,7 +5,7 @@ import zio.{Tag, Trace, UIO, URIO, ZEnvironment, ZIO, ZLayer} trait RebalanceListener[-R] { self => def onPartitionsRevoked(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): URIO[R, DelayedRebalanceEffect] - def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): URIO[R, Any] + def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): URIO[R, DelayedRebalanceEffect] def *>[R1](other: RebalanceListener[R1]) = new RebalanceListener[R with R1] { override def onPartitionsRevoked(consumer: Consumer, partitions: Set[TopicPartition])( @@ -16,7 +16,9 @@ trait RebalanceListener[-R] { self => ef2 <- other.onPartitionsRevoked(consumer, partitions) } yield ef1 *> ef2 - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): URIO[R with R1, Any] = + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( + implicit trace: Trace + ): URIO[R with R1, DelayedRebalanceEffect] = self.onPartitionsAssigned(consumer, partitions) *> other.onPartitionsAssigned(consumer, partitions) } @@ -25,7 +27,9 @@ trait RebalanceListener[-R] { self => implicit trace: Trace ): URIO[Any, DelayedRebalanceEffect] = self.onPartitionsRevoked(consumer, partitions).provide(ZLayer.succeed(r)) - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): URIO[Any, Any] = + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( + implicit trace: Trace + ): URIO[Any, DelayedRebalanceEffect] = self.onPartitionsAssigned(consumer, partitions).provide(ZLayer.succeed(r)) } @@ -34,7 +38,9 @@ trait RebalanceListener[-R] { self => implicit trace: Trace ): URIO[Any, DelayedRebalanceEffect] = self.onPartitionsRevoked(consumer, partitions).provideEnvironment(r) - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): URIO[Any, Any] = + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( + implicit trace: Trace + ): URIO[Any, DelayedRebalanceEffect] = self.onPartitionsAssigned(consumer, partitions).provideEnvironment(r) } } @@ -89,8 +95,10 @@ object RebalanceListener { implicit trace: Trace ): URIO[R, DelayedRebalanceEffect] = onRevoked(consumer, partitions).as(DelayedRebalanceEffect.unit) - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): URIO[R, Any] = - onAssigned(consumer, partitions) + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( + implicit trace: Trace + ): URIO[R, DelayedRebalanceEffect] = + onAssigned(consumer, partitions).as(DelayedRebalanceEffect.unit) } def apply( @@ -102,8 +110,9 @@ object RebalanceListener { implicit trace: Trace ): UIO[DelayedRebalanceEffect] = onRevoked(partitions).as(DelayedRebalanceEffect.unit) - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[Any] = onAssigned( - partitions - ) + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( + implicit trace: Trace + ): UIO[DelayedRebalanceEffect] = + onAssigned(partitions).as(DelayedRebalanceEffect.unit) } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala index 19c05ee1..9ea37db3 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala @@ -168,12 +168,12 @@ object RecordConsumer { override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( implicit trace: Trace - ): URIO[R1, Any] = + ): URIO[R1, DelayedRebalanceEffect] = for { allAssigned <- assigned.updateAndGet(_ => partitions) _ <- consumerSubscriptionRef.set(subscription) _ <- promise.succeed(allAssigned) - } yield () + } yield DelayedRebalanceEffect.unit } _ <- subscribe[R1](subscription, rebalanceListener)(consumer) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala index d6f122a8..8a260a9a 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala @@ -50,7 +50,7 @@ case class ReportingConsumer(clientId: ClientId, group: Group, internal: Consume } .map(_._2)).provideEnvironment(r) - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[Any] = + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[DelayedRebalanceEffect] = (report(PartitionsAssigned(clientId, group, partitions, config.consumerAttributes)) *> rebalanceListener.onPartitionsAssigned(consumer, partitions)).provideEnvironment(r) } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala index 053439a6..3d92e85a 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala @@ -91,12 +91,12 @@ object BatchConsumer { override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( implicit trace: Trace - ): URIO[R1, Any] = + ): URIO[R1, DelayedRebalanceEffect] = for { allAssigned <- assigned.updateAndGet(_ => partitions) _ <- consumerSubscriptionRef.set(subscription) _ <- promise.succeed(allAssigned) - } yield () + } yield DelayedRebalanceEffect.unit } _ <- subscribe[R1](subscription, rebalanceListener)(consumer) resubscribeTimeout = config.resubscribeTimeout @@ -157,8 +157,10 @@ object BatchConsumer { ): URIO[Any, DelayedRebalanceEffect] = assignments.update(_ -- partitions).as(DelayedRebalanceEffect.unit) - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): URIO[Any, Any] = - assignments.update(_ ++ partitions) + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( + implicit trace: Trace + ): URIO[Any, DelayedRebalanceEffect] = + assignments.update(_ ++ partitions).as(DelayedRebalanceEffect.unit) } } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoop.scala index 02c3564c..b166fbf4 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchEventLoop.scala @@ -243,8 +243,8 @@ object BatchEventLoop { state.partitionsRevoked(partitions).as(DelayedRebalanceEffect.unit) } - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[Any] = - partitionsAssigned.succeed(()) + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[DelayedRebalanceEffect] = + partitionsAssigned.succeed(()).as(DelayedRebalanceEffect.unit) } } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/RebalanceListenerTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/RebalanceListenerTest.scala index d1ad0e44..3e028df3 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/RebalanceListenerTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/RebalanceListenerTest.scala @@ -24,8 +24,8 @@ class RebalanceListenerTest extends JUnitRunnableSpec { log(s"$id.revoke $partitions").as(DelayedRebalanceEffect(unsafeLog(s"$id.revoke.tle $partitions"))) override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( implicit trace: Trace - ): URIO[Any, Any] = - log(s"$id.assigned $partitions") + ): URIO[Any, DelayedRebalanceEffect] = + log(s"$id.assigned $partitions").as(DelayedRebalanceEffect(unsafeLog(s"$id.assigned.tle $partitions"))) } l1l2 = listener("l1") *> listener("l2") partitions = Set(TopicPartition("topic", 0)) From 00d10db17479f1cd62bf37fb3839bcd995436c31 Mon Sep 17 00:00:00 2001 From: Leon Burdinov Date: Sun, 30 Jul 2023 12:47:55 +0300 Subject: [PATCH 37/52] fix batch consumer test timeout interruption (#36246) GitOrigin-RevId: 3332f5e1405eba5397bfcc93174b0415f2220703 --- .../wixpress/dst/greyhound/core/batched/BatchedConsumerIT.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/batched/BatchedConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/batched/BatchedConsumerIT.scala index e059aecf..6cd9e01e 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/batched/BatchedConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/batched/BatchedConsumerIT.scala @@ -212,6 +212,7 @@ class BatchedConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { ) *> barrier .offer(()) + .interruptible // we can't block here, otherwise rebalance won't happen - so we just fail .timeoutFail(new RuntimeException("queue full"))(1.second) .tapError(_ => ZIO.succeed(println(s"[$id] timed out waiting on barrier"))) From e2fdcb559c421b0b8f88ce3e0e94fc7331521603 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Mon, 31 Jul 2023 17:51:13 +0300 Subject: [PATCH 38/52] [greyhound] parallel consumer - fix skip logic (#36281) GitOrigin-RevId: 8fe84370c72e4bf864708f42ed0e6fe435cddc92 --- .../greyhound/core/consumer/Dispatcher.scala | 2 +- .../consumer/dispatcher/DispatcherTest.scala | 32 ++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index 08034fec..eeeec8d6 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -510,7 +510,7 @@ object Dispatcher { private def shouldRecordBeHandled(record: Record, gaps: Map[TopicPartition, OffsetAndGaps]): Boolean = { gaps.get(TopicPartition(record.topic, record.partition)) match { case Some(offsetAndGapsForPartition) if offsetAndGapsForPartition.gaps.nonEmpty => - record.offset > offsetAndGapsForPartition.offset || offsetAndGapsForPartition.gaps.exists(_.contains(record.offset)) + record.offset >= offsetAndGapsForPartition.offset || offsetAndGapsForPartition.gaps.exists(_.contains(record.offset)) case _ => true } } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala index 88bc7d4f..aa2fb6db 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala @@ -5,7 +5,7 @@ import com.wixpress.dst.greyhound.core.consumer.DispatcherMetric.RecordHandled import com.wixpress.dst.greyhound.core.consumer.RecordConsumer.Env import com.wixpress.dst.greyhound.core.consumer.SubmitResult.Rejected import com.wixpress.dst.greyhound.core.consumer.domain.ConsumerRecord -import com.wixpress.dst.greyhound.core.consumer.{Dispatcher, SubmitResult} +import com.wixpress.dst.greyhound.core.consumer.{Dispatcher, Gap, OffsetAndGaps, SubmitResult} import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetric import com.wixpress.dst.greyhound.core.testkit._ import com.wixpress.dst.greyhound.core.zioutils.AwaitShutdown.ShutdownPromise @@ -108,6 +108,36 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { } yield ok) // if execution is not parallel, the latch will not be released } + "consume records with parallel consumer when prior committed offset and gaps exist" in + new ctx { + val recordOffset2 = ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 2L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") + val recordOffset3 = ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 3L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") + + val existingOffsetAndGap = Map( + TopicPartition(topic, partition) -> OffsetAndGaps(2L, Seq(Gap(0, 0))) + ) // simulate following situation: only offset 1 was consumed, so 0 is a gap + + run(for { + handled <- Ref.make[Int](0) + ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) + init <- getInit + + dispatcher <- Dispatcher.make( + "group", + "clientId", + _ => handled.update(_ + 1), + lowWatermark, + highWatermark, + workersShutdownRef = ref, + consumeInParallel = true, + currentGaps = _ => ZIO.succeed(existingOffsetAndGap), + init = init + ) + _ <- submitBatch(dispatcher, Seq(recordOffset2, recordOffset3)) + numHandled <- handled.get + } yield numHandled must equalTo(2)) + } + "reject records when high watermark is reached" in new ctx() { run(for { From d8420ab072e7699e953cdc4df9fe1fb660b57fb1 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Thu, 3 Aug 2023 18:54:14 +0300 Subject: [PATCH 39/52] [greyhound] parallel consumer - improve visibility (#36346) GitOrigin-RevId: 8f75ec7018fc2b8723e22f70b42c1bf7f8404b99 --- .../com/wixpress/dst/greyhound/core/consumer/EventLoop.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 7c81308e..254b83b3 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -333,7 +333,7 @@ object EventLoop { val offsetsAndMetadataToCommit = OffsetsAndGaps.toOffsetsAndMetadata(committable) consumer .commitWithMetadata(offsetsAndMetadataToCommit) - .tap(_ => ZIO.when(offsetsAndMetadataToCommit.nonEmpty)(report(CommittedOffsetsAndMetadata(offsetsAndMetadataToCommit)))) + .tap(_ => ZIO.when(offsetsAndMetadataToCommit.nonEmpty)(report(CommittedOffsetsAndGaps(committable)))) .catchAll { t => report(FailedToCommitOffsetsAndMetadata(t, offsetsAndMetadataToCommit)) *> offsetsAndGaps.setCommittable(committable) } @@ -480,7 +480,7 @@ object EventLoopMetric { attributes: Map[String, String] ) extends EventLoopMetric - case class CommittedOffsetsAndMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata]) extends EventLoopMetric + case class CommittedOffsetsAndGaps(offsetsAndGaps: Map[TopicPartition, OffsetAndGaps]) extends EventLoopMetric case class FailedToCommitOffsetsAndMetadata(t: Throwable, offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata]) extends EventLoopMetric From 3af93b91afb991b6d9c549ef61637b19ef68c73f Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Wed, 9 Aug 2023 11:05:21 +0300 Subject: [PATCH 40/52] [greyhound] parallel consumer - init gaps on every rebalance (#36375) GitOrigin-RevId: 5b55ff311413a0ca90f0c66b7b5be0f12c8d3582 --- .../core/parallel/ParallelConsumerIT.scala | 45 +++++++----- .../greyhound/core/consumer/Consumer.scala | 16 +++++ .../greyhound/core/consumer/Dispatcher.scala | 20 +----- .../greyhound/core/consumer/EventLoop.scala | 70 ++++++++----------- .../core/consumer/RecordConsumer.scala | 11 +++ .../core/consumer/ReportingConsumer.scala | 11 ++- .../core/consumer/EventLoopTest.scala | 4 ++ .../consumer/dispatcher/DispatcherTest.scala | 50 ++++--------- 8 files changed, 111 insertions(+), 116 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala index eaa52c02..b7391d0e 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/parallel/ParallelConsumerIT.scala @@ -155,30 +155,43 @@ class ParallelConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { for { r <- getShared TestResources(kafka, producer) = r - topic <- kafka.createRandomTopic() + topic <- kafka.createRandomTopic(partitions = 1) group <- randomGroup cId <- clientId + tp = TopicPartition(topic, 0) regularConfig = configFor(kafka, group, Set(topic)) parallelConfig = parallelConsumerConfig(kafka, topic, group, cId) // same group name for both consumers - queue <- Queue.unbounded[ConsumerRecord[String, String]] - regularHandler = RecordHandler((cr: ConsumerRecord[String, String]) => queue.offer(cr)).withDeserializers(StringSerde, StringSerde) + handledOffsets <- Ref.make[Seq[Long]](Seq.empty) // keep track of handled offsets to make sure no duplicates are processed + regularHandler = RecordHandler((cr: ConsumerRecord[String, String]) => handledOffsets.update(_ :+ cr.offset)) + .withDeserializers(StringSerde, StringSerde) longRunningHandler = RecordHandler((cr: ConsumerRecord[String, String]) => - (if (cr.offset % 2 == 0) ZIO.sleep(2.seconds) else ZIO.unit) *> queue.offer(cr) + (if (cr.offset % 2 == 0) ZIO.sleep(2.seconds) else ZIO.unit) *> handledOffsets.update(_ :+ cr.offset) ).withDeserializers(StringSerde, StringSerde) - records1 = producerRecords(topic, "1", partitions, 20) - records2 = producerRecords(topic, "2", partitions, 20) - numMessages = records1.size + records2.size - - _ <- RecordConsumer.make(regularConfig, regularHandler) - _ <- produceRecords(producer, records1) - _ <- eventuallyZ(queue.size)(_ == records1.size) - _ <- ZIO.sleep(10.seconds) - _ <- RecordConsumer.make(parallelConfig, longRunningHandler).delay(3.seconds) - _ <- produceRecords(producer, records2) - _ <- ZIO.sleep(10.seconds) - _ <- eventuallyZ(queue.size, timeout = 20.seconds)(_ == numMessages) + records1 = producerRecords(topic, "1", 1, 10) + records2 = producerRecords(topic, "2", 1, 10) + records3 = producerRecords(topic, "3", 1, 10) + records4 = producerRecords(topic, "4", 1, 10) + + regularConsumer1 <- RecordConsumer.make(regularConfig, regularHandler) + _ <- produceRecords(producer, records1) + _ <- eventuallyZ(handledOffsets.get)(_.sorted == records1.indices.map(_.toLong).sorted) + _ <- regularConsumer1.shutdown() + parallelConsumer1 <- RecordConsumer.make(parallelConfig, longRunningHandler) + parallelConsumer2 <- RecordConsumer.make(parallelConfig, longRunningHandler) + _ <- produceRecords(producer, records2) + _ <- eventuallyZ(handledOffsets.get, timeout = 10.seconds)(_.sorted == (records1 ++ records2).indices.map(_.toLong).sorted) + parallelConsumer3 <- RecordConsumer.make(parallelConfig, longRunningHandler).delay(5.seconds) + _ <- parallelConsumer1.shutdown() zipPar parallelConsumer2.shutdown() + _ <- produceRecords(producer, records3) + _ <- + eventuallyZ(handledOffsets.get, timeout = 10.seconds)(_.sorted == (records1 ++ records2 ++ records3).indices.map(_.toLong).sorted) + _ <- produceRecords(producer, records4) + _ <- eventuallyZ(handledOffsets.get, timeout = 10.seconds)( + _.sorted == (records1 ++ records2 ++ records3 ++ records4).indices.map(_.toLong).sorted + ) + _ <- eventuallyZ(parallelConsumer3.committedOffsetsAndGaps(Set(tp)), timeout = 10.seconds)(_.get(tp).exists(_.gaps.isEmpty)) } yield ok } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index 70fe937c..909682b7 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -47,6 +47,8 @@ trait Consumer { def commitOnRebalance(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] + def committedOffsetsAndMetadataOnRebalance(partitions: Set[TopicPartition])(implicit trace: Trace): Map[TopicPartition, OffsetAndMetadata] + def commitWithMetadataOnRebalance(offsets: Map[TopicPartition, OffsetAndMetadata])( implicit trace: Trace ): RIO[GreyhoundMetrics, DelayedRebalanceEffect] @@ -187,6 +189,20 @@ object Consumer { ZIO.succeed(DelayedRebalanceEffect(consumer.commitSync(kOffsets))) } + override def committedOffsetsAndMetadataOnRebalance(partitions: Set[TopicPartition])( + implicit trace: Trace + ): Map[TopicPartition, OffsetAndMetadata] = { + // unsafe function - should only be called from a RebalanceListener + consumer + .committed(kafkaPartitions(partitions)) + .asScala + .collect { + case (tp: KafkaTopicPartition, om: KafkaOffsetAndMetadata) => + (TopicPartition(tp), OffsetAndMetadata(om.offset, om.metadata)) + } + .toMap + } + override def commitWithMetadataOnRebalance( offsets: Map[TopicPartition, OffsetAndMetadata] )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index eeeec8d6..e377b35c 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -56,14 +56,11 @@ object Dispatcher { maxParallelism: Int = 1, updateBatch: Chunk[Record] => URIO[GreyhoundMetrics, Unit] = _ => ZIO.unit, currentGaps: Set[TopicPartition] => ZIO[GreyhoundMetrics, Nothing, Map[TopicPartition, OffsetAndGaps]] = _ => ZIO.succeed(Map.empty), - gapsSizeLimit: Int = 500, - init: Promise[Nothing, Unit] + gapsSizeLimit: Int = 500 )(implicit trace: Trace): UIO[Dispatcher[R]] = for { p <- Promise.make[Nothing, Unit] state <- Ref.make[DispatcherState](if (startPaused) DispatcherState.Paused(p) else DispatcherState.Running) - initState <- - Ref.make[DispatcherInitState](if (consumeInParallel) DispatcherInitState.NotInitialized else DispatcherInitState.Initialized) workers <- Ref.make(Map.empty[TopicPartition, Worker]) } yield new Dispatcher[R] { override def submit(record: Record): URIO[R with Env, SubmitResult] = @@ -77,11 +74,6 @@ object Dispatcher { override def submitBatch(records: Records): URIO[R with Env, SubmitResult] = for { _ <- report(SubmittingRecordBatch(group, clientId, records.size, records, consumerAttributes)) - currentInitState <- initState.get - _ <- currentInitState match { - case DispatcherInitState.NotInitialized => init.await *> initState.set(DispatcherInitState.Initialized) - case _ => ZIO.unit - } allSamePartition = records.map(r => RecordTopicPartition(r)).distinct.size == 1 submitResult <- if (allSamePartition) { val partition = RecordTopicPartition(records.head) @@ -225,16 +217,6 @@ object Dispatcher { } - sealed trait DispatcherInitState - - object DispatcherInitState { - - case object NotInitialized extends DispatcherInitState - - case object Initialized extends DispatcherInitState - - } - case class Task(record: Record, complete: UIO[Unit]) trait Worker { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 254b83b3..520acdc1 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -46,7 +46,6 @@ object EventLoop { updateBatch = { records: Chunk[Record] => report(HandledBatch(records)) *> updateGapsByBatch(records, offsetsAndGaps) } currentGaps = { partitions: Set[TopicPartition] => offsetsAndGaps.offsetsAndGapsForPartitions(partitions) } _ <- report(CreatingDispatcher(clientId, group, consumerAttributes, config.startPaused)) - offsetsAndGapsInit <- Promise.make[Nothing, Unit] dispatcher <- Dispatcher.make( group, clientId, @@ -62,8 +61,7 @@ object EventLoop { config.maxParallelism, updateBatch, currentGaps, - config.gapsSizeLimit, - offsetsAndGapsInit + config.gapsSizeLimit ) positionsRef <- Ref.make(Map.empty[TopicPartition, Offset]) pausedPartitionsRef <- Ref.make(Set.empty[TopicPartition]) @@ -90,19 +88,6 @@ object EventLoop { .forkDaemon _ <- report(AwaitingPartitionsAssignment(clientId, group, consumerAttributes)) partitions <- partitionsAssigned.await - _ <- if (config.consumePartitionInParallel) { - report(AwaitingOffsetsAndGapsInit(clientId, group, consumerAttributes)) *> - initializeOffsetsAndGaps( // we must preform init in the main thread ant not in the rebalance listener as it involves calling SDK - offsetsAndGaps, - partitions, - consumer, - clientId, - group, - consumerAttributes, - offsetsAndGapsInit - ) *> offsetsAndGapsInit.await - - } else offsetsAndGapsInit.succeed() env <- ZIO.environment[Env] } yield (dispatcher, fiber, offsets, positionsRef, running, rebalanceListener.provideEnvironment(env)) @@ -224,8 +209,17 @@ object EventLoop { override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( implicit trace: Trace - ): UIO[DelayedRebalanceEffect] = - partitionsAssigned.succeed(partitions).as(DelayedRebalanceEffect.unit) + ): URIO[GreyhoundMetrics, DelayedRebalanceEffect] = { + for { + delayedRebalanceEffect <- + if (useParallelConsumer) + initOffsetsAndGapsOnRebalance(partitions, consumer0, offsetsAndGaps).catchAll { t => + report(FailedToUpdateGapsOnPartitionAssignment(partitions, t)).as(DelayedRebalanceEffect.unit) + } + else DelayedRebalanceEffect.zioUnit + _ <- partitionsAssigned.succeed(partitions) + } yield delayedRebalanceEffect + } } } @@ -263,24 +257,24 @@ object EventLoop { _ <- pausedRef.update(_ => pausedTopics) } yield records - private def initializeOffsetsAndGaps( - offsetsAndGaps: OffsetsAndGaps, + private def initOffsetsAndGapsOnRebalance( partitions: Set[TopicPartition], consumer: Consumer, - clientId: ClientId, - group: Group, - attributes: Map[String, String], - offsetsAndGapsInit: Promise[Nothing, Unit] - ) = for { - committedOffsetsAndMetadata <- consumer.committedOffsetsAndMetadata(partitions) - initialOffsetsAndGaps = - committedOffsetsAndMetadata.mapValues(om => - OffsetsAndGaps.parseGapsString(om.metadata).fold(OffsetAndGaps(om.offset - 1, committable = false))(identity) - ) - _ <- offsetsAndGaps.init(initialOffsetsAndGaps) - _ <- report(InitializedOffsetsAndGaps(clientId, group, initialOffsetsAndGaps, attributes)) - _ <- offsetsAndGapsInit.succeed(()) - } yield () + offsetsAndGaps: OffsetsAndGaps + ): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { + ZIO.runtime[GreyhoundMetrics].map { rt => + DelayedRebalanceEffect { + val committed = committedOffsetsAndGaps(consumer, partitions) + zio.Unsafe.unsafe { implicit s => rt.unsafe.run(offsetsAndGaps.init(committed)) } + } + } + } + + private def committedOffsetsAndGaps(consumer: Consumer, partitions: Set[TopicPartition]): Map[TopicPartition, OffsetAndGaps] = { + consumer + .committedOffsetsAndMetadataOnRebalance(partitions) + .mapValues(om => OffsetsAndGaps.parseGapsString(om.metadata).fold(OffsetAndGaps(om.offset - 1, committable = false))(identity)) + } private def submitRecordsSequentially[R2, R1]( consumer: Consumer, @@ -450,13 +444,7 @@ object EventLoopMetric { case class FailedToUpdatePositions(t: Throwable, clientId: ClientId, attributes: Map[String, String] = Map.empty) extends EventLoopMetric - case class FailedToUpdateGapsOnPartitionAssignment( - t: Throwable, - clientId: ClientId, - group: Group, - partitions: Set[TopicPartition], - attributes: Map[String, String] = Map.empty - ) extends EventLoopMetric + case class FailedToUpdateGapsOnPartitionAssignment(partitions: Set[TopicPartition], t: Throwable) extends EventLoopMetric case class FailedToFetchCommittedGaps(t: Throwable, clientId: ClientId, attributes: Map[String, String] = Map.empty) extends EventLoopMetric diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala index 9ea37db3..ed3028f0 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala @@ -46,6 +46,8 @@ trait RecordConsumer[-R] extends Resource[R] with RecordConsumerProperties[Recor def committedOffsets(partitions: Set[TopicPartition]): RIO[Env, Map[TopicPartition, Offset]] + def committedOffsetsAndGaps(partitions: Set[TopicPartition]): RIO[Env, Map[TopicPartition, OffsetAndGaps]] + def waitForCurrentRecordsCompletion: URIO[Any, Unit] def offsetsForTimes(topicPartitionsOnTimestamp: Map[TopicPartition, Long]): RIO[Any, Map[TopicPartition, Offset]] @@ -139,6 +141,15 @@ object RecordConsumer { override def committedOffsets(partitions: Set[TopicPartition]): RIO[Env, Map[TopicPartition, Offset]] = consumer.committedOffsets(partitions) + override def committedOffsetsAndGaps(partitions: Set[TopicPartition]): RIO[Env, Map[TopicPartition, OffsetAndGaps]] = + consumer + .committedOffsetsAndMetadata(partitions) + .map( + _.mapValues(om => + OffsetsAndGaps.parseGapsString(om.metadata).fold(OffsetAndGaps(om.offset - 1, committable = false))(identity) + ) + ) + override def waitForCurrentRecordsCompletion: URIO[Any, Unit] = eventLoop.waitForCurrentRecordsCompletion override def state(implicit trace: Trace): UIO[RecordConsumerExposedState] = for { diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala index 8a260a9a..178d2132 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/ReportingConsumer.scala @@ -50,7 +50,9 @@ case class ReportingConsumer(clientId: ClientId, group: Group, internal: Consume } .map(_._2)).provideEnvironment(r) - override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])(implicit trace: Trace): UIO[DelayedRebalanceEffect] = + override def onPartitionsAssigned(consumer: Consumer, partitions: Set[TopicPartition])( + implicit trace: Trace + ): UIO[DelayedRebalanceEffect] = (report(PartitionsAssigned(clientId, group, partitions, config.consumerAttributes)) *> rebalanceListener.onPartitionsAssigned(consumer, partitions)).provideEnvironment(r) } @@ -109,7 +111,12 @@ case class ReportingConsumer(clientId: ClientId, group: Group, internal: Consume } else DelayedRebalanceEffect.zioUnit } - override def commitWithMetadataOnRebalance(offsets: Map[TopicPartition, OffsetAndMetadata] + override def committedOffsetsAndMetadataOnRebalance(partitions: NonEmptySet[TopicPartition])( + implicit trace: Trace + ): Map[TopicPartition, OffsetAndMetadata] = internal.committedOffsetsAndMetadataOnRebalance(partitions) + + override def commitWithMetadataOnRebalance( + offsets: Map[TopicPartition, OffsetAndMetadata] )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = ZIO.runtime[GreyhoundMetrics].flatMap { runtime => if (offsets.nonEmpty) { diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala index 15b4b1f3..8168a498 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/EventLoopTest.scala @@ -149,6 +149,10 @@ trait EmptyConsumer extends Consumer { ): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = DelayedRebalanceEffect.zioUnit + override def committedOffsetsAndMetadataOnRebalance(partitions: Set[TopicPartition])( + implicit trace: Trace + ): Map[TopicPartition, OffsetAndMetadata] = Map.empty + override def commitWithMetadataOnRebalance(offsets: Map[TopicPartition, OffsetAndMetadata])( implicit trace: Trace ): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala index aa2fb6db..51b05d03 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala @@ -29,9 +29,8 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { run(for { promise <- Promise.make[Nothing, Record] ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- - Dispatcher.make("group", "clientId", promise.succeed, lowWatermark, highWatermark, workersShutdownRef = ref, init = init) + Dispatcher.make("group", "clientId", promise.succeed, lowWatermark, highWatermark, workersShutdownRef = ref) _ <- submit(dispatcher, record) handled <- promise.await } yield handled must equalTo(record)) @@ -45,9 +44,8 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { latch <- CountDownLatch.make(partitions) slowHandler = { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => Clock.sleep(1.second) *> latch.countDown } ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- - Dispatcher.make("group", "clientId", slowHandler, lowWatermark, highWatermark, workersShutdownRef = ref, init = init) + Dispatcher.make("group", "clientId", slowHandler, lowWatermark, highWatermark, workersShutdownRef = ref) _ <- ZIO.foreachDiscard(0 until partitions) { partition => submit(dispatcher, record.copy(partition = partition)) } _ <- TestClock.adjust(1.second) _ <- latch.await @@ -63,7 +61,6 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { latch <- CountDownLatch.make(numKeys) slowHandler = { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => Clock.sleep(1.second) *> latch.countDown } ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- Dispatcher.make( "group", "clientId", @@ -72,8 +69,7 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { highWatermark, workersShutdownRef = ref, consumeInParallel = true, - maxParallelism = 8, - init = init + maxParallelism = 8 ) // produce with unique keys to the same partition _ <- submitBatch(dispatcher, keys.map(key => record.copy(partition = 0, key = key))) @@ -90,7 +86,6 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { latch <- CountDownLatch.make(numRecords) slowHandler = { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => Clock.sleep(1.second) *> latch.countDown } ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- Dispatcher.make( "group", "clientId", @@ -99,8 +94,7 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { highWatermark, workersShutdownRef = ref, consumeInParallel = true, - maxParallelism = numRecords, - init = init + maxParallelism = numRecords ) _ <- submitBatch(dispatcher, (1 to numRecords).map(_ => record.copy(partition = 0, key = None))) _ <- TestClock.adjust(1.second) @@ -120,8 +114,6 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { run(for { handled <- Ref.make[Int](0) ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit - dispatcher <- Dispatcher.make( "group", "clientId", @@ -130,8 +122,7 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { highWatermark, workersShutdownRef = ref, consumeInParallel = true, - currentGaps = _ => ZIO.succeed(existingOffsetAndGap), - init = init + currentGaps = _ => ZIO.succeed(existingOffsetAndGap) ) _ <- submitBatch(dispatcher, Seq(recordOffset2, recordOffset3)) numHandled <- handled.get @@ -142,10 +133,9 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { new ctx() { run(for { ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- Dispatcher - .make[Any]("group", "clientId", _ => ZIO.never, lowWatermark, highWatermark, workersShutdownRef = ref, init = init) + .make[Any]("group", "clientId", _ => ZIO.never, lowWatermark, highWatermark, workersShutdownRef = ref) _ <- submit(dispatcher, record.copy(offset = 0L)) // Will be polled _ <- submit(dispatcher, record.copy(offset = 1L)) _ <- submit(dispatcher, record.copy(offset = 2L)) @@ -160,10 +150,9 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { new ctx(highWatermark = 5) { run(for { ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- Dispatcher - .make[Any]("group", "clientId", _ => ZIO.never, lowWatermark, highWatermark, workersShutdownRef = ref, init = init) + .make[Any]("group", "clientId", _ => ZIO.never, lowWatermark, highWatermark, workersShutdownRef = ref) records = (0 until 7).map(i => record.copy(offset = i.toLong)) result <- submitBatch(dispatcher, records) } yield result must beEqualTo(SubmitResult.RejectedBatch(record.copy(offset = 5L)))) @@ -174,7 +163,6 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { val gapsSizeLimit = 5 run(for { ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- Dispatcher .make[Any]( @@ -184,7 +172,6 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { lowWatermark, highWatermark, workersShutdownRef = ref, - init = init, gapsSizeLimit = gapsSizeLimit ) records = (0 until 7).map(i => record.copy(offset = i.toLong)) @@ -198,15 +185,13 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { for { queue <- Queue.bounded[Record](1) ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- Dispatcher.make[Any]( "group", "clientId", record => queue.offer(record).flatMap(result => ZIO.succeed(println(s"queue.offer result: ${result}"))), lowWatermark, highWatermark, - workersShutdownRef = ref, - init = init + workersShutdownRef = ref ) _ <- ZIO.foreachDiscard(0 to (highWatermark + 1)) { offset => submit( @@ -231,7 +216,6 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { for { queue <- Queue.bounded[Record](1) ref <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- Dispatcher.make[TestClock]( "group", "clientId", @@ -242,8 +226,7 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { lowWatermark, highWatermark, delayResumeOfPausedPartition = 6500, - workersShutdownRef = ref, - init = init + workersShutdownRef = ref ) _ <- ZIO.foreachDiscard(0 to (highWatermark + 1)) { offset => submit( @@ -297,7 +280,6 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { run(for { ref <- Ref.make(0) workersShutdownRef <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) - init <- getInit dispatcher <- Dispatcher .make[Any]( @@ -306,8 +288,7 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { _ => ref.update(_ + 1), lowWatermark, highWatermark, - workersShutdownRef = workersShutdownRef, - init = init + workersShutdownRef = workersShutdownRef ) _ <- pause(dispatcher) _ <- submit(dispatcher, record) // Will be queued @@ -322,10 +303,9 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { workersShutdownRef <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) promise <- Promise.make[Nothing, Unit] handler = { _: Record => Clock.sleep(1.second) *> ref.update(_ + 1) *> promise.succeed(()) } - init <- getInit dispatcher <- Dispatcher - .make[Any]("group", "clientId", handler, lowWatermark, highWatermark, workersShutdownRef = workersShutdownRef, init = init) + .make[Any]("group", "clientId", handler, lowWatermark, highWatermark, workersShutdownRef = workersShutdownRef) _ <- submit(dispatcher, record) // Will be handled _ <- TestMetrics.reported.flatMap(waitUntilRecordHandled(3.seconds)) _ <- pause(dispatcher) @@ -343,10 +323,9 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { workersShutdownRef <- Ref.make[Map[TopicPartition, ShutdownPromise]](Map.empty) promise <- Promise.make[Nothing, Unit] handler = { _: ConsumerRecord[Chunk[Byte], Chunk[Byte]] => ref.update(_ + 1) *> promise.succeed(()) } - init <- getInit dispatcher <- Dispatcher - .make("group", "clientId", handler, lowWatermark, highWatermark, workersShutdownRef = workersShutdownRef, init = init) + .make("group", "clientId", handler, lowWatermark, highWatermark, workersShutdownRef = workersShutdownRef) _ <- pause(dispatcher) _ <- submit(dispatcher, record) _ <- resume(dispatcher) @@ -388,11 +367,6 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { val record = ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 0L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") def getKeys(numKeys: Int) = (0 until numKeys).map(i => Some(Chunk.fromArray(s"key$i".getBytes))) - - def getInit() = for { - init <- Promise.make[Nothing, Unit] - _ <- init.succeed(()) - } yield init } } From 2abfa3b4c00186cae2465de1a728597bfaaaa0f0 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Wed, 9 Aug 2023 16:38:02 +0300 Subject: [PATCH 41/52] [greyhound] parallel cosnumer - add init log (#36421) [greyhound] parallel cosnumer - add init log #automerge GitOrigin-RevId: 8e6b0c2f3a877fe4b9805f60a98a814a144b2ae6 --- .../dst/greyhound/core/consumer/EventLoop.scala | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 520acdc1..52ca661c 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -213,7 +213,7 @@ object EventLoop { for { delayedRebalanceEffect <- if (useParallelConsumer) - initOffsetsAndGapsOnRebalance(partitions, consumer0, offsetsAndGaps).catchAll { t => + initOffsetsAndGapsOnRebalance(partitions, consumer0, offsetsAndGaps, clientId, group).catchAll { t => report(FailedToUpdateGapsOnPartitionAssignment(partitions, t)).as(DelayedRebalanceEffect.unit) } else DelayedRebalanceEffect.zioUnit @@ -260,12 +260,19 @@ object EventLoop { private def initOffsetsAndGapsOnRebalance( partitions: Set[TopicPartition], consumer: Consumer, - offsetsAndGaps: OffsetsAndGaps + offsetsAndGaps: OffsetsAndGaps, + clientId: ClientId, + group: Group ): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { ZIO.runtime[GreyhoundMetrics].map { rt => DelayedRebalanceEffect { val committed = committedOffsetsAndGaps(consumer, partitions) - zio.Unsafe.unsafe { implicit s => rt.unsafe.run(offsetsAndGaps.init(committed)) } + zio.Unsafe.unsafe { implicit s => + rt.unsafe.run( + offsetsAndGaps.init(committed) *> + report(InitializedOffsetsAndGaps(clientId, group, committed, consumer.config.consumerAttributes)) + ) + } } } } @@ -458,9 +465,7 @@ object EventLoopMetric { case class CreatingPollOnceFiber(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric case class AwaitingPartitionsAssignment(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric - - case class AwaitingOffsetsAndGapsInit(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric - + case class InitializedOffsetsAndGaps( clientId: ClientId, group: Group, From a27d84ee003ff225b8cbacc4fbab15ff553e81c5 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Tue, 15 Aug 2023 17:43:28 +0300 Subject: [PATCH 42/52] [greyhound] parallel consumer - fix shutdown (#36489) GitOrigin-RevId: eb0dc817fa50494422fba10994dd4382f1a58585 --- .../dst/greyhound/core/ConsumerIT.scala | 14 +++++++++----- .../dst/greyhound/core/consumer/EventLoop.scala | 17 +++++++++++------ .../consumer/dispatcher/DispatcherTest.scala | 2 +- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala index 23f0293d..d2318e82 100644 --- a/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala +++ b/core/src/it/scala/com/wixpress/dst/greyhound/core/ConsumerIT.scala @@ -1,7 +1,7 @@ package com.wixpress.dst.greyhound.core import com.wixpress.dst.greyhound.core.Serdes._ -import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.PollingFailed +import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.{CommittedOffsets, PollingFailed} import com.wixpress.dst.greyhound.core.consumer.EventLoop.Handler import com.wixpress.dst.greyhound.core.consumer.OffsetReset.{Earliest, Latest} import com.wixpress.dst.greyhound.core.consumer._ @@ -318,13 +318,15 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { } yield test } - s"wait until queues are drained${parallelConsumerString(useParallelConsumer)}" in { + s"wait until queues are drained and commit on shutdown${parallelConsumerString(useParallelConsumer)}" in { for { r <- getShared TestResources(kafka, producer) = r _ <- ZIO.debug(">>>> starting test: gracefulShutdownTest") - topic <- kafka.createRandomTopic(prefix = "core-wait-until") + topic <- kafka.createRandomTopic(partitions = 1, prefix = "core-wait-until") group <- randomGroup + cId <- clientId + tp = TopicPartition(topic, 0) ref <- Ref.make(0) startedHandling <- Promise.make[Nothing, Unit] @@ -340,15 +342,17 @@ class ConsumerIT extends BaseTestWithSharedEnv[Env, TestResources] { group, topic, mutateEventLoop = _.copy(consumePartitionInParallel = useParallelConsumer, maxParallelism = 8) - ), + ).copy(clientId = cId), handler ) .flatMap { _ => producer.produce(ProducerRecord(topic, Chunk.empty)) *> startedHandling.await } ) handled <- ref.get + metrics <- TestMetrics.reported } yield { - handled must equalTo(1) + (handled must equalTo(1)) and + (metrics must contain(CommittedOffsets(cId, group, Map(tp -> 1L), calledOnRebalance = false, Map.empty))) } } diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 52ca661c..9fd1bba1 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -89,15 +89,15 @@ object EventLoop { _ <- report(AwaitingPartitionsAssignment(clientId, group, consumerAttributes)) partitions <- partitionsAssigned.await env <- ZIO.environment[Env] - } yield (dispatcher, fiber, offsets, positionsRef, running, rebalanceListener.provideEnvironment(env)) + } yield (dispatcher, fiber, offsets, offsetsAndGaps, positionsRef, running, rebalanceListener.provideEnvironment(env)) start .map { - case (dispatcher, fiber, offsets, positionsRef, running, listener) => + case (dispatcher, fiber, offsets, offsetsAndGaps, positionsRef, running, listener) => new EventLoop[GreyhoundMetrics] { override def stop: URIO[GreyhoundMetrics, Any] = - stopLoop(group, consumer, clientId, consumerAttributes, config, running, fiber, offsets, dispatcher) + stopLoop(group, consumer, clientId, consumerAttributes, config, running, fiber, offsets, offsetsAndGaps, dispatcher) override def pause(implicit trace: Trace): URIO[GreyhoundMetrics, Unit] = (report(PausingEventLoop(clientId, group, consumerAttributes)) *> running.set(Paused) *> dispatcher.pause).unit @@ -132,6 +132,7 @@ object EventLoop { running: Ref[EventLoopState], fiber: Fiber.Runtime[Nothing, Boolean], offsets: Offsets, + offsetsAndGaps: OffsetsAndGaps, dispatcher: Dispatcher[R] ) = for { @@ -139,7 +140,7 @@ object EventLoop { _ <- running.set(ShuttingDown) drained <- (fiber.join *> dispatcher.shutdown).timeout(config.drainTimeout) _ <- ZIO.when(drained.isEmpty)(report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumerAttributes))) - _ <- commitOffsets(consumer, offsets) + _ <- if (config.consumePartitionInParallel) commitOffsetsAndGaps(consumer, offsetsAndGaps) else commitOffsets(consumer, offsets) } yield () private def updatePositions( @@ -327,7 +328,9 @@ object EventLoop { } private def commitOffsets(consumer: Consumer, offsets: Offsets): URIO[GreyhoundMetrics, Unit] = - offsets.committable.flatMap { committable => consumer.commit(committable).catchAll { _ => offsets.update(committable) } } + offsets.committable.flatMap { committable => + consumer.commit(committable).catchAll { t => report(FailedToCommitOffsets(t, committable)) *> offsets.update(committable) } + } private def commitOffsetsAndGaps(consumer: Consumer, offsetsAndGaps: OffsetsAndGaps): URIO[GreyhoundMetrics, Unit] = offsetsAndGaps.getCommittableAndClear.flatMap { committable => @@ -465,7 +468,7 @@ object EventLoopMetric { case class CreatingPollOnceFiber(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric case class AwaitingPartitionsAssignment(clientId: ClientId, group: Group, attributes: Map[String, String]) extends EventLoopMetric - + case class InitializedOffsetsAndGaps( clientId: ClientId, group: Group, @@ -478,6 +481,8 @@ object EventLoopMetric { case class FailedToCommitOffsetsAndMetadata(t: Throwable, offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata]) extends EventLoopMetric + case class FailedToCommitOffsets(t: Throwable, offsets: Map[TopicPartition, Offset]) extends EventLoopMetric + case class HandledBatch(records: Records) extends EventLoopMetric } diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala index 51b05d03..bce795a4 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/dispatcher/DispatcherTest.scala @@ -204,7 +204,7 @@ class DispatcherTest extends BaseTest[TestMetrics with TestClock] { ConsumerRecord[Chunk[Byte], Chunk[Byte]](topic, partition, 6L, Headers.Empty, None, Chunk.empty, 0L, 0L, 0L, "") ) // Will be dropped _ <- eventuallyZ(dispatcher.resumeablePartitions(Set(topicPartition)))(_.isEmpty) - _ <- ZIO.foreachDiscard(1 to 4)(_ => queue.take) + _ <- ZIO.foreachDiscard(1 to 5)(_ => queue.take) _ <- eventuallyZ(dispatcher.resumeablePartitions(Set(topicPartition)))(_ == Set(TopicPartition(topic, partition))) } yield ok ) From f61ce82b6e24ce976ca36b7753f0e6a87bec9564 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Wed, 16 Aug 2023 12:56:02 +0300 Subject: [PATCH 43/52] [greyhound] add shutdown visibility (#36571) [greyhound] add shutdown visibility #automerge GitOrigin-RevId: a65aaecff072d8a19e9e70f68d4da713971bd538 --- .../greyhound/core/consumer/Dispatcher.scala | 14 ++++--- .../greyhound/core/consumer/EventLoop.scala | 37 ++++++++++++++++++- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index e377b35c..b4c70f48 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -138,11 +138,13 @@ object Dispatcher { case state => (ZIO.unit, state) }.flatten - override def shutdown: URIO[GreyhoundMetrics, Unit] = - state.modify(state => (state, DispatcherState.ShuttingDown)).flatMap { - case DispatcherState.Paused(resume) => resume.succeed(()).unit - case _ => ZIO.unit - } *> workers.get.flatMap(shutdownWorkers).ignore + override def shutdown: URIO[GreyhoundMetrics, Unit] = { + report(ShuttingDownDispatcher(group, clientId, consumerAttributes)) *> + state.modify(state => (state, DispatcherState.ShuttingDown)).flatMap { + case DispatcherState.Paused(resume) => resume.succeed(()).unit + case _ => ZIO.unit + } *> workers.get.flatMap(shutdownWorkers).ignore + } /** * This implementation is not fiber-safe. Since the worker is used per partition, and all operations performed on a single partition @@ -581,6 +583,8 @@ object DispatcherMetric { attributes: Map[String, String] ) extends DispatcherMetric + case class ShuttingDownDispatcher(group: Group, clientId: ClientId, attributes: Map[String, String]) extends DispatcherMetric + case class WorkerStopped(group: Group, clientId: ClientId, partition: TopicPartition, durationMs: Long, attributes: Map[String, String]) extends DispatcherMetric diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 9fd1bba1..b7f251c5 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -138,11 +138,35 @@ object EventLoop { for { _ <- report(StoppingEventLoop(clientId, group, consumerAttributes)) _ <- running.set(ShuttingDown) - drained <- (fiber.join *> dispatcher.shutdown).timeout(config.drainTimeout) + drained <- + (joinFiberAndReport(group, clientId, consumerAttributes, fiber).interruptible *> + shutdownDispatcherAndReport(group, clientId, consumerAttributes, dispatcher)) + .timeout(config.drainTimeout) _ <- ZIO.when(drained.isEmpty)(report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumerAttributes))) _ <- if (config.consumePartitionInParallel) commitOffsetsAndGaps(consumer, offsetsAndGaps) else commitOffsets(consumer, offsets) + _ <- report(StoppedEventLoop(clientId, group, consumerAttributes)) } yield () + private def shutdownDispatcherAndReport[R]( + group: Group, + clientId: ClientId, + consumerAttributes: Map[Group, Group], + dispatcher: Dispatcher[R] + ) = + dispatcher.shutdown.timed + .map(_._1) + .flatMap(duration => report(DispatcherStopped(clientId, group, duration.toMillis, consumerAttributes))) + + private def joinFiberAndReport[R]( + group: Group, + clientId: ClientId, + consumerAttributes: Map[Group, Group], + fiber: Fiber.Runtime[Nothing, Boolean] + ) = + fiber.join.timed + .map(_._1) + .flatMap(duration => report(JoinedPollOnceFiberBeforeDispatcherShutdown(clientId, group, duration.toMillis, consumerAttributes))) + private def updatePositions( records: Consumer.Records, positionsRef: Ref[Map[TopicPartition, Offset]], @@ -434,6 +458,17 @@ object EventLoopMetric { case class StoppingEventLoop(clientId: ClientId, group: Group, attributes: Map[String, String] = Map.empty) extends EventLoopMetric + case class StoppedEventLoop(clientId: ClientId, group: Group, attributes: Map[String, String] = Map.empty) extends EventLoopMetric + + case class JoinedPollOnceFiberBeforeDispatcherShutdown( + clientId: ClientId, + group: Group, + durationMs: Long, + attributes: Map[String, String] = Map.empty + ) extends EventLoopMetric + + case class DispatcherStopped(group: Group, clientId: ClientId, durationMs: Long, attributes: Map[String, String]) extends EventLoopMetric + case class DrainTimeoutExceeded(clientId: ClientId, group: Group, timeoutMs: Long, attributes: Map[String, String] = Map.empty) extends EventLoopMetric From 94e8131f3764c3bcf8eac2488db214b21789fefe Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Sun, 20 Aug 2023 13:45:48 +0300 Subject: [PATCH 44/52] [greyhound] parallel consumer - interrupt fiber on shutdown timeout (#36642) GitOrigin-RevId: aaccb23bdc18584d42b2240e5b5e63c7d9432838 --- .../greyhound/core/consumer/EventLoop.scala | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index b7f251c5..1384792f 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -85,6 +85,7 @@ object EventLoop { _ <- report(CreatingPollOnceFiber(clientId, group, consumerAttributes)) fiber <- pollOnce(running, consumer, dispatcher, pausedPartitionsRef, positionsRef, offsets, config, clientId, group, offsetsAndGaps) .repeatWhile(_ == true) + .interruptible .forkDaemon _ <- report(AwaitingPartitionsAssignment(clientId, group, consumerAttributes)) partitions <- partitionsAssigned.await @@ -142,7 +143,9 @@ object EventLoop { (joinFiberAndReport(group, clientId, consumerAttributes, fiber).interruptible *> shutdownDispatcherAndReport(group, clientId, consumerAttributes, dispatcher)) .timeout(config.drainTimeout) - _ <- ZIO.when(drained.isEmpty)(report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumerAttributes))) + _ <- ZIO.when(drained.isEmpty)( + report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumerAttributes)) *> fiber.interruptFork + ) _ <- if (config.consumePartitionInParallel) commitOffsetsAndGaps(consumer, offsetsAndGaps) else commitOffsets(consumer, offsets) _ <- report(StoppedEventLoop(clientId, group, consumerAttributes)) } yield () @@ -356,16 +359,18 @@ object EventLoop { consumer.commit(committable).catchAll { t => report(FailedToCommitOffsets(t, committable)) *> offsets.update(committable) } } - private def commitOffsetsAndGaps(consumer: Consumer, offsetsAndGaps: OffsetsAndGaps): URIO[GreyhoundMetrics, Unit] = + private def commitOffsetsAndGaps(consumer: Consumer, offsetsAndGaps: OffsetsAndGaps): URIO[GreyhoundMetrics, Unit] = { offsetsAndGaps.getCommittableAndClear.flatMap { committable => val offsetsAndMetadataToCommit = OffsetsAndGaps.toOffsetsAndMetadata(committable) - consumer - .commitWithMetadata(offsetsAndMetadataToCommit) - .tap(_ => ZIO.when(offsetsAndMetadataToCommit.nonEmpty)(report(CommittedOffsetsAndGaps(committable)))) - .catchAll { t => - report(FailedToCommitOffsetsAndMetadata(t, offsetsAndMetadataToCommit)) *> offsetsAndGaps.setCommittable(committable) - } + report(CommittingOffsetsAndGaps(consumer.config.groupId, committable)) *> + consumer + .commitWithMetadata(offsetsAndMetadataToCommit) + .tap(_ => ZIO.when(offsetsAndMetadataToCommit.nonEmpty)(report(CommittedOffsetsAndGaps(committable)))) + .catchAll { t => + report(FailedToCommitOffsetsAndMetadata(t, offsetsAndMetadataToCommit)) *> offsetsAndGaps.setCommittable(committable) + } } + } private def commitOffsetsOnRebalance( consumer: Consumer, @@ -460,6 +465,12 @@ object EventLoopMetric { case class StoppedEventLoop(clientId: ClientId, group: Group, attributes: Map[String, String] = Map.empty) extends EventLoopMetric + case class CommittingOffsetsAndGaps( + groupId: Group, + offsetsAndGaps: Map[TopicPartition, OffsetAndGaps], + attributes: Map[String, String] = Map.empty + ) extends EventLoopMetric + case class JoinedPollOnceFiberBeforeDispatcherShutdown( clientId: ClientId, group: Group, From 9468545eb9bf753fec261fce16cf05f5eb777fda Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Mon, 21 Aug 2023 14:17:23 +0300 Subject: [PATCH 45/52] [greyhound] parallel consumer - add disconnect on shutdown (#36660) GitOrigin-RevId: 84bff3d0d5156c9cd3315423becf3811fe374776 --- .../greyhound/core/consumer/EventLoop.scala | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 1384792f..7867c35d 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -139,9 +139,10 @@ object EventLoop { for { _ <- report(StoppingEventLoop(clientId, group, consumerAttributes)) _ <- running.set(ShuttingDown) + _ <- running.get.flatMap(currentState => report(EventLoopStateOnShutdown(clientId, group, currentState, consumerAttributes))) drained <- (joinFiberAndReport(group, clientId, consumerAttributes, fiber).interruptible *> - shutdownDispatcherAndReport(group, clientId, consumerAttributes, dispatcher)) + shutdownDispatcherAndReport(group, clientId, consumerAttributes, dispatcher)).disconnect.interruptible .timeout(config.drainTimeout) _ <- ZIO.when(drained.isEmpty)( report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumerAttributes)) *> fiber.interruptFork @@ -202,8 +203,8 @@ object EventLoop { _ <- ZIO.when(records.isEmpty)(ZIO.sleep(50.millis)) } yield true - case ShuttingDown => ZIO.succeed(false) - case Paused => ZIO.sleep(100.millis).as(true) + case ShuttingDown => report(PollOnceFiberShuttingDown(clientId, group, consumer.config.consumerAttributes)) *> ZIO.succeed(false) + case Paused => report(PollOnceFiberPaused(clientId, group, consumer.config.consumerAttributes)) *> ZIO.sleep(100.millis).as(true) } private def listener( @@ -463,6 +464,18 @@ object EventLoopMetric { case class StoppingEventLoop(clientId: ClientId, group: Group, attributes: Map[String, String] = Map.empty) extends EventLoopMetric + case class EventLoopStateOnShutdown( + clientId: ClientId, + group: Group, + eventLoopState: EventLoopState, + attributes: Map[String, String] = Map.empty + ) extends EventLoopMetric + + case class PollOnceFiberShuttingDown(clientId: ClientId, group: Group, attributes: Map[String, String] = Map.empty) + extends EventLoopMetric + + case class PollOnceFiberPaused(clientId: ClientId, group: Group, attributes: Map[String, String] = Map.empty) extends EventLoopMetric + case class StoppedEventLoop(clientId: ClientId, group: Group, attributes: Map[String, String] = Map.empty) extends EventLoopMetric case class CommittingOffsetsAndGaps( From 461ccbadc6cb7b53f1767eb9858f5ac64d2ff103 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Tue, 22 Aug 2023 18:03:15 +0300 Subject: [PATCH 46/52] [greyhound] parallel consumer - add OffsetsAndGaps visibility (#36685) GitOrigin-RevId: cfb918350d137d551a9ca88d2e88a207e4b00fc6 --- .../greyhound/core/consumer/Dispatcher.scala | 27 ++++++--- .../greyhound/core/consumer/EventLoop.scala | 55 +++++++++++++------ .../core/consumer/OffsetsAndGaps.scala | 10 ++-- .../core/consumer/OffsetsAndGapsTest.scala | 8 +-- 4 files changed, 65 insertions(+), 35 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala index b4c70f48..25c7f6be 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Dispatcher.scala @@ -115,15 +115,17 @@ object Dispatcher { override def waitForCurrentRecordsCompletion: UIO[Unit] = workers.get.flatMap(workers => ZIO.foreach(workers.values)(_.waitForCurrentExecutionCompletion)).unit - override def revoke(partitions: Set[TopicPartition]): URIO[GreyhoundMetrics, Unit] = - workers - .modify { workers => - val revoked = workers.filterKeys(partitions.contains) - val remaining = workers -- partitions - - (revoked, remaining) - } - .flatMap(shutdownWorkers) + override def revoke(partitions: Set[TopicPartition]): URIO[GreyhoundMetrics, Unit] = { + report(DispatcherRevokingPartitions(clientId, group, partitions, consumerAttributes)) *> + workers + .modify { workers => + val revoked = workers.filterKeys(partitions.contains) + val remaining = workers -- partitions + + (revoked, remaining) + } + .flatMap(shutdownWorkers) + } override def pause: URIO[GreyhoundMetrics, Unit] = for { resume <- Promise.make[Nothing, Unit] @@ -583,6 +585,13 @@ object DispatcherMetric { attributes: Map[String, String] ) extends DispatcherMetric + case class DispatcherRevokingPartitions( + clientId: ClientId, + group: Group, + partitions: Set[TopicPartition], + attributes: Map[String, String] + ) extends DispatcherMetric + case class ShuttingDownDispatcher(group: Group, clientId: ClientId, attributes: Map[String, String]) extends DispatcherMetric case class WorkerStopped(group: Group, clientId: ClientId, partition: TopicPartition, durationMs: Long, attributes: Map[String, String]) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 7867c35d..60db841f 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -145,7 +145,8 @@ object EventLoop { shutdownDispatcherAndReport(group, clientId, consumerAttributes, dispatcher)).disconnect.interruptible .timeout(config.drainTimeout) _ <- ZIO.when(drained.isEmpty)( - report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumerAttributes)) *> fiber.interruptFork + report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, onShutdown = true, consumerAttributes)) *> + fiber.interruptFork ) _ <- if (config.consumePartitionInParallel) commitOffsetsAndGaps(consumer, offsetsAndGaps) else commitOffsets(consumer, offsets) _ <- report(StoppedEventLoop(clientId, group, consumerAttributes)) @@ -229,7 +230,15 @@ object EventLoop { _ <- pausedPartitionsRef.update(_ -- partitions) isRevokeTimedOut <- dispatcher.revoke(partitions).timeout(config.drainTimeout).map(_.isEmpty) _ <- ZIO.when(isRevokeTimedOut)( - report(DrainTimeoutExceeded(clientId, group, config.drainTimeout.toMillis, consumer.config.consumerAttributes)) + report( + DrainTimeoutExceeded( + clientId, + group, + config.drainTimeout.toMillis, + onShutdown = false, + consumer.config.consumerAttributes + ) + ) ) delayedRebalanceEffect <- if (useParallelConsumer) commitOffsetsAndGapsOnRebalance(consumer0, offsetsAndGaps) else commitOffsetsOnRebalance(consumer0, offsets) @@ -361,15 +370,16 @@ object EventLoop { } private def commitOffsetsAndGaps(consumer: Consumer, offsetsAndGaps: OffsetsAndGaps): URIO[GreyhoundMetrics, Unit] = { - offsetsAndGaps.getCommittableAndClear.flatMap { committable => - val offsetsAndMetadataToCommit = OffsetsAndGaps.toOffsetsAndMetadata(committable) - report(CommittingOffsetsAndGaps(consumer.config.groupId, committable)) *> - consumer - .commitWithMetadata(offsetsAndMetadataToCommit) - .tap(_ => ZIO.when(offsetsAndMetadataToCommit.nonEmpty)(report(CommittedOffsetsAndGaps(committable)))) - .catchAll { t => - report(FailedToCommitOffsetsAndMetadata(t, offsetsAndMetadataToCommit)) *> offsetsAndGaps.setCommittable(committable) - } + offsetsAndGaps.getCommittableAndClear.flatMap { + case (committable, offsetsAndGapsBefore, offsetsAndGapsAfter) => + val offsetsAndMetadataToCommit = OffsetsAndGaps.toOffsetsAndMetadata(committable) + report(CommittingOffsetsAndGaps(consumer.config.groupId, committable, offsetsAndGapsBefore, offsetsAndGapsAfter)) *> + consumer + .commitWithMetadata(offsetsAndMetadataToCommit) + .tap(_ => ZIO.when(offsetsAndMetadataToCommit.nonEmpty)(report(CommittedOffsetsAndGaps(committable)))) + .catchAll { t => + report(FailedToCommitOffsetsAndMetadata(t, offsetsAndMetadataToCommit)) *> offsetsAndGaps.setCommittable(committable) + } } } @@ -397,11 +407,13 @@ object EventLoop { offsetsAndGaps: OffsetsAndGaps ): URIO[GreyhoundMetrics, DelayedRebalanceEffect] = { for { - committable <- offsetsAndGaps.getCommittableAndClear - tle <- consumer - .commitWithMetadataOnRebalance(OffsetsAndGaps.toOffsetsAndMetadata(committable)) - .catchAll { _ => offsetsAndGaps.setCommittable(committable) *> DelayedRebalanceEffect.zioUnit } - runtime <- ZIO.runtime[Any] + committableResult <- offsetsAndGaps.getCommittableAndClear + (committable, offsetsAndGapsBefore, offsetsAndGapsAfter) = committableResult + _ <- report(CommittingOffsetsAndGaps(consumer.config.groupId, committable, offsetsAndGapsBefore, offsetsAndGapsAfter)) + tle <- consumer + .commitWithMetadataOnRebalance(OffsetsAndGaps.toOffsetsAndMetadata(committable)) + .catchAll { _ => offsetsAndGaps.setCommittable(committable) *> DelayedRebalanceEffect.zioUnit } + runtime <- ZIO.runtime[Any] } yield tle.catchAll { _ => zio.Unsafe.unsafe { implicit s => runtime.unsafe @@ -481,6 +493,8 @@ object EventLoopMetric { case class CommittingOffsetsAndGaps( groupId: Group, offsetsAndGaps: Map[TopicPartition, OffsetAndGaps], + offsetsAndGapsBefore: Map[TopicPartition, OffsetAndGaps], + offsetsAndGapsAfter: Map[TopicPartition, OffsetAndGaps], attributes: Map[String, String] = Map.empty ) extends EventLoopMetric @@ -493,8 +507,13 @@ object EventLoopMetric { case class DispatcherStopped(group: Group, clientId: ClientId, durationMs: Long, attributes: Map[String, String]) extends EventLoopMetric - case class DrainTimeoutExceeded(clientId: ClientId, group: Group, timeoutMs: Long, attributes: Map[String, String] = Map.empty) - extends EventLoopMetric + case class DrainTimeoutExceeded( + clientId: ClientId, + group: Group, + timeoutMs: Long, + onShutdown: Boolean, + attributes: Map[String, String] = Map.empty + ) extends EventLoopMetric case class HighWatermarkReached(partition: TopicPartition, onOffset: Offset, attributes: Map[String, String] = Map.empty) extends EventLoopMetric diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala index 7e68e153..97cf42da 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -13,7 +13,8 @@ import scala.util.Try trait OffsetsAndGaps { def init(committedOffsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] - def getCommittableAndClear: UIO[Map[TopicPartition, OffsetAndGaps]] + def getCommittableAndClear + : UIO[(Map[TopicPartition, OffsetAndGaps], Map[TopicPartition, OffsetAndGaps], Map[TopicPartition, OffsetAndGaps])] def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] @@ -44,11 +45,12 @@ object OffsetsAndGaps { override def init(committedOffsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] = ref.update(_ => committedOffsets) - override def getCommittableAndClear: UIO[Map[TopicPartition, OffsetAndGaps]] = + override def getCommittableAndClear + : UIO[(Map[TopicPartition, OffsetAndGaps], Map[TopicPartition, OffsetAndGaps], Map[TopicPartition, OffsetAndGaps])] = ref.modify(offsetsAndGaps => { val committable = offsetsAndGaps.filter(_._2.committable) - val updated = offsetsAndGaps.mapValues(_.markCommitted) - (committable, updated) + val updated = offsetsAndGaps.map { case (tp, og) => tp -> og.markCommitted } + ((committable, offsetsAndGaps, updated), updated) }) override def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] = diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala index f21c194b..c3142aeb 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala @@ -21,7 +21,7 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { offsetGaps <- OffsetsAndGaps.make _ <- offsetGaps.update(topicPartition, Seq(1L, 3L, 7L)) _ <- offsetGaps.update(topicPartition, Seq(2L, 5L)) - getCommittableAndClear <- offsetGaps.getCommittableAndClear + getCommittableAndClear <- offsetGaps.getCommittableAndClear.map(_._1) } yield getCommittableAndClear must havePair(topicPartition -> OffsetAndGaps(7L, Seq(Gap(0L, 0L), Gap(4L, 4L), Gap(6L, 6L)))) } @@ -30,7 +30,7 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { offsetGaps <- OffsetsAndGaps.make _ <- offsetGaps.update(topicPartition, Seq(1L, 3L, 7L)) _ <- offsetGaps.getCommittableAndClear - getCommittableAndClear <- offsetGaps.getCommittableAndClear + getCommittableAndClear <- offsetGaps.getCommittableAndClear.map(_._1) } yield getCommittableAndClear must beEmpty } @@ -52,7 +52,7 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { _ <- offsetGaps.update(partition0, Seq(1L)) _ <- offsetGaps.update(partition0, Seq(0L)) _ <- offsetGaps.update(partition1, Seq(0L)) - current <- offsetGaps.getCommittableAndClear + current <- offsetGaps.getCommittableAndClear.map(_._1) } yield current must havePairs(partition0 -> OffsetAndGaps(1L, Seq()), partition1 -> OffsetAndGaps(0L, Seq())) } @@ -66,7 +66,7 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { _ <- offsetGaps.init(initialCommittedOffsets) _ <- offsetGaps.update(partition0, Seq(101L, 102L)) _ <- offsetGaps.update(partition1, Seq(203L, 204L)) - current <- offsetGaps.getCommittableAndClear + current <- offsetGaps.getCommittableAndClear.map(_._1) } yield current must havePairs(partition0 -> OffsetAndGaps(102L, Seq()), partition1 -> OffsetAndGaps(204L, Seq(Gap(201L, 202L)))) } From 41ade44fd4ae31d98ba103840c1732a7dc3dcbdb Mon Sep 17 00:00:00 2001 From: Leon Burdinov Date: Wed, 23 Aug 2023 14:39:13 +0300 Subject: [PATCH 47/52] enrich commit metadata with pod and timestamp (#36679) * enrich commit metadata with pod and timestamp GitOrigin-RevId: 2669ad3b6fdb31749768eda02cf4cf0872933e74 --- .../greyhound/core/consumer/Consumer.scala | 32 ++++++++++++++----- .../core/consumer/RecordConsumer.scala | 6 ++-- .../core/consumer/batched/BatchConsumer.scala | 6 ++-- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index 909682b7..54f21aa6 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -6,7 +6,7 @@ import com.wixpress.dst.greyhound.core.consumer.ConsumerMetric.ClosedConsumer import com.wixpress.dst.greyhound.core.consumer.domain.{ConsumerRecord, Decryptor, NoOpDecryptor, RecordTopicPartition} import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics import com.wixpress.dst.greyhound.core.metrics.GreyhoundMetrics._ -import org.apache.kafka.clients.consumer.{ConsumerConfig => KafkaConsumerConfig, ConsumerRebalanceListener, KafkaConsumer, OffsetAndMetadata => KafkaOffsetAndMetadata} +import org.apache.kafka.clients.consumer.{ConsumerRebalanceListener, KafkaConsumer, ConsumerConfig => KafkaConsumerConfig, OffsetAndMetadata => KafkaOffsetAndMetadata} import org.apache.kafka.common.serialization.Deserializer import org.apache.kafka.common.{TopicPartition => KafkaTopicPartition} import zio.ZIO.attemptBlocking @@ -99,11 +99,12 @@ object Consumer { // we commit missing offsets to current position on assign - otherwise messages may be lost, in case of `OffsetReset.Latest`, // if a partition with no committed offset is revoked during processing // we also may want to seek forward to some given initial offsets + unsafeOffsetOperations = UnsafeOffsetOperations.make(consumer) offsetsInitializer <- OffsetsInitializer .make( cfg.clientId, cfg.groupId, - UnsafeOffsetOperations.make(consumer), + unsafeOffsetOperations, timeout = 10.seconds, timeoutIfSeek = 10.seconds, initialSeek = cfg.initialSeek, @@ -116,13 +117,13 @@ object Consumer { override def subscribePattern[R1](topicStartsWith: Pattern, rebalanceListener: RebalanceListener[R1])( implicit trace: Trace ): RIO[GreyhoundMetrics with R1, Unit] = - listener(this, offsetsInitializer.initializeOffsets, config.additionalListener *> rebalanceListener) + listener(this, offsetsInitializer.initializeOffsets, config.additionalListener *> rebalanceListener, unsafeOffsetOperations) .flatMap(lis => withConsumer(_.subscribe(topicStartsWith, lis))) override def subscribe[R1](topics: Set[Topic], rebalanceListener: RebalanceListener[R1])( implicit trace: Trace ): RIO[GreyhoundMetrics with R1, Unit] = - listener(this, offsetsInitializer.initializeOffsets, config.additionalListener *> rebalanceListener) + listener(this, offsetsInitializer.initializeOffsets, config.additionalListener *> rebalanceListener, unsafeOffsetOperations) .flatMap(lis => withConsumerBlocking(_.subscribe(topics.asJava, lis))) override def poll(timeout: Duration)(implicit trace: Trace): RIO[Any, Records] = @@ -170,8 +171,14 @@ object Consumer { .toMap ) + import java.time.format.DateTimeFormatter + import java.time.LocalDateTime + private val podName = sys.env.get("POD_NAME") + private val dtf = DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss") + def metadata: Option[String] = if (config.enrichMetadata) podName.map(name => s">>> pod: $name, ts: ${dtf.format(LocalDateTime.now())}") else None + override def commit(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { - withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, cfg.commitMetadataString)))) + withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, metadata.getOrElse(cfg.commitMetadataString))))) } override def commitWithMetadata( @@ -279,9 +286,17 @@ object Consumer { } } - private def listener[R1](consumer: Consumer, onAssignFirstDo: Set[TopicPartition] => Unit, rebalanceListener: RebalanceListener[R1]) = + case class InitialOffsetsAndMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata]) extends com.wixpress.dst.greyhound.core.metrics.GreyhoundMetric + + private def listener[R1](consumer: Consumer, onAssignFirstDo: Set[TopicPartition] => Unit, rebalanceListener: RebalanceListener[R1], unsafeOffsetOperations: UnsafeOffsetOperations) = ZIO.runtime[R1].map { runtime => new ConsumerRebalanceListener { + + def reportInitialOffsetsAndMetadata(partitions: Set[TopicPartition]) = { + val offsetsAndMetadata = unsafeOffsetOperations.committedWithMetadata(partitions, 10.seconds) + report(InitialOffsetsAndMetadata(offsetsAndMetadata)).provide(GreyhoundMetrics.liveLayer) + } + override def onPartitionsRevoked(partitions: util.Collection[KafkaTopicPartition]): Unit = { zio.Unsafe.unsafe { implicit s => runtime.unsafe @@ -302,7 +317,7 @@ object Consumer { zio.Unsafe.unsafe { implicit s => runtime.unsafe .run( - rebalanceListener.onPartitionsAssigned(consumer, assigned) + reportInitialOffsetsAndMetadata(assigned) *> rebalanceListener.onPartitionsAssigned(consumer, assigned) ) .getOrThrowFiberFailure() .run() @@ -341,7 +356,8 @@ case class ConsumerConfig( decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, rewindUncommittedOffsetsByMillis: Long = 0L, - useParallelConsumer: Boolean = false + useParallelConsumer: Boolean = false, + enrichMetadata: Boolean = false ) extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = Map( diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala index ed3028f0..d788c8ae 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala @@ -228,7 +228,8 @@ object RecordConsumer { config.decryptor, config.commitMetadataString, config.rewindUncommittedOffsetsBy.toMillis, - config.eventLoopConfig.consumePartitionInParallel + config.eventLoopConfig.consumePartitionInParallel, + config.enrichMetadata ) } @@ -344,7 +345,8 @@ case class RecordConsumerConfig( retryProducerAttributes: Map[String, String] = Map.empty, commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, rewindUncommittedOffsetsBy: Duration = 0.millis, - createRetryTopics: Boolean = true + createRetryTopics: Boolean = true, + enrichMetadata: Boolean = true ) extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = extraProperties diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala index 3d92e85a..ba414929 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala @@ -146,7 +146,8 @@ object BatchConsumer { config.consumerAttributes, config.decryptor, config.commitMetadataString, - config.rewindUncommittedOffsetsBy.toMillis + config.rewindUncommittedOffsetsBy.toMillis, + enrichMetadata = config.enrichMetadata ) } @@ -184,7 +185,8 @@ case class BatchConsumerConfig( consumerAttributes: Map[String, String] = Map.empty, decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, - rewindUncommittedOffsetsBy: Duration = Duration.ZERO + rewindUncommittedOffsetsBy: Duration = Duration.ZERO, + enrichMetadata: Boolean = false ) object BatchConsumerConfig { From 785d0b9b3f3a678d62d35a4aeb35cf128c0959a0 Mon Sep 17 00:00:00 2001 From: Leon Burdinov Date: Sun, 27 Aug 2023 15:04:30 +0300 Subject: [PATCH 48/52] Metadata to commit log (#36749) * enrich commit metadata with pod and timestamp * fix for metrics layer GitOrigin-RevId: 8f4f026e95f2008cfe9e958a518381f31630e6c1 --- .../com/wixpress/dst/greyhound/core/consumer/Consumer.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index 54f21aa6..f9c07e29 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -289,12 +289,12 @@ object Consumer { case class InitialOffsetsAndMetadata(offsetsAndMetadata: Map[TopicPartition, OffsetAndMetadata]) extends com.wixpress.dst.greyhound.core.metrics.GreyhoundMetric private def listener[R1](consumer: Consumer, onAssignFirstDo: Set[TopicPartition] => Unit, rebalanceListener: RebalanceListener[R1], unsafeOffsetOperations: UnsafeOffsetOperations) = - ZIO.runtime[R1].map { runtime => + ZIO.runtime[R1 with GreyhoundMetrics].map { runtime => new ConsumerRebalanceListener { def reportInitialOffsetsAndMetadata(partitions: Set[TopicPartition]) = { val offsetsAndMetadata = unsafeOffsetOperations.committedWithMetadata(partitions, 10.seconds) - report(InitialOffsetsAndMetadata(offsetsAndMetadata)).provide(GreyhoundMetrics.liveLayer) + report(InitialOffsetsAndMetadata(offsetsAndMetadata)) } override def onPartitionsRevoked(partitions: util.Collection[KafkaTopicPartition]): Unit = { From c9d6404c5a8316f39c919d8b15fca3e48b5c2369 Mon Sep 17 00:00:00 2001 From: Leon Burdinov Date: Mon, 28 Aug 2023 14:27:58 +0300 Subject: [PATCH 49/52] GH commit metadata ts is in UTC (#36774) GH commit metadata ts is in UTC #pr #skipreview GitOrigin-RevId: 2efd9f9a307f625f4112ca172cd53e256bcd24f7 --- .../com/wixpress/dst/greyhound/core/consumer/Consumer.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index f9c07e29..d8d0c15d 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -173,9 +173,10 @@ object Consumer { import java.time.format.DateTimeFormatter import java.time.LocalDateTime + import java.time.ZoneOffset private val podName = sys.env.get("POD_NAME") private val dtf = DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss") - def metadata: Option[String] = if (config.enrichMetadata) podName.map(name => s">>> pod: $name, ts: ${dtf.format(LocalDateTime.now())}") else None + def metadata: Option[String] = if (config.enrichMetadata) podName.map(name => s">>> pod: $name, ts: ${dtf.format(LocalDateTime.now(ZoneOffset.UTC))}") else None override def commit(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, metadata.getOrElse(cfg.commitMetadataString))))) From cee1cacabe03ddc6e17556c03969f92e04be6cb9 Mon Sep 17 00:00:00 2001 From: Leon Burdinov Date: Mon, 28 Aug 2023 17:19:12 +0300 Subject: [PATCH 50/52] moved enrich metadata out of GH core. add it to commit on rebalance (#36783) moved enrich metadata out of GH core. add it to commit on rebalance #pr #skipreview GitOrigin-RevId: bfdcccbc4c5210870118a1d049063db157d9f771 --- .../dst/greyhound/core/consumer/Consumer.scala | 14 +++----------- .../greyhound/core/consumer/RecordConsumer.scala | 4 +--- .../core/consumer/batched/BatchConsumer.scala | 6 ++---- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala index d8d0c15d..b1c872bb 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/Consumer.scala @@ -171,15 +171,8 @@ object Consumer { .toMap ) - import java.time.format.DateTimeFormatter - import java.time.LocalDateTime - import java.time.ZoneOffset - private val podName = sys.env.get("POD_NAME") - private val dtf = DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss") - def metadata: Option[String] = if (config.enrichMetadata) podName.map(name => s">>> pod: $name, ts: ${dtf.format(LocalDateTime.now(ZoneOffset.UTC))}") else None - override def commit(offsets: Map[TopicPartition, Offset])(implicit trace: Trace): RIO[GreyhoundMetrics, Unit] = { - withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, metadata.getOrElse(cfg.commitMetadataString))))) + withConsumerBlocking(_.commitSync(kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, cfg.commitMetadataString())))) } override def commitWithMetadata( @@ -191,7 +184,7 @@ object Consumer { override def commitOnRebalance( offsets: Map[TopicPartition, Offset] )(implicit trace: Trace): RIO[GreyhoundMetrics, DelayedRebalanceEffect] = { - val kOffsets = kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, cfg.commitMetadataString)) + val kOffsets = kafkaOffsetsAndMetaData(toOffsetsAndMetadata(offsets, cfg.commitMetadataString())) // we can't actually call commit here, as it needs to be called from the same // thread, that triggered poll(), so we return the commit action as thunk ZIO.succeed(DelayedRebalanceEffect(consumer.commitSync(kOffsets))) @@ -355,10 +348,9 @@ case class ConsumerConfig( initialSeek: InitialOffsetsSeek = InitialOffsetsSeek.default, consumerAttributes: Map[String, String] = Map.empty, decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, - commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, + commitMetadataString: Unit => Metadata = _ => OffsetAndMetadata.NO_METADATA, rewindUncommittedOffsetsByMillis: Long = 0L, useParallelConsumer: Boolean = false, - enrichMetadata: Boolean = false ) extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = Map( diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala index d788c8ae..6078a784 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/RecordConsumer.scala @@ -229,7 +229,6 @@ object RecordConsumer { config.commitMetadataString, config.rewindUncommittedOffsetsBy.toMillis, config.eventLoopConfig.consumePartitionInParallel, - config.enrichMetadata ) } @@ -343,10 +342,9 @@ case class RecordConsumerConfig( consumerAttributes: Map[String, String] = Map.empty, decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, retryProducerAttributes: Map[String, String] = Map.empty, - commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, + commitMetadataString: Unit => Metadata = _ => OffsetAndMetadata.NO_METADATA, rewindUncommittedOffsetsBy: Duration = 0.millis, createRetryTopics: Boolean = true, - enrichMetadata: Boolean = true ) extends CommonGreyhoundConfig { override def kafkaProps: Map[String, String] = extraProperties diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala index ba414929..7d2839b3 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/batched/BatchConsumer.scala @@ -146,8 +146,7 @@ object BatchConsumer { config.consumerAttributes, config.decryptor, config.commitMetadataString, - config.rewindUncommittedOffsetsBy.toMillis, - enrichMetadata = config.enrichMetadata + config.rewindUncommittedOffsetsBy.toMillis ) } @@ -184,9 +183,8 @@ case class BatchConsumerConfig( initialOffsetsSeek: InitialOffsetsSeek = InitialOffsetsSeek.default, consumerAttributes: Map[String, String] = Map.empty, decryptor: Decryptor[Any, Throwable, Chunk[Byte], Chunk[Byte]] = new NoOpDecryptor, - commitMetadataString: Metadata = OffsetAndMetadata.NO_METADATA, + commitMetadataString: Unit => Metadata = _ => OffsetAndMetadata.NO_METADATA, rewindUncommittedOffsetsBy: Duration = Duration.ZERO, - enrichMetadata: Boolean = false ) object BatchConsumerConfig { From 10ffeb231c8c20f17c3a10903e35050cffb96547 Mon Sep 17 00:00:00 2001 From: Ben Wattelman <82799628+ben-wattelman@users.noreply.github.com> Date: Mon, 28 Aug 2023 18:57:07 +0300 Subject: [PATCH 51/52] [greyhound] parallel consumer - remove noisy logs (#36777) GitOrigin-RevId: 67e32052558deb48f4373262a724d1ccd1751b82 --- .../greyhound/core/consumer/EventLoop.scala | 36 ++++++++----------- .../core/consumer/OffsetsAndGaps.scala | 8 ++--- .../core/consumer/OffsetsAndGapsTest.scala | 8 ++--- 3 files changed, 22 insertions(+), 30 deletions(-) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala index 60db841f..10ab0640 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/EventLoop.scala @@ -204,8 +204,8 @@ object EventLoop { _ <- ZIO.when(records.isEmpty)(ZIO.sleep(50.millis)) } yield true - case ShuttingDown => report(PollOnceFiberShuttingDown(clientId, group, consumer.config.consumerAttributes)) *> ZIO.succeed(false) - case Paused => report(PollOnceFiberPaused(clientId, group, consumer.config.consumerAttributes)) *> ZIO.sleep(100.millis).as(true) + case ShuttingDown => ZIO.succeed(false) + case Paused => ZIO.sleep(100.millis).as(true) } private def listener( @@ -370,16 +370,14 @@ object EventLoop { } private def commitOffsetsAndGaps(consumer: Consumer, offsetsAndGaps: OffsetsAndGaps): URIO[GreyhoundMetrics, Unit] = { - offsetsAndGaps.getCommittableAndClear.flatMap { - case (committable, offsetsAndGapsBefore, offsetsAndGapsAfter) => - val offsetsAndMetadataToCommit = OffsetsAndGaps.toOffsetsAndMetadata(committable) - report(CommittingOffsetsAndGaps(consumer.config.groupId, committable, offsetsAndGapsBefore, offsetsAndGapsAfter)) *> - consumer - .commitWithMetadata(offsetsAndMetadataToCommit) - .tap(_ => ZIO.when(offsetsAndMetadataToCommit.nonEmpty)(report(CommittedOffsetsAndGaps(committable)))) - .catchAll { t => - report(FailedToCommitOffsetsAndMetadata(t, offsetsAndMetadataToCommit)) *> offsetsAndGaps.setCommittable(committable) - } + offsetsAndGaps.getCommittableAndClear.flatMap { committable => + val offsetsAndMetadataToCommit = OffsetsAndGaps.toOffsetsAndMetadata(committable) + consumer + .commitWithMetadata(offsetsAndMetadataToCommit) + .tap(_ => ZIO.when(offsetsAndMetadataToCommit.nonEmpty)(report(CommittedOffsetsAndGaps(committable)))) + .catchAll { t => + report(FailedToCommitOffsetsAndMetadata(t, offsetsAndMetadataToCommit)) *> offsetsAndGaps.setCommittable(committable) + } } } @@ -407,13 +405,11 @@ object EventLoop { offsetsAndGaps: OffsetsAndGaps ): URIO[GreyhoundMetrics, DelayedRebalanceEffect] = { for { - committableResult <- offsetsAndGaps.getCommittableAndClear - (committable, offsetsAndGapsBefore, offsetsAndGapsAfter) = committableResult - _ <- report(CommittingOffsetsAndGaps(consumer.config.groupId, committable, offsetsAndGapsBefore, offsetsAndGapsAfter)) - tle <- consumer - .commitWithMetadataOnRebalance(OffsetsAndGaps.toOffsetsAndMetadata(committable)) - .catchAll { _ => offsetsAndGaps.setCommittable(committable) *> DelayedRebalanceEffect.zioUnit } - runtime <- ZIO.runtime[Any] + committable <- offsetsAndGaps.getCommittableAndClear + tle <- consumer + .commitWithMetadataOnRebalance(OffsetsAndGaps.toOffsetsAndMetadata(committable)) + .catchAll { _ => offsetsAndGaps.setCommittable(committable) *> DelayedRebalanceEffect.zioUnit } + runtime <- ZIO.runtime[Any] } yield tle.catchAll { _ => zio.Unsafe.unsafe { implicit s => runtime.unsafe @@ -493,8 +489,6 @@ object EventLoopMetric { case class CommittingOffsetsAndGaps( groupId: Group, offsetsAndGaps: Map[TopicPartition, OffsetAndGaps], - offsetsAndGapsBefore: Map[TopicPartition, OffsetAndGaps], - offsetsAndGapsAfter: Map[TopicPartition, OffsetAndGaps], attributes: Map[String, String] = Map.empty ) extends EventLoopMetric diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala index 97cf42da..92d23a84 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGaps.scala @@ -13,8 +13,7 @@ import scala.util.Try trait OffsetsAndGaps { def init(committedOffsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] - def getCommittableAndClear - : UIO[(Map[TopicPartition, OffsetAndGaps], Map[TopicPartition, OffsetAndGaps], Map[TopicPartition, OffsetAndGaps])] + def getCommittableAndClear: UIO[Map[TopicPartition, OffsetAndGaps]] def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] @@ -45,12 +44,11 @@ object OffsetsAndGaps { override def init(committedOffsets: Map[TopicPartition, OffsetAndGaps]): UIO[Unit] = ref.update(_ => committedOffsets) - override def getCommittableAndClear - : UIO[(Map[TopicPartition, OffsetAndGaps], Map[TopicPartition, OffsetAndGaps], Map[TopicPartition, OffsetAndGaps])] = + override def getCommittableAndClear: UIO[Map[TopicPartition, OffsetAndGaps]] = ref.modify(offsetsAndGaps => { val committable = offsetsAndGaps.filter(_._2.committable) val updated = offsetsAndGaps.map { case (tp, og) => tp -> og.markCommitted } - ((committable, offsetsAndGaps, updated), updated) + (committable, updated) }) override def gapsForPartition(partition: TopicPartition): UIO[Seq[Gap]] = diff --git a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala index c3142aeb..f21c194b 100644 --- a/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala +++ b/core/src/test/scala/com/wixpress/dst/greyhound/core/consumer/OffsetsAndGapsTest.scala @@ -21,7 +21,7 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { offsetGaps <- OffsetsAndGaps.make _ <- offsetGaps.update(topicPartition, Seq(1L, 3L, 7L)) _ <- offsetGaps.update(topicPartition, Seq(2L, 5L)) - getCommittableAndClear <- offsetGaps.getCommittableAndClear.map(_._1) + getCommittableAndClear <- offsetGaps.getCommittableAndClear } yield getCommittableAndClear must havePair(topicPartition -> OffsetAndGaps(7L, Seq(Gap(0L, 0L), Gap(4L, 4L), Gap(6L, 6L)))) } @@ -30,7 +30,7 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { offsetGaps <- OffsetsAndGaps.make _ <- offsetGaps.update(topicPartition, Seq(1L, 3L, 7L)) _ <- offsetGaps.getCommittableAndClear - getCommittableAndClear <- offsetGaps.getCommittableAndClear.map(_._1) + getCommittableAndClear <- offsetGaps.getCommittableAndClear } yield getCommittableAndClear must beEmpty } @@ -52,7 +52,7 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { _ <- offsetGaps.update(partition0, Seq(1L)) _ <- offsetGaps.update(partition0, Seq(0L)) _ <- offsetGaps.update(partition1, Seq(0L)) - current <- offsetGaps.getCommittableAndClear.map(_._1) + current <- offsetGaps.getCommittableAndClear } yield current must havePairs(partition0 -> OffsetAndGaps(1L, Seq()), partition1 -> OffsetAndGaps(0L, Seq())) } @@ -66,7 +66,7 @@ class OffsetsAndGapsTestGapsTest extends BaseTestNoEnv { _ <- offsetGaps.init(initialCommittedOffsets) _ <- offsetGaps.update(partition0, Seq(101L, 102L)) _ <- offsetGaps.update(partition1, Seq(203L, 204L)) - current <- offsetGaps.getCommittableAndClear.map(_._1) + current <- offsetGaps.getCommittableAndClear } yield current must havePairs(partition0 -> OffsetAndGaps(102L, Seq()), partition1 -> OffsetAndGaps(204L, Seq(Gap(201L, 202L)))) } From d723223252f9a88c48ed4ef5e9c4ed0894cd769b Mon Sep 17 00:00:00 2001 From: Noam Berman Date: Mon, 4 Sep 2023 02:22:07 +0300 Subject: [PATCH 52/52] [consumer-proxy] fix retry bug (#36902) fix retry bug - was taking all records in batch once the first record was finished backing off, resulting in too-soon retries for the rest of the batch (up to 2000 records) GitOrigin-RevId: 0a0abfd8c905ddb7f89696655162a92021a6fcd6 --- .../dst/greyhound/core/consumer/retry/RetryAttempt.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala index 6b740690..ae5109fd 100644 --- a/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala +++ b/core/src/main/scala/com/wixpress/dst/greyhound/core/consumer/retry/RetryAttempt.scala @@ -26,6 +26,7 @@ object RetryHeader { val Backoff = DelayHeaders.Backoff val OriginalTopic = "GH_OriginalTopic" val RetryAttempt = "GH_RetryAttempt" + val allHeaders = Set(Submitted, Backoff, OriginalTopic, RetryAttempt) } case class RetryAttemptHeaders(