From bbaa76ac4b0667271db1c6393eaea4ef6e800e14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Poniedzia=C5=82ek?= Date: Mon, 4 Dec 2023 14:51:58 +0100 Subject: [PATCH] Add retry config --- config/config.azure.reference.hocon | 6 +++ config/config.kinesis.reference.hocon | 6 +++ config/config.pubsub.reference.hocon | 19 +++------- .../core/src/main/resources/reference.conf | 4 ++ .../AppHealth.scala | 2 +- .../Config.scala | 4 ++ .../Environment.scala | 6 +-- .../processing/ChannelProvider.scala | 12 ++++-- .../processing/SnowflakeHealth.scala | 2 +- .../processing/SnowflakeRetrying.scala | 38 ++++++++----------- .../processing/TableManager.scala | 5 ++- 11 files changed, 57 insertions(+), 47 deletions(-) diff --git a/config/config.azure.reference.hocon b/config/config.azure.reference.hocon index d4b15cf..a663b20 100644 --- a/config/config.azure.reference.hocon +++ b/config/config.azure.reference.hocon @@ -78,6 +78,12 @@ # - How many batches can we send simultaneously over the network to Snowflake. "uploadConcurrency": 1 } + + # Retry configuration for Snowflake operation failures + "retries": { + # Starting backoff period + "backoff": "30 seconds" + } # -- Schemas that won't be loaded to Snowflake. Optional, default value [] "skipSchemas": [ diff --git a/config/config.kinesis.reference.hocon b/config/config.kinesis.reference.hocon index a9d5c19..d71975f 100644 --- a/config/config.kinesis.reference.hocon +++ b/config/config.kinesis.reference.hocon @@ -99,6 +99,12 @@ # - How many batches can we send simultaneously over the network to Snowflake. "uploadConcurrency": 1 } + + # Retry configuration for Snowflake operation failures + "retries": { + # Starting backoff period + "backoff": "30 seconds" + } # -- Schemas that won't be loaded to Snowflake. Optional, default value [] "skipSchemas": [ diff --git a/config/config.pubsub.reference.hocon b/config/config.pubsub.reference.hocon index 3f879f7..af26952 100644 --- a/config/config.pubsub.reference.hocon +++ b/config/config.pubsub.reference.hocon @@ -79,19 +79,12 @@ # - How many batches can we send simultaneously over the network to Snowflake. "uploadConcurrency": 1 } - - "batching": { - - # - Events are emitted to Snowflake when the batch reaches this size in bytes - "maxBytes": 16000000 - - # - Events are emitted to Snowflake for a maximum of this duration, even if the `maxBytes` size has not been reached - "maxDelay": "1 second" - - # - How many batches can we send simultaneously over the network to Snowflake. - # - The Snowflake SDK dictates this should be kept below the number of available runtime processors (cpu) - "uploadConcurrency": 1 - } + + # Retry configuration for Snowflake operation failures + "retries": { + # Starting backoff period + "backoff": "30 seconds" + } # -- Schemas that won't be loaded to Snowflake. Optional, default value [] "skipSchemas": [ diff --git a/modules/core/src/main/resources/reference.conf b/modules/core/src/main/resources/reference.conf index 2a8f11b..eaabbe9 100644 --- a/modules/core/src/main/resources/reference.conf +++ b/modules/core/src/main/resources/reference.conf @@ -16,6 +16,10 @@ "uploadConcurrency": 3 } + "retries": { + "backoff": "30 seconds" + } + "skipSchemas": [] "monitoring": { diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/AppHealth.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/AppHealth.scala index 34cead9..6447757 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/AppHealth.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/AppHealth.scala @@ -31,7 +31,7 @@ object AppHealth { case (Healthy, Healthy) => Healthy case (Healthy, unhealthy) => unhealthy case (unhealthy, Healthy) => unhealthy - case (Unhealthy(first), Unhealthy(second)) => Unhealthy(reason = s"$first, $second") // TODO do we want to combine reasons like that...? + case (Unhealthy(first), Unhealthy(second)) => Unhealthy(reason = s"$first, $second") } private implicit val healthMonoid: Monoid[HealthProbe.Status] = Monoid.instance(Healthy, combineHealth) diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Config.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Config.scala index ce23c0b..fe5203c 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Config.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Config.scala @@ -26,6 +26,7 @@ case class Config[+Source, +Sink]( input: Source, output: Config.Output[Sink], batching: Config.Batching, + retries: Config.Retries, skipSchemas: List[SchemaCriterion], telemetry: Telemetry.Config, monitoring: Config.Monitoring @@ -75,6 +76,8 @@ object Config { healthProbe: HealthProbe ) + case class Retries(backoff: FiniteDuration) + implicit def decoder[Source: Decoder, Sink: Decoder]: Decoder[Config[Source, Sink]] = { implicit val configuration = Configuration.default.withDiscriminator("type") implicit val urlDecoder = Decoder.decodeString.emapTry { str => @@ -93,6 +96,7 @@ object Config { implicit val metricsDecoder = deriveConfiguredDecoder[Metrics] implicit val healthProbeDecoder = deriveConfiguredDecoder[HealthProbe] implicit val monitoringDecoder = deriveConfiguredDecoder[Monitoring] + implicit val retriesDecoder = deriveConfiguredDecoder[Retries] deriveConfiguredDecoder[Config[Source, Sink]] } diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Environment.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Environment.scala index 502225d..15cdd92 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Environment.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/Environment.scala @@ -46,9 +46,9 @@ object Environment { badSink <- toSink(config.output.bad) metrics <- Resource.eval(Metrics.build(config.monitoring.metrics)) xa <- Resource.eval(SQLUtils.transactor[F](config.output.good)) - _ <- Resource.eval(SnowflakeRetrying.retryIndefinitely(snowflakeHealth)(SQLUtils.createTable(config.output.good, xa))) - tblManager = TableManager.fromTransactor(config.output.good, snowflakeHealth, xa) - channelProvider <- ChannelProvider.make(config.output.good, snowflakeHealth, config.batching) + _ <- Resource.eval(SnowflakeRetrying.retryIndefinitely(snowflakeHealth, config.retries)(SQLUtils.createTable(config.output.good, xa))) + tblManager = TableManager.fromTransactor(config.output.good, xa, snowflakeHealth, config.retries) + channelProvider <- ChannelProvider.make(config.output.good, snowflakeHealth, config.batching, config.retries) sourceAndAck <- Resource.eval(toSource(config.input)) _ <- HealthProbe.resource( config.monitoring.healthProbe.port, diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/ChannelProvider.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/ChannelProvider.scala index 7313953..1e0ec1e 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/ChannelProvider.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/ChannelProvider.scala @@ -110,7 +110,8 @@ object ChannelProvider { def make[F[_]: Async]( config: Config.Snowflake, snowflakeHealth: SnowflakeHealth[F], - batchingConfig: Config.Batching + batchingConfig: Config.Batching, + retriesConfig: Config.Retries ): Resource[F, ChannelProvider[F]] = for { client <- createClient(config, batchingConfig) @@ -118,14 +119,15 @@ object ChannelProvider { channel <- Resource.eval(hs.swap(createChannel(config, client))) ref <- Resource.eval(Ref[F].of(channel)) sem <- Resource.eval(Semaphore[F](allAvailablePermits)) - } yield impl(ref, hs, sem, createChannel(config, client), snowflakeHealth) + } yield impl(ref, hs, sem, createChannel(config, client), snowflakeHealth, retriesConfig) private def impl[F[_]: Async]( ref: Ref[F, SnowflakeStreamingIngestChannel], hs: Hotswap[F, SnowflakeStreamingIngestChannel], sem: Semaphore[F], next: Resource[F, SnowflakeStreamingIngestChannel], - snowflakeHealth: SnowflakeHealth[F] + snowflakeHealth: SnowflakeHealth[F], + retriesConfig: Config.Retries ): ChannelProvider[F] = new ChannelProvider[F] { def reset: F[Unit] = @@ -164,7 +166,9 @@ object ChannelProvider { for { channel <- ref.get response <- - SnowflakeRetrying.retryIndefinitely(snowflakeHealth)(Sync[F].blocking(channel.insertRows(rows.map(_.asJava).asJava, null))) + SnowflakeRetrying.retryIndefinitely(snowflakeHealth, retriesConfig)( + Sync[F].blocking(channel.insertRows(rows.map(_.asJava).asJava, null)) + ) _ <- flushChannel[F](channel) isValid <- Sync[F].delay(channel.isValid) } yield if (isValid) WriteResult.WriteFailures(parseResponse(response)) else WriteResult.ChannelIsInvalid diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeHealth.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeHealth.scala index 335d629..8bdfe1c 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeHealth.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeHealth.scala @@ -5,7 +5,7 @@ import cats.implicits._ import com.snowplowanalytics.snowplow.runtime.HealthProbe final case class SnowflakeHealth[F[_]](state: Ref[F, HealthProbe.Status]) { - def setUnhealthy(): F[Unit] = state.set(HealthProbe.Unhealthy("Snowflake is not healthy")) // TODO do we need more details here? + def setUnhealthy(): F[Unit] = state.set(HealthProbe.Unhealthy("Snowflake is not healthy")) def setHealthy(): F[Unit] = state.set(HealthProbe.Healthy) } diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeRetrying.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeRetrying.scala index a7564f3..fa8feee 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeRetrying.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/SnowflakeRetrying.scala @@ -2,38 +2,30 @@ package com.snowplowanalytics.snowplow.snowflake.processing import cats.effect.Sync import cats.implicits._ -import net.snowflake.client.jdbc.SnowflakeSQLException +import com.snowplowanalytics.snowplow.snowflake.Config import org.typelevel.log4cats.Logger import org.typelevel.log4cats.slf4j.Slf4jLogger import retry._ import retry.implicits.retrySyntaxError -import scala.concurrent.duration.DurationInt - object SnowflakeRetrying { private implicit def logger[F[_]: Sync] = Slf4jLogger.getLogger[F] - def retryIndefinitely[F[_]: Sync: Sleep, A](snowflakeHealth: SnowflakeHealth[F])(action: F[A]): F[A] = - retryUntilSuccessful(snowflakeHealth, action) <* + def retryIndefinitely[F[_]: Sync: Sleep, A](snowflakeHealth: SnowflakeHealth[F], config: Config.Retries)(action: F[A]): F[A] = + retryUntilSuccessful(snowflakeHealth, config, action) <* snowflakeHealth.setHealthy() - private def retryUntilSuccessful[F[_]: Sync: Sleep, A](snowflakeHealth: SnowflakeHealth[F], action: F[A]) = - action.retryingOnSomeErrors( - isWorthRetrying = isWorthRetrying(_).pure[F], - policy = RetryPolicies.exponentialBackoff[F](baseDelay = 1.minute), // TODO make it configurable - onError = (ex, _) => handleError(snowflakeHealth, ex) - ) - - private def handleError[F[_]: Sync](snowflakeHealth: SnowflakeHealth[F], ex: Throwable): F[Unit] = - snowflakeHealth.setUnhealthy() *> - Logger[F].error(ex)("Executing Snowflake command failed") - - private def isWorthRetrying(error: Throwable): Boolean = error match { - // TODO Retry for some specific error codes or simply for any snowflake exception? - case se: SnowflakeSQLException if se.getErrorCode === 2222 => - true - case _ => true - } - + private def retryUntilSuccessful[F[_]: Sync: Sleep, A]( + snowflakeHealth: SnowflakeHealth[F], + config: Config.Retries, + action: F[A] + ) = + action + .onError(_ => snowflakeHealth.setUnhealthy()) + .retryingOnAllErrors( + policy = RetryPolicies.exponentialBackoff[F](config.backoff), + onError = (error, retryDetails) => + Logger[F].error(error)(s"Executing Snowflake command failed. Retries so far: ${retryDetails.retriesSoFar}") + ) } diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala index 0d08e7d..ca103c1 100644 --- a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala +++ b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.snowflake/processing/TableManager.scala @@ -31,11 +31,12 @@ object TableManager { def fromTransactor[F[_]: Async]( config: Config.Snowflake, + xa: Transactor[F], snowflakeHealth: SnowflakeHealth[F], - xa: Transactor[F] + retriesConfig: Config.Retries ): TableManager[F] = new TableManager[F] { - def addColumns(columns: List[String]): F[Unit] = SnowflakeRetrying.retryIndefinitely(snowflakeHealth) { + def addColumns(columns: List[String]): F[Unit] = SnowflakeRetrying.retryIndefinitely(snowflakeHealth, retriesConfig) { Logger[F].info(s"Altering table to add columns [${columns.mkString(", ")}]") *> xa.rawTrans.apply { columns.traverse_ { col =>