Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add configuration of the backoff algorithm for the federation client #15754

Merged
merged 17 commits into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/15754.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Allow for the configuration of the backoff algorithm for federation destinations.
11 changes: 11 additions & 0 deletions docs/usage/configuration/config_documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -1212,6 +1212,14 @@ like sending a federation transaction.
* `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
* `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.

The following options control the retry logic when communicating with a specific homeserver destination.
Unlike the previous configuration options, these values apply across all requests
for a given destination and the state of the backoff is stored in the database.

* `destination_min_retry_interval`: the initial backoff, after the first request fails. Defaults to 10m.
* `destination_retry_multiplier`: how much we multiply the backoff by after each subsequent fail. Defaults to 2.
* `destination_max_retry_interval`: a cap on the backoff. Defaults to a week.

Example configuration:
```yaml
federation:
MatMaul marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -1220,6 +1228,9 @@ federation:
max_long_retry_delay: 100s
max_short_retries: 5
max_long_retries: 20
destination_min_retry_interval: 30s
destination_retry_multiplier: 5
destination_max_retry_interval: 12h
```
---
## Caching
Expand Down
14 changes: 14 additions & 0 deletions synapse/config/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,19 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None:
self.max_long_retries = federation_config.get("max_long_retries", 10)
self.max_short_retries = federation_config.get("max_short_retries", 3)

# Allow for the configuration of the backoff algorithm used
# when trying to reach an unavailable destination.
# Unlike previous configuration those values applies across
# multiple requests and the state of the backoff is stored on DB.
self.destination_min_retry_interval_ms = Config.parse_duration(
federation_config.get("destination_min_retry_interval", "10m")
)
self.destination_retry_multiplier = federation_config.get(
"destination_retry_multiplier", 2
)
self.destination_max_retry_interval_ms = Config.parse_duration(
federation_config.get("destination_max_retry_interval", "7d")
)
MatMaul marked this conversation as resolved.
Show resolved Hide resolved


_METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
29 changes: 16 additions & 13 deletions synapse/util/retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,6 @@

logger = logging.getLogger(__name__)

# the initial backoff, after the first transaction fails
MIN_RETRY_INTERVAL = 10 * 60 * 1000

# how much we multiply the backoff by after each subsequent fail
RETRY_MULTIPLIER = 5

# a cap on the backoff. (Essentially none)
MAX_RETRY_INTERVAL = 2**62


class NotRetryingDestination(Exception):
def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
Expand Down Expand Up @@ -169,6 +160,16 @@ def __init__(
self.notifier = notifier
self.replication_client = replication_client

self.destination_min_retry_interval_ms = (
self.store.hs.config.federation.destination_min_retry_interval_ms
)
self.destination_retry_multiplier = (
self.store.hs.config.federation.destination_retry_multiplier
)
self.destination_max_retry_interval_ms = (
self.store.hs.config.federation.destination_max_retry_interval_ms
)

def __enter__(self) -> None:
pass

Expand Down Expand Up @@ -220,13 +221,15 @@ def __exit__(
# We couldn't connect.
if self.retry_interval:
self.retry_interval = int(
self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4)
self.retry_interval
* self.destination_retry_multiplier
* random.uniform(0.8, 1.4)
)

if self.retry_interval >= MAX_RETRY_INTERVAL:
self.retry_interval = MAX_RETRY_INTERVAL
if self.retry_interval >= self.destination_max_retry_interval_ms:
self.retry_interval = self.destination_max_retry_interval_ms
else:
self.retry_interval = MIN_RETRY_INTERVAL
self.retry_interval = self.destination_min_retry_interval_ms

logger.info(
"Connection to %s was unsuccessful (%s(%s)); backoff now %i",
Expand Down
6 changes: 4 additions & 2 deletions tests/storage/test_transactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from synapse.server import HomeServer
from synapse.storage.databases.main.transactions import DestinationRetryTimings
from synapse.util import Clock
from synapse.util.retryutils import MAX_RETRY_INTERVAL

from tests.unittest import HomeserverTestCase

Expand Down Expand Up @@ -57,8 +56,11 @@ def test_initial_set_transactions(self) -> None:
self.get_success(d)

def test_large_destination_retry(self) -> None:
max_retry_interval = (
MatMaul marked this conversation as resolved.
Show resolved Hide resolved
self.hs.config.federation.destination_max_retry_interval_ms * 1000
MatMaul marked this conversation as resolved.
Show resolved Hide resolved
)
d = self.store.set_destination_retry_timings(
"example.com", MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL, MAX_RETRY_INTERVAL
"example.com", max_retry_interval, max_retry_interval, max_retry_interval
)
self.get_success(d)

Expand Down
20 changes: 9 additions & 11 deletions tests/util/test_retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from synapse.util.retryutils import (
MIN_RETRY_INTERVAL,
RETRY_MULTIPLIER,
NotRetryingDestination,
get_retry_limiter,
)
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter

from tests.unittest import HomeserverTestCase

Expand All @@ -42,6 +37,9 @@ def test_limiter(self) -> None:

limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))

min_retry_interval = self.hs.config.federation.destination_min_retry_interval_ms
MatMaul marked this conversation as resolved.
Show resolved Hide resolved
retry_multiplier = self.hs.config.federation.destination_retry_multiplier

self.pump(1)
try:
with limiter:
Expand All @@ -57,7 +55,7 @@ def test_limiter(self) -> None:
assert new_timings is not None
self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings.retry_last_ts, failure_ts)
self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL)
self.assertEqual(new_timings.retry_interval, min_retry_interval)

# now if we try again we should get a failure
self.get_failure(
Expand All @@ -68,7 +66,7 @@ def test_limiter(self) -> None:
# advance the clock and try again
#

self.pump(MIN_RETRY_INTERVAL)
self.pump(min_retry_interval)
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))

self.pump(1)
Expand All @@ -87,16 +85,16 @@ def test_limiter(self) -> None:
self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings.retry_last_ts, retry_ts)
self.assertGreaterEqual(
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
new_timings.retry_interval, min_retry_interval * retry_multiplier * 0.5
)
self.assertLessEqual(
new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
new_timings.retry_interval, min_retry_interval * retry_multiplier * 2.0
)

#
# one more go, with success
#
self.reactor.advance(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
self.reactor.advance(min_retry_interval * retry_multiplier * 2.0)
limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))

self.pump(1)
Expand Down