Skip to content

Commit

Permalink
Ratelimiting of remote media downloads (#17256)
Browse files Browse the repository at this point in the history
  • Loading branch information
H-Shay authored Jun 5, 2024
1 parent aabf577 commit fcbc79b
Show file tree
Hide file tree
Showing 12 changed files with 372 additions and 14 deletions.
1 change: 1 addition & 0 deletions changelog.d/17256.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve ratelimiting in Synapse (#17256).
18 changes: 18 additions & 0 deletions docs/usage/configuration/config_documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -1946,6 +1946,24 @@ Example configuration:
max_image_pixels: 35M
```
---
### `remote_media_download_burst_count`

Remote media downloads are ratelimited using a [leaky bucket algorithm](https://en.wikipedia.org/wiki/Leaky_bucket), where a given "bucket" is keyed to the IP address of the requester when requesting remote media downloads. This configuration option sets the size of the bucket against which the size in bytes of downloads are penalized - if the bucket is full, ie a given number of bytes have already been downloaded, further downloads will be denied until the bucket drains. Defaults to 500MiB. See also `remote_media_download_per_second` which determines the rate at which the "bucket" is emptied and thus has available space to authorize new requests.

Example configuration:
```yaml
remote_media_download_burst_count: 200M
```
---
### `remote_media_download_per_second`

Works in conjunction with `remote_media_download_burst_count` to ratelimit remote media downloads - this configuration option determines the rate at which the "bucket" (see above) leaks in bytes per second. As requests are made to download remote media, the size of those requests in bytes is added to the bucket, and once the bucket has reached it's capacity, no more requests will be allowed until a number of bytes has "drained" from the bucket. This setting determines the rate at which bytes drain from the bucket, with the practical effect that the larger the number, the faster the bucket leaks, allowing for more bytes downloaded over a shorter period of time. Defaults to 87KiB per second. See also `remote_media_download_burst_count`.

Example configuration:
```yaml
remote_media_download_per_second: 40K
```
---
### `prevent_media_downloads_from`

A list of domains to never download media from. Media from these
Expand Down
10 changes: 10 additions & 0 deletions synapse/config/ratelimiting.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,13 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None:
"rc_media_create",
defaults={"per_second": 10, "burst_count": 50},
)

self.remote_media_downloads = RatelimitSettings(
key="rc_remote_media_downloads",
per_second=self.parse_size(
config.get("remote_media_download_per_second", "87K")
),
burst_count=self.parse_size(
config.get("remote_media_download_burst_count", "500M")
),
)
7 changes: 7 additions & 0 deletions synapse/federation/federation_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
SynapseError,
UnsupportedRoomVersionError,
)
from synapse.api.ratelimiting import Ratelimiter
from synapse.api.room_versions import (
KNOWN_ROOM_VERSIONS,
EventFormatVersions,
Expand Down Expand Up @@ -1877,6 +1878,8 @@ async def download_media(
output_stream: BinaryIO,
max_size: int,
max_timeout_ms: int,
download_ratelimiter: Ratelimiter,
ip_address: str,
) -> Tuple[int, Dict[bytes, List[bytes]]]:
try:
return await self.transport_layer.download_media_v3(
Expand All @@ -1885,6 +1888,8 @@ async def download_media(
output_stream=output_stream,
max_size=max_size,
max_timeout_ms=max_timeout_ms,
download_ratelimiter=download_ratelimiter,
ip_address=ip_address,
)
except HttpResponseException as e:
# If an error is received that is due to an unrecognised endpoint,
Expand All @@ -1905,6 +1910,8 @@ async def download_media(
output_stream=output_stream,
max_size=max_size,
max_timeout_ms=max_timeout_ms,
download_ratelimiter=download_ratelimiter,
ip_address=ip_address,
)


Expand Down
9 changes: 9 additions & 0 deletions synapse/federation/transport/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

from synapse.api.constants import Direction, Membership
from synapse.api.errors import Codes, HttpResponseException, SynapseError
from synapse.api.ratelimiting import Ratelimiter
from synapse.api.room_versions import RoomVersion
from synapse.api.urls import (
FEDERATION_UNSTABLE_PREFIX,
Expand Down Expand Up @@ -819,6 +820,8 @@ async def download_media_r0(
output_stream: BinaryIO,
max_size: int,
max_timeout_ms: int,
download_ratelimiter: Ratelimiter,
ip_address: str,
) -> Tuple[int, Dict[bytes, List[bytes]]]:
path = f"/_matrix/media/r0/download/{destination}/{media_id}"

Expand All @@ -834,6 +837,8 @@ async def download_media_r0(
"allow_remote": "false",
"timeout_ms": str(max_timeout_ms),
},
download_ratelimiter=download_ratelimiter,
ip_address=ip_address,
)

async def download_media_v3(
Expand All @@ -843,6 +848,8 @@ async def download_media_v3(
output_stream: BinaryIO,
max_size: int,
max_timeout_ms: int,
download_ratelimiter: Ratelimiter,
ip_address: str,
) -> Tuple[int, Dict[bytes, List[bytes]]]:
path = f"/_matrix/media/v3/download/{destination}/{media_id}"

Expand All @@ -862,6 +869,8 @@ async def download_media_v3(
"allow_redirect": "true",
},
follow_redirects=True,
download_ratelimiter=download_ratelimiter,
ip_address=ip_address,
)


Expand Down
55 changes: 51 additions & 4 deletions synapse/http/matrixfederationclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
from twisted.internet.task import Cooperator
from twisted.web.client import ResponseFailed
from twisted.web.http_headers import Headers
from twisted.web.iweb import IAgent, IBodyProducer, IResponse
from twisted.web.iweb import UNKNOWN_LENGTH, IAgent, IBodyProducer, IResponse

import synapse.metrics
import synapse.util.retryutils
Expand All @@ -68,6 +68,7 @@
RequestSendFailed,
SynapseError,
)
from synapse.api.ratelimiting import Ratelimiter
from synapse.crypto.context_factory import FederationPolicyForHTTPS
from synapse.http import QuieterFileBodyProducer
from synapse.http.client import (
Expand Down Expand Up @@ -1411,9 +1412,11 @@ async def get_file(
destination: str,
path: str,
output_stream: BinaryIO,
download_ratelimiter: Ratelimiter,
ip_address: str,
max_size: int,
args: Optional[QueryParams] = None,
retry_on_dns_fail: bool = True,
max_size: Optional[int] = None,
ignore_backoff: bool = False,
follow_redirects: bool = False,
) -> Tuple[int, Dict[bytes, List[bytes]]]:
Expand All @@ -1422,6 +1425,10 @@ async def get_file(
destination: The remote server to send the HTTP request to.
path: The HTTP path to GET.
output_stream: File to write the response body to.
download_ratelimiter: a ratelimiter to limit remote media downloads, keyed to
requester IP
ip_address: IP address of the requester
max_size: maximum allowable size in bytes of the file
args: Optional dictionary used to create the query string.
ignore_backoff: true to ignore the historical backoff data
and try the request anyway.
Expand All @@ -1441,11 +1448,27 @@ async def get_file(
federation whitelist
RequestSendFailed: If there were problems connecting to the
remote, due to e.g. DNS failures, connection timeouts etc.
SynapseError: If the requested file exceeds ratelimits
"""
request = MatrixFederationRequest(
method="GET", destination=destination, path=path, query=args
)

# check for a minimum balance of 1MiB in ratelimiter before initiating request
send_req, _ = await download_ratelimiter.can_do_action(
requester=None, key=ip_address, n_actions=1048576, update=False
)

if not send_req:
msg = "Requested file size exceeds ratelimits"
logger.warning(
"{%s} [%s] %s",
request.txn_id,
request.destination,
msg,
)
raise SynapseError(HTTPStatus.TOO_MANY_REQUESTS, msg, Codes.LIMIT_EXCEEDED)

response = await self._send_request(
request,
retry_on_dns_fail=retry_on_dns_fail,
Expand All @@ -1455,12 +1478,36 @@ async def get_file(

headers = dict(response.headers.getAllRawHeaders())

expected_size = response.length
# if we don't get an expected length then use the max length
if expected_size == UNKNOWN_LENGTH:
expected_size = max_size
logger.debug(
f"File size unknown, assuming file is max allowable size: {max_size}"
)

read_body, _ = await download_ratelimiter.can_do_action(
requester=None,
key=ip_address,
n_actions=expected_size,
)
if not read_body:
msg = "Requested file size exceeds ratelimits"
logger.warning(
"{%s} [%s] %s",
request.txn_id,
request.destination,
msg,
)
raise SynapseError(HTTPStatus.TOO_MANY_REQUESTS, msg, Codes.LIMIT_EXCEEDED)

try:
d = read_body_with_max_size(response, output_stream, max_size)
# add a byte of headroom to max size as function errs at >=
d = read_body_with_max_size(response, output_stream, expected_size + 1)
d.addTimeout(self.default_timeout_seconds, self.reactor)
length = await make_deferred_yieldable(d)
except BodyExceededMaxSize:
msg = "Requested file is too large > %r bytes" % (max_size,)
msg = "Requested file is too large > %r bytes" % (expected_size,)
logger.warning(
"{%s} [%s] %s",
request.txn_id,
Expand Down
43 changes: 38 additions & 5 deletions synapse/media/media_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
SynapseError,
cs_error,
)
from synapse.api.ratelimiting import Ratelimiter
from synapse.config.repository import ThumbnailRequirement
from synapse.http.server import respond_with_json
from synapse.http.site import SynapseRequest
Expand Down Expand Up @@ -111,6 +112,12 @@ def __init__(self, hs: "HomeServer"):
)
self.prevent_media_downloads_from = hs.config.media.prevent_media_downloads_from

self.download_ratelimiter = Ratelimiter(
store=hs.get_storage_controllers().main,
clock=hs.get_clock(),
cfg=hs.config.ratelimiting.remote_media_downloads,
)

# List of StorageProviders where we should search for media and
# potentially upload to.
storage_providers = []
Expand Down Expand Up @@ -464,6 +471,7 @@ async def get_remote_media(
media_id: str,
name: Optional[str],
max_timeout_ms: int,
ip_address: str,
) -> None:
"""Respond to requests for remote media.
Expand All @@ -475,6 +483,7 @@ async def get_remote_media(
the filename in the Content-Disposition header of the response.
max_timeout_ms: the maximum number of milliseconds to wait for the
media to be uploaded.
ip_address: the IP address of the requester
Returns:
Resolves once a response has successfully been written to request
Expand All @@ -500,7 +509,11 @@ async def get_remote_media(
key = (server_name, media_id)
async with self.remote_media_linearizer.queue(key):
responder, media_info = await self._get_remote_media_impl(
server_name, media_id, max_timeout_ms
server_name,
media_id,
max_timeout_ms,
self.download_ratelimiter,
ip_address,
)

# We deliberately stream the file outside the lock
Expand All @@ -517,7 +530,7 @@ async def get_remote_media(
respond_404(request)

async def get_remote_media_info(
self, server_name: str, media_id: str, max_timeout_ms: int
self, server_name: str, media_id: str, max_timeout_ms: int, ip_address: str
) -> RemoteMedia:
"""Gets the media info associated with the remote file, downloading
if necessary.
Expand All @@ -527,6 +540,7 @@ async def get_remote_media_info(
media_id: The media ID of the content (as defined by the remote server).
max_timeout_ms: the maximum number of milliseconds to wait for the
media to be uploaded.
ip_address: IP address of the requester
Returns:
The media info of the file
Expand All @@ -542,7 +556,11 @@ async def get_remote_media_info(
key = (server_name, media_id)
async with self.remote_media_linearizer.queue(key):
responder, media_info = await self._get_remote_media_impl(
server_name, media_id, max_timeout_ms
server_name,
media_id,
max_timeout_ms,
self.download_ratelimiter,
ip_address,
)

# Ensure we actually use the responder so that it releases resources
Expand All @@ -553,7 +571,12 @@ async def get_remote_media_info(
return media_info

async def _get_remote_media_impl(
self, server_name: str, media_id: str, max_timeout_ms: int
self,
server_name: str,
media_id: str,
max_timeout_ms: int,
download_ratelimiter: Ratelimiter,
ip_address: str,
) -> Tuple[Optional[Responder], RemoteMedia]:
"""Looks for media in local cache, if not there then attempt to
download from remote server.
Expand All @@ -564,6 +587,9 @@ async def _get_remote_media_impl(
remote server).
max_timeout_ms: the maximum number of milliseconds to wait for the
media to be uploaded.
download_ratelimiter: a ratelimiter limiting remote media downloads, keyed to
requester IP.
ip_address: the IP address of the requester
Returns:
A tuple of responder and the media info of the file.
Expand Down Expand Up @@ -596,7 +622,7 @@ async def _get_remote_media_impl(

try:
media_info = await self._download_remote_file(
server_name, media_id, max_timeout_ms
server_name, media_id, max_timeout_ms, download_ratelimiter, ip_address
)
except SynapseError:
raise
Expand Down Expand Up @@ -630,6 +656,8 @@ async def _download_remote_file(
server_name: str,
media_id: str,
max_timeout_ms: int,
download_ratelimiter: Ratelimiter,
ip_address: str,
) -> RemoteMedia:
"""Attempt to download the remote file from the given server name,
using the given file_id as the local id.
Expand All @@ -641,6 +669,9 @@ async def _download_remote_file(
locally generated.
max_timeout_ms: the maximum number of milliseconds to wait for the
media to be uploaded.
download_ratelimiter: a ratelimiter limiting remote media downloads, keyed to
requester IP
ip_address: the IP address of the requester
Returns:
The media info of the file.
Expand All @@ -658,6 +689,8 @@ async def _download_remote_file(
output_stream=f,
max_size=self.max_upload_size,
max_timeout_ms=max_timeout_ms,
download_ratelimiter=download_ratelimiter,
ip_address=ip_address,
)
except RequestSendFailed as e:
logger.warning(
Expand Down
6 changes: 4 additions & 2 deletions synapse/media/thumbnailer.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,9 +359,10 @@ async def select_or_generate_remote_thumbnail(
desired_method: str,
desired_type: str,
max_timeout_ms: int,
ip_address: str,
) -> None:
media_info = await self.media_repo.get_remote_media_info(
server_name, media_id, max_timeout_ms
server_name, media_id, max_timeout_ms, ip_address
)
if not media_info:
respond_404(request)
Expand Down Expand Up @@ -422,12 +423,13 @@ async def respond_remote_thumbnail(
method: str,
m_type: str,
max_timeout_ms: int,
ip_address: str,
) -> None:
# TODO: Don't download the whole remote file
# We should proxy the thumbnail from the remote server instead of
# downloading the remote file and generating our own thumbnails.
media_info = await self.media_repo.get_remote_media_info(
server_name, media_id, max_timeout_ms
server_name, media_id, max_timeout_ms, ip_address
)
if not media_info:
return
Expand Down
Loading

0 comments on commit fcbc79b

Please sign in to comment.