Protection against memory spikes when downloading assets (#31)

torchbox · Jul 29, 2024 · 53327fe · 53327fe
1 parent 9e291e1
commit 53327fe
Show file tree

Hide file tree

Showing 7 changed files with 225 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [unreleased]
 
+### Added
+
+- BYNDER_MAX_DOCUMENT_FILE_SIZE and BYNDER_MAX_IMAGE_FILE_SIZE settings to guard against memory spikes when downloading asset files ([#31]https://github.com/torchbox/wagtail-bynder/pull/31) @ababic
+
 ## [0.5.1] - 2024-07-29
 
 ### Fixed

diff --git a/README.md b/README.md
@@ -64,7 +64,6 @@ When communicating with Bynder about configuring a new instance for compatibilit
 2. A custom derivative must to be configured for image assets
 3. A couple of custom derivatives must be configured for video assets
 
-
 ### Why are custom derivatives needed?
 
 It is common for assets to be uploaded to a DAMS in formats that preserve as much quality as possible. For example, high-resolution uncompressed TIFF images are common for digital photography. While great for print and other media, such formats are simply overkill for most websites. Not only are images likely to be shown at much smaller dimensions in a web browser, but they are also likely to be converted to more web-friendly formats like AVIF or WebP by Wagtail, where the image quality of an uncompressed TIFF is unlikely to shine through.
@@ -253,6 +252,30 @@ WARNING: It's important to get this right, because if the specified derivative i
 image for any reason, the ORIGINAL will be downloaded - which will lead to slow chooser response times and higher memory
 usage when generating renditions.
 
+### `BYNDER_MAX_DOCUMENT_FILE_SIZE`
+
+Example: `10485760`
+
+Default: `5242880`
+
+The maximum acceptable file size (in Bytes) when downloading a 'Document' asset from Bynder. This is safety measure to protect projects against memory spikes when file contents is loaded into memory, and can be tweaked on a project/environment basis to reflect:
+
+- How much RAM is available in the host infrastructure
+- How large the documents are that editors want to feature in content
+- Whether you are doing anything particularly memory intensive with document files in your project (e.g. text/content analysis)
+
+### `BYNDER_MAX_IMAGE_FILE_SIZE`
+
+Example: `10485760`
+
+Default: `5242880`
+
+The maximum acceptable file size (in Bytes) when downloading an 'Image' asset from Bynder. This is safety measure to protect projects against memory spikes when file contents is loaded into memory.
+
+This setting is provided separately to `BYNDER_MAX_DOCUMENT_FILE_SIZE`, because it often needs to be set to a lower value, even if enough RAM is available to hold the orignal file in memory. This is because server-size image libraries have to understand the individual pixel values of the image, which often requires much more memory than that of the original contents.
+
+As with `BYNDER_MAX_DOCUMENT_FILE_SIZE`, this can be tweaked for individual projects/environments to reflect how much RAM is available in the host infrastructure.
+
 ### `BYNDER_VIDEO_MODEL`
 
 Example: `"video.Video"`

diff --git a/src/wagtail_bynder/exceptions.py b/src/wagtail_bynder/exceptions.py
@@ -3,3 +3,10 @@ class BynderAssetDataError(Exception):
     Raised when values expected to be present in an asset API representation
     from Bynder are missing.
     """
+
+
+class BynderAssetFileTooLarge(Exception):
+    """
+    Raised when an asset file being downloaded from Bynder is found to be
+    larger than specified in ``settings.BYNDER_MAX_DOWNLOAD_FILE_SIZE``
+    """
diff --git a/src/wagtail_bynder/models.py b/src/wagtail_bynder/models.py
@@ -6,6 +6,7 @@
 from typing import Any
 
 from django.conf import settings
+from django.core.files.uploadedfile import UploadedFile
 from django.db import models
 from django.utils.functional import cached_property
 from django.utils.translation import gettext_lazy as _
@@ -155,7 +156,10 @@ def asset_file_has_changed(self, asset_data: dict[str, Any]) -> bool:
 
     def update_file(self, asset_data: dict[str, Any]) -> None:
         source_url = self.extract_file_source(asset_data)
-        self.file = utils.download_asset(source_url)
+        file = self.download_file(source_url)
+        processed_file = self.process_downloaded_file(file, asset_data)
+
+        self.file = processed_file if processed_file is not None else file
 
         # Used to trigger additional updates on save()
         self._file_changed = True
@@ -164,6 +168,27 @@ def update_file(self, asset_data: dict[str, Any]) -> None:
         self.source_filename = utils.filename_from_url(source_url)
         self.original_filesize = int(asset_data["fileSize"])
 
+    def download_file(self, source_url: str) -> UploadedFile:
+        raise NotImplementedError
+
+    def process_downloaded_file(
+        self, file: UploadedFile, asset_data: dict[str, Any]
+    ) -> UploadedFile:
+        """
+        A hook to allow subclasses to apply additional analysis/customisation
+        of asset files downloaded from Bynder BEFORE they are used to set the
+        object's ``file`` field value. The return value is an ``UploadedFile``
+        object that should be used to set the new field value.
+
+        The provided `file` object is considered mutable, so may be modified
+        directly and returned, or used purely as source data to create and
+        return an entirely new ``UploadedFile`` instance.
+
+        By default, the provided ``file`` is returned as is, without taking
+        any further action.
+        """
+        return file
+
 
 class BynderSyncedImage(BynderAssetWithFileMixin, AbstractImage):
     admin_form_fields = (
@@ -238,6 +263,9 @@ def update_file(self, asset_data: dict[str, Any]) -> None:
         self.original_height = int(asset_data["height"])
         return super().update_file(asset_data)
 
+    def download_file(self, source_url: str) -> UploadedFile:
+        return utils.download_image(source_url)
+
     def set_focal_area_from_focus_point(
         self, x: int, y: int, original_height: int, original_width: int
     ) -> None:
@@ -324,6 +352,9 @@ def extract_file_source(asset_data: dict[str, Any]) -> str:
                 "'original' asset URL in order to download and save its own copy."
             ) from e
 
+    def download_file(self, source_url: str) -> UploadedFile:
+        return utils.download_document(source_url)
+
 
 class BynderSyncedVideo(
     BynderAssetMixin, CollectionMember, index.Indexed, models.Model

diff --git a/src/wagtail_bynder/utils.py b/src/wagtail_bynder/utils.py
@@ -9,40 +9,56 @@
 from bynder_sdk import BynderClient
 from django.conf import settings
 from django.core.files.uploadedfile import InMemoryUploadedFile
+from django.template.defaultfilters import filesizeformat
 from wagtail.models import Collection
 
+from .exceptions import BynderAssetFileTooLarge
 
-_DEFAULT_COLLECTION = Local()
 
+_DEFAULT_COLLECTION = Local()
 
-class DownloadedFile(BytesIO):
-    name: str
-    size: int
-    content_type: str | None
-    charset: str | None
 
+def download_file(
+    url: str, max_filesize: int, max_filesize_setting_name: str
+) -> InMemoryUploadedFile:
+    name = os.path.basename(url)
 
-def download_file(url: str) -> DownloadedFile:
-    raw_bytes = requests.get(url, timeout=20).content
-    f = DownloadedFile(raw_bytes)
-    f.name = os.path.basename(url)
-    f.size = len(raw_bytes)
-    f.content_type, f.charset = mimetypes.guess_type(f.name)
-    return f
+    # Stream file to memory
+    file = BytesIO()
+    for line in requests.get(url, timeout=20, stream=True):
+        file.write(line)
+        if file.tell() > max_filesize:
+            file.truncate(0)
+            raise BynderAssetFileTooLarge(
+                f"File '{name}' exceeded the size limit enforced by the {max_filesize_setting_name} setting, which is currently set to {filesizeformat(max_filesize)}."
+            )
 
+    size = file.tell()
+    file.seek(0)
 
-def download_asset(url: str) -> InMemoryUploadedFile:
-    f = download_file(url)
+    content_type, charset = mimetypes.guess_type(name)
     return InMemoryUploadedFile(
-        f,
-        name=f.name,
+        file,
         field_name="file",
-        size=f.size,
-        charset=f.charset,
-        content_type=f.content_type,
+        name=name,
+        content_type=content_type,
+        size=size,
+        charset=charset,
     )
 
 
+def download_document(url: str) -> InMemoryUploadedFile:
+    max_filesize_setting_name = "BYNDER_MAX_DOCUMENT_FILE_SIZE"
+    max_filesize = getattr(settings, max_filesize_setting_name, 5242880)
+    return download_file(url, max_filesize, max_filesize_setting_name)
+
+
+def download_image(url: str) -> InMemoryUploadedFile:
+    max_filesize_setting_name = "BYNDER_MAX_IMAGE_FILE_SIZE"
+    max_filesize = getattr(settings, max_filesize_setting_name, 5242880)
+    return download_file(url, max_filesize, max_filesize_setting_name)
+
+
 def filename_from_url(url: str) -> str:
     return os.path.basename(url)
 

diff --git a/tests/test_models.py b/tests/test_models.py
@@ -9,7 +9,11 @@
 from wagtail_bynder.exceptions import BynderAssetDataError
 from wagtail_bynder.utils import filename_from_url
 
-from .utils import get_test_asset_data
+from .utils import (
+    get_fake_downloaded_document,
+    get_fake_downloaded_image,
+    get_test_asset_data,
+)
 
 
 class BynderSyncedDocumentTests(SimpleTestCase):
@@ -65,11 +69,23 @@ def test_update_file(self):
         self.obj.original_filesize = None
         self.assertFalse(hasattr(self.obj, "_file_changed"))
 
-        with mock.patch(
-            "wagtail_bynder.models.utils.download_asset", return_value=None
+        fake_document = get_fake_downloaded_document()
+
+        with (
+            mock.patch.object(
+                self.obj, "download_file", return_value=fake_document
+            ) as download_file_mock,
+            mock.patch.object(
+                self.obj, "process_downloaded_file", return_value=fake_document
+            ) as process_downloaded_file_mock,
         ):
             self.obj.update_file(self.asset_data)
 
+        download_file_mock.assert_called_once_with(self.asset_data["original"])
+        process_downloaded_file_mock.assert_called_once_with(
+            fake_document, self.asset_data
+        )
+
         self.assertTrue(self.obj._file_changed)
         self.assertEqual(
             self.obj.source_filename, filename_from_url(self.asset_data["original"])
@@ -175,11 +191,24 @@ def test_update_file(self):
         self.obj.original_width = None
         self.assertFalse(hasattr(self.obj, "_file_changed"))
 
-        with mock.patch(
-            "wagtail_bynder.models.utils.download_asset", return_value=None
+        fake_image = get_fake_downloaded_image()
+        with (
+            mock.patch.object(
+                self.obj, "download_file", return_value=fake_image
+            ) as download_file_mock,
+            mock.patch.object(
+                self.obj, "process_downloaded_file", return_value=fake_image
+            ) as process_downloaded_file_mock,
         ):
             self.obj.update_file(self.asset_data)
 
+        download_file_mock.assert_called_once_with(
+            self.asset_data["thumbnails"]["WagtailSource"]
+        )
+        process_downloaded_file_mock.assert_called_once_with(
+            fake_image, self.asset_data
+        )
+
         self.assertTrue(self.obj._file_changed)
         self.assertEqual(
             self.obj.source_filename,

diff --git a/tests/utils.py b/tests/utils.py
@@ -1,11 +1,46 @@
+import io
+import os
+
 from datetime import datetime
+from enum import StrEnum
+from random import choice
+from string import ascii_uppercase
 
+from django.core.files.uploadedfile import InMemoryUploadedFile
 from django.utils.text import slugify
+from PIL import Image
 
 
 TEST_ASSET_ID = "1A7BA172-97B9-44A4-8C0AA41D9E8AE6A2"
 
 
+class ImageFormat(StrEnum):
+    JPEG = "JPEG"
+    GIF = "GIF"
+    PNG = "PNG"
+    BMP = "BMP"
+    WEBP = "WebP"
+    TIFF = "TIFF"
+
+
+IMAGE_EXTENSION_TO_FORMAT = {
+    ".jpg": (ImageFormat.JPEG, "image/jpeg"),
+    ".jpeg": (ImageFormat.JPEG, "image/jpeg"),
+    ".gif": (ImageFormat.GIF, "image/gif"),
+    ".png": (ImageFormat.PNG, "image/png"),
+    ".webp": (ImageFormat.WEBP, "image/webp"),
+    ".bmp": (ImageFormat.BMP, "image/bmp"),
+    ".tif": (ImageFormat.TIFF, "image/tiff"),
+    ".tiff": (ImageFormat.TIFF, "image/tiff"),
+}
+
+DOCUMENT_EXTENSION_TO_FORMAT = {
+    ".pdf": "application/pdf",
+    ".txt": "text/plain",
+    ".xml": "text/xml",
+}
+
+
 def get_test_asset_data(
     id: str = TEST_ASSET_ID,
     name: str = "Test asset",
@@ -86,3 +121,56 @@ def get_test_asset_data(
         ]
 
     return data
+
+
+def get_fake_image(
+    width: int = 100, height: int = 100, image_format: ImageFormat = ImageFormat.JPEG
+) -> io.BytesIO:
+    thumb_io = io.BytesIO()
+    with Image.new("RGB", (width, height), "blue") as thumb:
+        thumb.save(thumb_io, format=image_format)
+    return thumb_io
+
+
+def get_fake_document(size: int = 1024) -> io.BytesIO:
+    contents = "".join(choice(ascii_uppercase) for i in range(size))  # noqa: S311
+    return io.BytesIO(contents.encode("utf-8"))
+
+
+def get_fake_downloaded_image(
+    name: str = "fake.jpg", width: int = 100, height: int = 100
+) -> InMemoryUploadedFile:
+    _, ext = os.path.splitext(name.lower())
+    if ext not in IMAGE_EXTENSION_TO_FORMAT:
+        raise ValueError(
+            f"{ext} is not supported image file extension. Try one of the following: {list(IMAGE_EXTENSION_TO_FORMAT.keys())}"
+        )
+    image_format, content_type = IMAGE_EXTENSION_TO_FORMAT[ext]
+
+    return InMemoryUploadedFile(
+        get_fake_image(width, height, image_format),
+        field_name="file",
+        name=name,
+        content_type=content_type,
+        size=1048576,
+        charset="utf-8",
+    )
+
+
+def get_fake_downloaded_document(
+    name: str = "fake.pdf", size: int = 1024
+) -> InMemoryUploadedFile:
+    _, ext = os.path.splitext(name.lower())
+    if ext not in DOCUMENT_EXTENSION_TO_FORMAT:
+        raise ValueError(
+            f"{ext} is not supported document file extension. Try one of the following: {list(DOCUMENT_EXTENSION_TO_FORMAT.keys())}"
+        )
+
+    return InMemoryUploadedFile(
+        get_fake_document(size),
+        field_name="file",
+        name=name,
+        content_type=DOCUMENT_EXTENSION_TO_FORMAT[ext],
+        size=size,
+        charset="utf-8",
+    )