From 475a1e85864727dfcbaeac984c998aa99376ccd5 Mon Sep 17 00:00:00 2001 From: Sven Marnach Date: Wed, 10 Jul 2024 13:05:07 +0200 Subject: [PATCH] bug-1906959: Allow GCS buckets to be behind a CDN. --- docker-compose.yml | 10 ++++++++++ docker/config/local_dev.env | 4 ++-- docker/images/gcs-cdn/Dockerfile | 2 ++ docker/images/gcs-cdn/default.conf | 8 ++++++++ tecken/download/views.py | 5 +++-- tecken/ext/gcs/storage.py | 16 +++++++++++++--- tecken/libstorage.py | 10 +++++++--- tecken/tests/conftest.py | 15 +++++++++------ tecken/tests/test_download.py | 6 ++---- tecken/tests/test_storage_backends.py | 13 ++++++++----- tecken/tests/test_symbolstorage.py | 4 +--- tecken/tests/utils.py | 12 +++++++++++- 12 files changed, 76 insertions(+), 29 deletions(-) create mode 100644 docker/images/gcs-cdn/Dockerfile create mode 100644 docker/images/gcs-cdn/default.conf diff --git a/docker-compose.yml b/docker-compose.yml index 5dfa284a6..68f39a020 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -168,6 +168,16 @@ services: interval: 1s timeout: 3s retries: 5 + depends_on: + - gcs-cdn + + # nginx as a reverse proxy simulating a CDN in front of the GCS emulator. + gcs-cdn: + build: + context: docker/images/gcs-cdn + image: local/tecken_gcs_cdn + ports: + - "${EXPOSE_CDN_PORT:-8002}:8002" # https://hub.docker.com/r/localstack/localstack/ # localstack running a fake AWS S3 diff --git a/docker/config/local_dev.env b/docker/config/local_dev.env index 1472f28ff..8d964eb4c 100644 --- a/docker/config/local_dev.env +++ b/docker/config/local_dev.env @@ -32,8 +32,8 @@ AWS_ENDPOINT_URL=http://localstack:4566/ DEBUG=true LOCAL_DEV_ENV=true SYMBOL_URLS= -UPLOAD_DEFAULT_URL=http://gcs-emulator:8001/publicbucket/ -UPLOAD_TRY_SYMBOLS_URL=http://gcs-emulator:8001/publicbucket/try/ +UPLOAD_DEFAULT_URL=http://gcs-emulator:8001/publicbucket?public_url=http://gcs-cdn:8002/publicbucket +UPLOAD_TRY_SYMBOLS_URL=http://gcs-emulator:8001/publicbucket/try?public_url=http://gcs-cdn:8002/publicbucket/try # Default to the test oidcprovider container for Open ID Connect # diff --git a/docker/images/gcs-cdn/Dockerfile b/docker/images/gcs-cdn/Dockerfile new file mode 100644 index 000000000..7ab8af11e --- /dev/null +++ b/docker/images/gcs-cdn/Dockerfile @@ -0,0 +1,2 @@ +FROM nginx:1.27-alpine +COPY default.conf /etc/nginx/conf.d/default.conf diff --git a/docker/images/gcs-cdn/default.conf b/docker/images/gcs-cdn/default.conf new file mode 100644 index 000000000..6bc668a85 --- /dev/null +++ b/docker/images/gcs-cdn/default.conf @@ -0,0 +1,8 @@ +server { + listen 8002; + server_name cdn; + + location / { + proxy_pass http://gcs-emulator:8001; + } +} diff --git a/tecken/download/views.py b/tecken/download/views.py index 97a0ddf07..230ec69cb 100644 --- a/tecken/download/views.py +++ b/tecken/download/views.py @@ -156,13 +156,14 @@ def download_symbol(request, debugfilename, debugid, filename, try_symbols=False ) if metadata: url = metadata.download_url - if "http://localstack:4566" in url and request.get_host() == "localhost:8000": + if request.get_host() == "localhost:8000": # If doing local development, with Docker, you're most likely running # localstack as a fake S3. It runs on its own hostname that is only # available from other Docker containers. But to make it really convenient, # for testing symbol download we'll rewrite the URL to one that is possible # to reach from the host. - url = url.replace("localstack:4566", "localhost:4566") + url = url.replace("http://gcs-cdn:8002/", "http://localhost:8002/") + url = url.replace("http://localstack:4566/", "http://localhost:4566/") response = http.HttpResponseRedirect(url) if request._request_debug: response["Debug-Time"] = elapsed_time diff --git a/tecken/ext/gcs/storage.py b/tecken/ext/gcs/storage.py index 6d68ab761..00264c355 100644 --- a/tecken/ext/gcs/storage.py +++ b/tecken/ext/gcs/storage.py @@ -5,7 +5,7 @@ from io import BufferedReader import threading from typing import Optional -from urllib.parse import urlparse +from urllib.parse import quote, urlparse from django.conf import settings @@ -23,9 +23,15 @@ class GCSStorage(StorageBackend): accepted_hostnames = (".googleapis.com", "gcs-emulator", "gcs.example.com") - def __init__(self, url: str, try_symbols: bool = False): + def __init__( + self, url: str, try_symbols: bool = False, public_url: Optional[str] = None + ): url = url.removesuffix("/") self.url = url + if public_url: + self.public_url = public_url.removesuffix("/") + else: + self.public_url = None parsed_url = urlparse(url) self.endpoint_url = f"{parsed_url.scheme}://{parsed_url.netloc}" self.name, _, self.prefix = parsed_url.path[1:].partition("/") @@ -93,8 +99,12 @@ def get_object_metadata(self, key: str) -> Optional[ObjectMetadata]: original_content_length = int(original_content_length) except ValueError: original_content_length = None + if self.public_url: + download_url = f"{self.public_url}/v1/{quote(key)}" + else: + download_url = blob.public_url metadata = ObjectMetadata( - download_url=blob.public_url, + download_url=download_url, content_type=blob.content_type, content_length=blob.size, content_encoding=blob.content_encoding, diff --git a/tecken/libstorage.py b/tecken/libstorage.py index f25e49562..a36cfc56f 100644 --- a/tecken/libstorage.py +++ b/tecken/libstorage.py @@ -6,7 +6,7 @@ import datetime from io import BufferedReader from typing import Optional -from urllib.parse import urlparse +from urllib import parse @dataclass @@ -60,10 +60,14 @@ def new(cls, url: str, try_symbols: bool = False) -> "StorageBackend": # _BACKEND_CLASSES is fully populated before proceeding. The redundant alias pacifies ruff. from tecken import ext as ext - hostname = urlparse(url).hostname.lower().removesuffix(".") + parsed_url = parse.urlparse(url) + hostname = parsed_url.hostname.lower().removesuffix(".") + query_params = parse.parse_qs(parsed_url.query, max_num_fields=1) + kwargs = {k: v for k, [v] in query_params.items()} + url = parse.urlunparse(parsed_url._replace(query="")) for backend_class in _BACKEND_CLASSES: if hostname.endswith(backend_class.accepted_hostnames): - return backend_class(url, try_symbols) + return backend_class(url=url, try_symbols=try_symbols, **kwargs) raise NoMatchingBackend(url) def __repr__(self): diff --git a/tecken/tests/conftest.py b/tecken/tests/conftest.py index 373a1dd18..19d31923c 100644 --- a/tecken/tests/conftest.py +++ b/tecken/tests/conftest.py @@ -150,21 +150,24 @@ def get_test_storage_url(bucket_name): """Return a function to generate unique test storage URLs for the current test.""" def _get_test_storage_url( - kind: Literal["gcs", "s3"], try_symbols: bool = False + kind: Literal["gcs", "gcs-cdn", "s3"], try_symbols: bool = False ) -> str: + path = bucket_name + if try_symbols: + path += "/try" match kind: case "gcs": - url = f"http://gcs-emulator:8001/{bucket_name}" + url = f"http://gcs-emulator:8001/{path}" + case "gcs-cdn": + url = f"http://gcs-emulator:8001/{path}?public_url=http://gcs-cdn:8002/{path}" case "s3": - url = f"http://localstack:4566/{bucket_name}" - if try_symbols: - url += "/try" + url = f"http://localstack:4566/{path}" return url return _get_test_storage_url -@pytest.fixture(params=["gcs", "s3"]) +@pytest.fixture(params=["gcs", "gcs-cdn", "s3"]) def symbol_storage_no_create(request, settings, get_test_storage_url): """Replace the global SymbolStorage instance with a new instance. diff --git a/tecken/tests/test_download.py b/tecken/tests/test_download.py index c0dde6d70..7a061a2d7 100644 --- a/tecken/tests/test_download.py +++ b/tecken/tests/test_download.py @@ -27,10 +27,7 @@ def test_client_happy_path(client, db, symbol_storage, metricsmock): response = client.get(url) assert response.status_code == 302 - assert ( - response.headers["location"] - == f"{upload.backend.url}/v1/xul.pdb/44E4EC8C2F41492B9369D6B9A059577C2/xul.sym" - ) + assert response.headers["location"] == upload.download_url metricsmock.assert_histogram_once( "tecken.symboldownloader.file_age_days", value=0, @@ -65,6 +62,7 @@ def test_client_try_download(client, db, symbol_storage, metricsmock): ) response = client.get(try_url) assert response.status_code == 302 + assert response.headers["location"] == upload.download_url # Also note that the headers are the same as for regular downloads assert response["Access-Control-Allow-Origin"] == "*" metricsmock.assert_histogram_once( diff --git a/tecken/tests/test_storage_backends.py b/tecken/tests/test_storage_backends.py index e0927e79d..4d2030722 100644 --- a/tecken/tests/test_storage_backends.py +++ b/tecken/tests/test_storage_backends.py @@ -13,10 +13,13 @@ from tecken.tests.utils import Upload, UPLOADS +@pytest.mark.parametrize("try_storage", [False, True]) @pytest.mark.parametrize("upload", UPLOADS.values(), ids=UPLOADS.keys()) -@pytest.mark.parametrize("storage_kind", ["gcs", "s3"]) -def test_upload_and_download(get_test_storage_url, storage_kind: str, upload: Upload): - url = get_test_storage_url(storage_kind) +@pytest.mark.parametrize("storage_kind", ["gcs", "gcs-cdn", "s3"]) +def test_upload_and_download( + get_test_storage_url, storage_kind: str, upload: Upload, try_storage: bool +): + url = get_test_storage_url(storage_kind, try_storage) backend = StorageBackend.new(url) backend.clear() assert backend.exists() @@ -37,7 +40,7 @@ def test_upload_and_download(get_test_storage_url, storage_kind: str, upload: Up assert metadata.original_md5_sum == upload.metadata.original_md5_sum -@pytest.mark.parametrize("storage_kind", ["gcs", "s3"]) +@pytest.mark.parametrize("storage_kind", ["gcs", "gcs-cdn", "s3"]) def test_non_exsiting_bucket(get_test_storage_url, storage_kind: str): url = get_test_storage_url(storage_kind) backend = StorageBackend.new(url) @@ -72,7 +75,7 @@ def test_storage_backend_new(url, kind, name, prefix): assert isinstance(backend, S3Storage) -@pytest.mark.parametrize("storage_kind", ["gcs", "s3"]) +@pytest.mark.parametrize("storage_kind", ["gcs", "gcs-cdn", "s3"]) def test_storageerror_msg(get_test_storage_url, storage_kind: str): url = get_test_storage_url(storage_kind) backend = StorageBackend.new(url) diff --git a/tecken/tests/test_symbolstorage.py b/tecken/tests/test_symbolstorage.py index e478eb523..7f774c3d5 100644 --- a/tecken/tests/test_symbolstorage.py +++ b/tecken/tests/test_symbolstorage.py @@ -38,9 +38,7 @@ def test_get_metadata(symbol_storage, lower_case_debug_id): if lower_case_debug_id: debug_id = debug_id.lower() metadata = symbol_storage.get_metadata("xul.pdb", debug_id, "xul.sym") - assert metadata.download_url == ( - f"{upload.backend.url}/v1/xul.pdb/44E4EC8C2F41492B9369D6B9A059577C2/xul.sym" - ) + assert metadata.download_url == upload.download_url assert not symbol_storage.get_metadata( "xxx.pdb", "44E4EC8C2F41492B9369D6B9A059577C2", "xxx.sym" ) diff --git a/tecken/tests/utils.py b/tecken/tests/utils.py index e73e9ab9d..fb18188dd 100644 --- a/tecken/tests/utils.py +++ b/tecken/tests/utils.py @@ -7,6 +7,7 @@ from hashlib import md5 from io import BytesIO from typing import Optional +from urllib.parse import quote from tecken.base.symbolstorage import SymbolStorage from tecken.libstorage import ObjectMetadata, StorageBackend @@ -23,9 +24,18 @@ class Upload: backend: Optional[StorageBackend] = None @property - def key(self): + def key(self) -> str: return SymbolStorage.make_key(self.debug_file, self.debug_id, self.sym_file) + @property + def download_url(self) -> Optional[str]: + if not self.backend: + return None + base_url = getattr(self.backend, "public_url", None) + if not base_url: + base_url = self.backend.url + return f"{base_url}/v1/{quote(self.key)}" + @classmethod def uncompressed( cls, debug_file: str, debug_id: str, sym_file: str, body: bytes