From 2bc530e9354b1d8026a83870f0be13b939391ab4 Mon Sep 17 00:00:00 2001 From: Etienne Jodry Date: Tue, 5 Nov 2024 12:58:01 +0100 Subject: [PATCH] Small improvements: stricter file upload validation, credentials casted as secrets, ... --- docs/developer_manual/s3conf.rst | 18 ++++- .../components/controllers/s3controller.py | 5 +- src/biodm/components/services/dbservice.py | 4 +- src/biodm/components/services/s3service.py | 81 ++++++++++++++----- src/biodm/config.py | 47 +++++------ src/biodm/error.py | 3 +- src/biodm/exceptions.py | 4 + src/biodm/managers/k8smanager.py | 2 +- src/biodm/managers/kcmanager.py | 9 ++- src/biodm/managers/s3manager.py | 9 ++- src/biodm/utils/utils.py | 19 ++++- src/example/entities/tables/file.py | 9 ++- src/tests/integration/s3/test_files.py | 29 +++++-- 13 files changed, 174 insertions(+), 65 deletions(-) diff --git a/docs/developer_manual/s3conf.rst b/docs/developer_manual/s3conf.rst index b73786e..86cf824 100644 --- a/docs/developer_manual/s3conf.rst +++ b/docs/developer_manual/s3conf.rst @@ -38,6 +38,20 @@ can be done by overloading ``key_salt`` with a ``hybrid_property`` in the follow session = self.__dict__.pop('session') # Use session to fetch what you need. await session.refresh(self, ['dataset']) - await session.refresh(self.dataset, ['project']) # Build your custom prefix. - return f"{self.dataset.project.name}_{self.dataset.name}" + return f"{self.dataset.name}" + + __table_args__ = ( + UniqueConstraint( + "filename", + "extension", + "dataset_id", + name="uc_file_in_dataset" + ), + ) + +.. note:: + + The ``uuid4`` prefix guarantees us key uniqueness in all probable cases. + In case you are replacing it by one of your own like above, you should + also add a unique constraint on the table for that purpose. diff --git a/src/biodm/components/controllers/s3controller.py b/src/biodm/components/controllers/s3controller.py index 2069b6d..e627bd0 100644 --- a/src/biodm/components/controllers/s3controller.py +++ b/src/biodm/components/controllers/s3controller.py @@ -54,7 +54,7 @@ def routes(self, **_) -> List[Mount | Route] | List[Mount] | List[BaseRoute]: Route(f'{prefix}complete_multipart', self.complete_multipart, methods=[HttpMethod.PUT]), PublicRoute(f'{prefix}post_success', self.post_success, methods=[HttpMethod.GET]), ] - self.post_upload_callback = Path(file_routes[-1].path) + self.svc.post_upload_callback = Path(file_routes[-1].path) return file_routes + super().routes() @@ -94,6 +94,9 @@ async def post_success(self, request: Request): """ await self.svc.post_success( pk_val=self._extract_pk_val(request), + bucket=request.query_params.get('bucket', ''), + key=request.query_params.get('key', ''), + etag=request.query_params.get('etag', '').strip('"'), ) return json_response("Uploaded.", status_code=201) diff --git a/src/biodm/components/services/dbservice.py b/src/biodm/components/services/dbservice.py index c7f432d..1f2336c 100644 --- a/src/biodm/components/services/dbservice.py +++ b/src/biodm/components/services/dbservice.py @@ -17,7 +17,7 @@ from biodm.component import ApiService from biodm.components import Base from biodm.exceptions import ( - DataError, EndpointError, FailedRead, FailedDelete, ReleaseVersionError, UpdateVersionedError, UnauthorizedError + DataError, EndpointError, FailedCreate, FailedRead, FailedDelete, ReleaseVersionError, UpdateVersionedError, UnauthorizedError ) from biodm.managers import DatabaseManager from biodm.tables import ListGroup, Group @@ -59,7 +59,7 @@ async def _insert( raise UpdateVersionedError( "Attempt at updating versioned resources." ) - raise + raise FailedCreate(str(ie)) @DatabaseManager.in_session async def _insert_list( diff --git a/src/biodm/components/services/s3service.py b/src/biodm/components/services/s3service.py index 2f8282a..b192c23 100644 --- a/src/biodm/components/services/s3service.py +++ b/src/biodm/components/services/s3service.py @@ -1,15 +1,16 @@ from asyncio import iscoroutine from math import ceil +from pathlib import Path from typing import List, Any, Sequence, Dict from sqlalchemy import Insert from sqlalchemy.ext.asyncio import AsyncSession from biodm.components.table import Base, S3File -from biodm.exceptions import FileNotUploadedError, FileTooLargeError +from biodm.exceptions import FileNotUploadedError, FileTooLargeError, FileUploadSuccessError from biodm.managers import DatabaseManager, S3Manager from biodm.tables import Upload, UploadPart -from biodm.utils.utils import utcnow, classproperty +from biodm.utils.utils import utcnow, classproperty, check_hash from biodm.utils.security import UserInfo from .dbservice import CompositeEntityService @@ -19,6 +20,8 @@ class S3Service(CompositeEntityService): """Class for automatic management of S3 bucket transactions for file resources.""" + post_upload_callback: Path # set by S3Controller. + @classproperty def s3(cls) -> S3Manager: return cls.app.s3 @@ -29,21 +32,34 @@ def post_callback(self, item) -> str: for key in self.table.pk } - route = str(self.table.ctrl.post_upload_callback) # TODO: svc argument ? + route = str(self.post_upload_callback) for key, val in mapping.items(): route = route.replace("{" + f"{key}" + "}", str(val)) srv = self.app.server_endpoint.strip('/') return f"{srv}{route}" - async def gen_key(self, item, session: AsyncSession): - """Generate a unique bucket key from file elements.""" - await session.refresh(item, ['filename', 'extension']) - version = "" - if self.table.is_versioned: - await session.refresh(item, ['version']) - version = "_v" + str(item.version) + @property + def key_fields(self) -> List[str]: + """List of fields used to compute the key.""" + return ['filename', 'extension'] + (['version'] if self.table.is_versioned else []) + + async def gen_key(self, item: S3File, session: AsyncSession, refresh=True) -> str: + """Generate a unique bucket key from file elements. + :param item: file + :type item: S3File + :param session: current sqla session + :type session: AsyncSession + :param refresh: if key fields should be fetched - can save a db request, defaults to True + :type refresh: bool, optional + :return: unique s3 bucket file key + :rtype: str + """ + if refresh: + await session.refresh(item, self.key_fields) + + version = "_v" + str(item.version) if self.table.is_versioned else "" key_salt = await getattr(item.awaitable_attrs, 'key_salt') if iscoroutine(key_salt): item.__dict__['session'] = session @@ -69,7 +85,7 @@ async def gen_upload_form(self, file: S3File, session: AsyncSession): await session.flush() parts = await getattr(file.upload.awaitable_attrs, 'parts') - key = await self.gen_key(file, session=session) + key = await self.gen_key(file, session=session, refresh=False) n_chunks = ceil(file.size / CHUNK_SIZE) if n_chunks > 1: @@ -127,8 +143,31 @@ async def _insert_list( return files @DatabaseManager.in_session - async def post_success(self, pk_val: List[Any], session: AsyncSession): - file = await self.read(pk_val, fields=['ready', 'upload'], session=session) + async def post_success( + self, + pk_val: List[Any], + bucket: str, + key: str, + etag: str, + session: AsyncSession + ): + file = await self.read( + pk_val, + fields=['ready', 'upload'] + self.key_fields, + session=session + ) + + # Weak check. TODO: [prio not urgent] can be improved ? + indb_key = await self.gen_key(file, session=session, refresh=False) + if ( + not check_hash(etag) or + bucket != self.s3.bucket_name or + indb_key != key + ): + raise FileUploadSuccessError( + "Critical: Manual attempt at validating file detected !" + ) + file.validated_at = utcnow() file.ready = True file.upload_id, file.upload = None, None @@ -141,10 +180,14 @@ async def complete_multipart( session: AsyncSession ): # parts should take the form of [{'PartNumber': part_number, 'ETag': etag}, ...] - file = await self.read(pk_val, fields=['ready', 'upload'], session=session) + file = await self.read( + pk_val, + fields=['ready', 'upload'] + self.key_fields, + session=session + ) upload = await getattr(file.awaitable_attrs, 'upload') complete = self.s3.complete_multipart_upload( - object_name=await self.gen_key(file, session=session), + object_name=await self.gen_key(file, session=session, refresh=False), upload_id=upload.s3_uploadId, parts=parts ) @@ -176,7 +219,7 @@ async def download( :rtype: str """ # File management. - fields = ['filename', 'extension', 'dl_count', 'ready'] + fields = ['dl_count', 'ready'] + self.key_fields # Also fetch foreign keys, as some may be necessary for permission check. fields += list(c.name for c in self.table.__table__.columns if c.foreign_keys) file = await self.read(pk_val, fields=fields, session=session) @@ -188,7 +231,9 @@ async def download( if not file.ready: raise FileNotUploadedError("File exists but has not been uploaded yet.") - url = self.s3.create_presigned_download_url(await self.gen_key(file, session=session)) + url = self.s3.create_presigned_download_url( + await self.gen_key(file, session=session, refresh=False) + ) file.dl_count += 1 return url @@ -196,7 +241,6 @@ async def download( async def release( self, pk_val: List[Any], - fields: List[str], update: Dict[str, Any], session: AsyncSession, user_info: UserInfo | None = None, @@ -204,7 +248,6 @@ async def release( # Bumps version. file = await super().release( pk_val=pk_val, - fields=fields, update=update, session=session, user_info=user_info diff --git a/src/biodm/config.py b/src/biodm/config.py index c733e44..d99c04a 100644 --- a/src/biodm/config.py +++ b/src/biodm/config.py @@ -1,4 +1,5 @@ from starlette.config import Config +from starlette.datastructures import Secret from databases import DatabaseURL try: @@ -27,33 +28,33 @@ CACHE_MAX_AGE = config('CACHE_MAX_AGE', cast=int, default=600) # DB. -DATABASE_URL = config("DATABASE_URL", cast=str, default="sqlite:///:memory:") +DATABASE_URL = config("DATABASE_URL", cast=DatabaseURL, default="sqlite:///:memory:") # S3 Bucket. -S3_ENDPOINT_URL = config('S3_ENDPOINT_URL', cast=str, default=None) -S3_BUCKET_NAME = config('S3_BUCKET_NAME', cast=str, default=None) -S3_ACCESS_KEY_ID = config('S3_ACCESS_KEY_ID', cast=str, default=None) -S3_SECRET_ACCESS_KEY = config('S3_SECRET_ACCESS_KEY', cast=str, default=None) -S3_URL_EXPIRATION = config('S3_URL_EXPIRATION', cast=int, default=3600) -S3_PENDING_EXPIRATION = config('S3_PENDING_EXPIRATION', cast=int, default=3600 * 24) -S3_REGION_NAME = config('S3_REGION_NAME', cast=str, default="us-east-1") -S3_FILE_SIZE_LIMIT = config('S3_FILE_SIZE_LIMIT', cast=int, default=100) +S3_ENDPOINT_URL = config('S3_ENDPOINT_URL', cast=str, default=None) +S3_BUCKET_NAME = config('S3_BUCKET_NAME', cast=str, default=None) +S3_ACCESS_KEY_ID = config('S3_ACCESS_KEY_ID', cast=Secret, default=None) +S3_SECRET_ACCESS_KEY = config('S3_SECRET_ACCESS_KEY', cast=Secret, default=None) +S3_URL_EXPIRATION = config('S3_URL_EXPIRATION', cast=int, default=3600) +S3_PENDING_EXPIRATION = config('S3_PENDING_EXPIRATION', cast=int, default=3600 * 24) +S3_REGION_NAME = config('S3_REGION_NAME', cast=str, default="us-east-1") +S3_FILE_SIZE_LIMIT = config('S3_FILE_SIZE_LIMIT', cast=int, default=100) # Keycloak. -KC_HOST = config("KC_HOST", cast=str, default=None) -KC_REALM = config("KC_REALM", cast=str, default=None) -KC_PUBLIC_KEY = config("KC_PUBLIC_KEY", cast=str, default=None) -KC_ADMIN = config("KC_ADMIN", cast=str, default=None) -KC_ADMIN_PASSWORD = config("KC_ADMIN_PASSWORD", cast=str, default=None) -KC_CLIENT_ID = config("KC_CLIENT_ID", cast=str, default=None) -KC_CLIENT_SECRET = config("KC_CLIENT_SECRET", cast=str, default=None) -KC_JWT_OPTIONS = config("KC_JWT_OPTIONS", cast=dict, default={'verify_exp': False, +KC_HOST = config("KC_HOST", cast=str, default=None) +KC_REALM = config("KC_REALM", cast=str, default=None) +KC_PUBLIC_KEY = config("KC_PUBLIC_KEY", cast=str, default=None) +KC_ADMIN = config("KC_ADMIN", cast=str, default=None) +KC_ADMIN_PASSWORD = config("KC_ADMIN_PASSWORD", cast=Secret, default=None) +KC_CLIENT_ID = config("KC_CLIENT_ID", cast=str, default=None) +KC_CLIENT_SECRET = config("KC_CLIENT_SECRET", cast=Secret, default=None) +KC_JWT_OPTIONS = config("KC_JWT_OPTIONS", cast=dict, default={'verify_exp': False, 'verify_aud': False}) # Kubernetes. -K8_IP = config("K8_IP", cast=str, default=None) -K8_PORT = config("K8_PORT", cast=str, default="8443") -K8_HOST = config("K8_HOST", cast=str, default=None) -K8_CERT = config("K8_CERT", cast=str, default=None) -K8_TOKEN = config("K8_TOKEN", cast=str, default=None) -K8_NAMESPACE = config("K8_NAMESPACE", cast=str, default="default") +K8_IP = config("K8_IP", cast=str, default=None) +K8_PORT = config("K8_PORT", cast=str, default="8443") +K8_HOST = config("K8_HOST", cast=str, default=None) +K8_CERT = config("K8_CERT", cast=str, default=None) +K8_TOKEN = config("K8_TOKEN", cast=Secret, default=None) +K8_NAMESPACE = config("K8_NAMESPACE", cast=str, default="default") diff --git a/src/biodm/error.py b/src/biodm/error.py index 383e3bb..df75477 100644 --- a/src/biodm/error.py +++ b/src/biodm/error.py @@ -5,6 +5,7 @@ from .exceptions import ( EndpointError, FailedUpdate, + FileUploadSuccessError, PayloadJSONDecodingError, RequestError, FailedDelete, @@ -50,7 +51,7 @@ async def onerror(_, exc): ) match exc: - case FileTooLargeError(): + case FileTooLargeError() | FileUploadSuccessError(): status = 400 case DataError() | EndpointError() | PayloadJSONDecodingError(): status = 400 diff --git a/src/biodm/exceptions.py b/src/biodm/exceptions.py index cc41c9d..3d3a0ec 100644 --- a/src/biodm/exceptions.py +++ b/src/biodm/exceptions.py @@ -65,6 +65,10 @@ class FileTooLargeError(RequestError): """Raised when trying to create a too large file.""" +class FileUploadSuccessError(RequestError): + """Raised when file upload success route is visted with wrong information.""" + + class DataError(RequestError): """Raised when input data is incorrect.""" diff --git a/src/biodm/managers/k8smanager.py b/src/biodm/managers/k8smanager.py index 6f2bb79..7aefa34 100644 --- a/src/biodm/managers/k8smanager.py +++ b/src/biodm/managers/k8smanager.py @@ -44,7 +44,7 @@ def __init__( def authenticate(self, token, host, cert): """Set configuration with the credentials and certificate""" - self._config.api_key["authorization"] = token + self._config.api_key["authorization"] = str(token) self._config.api_key_prefix['authorization'] = 'Bearer' self._config.host = host self._config.ssl_ca_cert = cert diff --git a/src/biodm/managers/kcmanager.py b/src/biodm/managers/kcmanager.py index d2ca4a0..f29f702 100644 --- a/src/biodm/managers/kcmanager.py +++ b/src/biodm/managers/kcmanager.py @@ -5,6 +5,7 @@ from keycloak.openid_connection import KeycloakOpenIDConnection from keycloak.keycloak_openid import KeycloakOpenID from keycloak.exceptions import KeycloakError, KeycloakDeleteError, KeycloakGetError +from starlette.datastructures import Secret from biodm.component import ApiManager from biodm.exceptions import ( @@ -26,9 +27,9 @@ def __init__( realm: str, public_key: str, admin: str, - admin_password: str, + admin_password: Secret, client_id: str, - client_secret: str, + client_secret: Secret, jwt_options: dict ) -> None: super().__init__(app=app) @@ -44,14 +45,14 @@ def __init__( user_realm_name="master", realm_name=realm, username=admin, - password=admin_password, + password=str(admin_password), verify=True, ) self._openid = KeycloakOpenID( server_url=host, realm_name=realm, client_id=client_id, - client_secret_key=client_secret, + client_secret_key=str(client_secret), ) except KeycloakError as e: raise KeycloakUnavailableError( diff --git a/src/biodm/managers/s3manager.py b/src/biodm/managers/s3manager.py index f940705..71ae0e4 100644 --- a/src/biodm/managers/s3manager.py +++ b/src/biodm/managers/s3manager.py @@ -4,6 +4,7 @@ from boto3 import client from botocore.config import Config from botocore.exceptions import ClientError +from starlette.datastructures import Secret from biodm.component import ApiManager from biodm.utils.utils import utcnow @@ -19,8 +20,8 @@ def __init__( app: Api, endpoint_url: str, bucket_name: str, - access_key_id: str, - secret_access_key: str, + access_key_id: Secret, + secret_access_key: Secret, url_expiration: int, pending_expiration: int, region_name: str, @@ -36,8 +37,8 @@ def __init__( self.s3_client = client( 's3', endpoint_url=endpoint_url, - aws_access_key_id=access_key_id, - aws_secret_access_key=secret_access_key, + aws_access_key_id=str(access_key_id), + aws_secret_access_key=str(secret_access_key), region_name=self.region_name, config=Config( signature_version='s3' diff --git a/src/biodm/utils/utils.py b/src/biodm/utils/utils.py index 43bb7f7..2f3a3b7 100644 --- a/src/biodm/utils/utils.py +++ b/src/biodm/utils/utils.py @@ -4,6 +4,7 @@ from functools import reduce, update_wrapper import operator from os import path, utime +import re from typing import ( Any, List, Callable, Tuple, TypeVar, Dict, Iterator, Self, Generic, MutableSet, Iterable, Sequence @@ -141,8 +142,6 @@ def coalesce_dicts(ls: List[Dict[_T, _U]]) -> Dict[_T, _U]: return reduce(operator.or_, ls, {}) - - class OrderedSet(MutableSet[_T]): """A set that preserves insertion order by internally using a dict. @@ -171,3 +170,19 @@ def __str__(self): def __repr__(self): return f"" + + +_hash_regexp = re.compile(r'^[0-9a-f]{32}$', re.IGNORECASE) + + +def check_hash(s: str) -> bool: + """Check if input string looks like a md5/sha1/sha2 hash. + + i.e. a string of exactly 32 hexadecimal characters + + :param s: string to match + :type s: str + :return: string matches flag + :rtype: bool + """ + return bool(_hash_regexp.match(s)) diff --git a/src/example/entities/tables/file.py b/src/example/entities/tables/file.py index 0240726..ef86940 100644 --- a/src/example/entities/tables/file.py +++ b/src/example/entities/tables/file.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING import uuid -from sqlalchemy import Column, Integer, ForeignKey, Boolean, String, ForeignKeyConstraint, SmallInteger +from sqlalchemy import Column, Integer, ForeignKey, Boolean, String, ForeignKeyConstraint, SmallInteger, UniqueConstraint from sqlalchemy.orm import Mapped, relationship, mapped_column from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.ext.asyncio import AsyncSession @@ -27,6 +27,13 @@ class File(S3File, Base): ["DATASET.id", "DATASET.version"], name="fk_file_dataset", ), + UniqueConstraint( + "filename", + "extension", + "dataset_id", + "dataset_version", + name="uc_file_in_dataset" + ) ) @hybrid_property diff --git a/src/tests/integration/s3/test_files.py b/src/tests/integration/s3/test_files.py index 5d24431..814dbdd 100644 --- a/src/tests/integration/s3/test_files.py +++ b/src/tests/integration/s3/test_files.py @@ -90,10 +90,9 @@ def test_file_upload(): assert response.status_code == 201 -@pytest.mark.dependency(name="test_create_file") -def test_create_oversized_file(srv_endpoint, utils, tmpdir): +def test_create_oversized_file(srv_endpoint, utils): small_file = { - "filename": small_file_path.name.split('.')[0], + "filename": small_file_path.name.split('.')[0] + "_os1", "extension": small_file_path.name.split('.')[1], "size": 1000*1024**3, # 1000GB "dataset_id": "1", @@ -105,11 +104,10 @@ def test_create_oversized_file(srv_endpoint, utils, tmpdir): assert "File exceeding 100 GB" in response.text -@pytest.mark.dependency(name="test_create_file") def test_create_and_upload_oversized_file(srv_endpoint, utils): # create, with lower size. small_file = { - "filename": small_file_path.name.split('.')[0], + "filename": small_file_path.name.split('.')[0] + "_os2", "extension": small_file_path.name.split('.')[1], "size": small_file_path.stat().st_size - 10, "dataset_id": "1", @@ -253,3 +251,24 @@ def test_download_large_file(srv_endpoint, tmp_path): f.write(response.content) assert filecmp.cmp(tmp_file, big_file_path) + + +def test_create_file_and_manually_visit_success_route(srv_endpoint, utils): + file = { + "filename": "test", + "extension": "xyz", + "size": CHUNK_SIZE * 0.10, + "dataset_id": "1", + "dataset_version": "1", + } + + response = requests.post(f"{srv_endpoint}/files", data=utils.json_bytes(file)) + assert response.status_code == 201 + + json_file = json.loads(response.text) + file_id = json_file['id'] + + # Hit success route ourselves. + response = requests.get(f"{srv_endpoint}/files/{file_id}/post_success") + assert response.status_code == 400 + assert "Critical: Manual attempt at validating file detected !" in response.text