From d4f144ea85446e808d69336fda871a26ad2d4510 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Wed, 1 May 2024 16:13:02 -0400 Subject: [PATCH] Added compute_file_md5 and compute_file_etag to file_utils. --- CHANGELOG.rst | 1 + dcicutils/file_utils.py | 72 +++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 3 files changed, 74 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fede69185..ac289040d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -20,6 +20,7 @@ Change Log - Added get_app_specific_directory, get_os_name, get_cpu_architecture_name, short_uuid to misc_utils. - Added normalize_file_path, are_files_equal, and create_random_file to file_utils. - Added create_temporary_file_name and remove_temporary_file tmpfile_utils. + - Added compute_file_md5 and compute_file_etag to file_utils. - Minor fix to misc_utils.create_dict (do not create property only if its value is None). diff --git a/dcicutils/file_utils.py b/dcicutils/file_utils.py index f6dbbc918..22adc32e3 100644 --- a/dcicutils/file_utils.py +++ b/dcicutils/file_utils.py @@ -1,4 +1,6 @@ import glob +import hashlib +import io import os import pathlib from datetime import datetime @@ -103,6 +105,76 @@ def are_files_equal(filea: str, fileb: str) -> bool: return False +def compute_file_md5(file: str) -> str: + """ + Returns the md5 checksum for the given file. + """ + if not isinstance(file, str): + return "" + try: + md5 = hashlib.md5() + with open(file, "rb") as file: + for chunk in iter(lambda: file.read(4096), b""): + md5.update(chunk) + return md5.hexdigest() + except Exception: + return "" + + +def compute_file_etag(file: str) -> Optional[str]: + """ + Returns the AWS S3 "etag" for the given file; this value is md5-like but + not the same as a normal md5. We use this to compare that a file in S3 + appears to be the exact the same file as a local file. + """ + try: + with io.open(file, "rb") as f: + return _compute_file_etag(f) + except Exception: + return None + + +def _compute_file_etag(f: io.BufferedReader) -> str: + # See: https://stackoverflow.com/questions/75723647/calculate-md5-from-aws-s3-etag + MULTIPART_THRESHOLD = 8388608 + MULTIPART_CHUNKSIZE = 8388608 + # BUFFER_SIZE = 1048576 + # Verify some assumptions are correct + # assert(MULTIPART_CHUNKSIZE >= MULTIPART_THRESHOLD) + # assert((MULTIPART_THRESHOLD % BUFFER_SIZE) == 0) + # assert((MULTIPART_CHUNKSIZE % BUFFER_SIZE) == 0) + hash = hashlib.md5() + read = 0 + chunks = None + while True: + # Read some from stdin, if we're at the end, stop reading + bits = f.read(1048576) + if len(bits) == 0: + break + read += len(bits) + hash.update(bits) + if chunks is None: + # We're handling a multi-part upload, so switch to calculating + # hashes of each chunk + if read >= MULTIPART_THRESHOLD: + chunks = b'' + if chunks is not None: + if (read % MULTIPART_CHUNKSIZE) == 0: + # Dont with a chunk, add it to the list of hashes to hash later + chunks += hash.digest() + hash = hashlib.md5() + if chunks is None: + # Normal upload, just output the MD5 hash + etag = hash.hexdigest() + else: + # Multipart upload, need to output the hash of the hashes + if (read % MULTIPART_CHUNKSIZE) != 0: + # Add the last part if we have a partial chunk + chunks += hash.digest() + etag = hashlib.md5(chunks).hexdigest() + "-" + str(len(chunks) // 16) + return etag + + def create_random_file(file: Optional[str] = None, prefix: Optional[str] = None, suffix: Optional[str] = None, nbytes: int = 1024, binary: bool = False, line_length: Optional[int] = None) -> str: """ diff --git a/pyproject.toml b/pyproject.toml index 55921f4cb..093017ff3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.8.4.1b15" # TODO: To become 8.8.5 +version = "8.8.4.1b16" # TODO: To become 8.8.5 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT"