From d4f144ea85446e808d69336fda871a26ad2d4510 Mon Sep 17 00:00:00 2001
From: David Michaels <david_michaels@hms.harvard.edu>
Date: Wed, 1 May 2024 16:13:02 -0400
Subject: [PATCH] Added compute_file_md5 and compute_file_etag to file_utils.

---
 CHANGELOG.rst           |  1 +
 dcicutils/file_utils.py | 72 +++++++++++++++++++++++++++++++++++++++++
 pyproject.toml          |  2 +-
 3 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index fede69185..ac289040d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -20,6 +20,7 @@ Change Log
   - Added get_app_specific_directory, get_os_name, get_cpu_architecture_name, short_uuid to misc_utils.
   - Added normalize_file_path, are_files_equal, and create_random_file to file_utils.
   - Added create_temporary_file_name and remove_temporary_file tmpfile_utils.
+  - Added compute_file_md5 and compute_file_etag to file_utils.
   - Minor fix to misc_utils.create_dict (do not create property only if its value is None).
 
 
diff --git a/dcicutils/file_utils.py b/dcicutils/file_utils.py
index f6dbbc918..22adc32e3 100644
--- a/dcicutils/file_utils.py
+++ b/dcicutils/file_utils.py
@@ -1,4 +1,6 @@
 import glob
+import hashlib
+import io
 import os
 import pathlib
 from datetime import datetime
@@ -103,6 +105,76 @@ def are_files_equal(filea: str, fileb: str) -> bool:
         return False
 
 
+def compute_file_md5(file: str) -> str:
+    """
+    Returns the md5 checksum for the given file.
+    """
+    if not isinstance(file, str):
+        return ""
+    try:
+        md5 = hashlib.md5()
+        with open(file, "rb") as file:
+            for chunk in iter(lambda: file.read(4096), b""):
+                md5.update(chunk)
+        return md5.hexdigest()
+    except Exception:
+        return ""
+
+
+def compute_file_etag(file: str) -> Optional[str]:
+    """
+    Returns the AWS S3 "etag" for the given file; this value is md5-like but
+    not the same as a normal md5. We use this to compare that a file in S3
+    appears to be the exact the same file as a local file.
+    """
+    try:
+        with io.open(file, "rb") as f:
+            return _compute_file_etag(f)
+    except Exception:
+        return None
+
+
+def _compute_file_etag(f: io.BufferedReader) -> str:
+    # See: https://stackoverflow.com/questions/75723647/calculate-md5-from-aws-s3-etag
+    MULTIPART_THRESHOLD = 8388608
+    MULTIPART_CHUNKSIZE = 8388608
+    # BUFFER_SIZE = 1048576
+    # Verify some assumptions are correct
+    # assert(MULTIPART_CHUNKSIZE >= MULTIPART_THRESHOLD)
+    # assert((MULTIPART_THRESHOLD % BUFFER_SIZE) == 0)
+    # assert((MULTIPART_CHUNKSIZE % BUFFER_SIZE) == 0)
+    hash = hashlib.md5()
+    read = 0
+    chunks = None
+    while True:
+        # Read some from stdin, if we're at the end, stop reading
+        bits = f.read(1048576)
+        if len(bits) == 0:
+            break
+        read += len(bits)
+        hash.update(bits)
+        if chunks is None:
+            # We're handling a multi-part upload, so switch to calculating
+            # hashes of each chunk
+            if read >= MULTIPART_THRESHOLD:
+                chunks = b''
+        if chunks is not None:
+            if (read % MULTIPART_CHUNKSIZE) == 0:
+                # Dont with a chunk, add it to the list of hashes to hash later
+                chunks += hash.digest()
+                hash = hashlib.md5()
+    if chunks is None:
+        # Normal upload, just output the MD5 hash
+        etag = hash.hexdigest()
+    else:
+        # Multipart upload, need to output the hash of the hashes
+        if (read % MULTIPART_CHUNKSIZE) != 0:
+            # Add the last part if we have a partial chunk
+            chunks += hash.digest()
+        etag = hashlib.md5(chunks).hexdigest() + "-" + str(len(chunks) // 16)
+    return etag
+
+
 def create_random_file(file: Optional[str] = None, prefix: Optional[str] = None, suffix: Optional[str] = None,
                        nbytes: int = 1024, binary: bool = False, line_length: Optional[int] = None) -> str:
     """
diff --git a/pyproject.toml b/pyproject.toml
index 55921f4cb..093017ff3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicutils"
-version = "8.8.4.1b15"  # TODO: To become 8.8.5
+version = "8.8.4.1b16"  # TODO: To become 8.8.5
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"