Skip to content

Commit

Permalink
Added compute_file_md5 and compute_file_etag to file_utils.
Browse files Browse the repository at this point in the history
  • Loading branch information
dmichaels-harvard committed May 1, 2024
1 parent 2004f66 commit d4f144e
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Change Log
- Added get_app_specific_directory, get_os_name, get_cpu_architecture_name, short_uuid to misc_utils.
- Added normalize_file_path, are_files_equal, and create_random_file to file_utils.
- Added create_temporary_file_name and remove_temporary_file tmpfile_utils.
- Added compute_file_md5 and compute_file_etag to file_utils.
- Minor fix to misc_utils.create_dict (do not create property only if its value is None).


Expand Down
72 changes: 72 additions & 0 deletions dcicutils/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import glob
import hashlib
import io
import os
import pathlib
from datetime import datetime
Expand Down Expand Up @@ -103,6 +105,76 @@ def are_files_equal(filea: str, fileb: str) -> bool:
return False


def compute_file_md5(file: str) -> str:
"""
Returns the md5 checksum for the given file.
"""
if not isinstance(file, str):
return ""
try:
md5 = hashlib.md5()
with open(file, "rb") as file:
for chunk in iter(lambda: file.read(4096), b""):
md5.update(chunk)
return md5.hexdigest()
except Exception:
return ""


def compute_file_etag(file: str) -> Optional[str]:
"""
Returns the AWS S3 "etag" for the given file; this value is md5-like but
not the same as a normal md5. We use this to compare that a file in S3
appears to be the exact the same file as a local file.
"""
try:
with io.open(file, "rb") as f:
return _compute_file_etag(f)
except Exception:
return None


def _compute_file_etag(f: io.BufferedReader) -> str:
# See: https://stackoverflow.com/questions/75723647/calculate-md5-from-aws-s3-etag
MULTIPART_THRESHOLD = 8388608
MULTIPART_CHUNKSIZE = 8388608
# BUFFER_SIZE = 1048576
# Verify some assumptions are correct
# assert(MULTIPART_CHUNKSIZE >= MULTIPART_THRESHOLD)
# assert((MULTIPART_THRESHOLD % BUFFER_SIZE) == 0)
# assert((MULTIPART_CHUNKSIZE % BUFFER_SIZE) == 0)
hash = hashlib.md5()
read = 0
chunks = None
while True:
# Read some from stdin, if we're at the end, stop reading
bits = f.read(1048576)
if len(bits) == 0:
break
read += len(bits)
hash.update(bits)
if chunks is None:
# We're handling a multi-part upload, so switch to calculating
# hashes of each chunk
if read >= MULTIPART_THRESHOLD:
chunks = b''
if chunks is not None:
if (read % MULTIPART_CHUNKSIZE) == 0:
# Dont with a chunk, add it to the list of hashes to hash later
chunks += hash.digest()
hash = hashlib.md5()
if chunks is None:
# Normal upload, just output the MD5 hash
etag = hash.hexdigest()
else:
# Multipart upload, need to output the hash of the hashes
if (read % MULTIPART_CHUNKSIZE) != 0:
# Add the last part if we have a partial chunk
chunks += hash.digest()
etag = hashlib.md5(chunks).hexdigest() + "-" + str(len(chunks) // 16)
return etag


def create_random_file(file: Optional[str] = None, prefix: Optional[str] = None, suffix: Optional[str] = None,
nbytes: int = 1024, binary: bool = False, line_length: Optional[int] = None) -> str:
"""
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicutils"
version = "8.8.4.1b15" # TODO: To become 8.8.5
version = "8.8.4.1b16" # TODO: To become 8.8.5
description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
license = "MIT"
Expand Down

0 comments on commit d4f144e

Please sign in to comment.