Skip to content

Commit

Permalink
chore: add hash to upload data
Browse files Browse the repository at this point in the history
  • Loading branch information
smotornyuk committed Mar 4, 2024
1 parent 96051d4 commit 4d5dbc3
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def upgrade():
op.execute(
sa.update(table)
.values(
atime=last_access, extras=dict(extras, filename=os.path.basename(path))
atime=last_access, extras=dict(extras or {}, filename=os.path.basename(path))
)
.where(table.c.id == id)
)
Expand Down
25 changes: 22 additions & 3 deletions ckanext/files/storage/fs.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import logging
import os
import magic
import uuid

import hashlib
import six
from werkzeug.datastructures import FileStorage

import ckan.plugins.toolkit as tk

from ckanext.files import exceptions, utils
from ckanext.files.model import file

from .base import Capability, Manager, Storage, Uploader

Expand All @@ -17,6 +17,7 @@


log = logging.getLogger(__name__)
CHUNK_SIZE = 16_384


class FileSystemUploader(Uploader):
Expand All @@ -30,13 +31,20 @@ def upload(self, name, upload, extras): # pragma: no cover
filename = str(uuid.uuid4())
filepath = os.path.join(self.storage.settings["path"], filename)

md5 = hashlib.md5()
with open(filepath, "wb") as dest:
upload.save(dest)
while True:
chunk = upload.stream.read(CHUNK_SIZE)
if not chunk:
break
md5.update(chunk)
dest.write(chunk)

return {
"filename": filename,
"content_type": upload.content_type,
"size": os.path.getsize(filepath),
"hash": md5.hexdigest(),
}

def initialize_multipart_upload(self, name, extras):
Expand Down Expand Up @@ -110,6 +118,17 @@ def complete_multipart_upload(self, upload_data, extras):
}
)

md5 = hashlib.md5()
with open(filepath, "rb") as src:
chunk = src.read(CHUNK_SIZE)
content_type = magic.from_buffer(chunk, True)

while chunk:
md5.update(chunk)
chunk = src.read(CHUNK_SIZE)

upload_data["hash"] = md5.hexdigest()
upload_data["content_type"] = content_type
return upload_data


Expand Down
17 changes: 15 additions & 2 deletions ckanext/files/storage/google_cloud.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import re
import uuid

import base64
import requests
import six
from google.cloud.storage import Client
Expand Down Expand Up @@ -41,7 +41,13 @@ def upload(self, name, upload, extras): # pragma: no cover
client = self.storage.client
blob = client.bucket(self.storage.settings["bucket"]).blob(filepath)
blob.upload_from_file(upload.stream)
return {"filename": filename, "content_type": upload.content_type}
filehash = base64.decodebytes(blob.md5_hash.encode()).hex()
return {
"filename": filename,
"content_type": upload.content_type,
"hash": filehash,
"size": blob.size,
}

def initialize_multipart_upload(self, name, extras):
# type: (str, dict[str, Any]) -> dict[str, Any]
Expand Down Expand Up @@ -99,6 +105,9 @@ def update_multipart_upload(self, upload_data, extras):
if last_byte >= size:
raise exceptions.UploadOutOfBoundError(last_byte, size)

if upload.content_length < 256 * 1024 and last_byte < size - 1:
raise tk.ValidationError({"upload": ["Cannot be smaller than 256KiB"]})

resp = requests.put(
upload_data["session_url"],
data=upload.stream.read(),
Expand Down Expand Up @@ -134,10 +143,14 @@ def complete_multipart_upload(self, upload_data, extras):
]
}
)

filehash = base64.decodebytes(upload_data["result"]["md5Hash"].encode()).hex()

return {
"filename": os.path.relpath(
upload_data["result"]["name"], self.storage.settings["path"]
),
"hash": filehash,
"content_type": upload_data["result"]["contentType"],
"size": upload_data["size"],
}
Expand Down

0 comments on commit 4d5dbc3

Please sign in to comment.