diff --git a/packages/openneuro-server/src/datalad/files.ts b/packages/openneuro-server/src/datalad/files.ts index 86b37d3f7..ef34a2575 100644 --- a/packages/openneuro-server/src/datalad/files.ts +++ b/packages/openneuro-server/src/datalad/files.ts @@ -1,4 +1,3 @@ -import request from "superagent" import { redis } from "../libs/redis" import CacheItem, { CacheType } from "../cache/item" import { getDatasetWorker } from "../libs/datalad-service" @@ -86,36 +85,33 @@ export const computeTotalSize = (files: [DatasetFile]): number => * @param {string} datasetId - Dataset accession number * @param {string} treeish - Git treeish hexsha */ -export const getFiles = (datasetId, treeish): Promise<[DatasetFile]> => { +export const getFiles = (datasetId, treeish): Promise<[DatasetFile?]> => { const cache = new CacheItem(redis, CacheType.commitFiles, [ datasetId, treeish.substring(0, 7), ]) return cache.get( - (doNotCache) => - request - .get( - `${ - getDatasetWorker( - datasetId, - ) - }/datasets/${datasetId}/tree/${treeish}`, + async (doNotCache): Promise<[DatasetFile?]> => { + const response = await fetch(`http://${ + getDatasetWorker( + datasetId, ) - .set("Accept", "application/json") - .then((response) => { - if (response.status === 200) { - const { - body: { files }, - } = response - for (const f of files) { - // Skip caching this tree if it doesn't contain S3 URLs - likely still exporting - if (!f.directory && !f.urls[0].includes("s3.amazonaws.com")) { - doNotCache(true) - break - } - } - return files as [DatasetFile] + }/datasets/${datasetId}/tree/${treeish}`) + const body = await response.json() + const files = body?.files + if (files) { + for (const f of files) { + // Skip caching this tree if it doesn't contain S3 URLs - likely still exporting + if (!f.directory && !f.urls[0].includes("s3.amazonaws.com")) { + doNotCache(true) + break } - }) as Promise<[DatasetFile]>, + } + return files + } else { + // Possible to have zero files here, return an empty array + return [] + } + }, ) } diff --git a/packages/openneuro-server/src/handlers/datalad.ts b/packages/openneuro-server/src/handlers/datalad.ts index 0e84f7e78..6208d94bf 100644 --- a/packages/openneuro-server/src/handlers/datalad.ts +++ b/packages/openneuro-server/src/handlers/datalad.ts @@ -44,12 +44,13 @@ export const getFile = async (req, res) => { .then((r) => { // Set the content length (allow clients to catch HTTP issues better) res.setHeader("Content-Length", Number(r.headers.get("content-length"))) - return r.body + if (r.status === 404) { + res.status(404).send("Requested dataset or file cannot be found") + } else { + // @ts-expect-error + Readable.fromWeb(r.body, { highWaterMark: 4194304 }).pipe(res) + } }) - .then((stream) => - // @ts-expect-error - Readable.fromWeb(stream, { highWaterMark: 4194304 }).pipe(res) - ) .catch((err) => { console.error(err) res.status(500).send("Internal error transferring requested file") diff --git a/services/datalad/datalad_service/handlers/files.py b/services/datalad/datalad_service/handlers/files.py index 554bc1d9d..1b7d505e3 100644 --- a/services/datalad/datalad_service/handlers/files.py +++ b/services/datalad/datalad_service/handlers/files.py @@ -2,6 +2,7 @@ import os import falcon +import pygit2 from datalad_service.common.git import git_show from datalad_service.common.user import get_user_info @@ -18,24 +19,28 @@ def __init__(self, store): def on_get(self, req, resp, dataset, filename, snapshot='HEAD'): ds_path = self.store.get_dataset_path(dataset) try: - file_content = git_show(ds_path, snapshot, filename) - # If the file begins with an annex path, return that path - if file_content[0:4096].find('.git/annex') != -1: - # Resolve absolute path for annex target - target_path = os.path.normpath(os.path.join( - ds_path, os.path.dirname(filename), file_content)) - # Verify the annex path is within the dataset dir - if ds_path == os.path.commonpath((ds_path, target_path)): - fd = open(target_path, 'rb') - resp.stream = fd - resp.stream_len = os.fstat(fd.fileno()).st_size - resp.status = falcon.HTTP_OK + try: + file_content = git_show(ds_path, snapshot, filename) + # If the file begins with an annex path, return that path + if file_content[0:4096].find('.git/annex') != -1: + # Resolve absolute path for annex target + target_path = os.path.normpath(os.path.join( + ds_path, os.path.dirname(filename), file_content)) + # Verify the annex path is within the dataset dir + if ds_path == os.path.commonpath((ds_path, target_path)): + fd = open(target_path, 'rb') + resp.stream = fd + resp.stream_len = os.fstat(fd.fileno()).st_size + resp.status = falcon.HTTP_OK + else: + resp.media = {'error': 'file not found in git tree'} + resp.status = falcon.HTTP_NOT_FOUND else: - resp.media = {'error': 'file not found in git tree'} - resp.status = falcon.HTTP_NOT_FOUND - else: - resp.body = file_content - resp.status = falcon.HTTP_OK + resp.body = file_content + resp.status = falcon.HTTP_OK + except pygit2.GitError: + resp.status = falcon.HTTP_NOT_FOUND + resp.media = {'error': 'dataset repository does not exist or could not be opened'} except KeyError: # File is not present in tree resp.media = {'error': 'file not found in git tree'} diff --git a/services/datalad/datalad_service/handlers/tree.py b/services/datalad/datalad_service/handlers/tree.py index 08dc3ff22..2a5b561db 100644 --- a/services/datalad/datalad_service/handlers/tree.py +++ b/services/datalad/datalad_service/handlers/tree.py @@ -15,6 +15,11 @@ def on_get(self, req, resp, dataset, tree): # Request for index of files # Return a list of file objects # {name, path, size} - files = get_tree(self.store, dataset, tree) - files.sort(key=dataset_sort) - resp.media = {'files': files} + try: + files = get_tree(self.store, dataset, tree) + files.sort(key=dataset_sort) + resp.status = falcon.HTTP_OK + resp.media = {'files': files} + except FileNotFoundError: + resp.status = falcon.HTTP_NOT_FOUND + resp.media = {'error': 'Dataset does not exist'}