Skip to content

Commit

Permalink
Merge pull request #2959 from OpenNeuroOrg/fix/missing-files-socket-c…
Browse files Browse the repository at this point in the history
…losed-bugs

Fix API crash caused by socket closed errors
  • Loading branch information
nellh authored Dec 13, 2023
2 parents 19245a2 + 7ce531f commit 48fb0d8
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 50 deletions.
46 changes: 21 additions & 25 deletions packages/openneuro-server/src/datalad/files.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import request from "superagent"
import { redis } from "../libs/redis"
import CacheItem, { CacheType } from "../cache/item"
import { getDatasetWorker } from "../libs/datalad-service"
Expand Down Expand Up @@ -86,36 +85,33 @@ export const computeTotalSize = (files: [DatasetFile]): number =>
* @param {string} datasetId - Dataset accession number
* @param {string} treeish - Git treeish hexsha
*/
export const getFiles = (datasetId, treeish): Promise<[DatasetFile]> => {
export const getFiles = (datasetId, treeish): Promise<[DatasetFile?]> => {
const cache = new CacheItem(redis, CacheType.commitFiles, [
datasetId,
treeish.substring(0, 7),
])
return cache.get(
(doNotCache) =>
request
.get(
`${
getDatasetWorker(
datasetId,
)
}/datasets/${datasetId}/tree/${treeish}`,
async (doNotCache): Promise<[DatasetFile?]> => {
const response = await fetch(`http://${
getDatasetWorker(
datasetId,
)
.set("Accept", "application/json")
.then((response) => {
if (response.status === 200) {
const {
body: { files },
} = response
for (const f of files) {
// Skip caching this tree if it doesn't contain S3 URLs - likely still exporting
if (!f.directory && !f.urls[0].includes("s3.amazonaws.com")) {
doNotCache(true)
break
}
}
return files as [DatasetFile]
}/datasets/${datasetId}/tree/${treeish}`)
const body = await response.json()
const files = body?.files
if (files) {
for (const f of files) {
// Skip caching this tree if it doesn't contain S3 URLs - likely still exporting
if (!f.directory && !f.urls[0].includes("s3.amazonaws.com")) {
doNotCache(true)
break
}
}) as Promise<[DatasetFile]>,
}
return files
} else {
// Possible to have zero files here, return an empty array
return []
}
},
)
}
11 changes: 6 additions & 5 deletions packages/openneuro-server/src/handlers/datalad.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ export const getFile = async (req, res) => {
.then((r) => {
// Set the content length (allow clients to catch HTTP issues better)
res.setHeader("Content-Length", Number(r.headers.get("content-length")))
return r.body
if (r.status === 404) {
res.status(404).send("Requested dataset or file cannot be found")
} else {
// @ts-expect-error
Readable.fromWeb(r.body, { highWaterMark: 4194304 }).pipe(res)
}
})
.then((stream) =>
// @ts-expect-error
Readable.fromWeb(stream, { highWaterMark: 4194304 }).pipe(res)
)
.catch((err) => {
console.error(err)
res.status(500).send("Internal error transferring requested file")
Expand Down
39 changes: 22 additions & 17 deletions services/datalad/datalad_service/handlers/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os

import falcon
import pygit2

from datalad_service.common.git import git_show
from datalad_service.common.user import get_user_info
Expand All @@ -18,24 +19,28 @@ def __init__(self, store):
def on_get(self, req, resp, dataset, filename, snapshot='HEAD'):
ds_path = self.store.get_dataset_path(dataset)
try:
file_content = git_show(ds_path, snapshot, filename)
# If the file begins with an annex path, return that path
if file_content[0:4096].find('.git/annex') != -1:
# Resolve absolute path for annex target
target_path = os.path.normpath(os.path.join(
ds_path, os.path.dirname(filename), file_content))
# Verify the annex path is within the dataset dir
if ds_path == os.path.commonpath((ds_path, target_path)):
fd = open(target_path, 'rb')
resp.stream = fd
resp.stream_len = os.fstat(fd.fileno()).st_size
resp.status = falcon.HTTP_OK
try:
file_content = git_show(ds_path, snapshot, filename)
# If the file begins with an annex path, return that path
if file_content[0:4096].find('.git/annex') != -1:
# Resolve absolute path for annex target
target_path = os.path.normpath(os.path.join(
ds_path, os.path.dirname(filename), file_content))
# Verify the annex path is within the dataset dir
if ds_path == os.path.commonpath((ds_path, target_path)):
fd = open(target_path, 'rb')
resp.stream = fd
resp.stream_len = os.fstat(fd.fileno()).st_size
resp.status = falcon.HTTP_OK
else:
resp.media = {'error': 'file not found in git tree'}
resp.status = falcon.HTTP_NOT_FOUND
else:
resp.media = {'error': 'file not found in git tree'}
resp.status = falcon.HTTP_NOT_FOUND
else:
resp.body = file_content
resp.status = falcon.HTTP_OK
resp.body = file_content
resp.status = falcon.HTTP_OK
except pygit2.GitError:
resp.status = falcon.HTTP_NOT_FOUND
resp.media = {'error': 'dataset repository does not exist or could not be opened'}
except KeyError:
# File is not present in tree
resp.media = {'error': 'file not found in git tree'}
Expand Down
11 changes: 8 additions & 3 deletions services/datalad/datalad_service/handlers/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ def on_get(self, req, resp, dataset, tree):
# Request for index of files
# Return a list of file objects
# {name, path, size}
files = get_tree(self.store, dataset, tree)
files.sort(key=dataset_sort)
resp.media = {'files': files}
try:
files = get_tree(self.store, dataset, tree)
files.sort(key=dataset_sort)
resp.status = falcon.HTTP_OK
resp.media = {'files': files}
except FileNotFoundError:
resp.status = falcon.HTTP_NOT_FOUND
resp.media = {'error': 'Dataset does not exist'}

0 comments on commit 48fb0d8

Please sign in to comment.