From 6c93cac64f2068a723bf24604ab70eb1c68a5b66 Mon Sep 17 00:00:00 2001 From: Riva Kepych Date: Fri, 22 Mar 2024 04:35:50 +0100 Subject: [PATCH] APPS-2549 Bulk describe optimization in scatter-collect to cover files in arrays, structs (#129) * flattenDataObjectsFromJson WIP * recurse for array * fixing compilation errors * fixing compilation errors * recursion for array * recursion for object * developers list * flattenDxFileObjectsFromJson * Move bulk describe logging here and make it consistent (after ids --> set, so the numbers should match) * Logging - uniqueFileIds.size * Revert "Logging - uniqueFileIds.size" This reverts commit d57efcb4c982e90bac97315e1830cefe8579a252. * Revert "Move bulk describe logging here and make it consistent (after ids --> set, so the numbers should match)" This reverts commit e68a639c1db090b68a078aad265854f6ca3f6043. * Add logging back * naming; tests WIP * unit tests - fix syntax * changelog --- api/RELEASE_NOTES.md | 1 + api/src/main/scala/dx/api/DxApi.scala | 27 +++++++++++++ api/src/test/scala/dx/api/DxApiTest.scala | 47 +++++++++++++++++++++++ build.sbt | 9 +---- 4 files changed, 76 insertions(+), 8 deletions(-) diff --git a/api/RELEASE_NOTES.md b/api/RELEASE_NOTES.md index b1e16d2..5bff78c 100644 --- a/api/RELEASE_NOTES.md +++ b/api/RELEASE_NOTES.md @@ -3,6 +3,7 @@ ## in develop * Change to make the user-agent string for dxScala more distinctive. +* Support for collecting data objects / files nested inside JSON objects. ## 0.13.9 (2024-02-29) * adds `headJobOnDemand` attribute to jobNew call diff --git a/api/src/main/scala/dx/api/DxApi.scala b/api/src/main/scala/dx/api/DxApi.scala index 89c35d5..3d8cde8 100644 --- a/api/src/main/scala/dx/api/DxApi.scala +++ b/api/src/main/scala/dx/api/DxApi.scala @@ -214,6 +214,31 @@ case class DxApi(version: String = "1.0.0", dxEnv: DXEnvironment = DxApi.default } } + def flattenDxDataObjectsFromJson(jsValue: JsValue): Vector[DxDataObject] = { + try { + val obj = dataObjectFromJson(jsValue) + Vector(obj) + } catch { + case _: Throwable => { + jsValue match { + case JsObject(fields) => + fields.values.toVector.flatMap(flattenDxDataObjectsFromJson) + case JsArray(elements) => + elements.flatMap(flattenDxDataObjectsFromJson) + case _ => + // Not an object, array, or recognized dx data object + Vector.empty + } + } + } + } + + def flattenDxFileObjectsFromJson(jsValue: JsValue): Vector[DxFile] = { + flattenDxDataObjectsFromJson(jsValue).collect { + case obj: DxFile => obj + } + } + def dataObjectFromJson(jsValue: JsValue): DxDataObject = { val link = jsValue match { case JsObject(fields) if fields.contains(DxUtils.DxLinkKey) => @@ -926,6 +951,7 @@ case class DxApi(version: String = "1.0.0", dxEnv: DXEnvironment = DxApi.default // to limit the number of objects in one API request. DxFindDataObjects // caches the desc on the DxFile object, so we only need to return the DxFile. def submitRequest(ids: Set[String], project: Option[DxProject]): Vector[DxFile] = { + logger.trace(s"Bulk describing ${ids.size} unique file ids") ids .grouped(limit) .flatMap { chunk => @@ -990,6 +1016,7 @@ case class DxApi(version: String = "1.0.0", dxEnv: DXEnvironment = DxApi.default } } + logger.trace(s"Successfully bulk described ${allResults.size} files") allResults } diff --git a/api/src/test/scala/dx/api/DxApiTest.scala b/api/src/test/scala/dx/api/DxApiTest.scala index d92f398..692694a 100644 --- a/api/src/test/scala/dx/api/DxApiTest.scala +++ b/api/src/test/scala/dx/api/DxApiTest.scala @@ -251,4 +251,51 @@ class DxApiTest extends AnyFlatSpec with Matchers with BeforeAndAfterAll with Mo app.describe().name shouldBe "bam_to_fastq" app.describe().version shouldBe "1.0.0" } + + it should "flat collect files from JSON - nested array" in { + // given + val jsNestedArrayWithFiles = { + val file1 = JsObject( + "$dnanexus_link" -> JsString("file-GgyF7P00q9pQGXZKkKBF5xpP") + ) + val file2 = JsObject( + "$dnanexus_link" -> JsString("file-GgyF7P00q9pyV2qbgYxk30J5") + ) + JsArray(Vector(JsArray(Vector(file1)), JsArray(Vector(file2)))) + } + + // when + val result = dxApi.flattenDxFileObjectsFromJson(jsNestedArrayWithFiles) + + // then + result.size shouldBe 2 + } + + it should "flat collect files from JSON - nested object" in { + // given + val jsNestedObjectWithDataObjects = { + val file1 = JsObject( + "$dnanexus_link" -> JsString("file-GgyF7P00q9pQGXZKkKBF5xpP") + ) + val file2 = JsObject( + "$dnanexus_link" -> JsString("file-GgyF7P00q9pyV2qbgYxk30J5") + ) + val record1 = JsObject( + "$dnanexus_link" -> JsString("record-Fgk7V7j0f9JfkYK55P7k3jGY") + ) + JsObject( + "a" -> file1, + "b" -> record1, + "c" -> JsObject( + "d" -> file2 + ) + ) + } + + // when + val result = dxApi.flattenDxFileObjectsFromJson(jsNestedObjectWithDataObjects) + + // then + result.size shouldBe 2 + } } diff --git a/build.sbt b/build.sbt index 010df80..12080e9 100644 --- a/build.sbt +++ b/build.sbt @@ -10,16 +10,9 @@ name := "dxScala" ThisBuild / organization := "com.dnanexus" ThisBuild / scalaVersion := "2.13.7" ThisBuild / developers := List( - Developer("commandlinegirl", - "Ola Zalcman", - "azalcman@dnanexus.com", - url("https://github.com/dnanexus")), Developer("Gvaihir", "Gvaihir", "aogrodnikov@dnanexus.com", url("https://github.com/dnanexus")), Developer("mhrvol", "Marek Hrvol", "mhrvol@dnanexus.com", url("https://github.com/dnanexus")), - Developer("r-i-v-a", - "Riva Nathans", - "rnathans@dnanexus.com", - url("https://github.com/dnanexus")), + Developer("r-i-v-a", "Riva Kepych", "rkepych@dnanexus.com", url("https://github.com/dnanexus")), Developer("YuxinShi0423", "Yuxin Shi", "yshi@dnanexus.com", url("https://github.com/dnanexus")), ) ThisBuild / homepage := Some(url("https://github.com/dnanexus/dxScala"))