Skip to content

Commit

Permalink
APPS-2549 Bulk describe optimization in scatter-collect to cover file…
Browse files Browse the repository at this point in the history
…s in arrays, structs (#129)

* flattenDataObjectsFromJson WIP

* recurse for array

* fixing compilation errors

* fixing compilation errors

* recursion for array

* recursion for object

* developers list

* flattenDxFileObjectsFromJson

* Move bulk describe logging here and make it consistent (after ids --> set, so the numbers should match)

* Logging - uniqueFileIds.size

* Revert "Logging - uniqueFileIds.size"

This reverts commit d57efcb.

* Revert "Move bulk describe logging here and make it consistent (after ids --> set, so the numbers should match)"

This reverts commit e68a639.

* Add logging back

* naming; tests WIP

* unit tests - fix syntax

* changelog
  • Loading branch information
r-i-v-a authored Mar 22, 2024
1 parent 85ca788 commit 6c93cac
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 8 deletions.
1 change: 1 addition & 0 deletions api/RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## in develop

* Change to make the user-agent string for dxScala more distinctive.
* Support for collecting data objects / files nested inside JSON objects.

## 0.13.9 (2024-02-29)
* adds `headJobOnDemand` attribute to jobNew call
Expand Down
27 changes: 27 additions & 0 deletions api/src/main/scala/dx/api/DxApi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,31 @@ case class DxApi(version: String = "1.0.0", dxEnv: DXEnvironment = DxApi.default
}
}

def flattenDxDataObjectsFromJson(jsValue: JsValue): Vector[DxDataObject] = {
try {
val obj = dataObjectFromJson(jsValue)
Vector(obj)
} catch {
case _: Throwable => {
jsValue match {
case JsObject(fields) =>
fields.values.toVector.flatMap(flattenDxDataObjectsFromJson)
case JsArray(elements) =>
elements.flatMap(flattenDxDataObjectsFromJson)
case _ =>
// Not an object, array, or recognized dx data object
Vector.empty
}
}
}
}

def flattenDxFileObjectsFromJson(jsValue: JsValue): Vector[DxFile] = {
flattenDxDataObjectsFromJson(jsValue).collect {
case obj: DxFile => obj
}
}

def dataObjectFromJson(jsValue: JsValue): DxDataObject = {
val link = jsValue match {
case JsObject(fields) if fields.contains(DxUtils.DxLinkKey) =>
Expand Down Expand Up @@ -926,6 +951,7 @@ case class DxApi(version: String = "1.0.0", dxEnv: DXEnvironment = DxApi.default
// to limit the number of objects in one API request. DxFindDataObjects
// caches the desc on the DxFile object, so we only need to return the DxFile.
def submitRequest(ids: Set[String], project: Option[DxProject]): Vector[DxFile] = {
logger.trace(s"Bulk describing ${ids.size} unique file ids")
ids
.grouped(limit)
.flatMap { chunk =>
Expand Down Expand Up @@ -990,6 +1016,7 @@ case class DxApi(version: String = "1.0.0", dxEnv: DXEnvironment = DxApi.default
}
}

logger.trace(s"Successfully bulk described ${allResults.size} files")
allResults
}

Expand Down
47 changes: 47 additions & 0 deletions api/src/test/scala/dx/api/DxApiTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -251,4 +251,51 @@ class DxApiTest extends AnyFlatSpec with Matchers with BeforeAndAfterAll with Mo
app.describe().name shouldBe "bam_to_fastq"
app.describe().version shouldBe "1.0.0"
}

it should "flat collect files from JSON - nested array" in {
// given
val jsNestedArrayWithFiles = {
val file1 = JsObject(
"$dnanexus_link" -> JsString("file-GgyF7P00q9pQGXZKkKBF5xpP")
)
val file2 = JsObject(
"$dnanexus_link" -> JsString("file-GgyF7P00q9pyV2qbgYxk30J5")
)
JsArray(Vector(JsArray(Vector(file1)), JsArray(Vector(file2))))
}

// when
val result = dxApi.flattenDxFileObjectsFromJson(jsNestedArrayWithFiles)

// then
result.size shouldBe 2
}

it should "flat collect files from JSON - nested object" in {
// given
val jsNestedObjectWithDataObjects = {
val file1 = JsObject(
"$dnanexus_link" -> JsString("file-GgyF7P00q9pQGXZKkKBF5xpP")
)
val file2 = JsObject(
"$dnanexus_link" -> JsString("file-GgyF7P00q9pyV2qbgYxk30J5")
)
val record1 = JsObject(
"$dnanexus_link" -> JsString("record-Fgk7V7j0f9JfkYK55P7k3jGY")
)
JsObject(
"a" -> file1,
"b" -> record1,
"c" -> JsObject(
"d" -> file2
)
)
}

// when
val result = dxApi.flattenDxFileObjectsFromJson(jsNestedObjectWithDataObjects)

// then
result.size shouldBe 2
}
}
9 changes: 1 addition & 8 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,9 @@ name := "dxScala"
ThisBuild / organization := "com.dnanexus"
ThisBuild / scalaVersion := "2.13.7"
ThisBuild / developers := List(
Developer("commandlinegirl",
"Ola Zalcman",
"azalcman@dnanexus.com",
url("https://github.com/dnanexus")),
Developer("Gvaihir", "Gvaihir", "aogrodnikov@dnanexus.com", url("https://github.com/dnanexus")),
Developer("mhrvol", "Marek Hrvol", "mhrvol@dnanexus.com", url("https://github.com/dnanexus")),
Developer("r-i-v-a",
"Riva Nathans",
"rnathans@dnanexus.com",
url("https://github.com/dnanexus")),
Developer("r-i-v-a", "Riva Kepych", "rkepych@dnanexus.com", url("https://github.com/dnanexus")),
Developer("YuxinShi0423", "Yuxin Shi", "yshi@dnanexus.com", url("https://github.com/dnanexus")),
)
ThisBuild / homepage := Some(url("https://github.com/dnanexus/dxScala"))
Expand Down

0 comments on commit 6c93cac

Please sign in to comment.