Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

APPS-2549 Bulk describe optimization in scatter-collect to cover files in arrays, structs #129

Merged
merged 16 commits into from
Mar 22, 2024
Merged
1 change: 1 addition & 0 deletions api/RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## in develop

* Change to make the user-agent string for dxScala more distinctive.
* Support for collecting data objects / files nested inside JSON objects.

## 0.13.9 (2024-02-29)
* adds `headJobOnDemand` attribute to jobNew call
Expand Down
27 changes: 27 additions & 0 deletions api/src/main/scala/dx/api/DxApi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,31 @@ case class DxApi(version: String = "1.0.0", dxEnv: DXEnvironment = DxApi.default
}
}

def flattenDxDataObjectsFromJson(jsValue: JsValue): Vector[DxDataObject] = {
try {
val obj = dataObjectFromJson(jsValue)
Vector(obj)
} catch {
case _: Throwable => {
jsValue match {
case JsObject(fields) =>
fields.values.toVector.flatMap(flattenDxDataObjectsFromJson)
case JsArray(elements) =>
elements.flatMap(flattenDxDataObjectsFromJson)
case _ =>
// Not an object, array, or recognized dx data object
Vector.empty
}
}
}
}

def flattenDxFileObjectsFromJson(jsValue: JsValue): Vector[DxFile] = {
flattenDxDataObjectsFromJson(jsValue).collect {
case obj: DxFile => obj
}
}

def dataObjectFromJson(jsValue: JsValue): DxDataObject = {
val link = jsValue match {
case JsObject(fields) if fields.contains(DxUtils.DxLinkKey) =>
Expand Down Expand Up @@ -926,6 +951,7 @@ case class DxApi(version: String = "1.0.0", dxEnv: DXEnvironment = DxApi.default
// to limit the number of objects in one API request. DxFindDataObjects
// caches the desc on the DxFile object, so we only need to return the DxFile.
def submitRequest(ids: Set[String], project: Option[DxProject]): Vector[DxFile] = {
logger.trace(s"Bulk describing ${ids.size} unique file ids")
ids
.grouped(limit)
.flatMap { chunk =>
Expand Down Expand Up @@ -990,6 +1016,7 @@ case class DxApi(version: String = "1.0.0", dxEnv: DXEnvironment = DxApi.default
}
}

logger.trace(s"Successfully bulk described ${allResults.size} files")
allResults
}

Expand Down
47 changes: 47 additions & 0 deletions api/src/test/scala/dx/api/DxApiTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -251,4 +251,51 @@ class DxApiTest extends AnyFlatSpec with Matchers with BeforeAndAfterAll with Mo
app.describe().name shouldBe "bam_to_fastq"
app.describe().version shouldBe "1.0.0"
}

it should "flat collect files from JSON - nested array" in {
// given
val jsNestedArrayWithFiles = {
val file1 = JsObject(
"$dnanexus_link" -> JsString("file-GgyF7P00q9pQGXZKkKBF5xpP")
)
val file2 = JsObject(
"$dnanexus_link" -> JsString("file-GgyF7P00q9pyV2qbgYxk30J5")
)
JsArray(Vector(JsArray(Vector(file1)), JsArray(Vector(file2))))
}

// when
val result = dxApi.flattenDxFileObjectsFromJson(jsNestedArrayWithFiles)

// then
result.size shouldBe 2
}

it should "flat collect files from JSON - nested object" in {
// given
val jsNestedObjectWithDataObjects = {
val file1 = JsObject(
"$dnanexus_link" -> JsString("file-GgyF7P00q9pQGXZKkKBF5xpP")
)
val file2 = JsObject(
"$dnanexus_link" -> JsString("file-GgyF7P00q9pyV2qbgYxk30J5")
)
val record1 = JsObject(
"$dnanexus_link" -> JsString("record-Fgk7V7j0f9JfkYK55P7k3jGY")
)
JsObject(
"a" -> file1,
"b" -> record1,
"c" -> JsObject(
"d" -> file2
)
)
}

// when
val result = dxApi.flattenDxFileObjectsFromJson(jsNestedObjectWithDataObjects)

// then
result.size shouldBe 2
}
}
9 changes: 1 addition & 8 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,9 @@ name := "dxScala"
ThisBuild / organization := "com.dnanexus"
ThisBuild / scalaVersion := "2.13.7"
ThisBuild / developers := List(
Developer("commandlinegirl",
"Ola Zalcman",
"azalcman@dnanexus.com",
url("https://github.com/dnanexus")),
Developer("Gvaihir", "Gvaihir", "aogrodnikov@dnanexus.com", url("https://github.com/dnanexus")),
Developer("mhrvol", "Marek Hrvol", "mhrvol@dnanexus.com", url("https://github.com/dnanexus")),
Developer("r-i-v-a",
"Riva Nathans",
"rnathans@dnanexus.com",
url("https://github.com/dnanexus")),
Developer("r-i-v-a", "Riva Kepych", "rkepych@dnanexus.com", url("https://github.com/dnanexus")),
Developer("YuxinShi0423", "Yuxin Shi", "yshi@dnanexus.com", url("https://github.com/dnanexus")),
)
ThisBuild / homepage := Some(url("https://github.com/dnanexus/dxScala"))
Expand Down
Loading