From 63b5f11a41877501a8627e4363441755892d8e98 Mon Sep 17 00:00:00 2001 From: Hayco de Jong Date: Tue, 26 Nov 2024 11:14:42 +0100 Subject: [PATCH] nested-complex-facets (#68) * make 'keyword' the default index type * allow for 'nested' fields in config * fix /indices to deal with nested config * translate nested queries to ES * make multiFacet aux queries 'nested' compliant * accept new style order/size in aggregations * translate nested aggregations query, including new style order/size params * remove redundant semicolon * remove unused WIP classes * Abstract deep nested ES result to bitesize :broccoli: * inline var (logging no longer needed) * Fix use map entry, not its size (!) * Del spurious newline * Bump version * Remove debug toString() * cleanup * Fix (re-)ignore 'curTerm' when doing aux queries * Bump version * Upgrade base image (fix several CVEs) * Make multiFacetCountQueries work with nested facets * multiFacetCountQuery should only request aggs for its own term * bump version * simplify predicate lambda/let * bump version (partial fix) * Fix multiFacetCountQueries to work with nested terms * Handle ES nested multiFacetCountQueries aggregation counts * bump version * remove obsolete mondriaan ignore * WIP: untangle configuration-derived mess to prep for ES query building * WIP: build 'main query' part of ES query for logical facets * WIP: build 'main query' part of ES query for logical facets * WIP: "filters" part of aggregation works, on to "aggs" part * WIP: "aggs" portion works, left to do: the "size" and "order" spec * WIP: add size+order spec * WIP: refactor>extract code clone * WIP: rename sortSpec * WIP: return path from ES: start parsing result * WIP: halfway extracting buckets; time to piece together the facetName * WIP: fix some typing issues * WIP: first working version of ES return mapping logical facet aggregations * WIP: cleanup some debug prints * WIP: make it work for configs without fixed field * version bump (0.40-xxx-7a) * Fix building queries for facets with no fixed value * version bump (0.40-xxx-7b) * Use deep value_count aggregation to get 'document' count for sorting nested facets by count * bump version (7c) * FROM and AS require same casing to satisfy linter * WIP: escalate 'size' to largest found in aggSpecs * WIP: tidy scope merge code a bit * cull aggregation results when less requested than returned by ES * bump version * nested-facets-8b: remove debug print * WIP: add multiple sort expressions to ES query when logical facets mapping to the same nested facet require different sorts TODO: extract the correct portions on ES return * WIP: suppress another unchecked cast * WIP: extract ES results according to query desires * Remove dev prints and bump version * don't recompute aggs for logical facets themselves * Use latest republic AR container --- .gitignore | 1 - config.yml | 148 +++++--- k8s/broccoli-server/Dockerfile | 4 +- pom.xml | 16 +- .../nl/knaw/huc/broccoli/api/Constants.kt | 4 + .../nl/knaw/huc/broccoli/api/ElasticQuery.kt | 164 ++++++++- .../nl/knaw/huc/broccoli/api/IndexQuery.kt | 4 +- .../broccoli/config/BroccoliConfiguration.kt | 85 ++++- .../huc/broccoli/core/ElasticQueryBuilder.kt | 348 +++++++++++++----- .../resources/brinta/BrintaResource.kt | 24 +- .../resources/projects/ProjectsResource.kt | 39 +- .../nl/knaw/huc/broccoli/service/Util.kt | 111 +++++- .../service/mock/MockIIIFStoreTest.kt | 36 -- 13 files changed, 753 insertions(+), 231 deletions(-) delete mode 100644 src/test/kotlin/nl/knaw/huc/broccoli/service/mock/MockIIIFStoreTest.kt diff --git a/.gitignore b/.gitignore index 8a11242..eb37e87 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,4 @@ target/ *.log globalise.http /globalise.http -/mondriaan.http /config.yml- diff --git a/config.yml b/config.yml index 5b420bc..64ad48e 100644 --- a/config.yml +++ b/config.yml @@ -58,13 +58,10 @@ projects: fields: - name: bodyType path: "$.body.type" - type: keyword - name: invNr path: "$.body.metadata.inventoryNumber" - type: keyword - name: document path: "$.body.metadata.document" - type: keyword textRepo: uri: https://globalise.tt.di.huc.knaw.nl @@ -112,40 +109,28 @@ projects: fields: - name: bodyType path: "$.body.type" - type: keyword - name: lang path: "$.body.metadata.lang" - type: keyword - name: type path: "$.body.metadata.type" - type: keyword - name: anno path: "$.body.metadata.anno" - type: keyword - name: country path: "$.body.metadata.country" - type: keyword - name: institution path: "$.body.metadata.institution" - type: keyword - name: msid path: "$.body.metadata.msid" - type: keyword - name: period path: "$.body.metadata.period" - type: keyword - name: periodLong path: "$.body.metadata.periodlong" - type: keyword - name: letterId path: "$.body.metadata.letterid" - type: keyword - name: correspondent path: "$.body.metadata.correspondent" - type: keyword - name: location path: "$.body.metadata.location" - type: keyword annoRepo: containerName: 'mondriaan-0.9.0' uri: https://mondriaan.annorepo.dev.clariah.nl @@ -160,18 +145,81 @@ projects: deleteKey: 'republic-dev-mag-weg' joinSeparator: " " indices: - - name: 'republic-2024.06.18' + - name: 'rep-2024.11.18' bodyTypes: [ Resolution ] fields: + - name: attendantId + logical: + scope: attendants + path: ".id" + - name: attendantName + logical: + scope: attendants + path: ".name" + - name: locationName + logical: + scope: entities + path: ".name" + fixed: + path: ".category" + value: LOC + - name: locationLabels + logical: + scope: entities + path: ".labels" + fixed: + path: ".category" + value: LOC + - name: organisationName + logical: + scope: entities + path: ".name" + fixed: + path: ".category" + value: ORG + - name: organisationLabels + logical: + scope: entities + path: ".labels" + fixed: + path: ".category" + value: ORG + - name: personName + logical: + scope: entities + path: ".name" + fixed: + path: ".category" + value: PERS + - name: personLabels + logical: + scope: entities + path: ".labels" + fixed: + path: ".category" + value: PERS + - name: roleName + logical: + scope: entities + path: ".name" + fixed: + path: ".category" + value: HOE + - name: roleLabels + logical: + scope: entities + path: ".labels" + fixed: + path: ".category" + value: HOE + - name: bodyType + path: "$.body.type" - name: propositionType path: "$.body.metadata.propositionType" - type: keyword - name: resolutionType path: "$.body.metadata.resolutionType" - type: keyword - name: textType path: "$.body.metadata.textType" - type: keyword - name: sessionDate path: "$.body.metadata.sessionDate" type: date @@ -184,32 +232,38 @@ projects: - name: sessionYear path: "$.body.metadata.sessionYear" type: short - - name: delegateId - path: "$.body.metadata.delegateId" - type: keyword - - name: delegateName - path: "$.body.metadata.delegateName" - type: keyword - - name: entityCategory - path: "$.body.metadata.category" - type: keyword - - name: entityId - path: "$.body.metadata.entityId" - type: keyword - - name: entityLabels - path: "$.body.metadata.entityLabels" - type: keyword - - name: entityName - path: "$.body.metadata.name" - type: keyword + - name: attendants + type: nested + nested: + from: [ Attendant ] + fields: + - name: id + path: "$.body.metadata.delegateId" + - name: name + path: "$.body.metadata.delegateName" + with: + - equal: "$.body.metadata.sessionID" + - name: entities + type: nested + nested: + from: [ Entity ] + fields: + - name: category + path: "$.body.metadata.category" + - name: id + path: "$.body.metadata.entityId" + - name: labels + path: "$.body.metadata.entityLabels" + - name: name + path: "$.body.metadata.name" + with: + - overlap: LogicalText - name: bodyType path: "$.body.type" - type: keyword - name: sessionWeekday path: "$.body.metadata.sessionWeekday" - type: keyword annoRepo: - containerName: republic-2024.06.18 + containerName: republic-2024.11.18 uri: https://annorepo.republic-caf.diginfra.org textRepo: uri: https://textrepo.republic-caf.diginfra.org @@ -255,28 +309,21 @@ projects: fields: - name: bodyType path: "$.body.type" - type: keyword - name: date path: "$.body.metadata.date" type: date - name: recipient path: "$.body.metadata.recipient" - type: keyword - name: recipientLoc path: "$.body.metadata.recipientLoc" - type: keyword - name: sender path: "$.body.metadata.sender" - type: keyword - name: senderLoc path: "$.body.metadata.senderLoc" - type: keyword - name: editorNotes path: "$.body.metadata.editorNotes" - type: keyword - name: shelfmark path: "$.body.metadata.shelfmark" - type: keyword - name: summary path: "$.body.metadata.summary" type: text @@ -299,7 +346,6 @@ projects: fields: - name: bodyType path: "$.body.type" - type: keyword textRepo: uri: https://brieven-van-hooft.tt.di.huc.knaw.nl @@ -328,22 +374,16 @@ projects: fields: - name: correspondent path: "$.body.metadata.correspondent" - type: keyword - name: institution path: "$.body.metadata.institution" - type: keyword - name: location path: "$.body.metadata.location" - type: keyword - name: msid path: "$.body.metadata.msid" - type: keyword - name: period path: "$.body.metadata.period" - type: keyword - name: periodLong path: "$.body.metadata.periodLong" - type: keyword annoRepo: containerName: 'vangogh-0.2.0' uri: https://vangogh.annorepo.dev.clariah.nl diff --git a/k8s/broccoli-server/Dockerfile b/k8s/broccoli-server/Dockerfile index 1eb2d2c..775ccdd 100644 --- a/k8s/broccoli-server/Dockerfile +++ b/k8s/broccoli-server/Dockerfile @@ -1,11 +1,11 @@ -FROM maven:3.8.5 as builder +FROM maven:3.8.5 AS builder WORKDIR /build/ COPY ./src /build/src COPY ./pom.xml /build/ RUN mvn --no-transfer-progress --batch-mode --update-snapshots --also-make package -FROM openjdk:20-slim +FROM openjdk:24-jdk-slim RUN apt-get update && apt-get install -y curl jq WORKDIR /apps/broccoli diff --git a/pom.xml b/pom.xml index eaeb978..923dced 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ nl.knaw.huc broccoli - 0.39.0 + 0.40-nested-facets-8d jar @@ -18,7 +18,7 @@ yyyy-MM-dd'T'HH:mm:ss'Z' official - 1.9.25 + 2.0.20 17 ${java.version} ${java.version} @@ -104,8 +104,10 @@ true - - + + ${mainClass} @@ -142,8 +144,10 @@ - - + + ${mainClass} diff --git a/src/main/kotlin/nl/knaw/huc/broccoli/api/Constants.kt b/src/main/kotlin/nl/knaw/huc/broccoli/api/Constants.kt index 94b120c..a81cb7a 100644 --- a/src/main/kotlin/nl/knaw/huc/broccoli/api/Constants.kt +++ b/src/main/kotlin/nl/knaw/huc/broccoli/api/Constants.kt @@ -29,4 +29,8 @@ object Constants { } const val TEXT_TOKEN_COUNT = "text.tokenCount" + + const val NO_FILTERS = "no_filters" + + const val DOC_COUNT = "doc_count" } diff --git a/src/main/kotlin/nl/knaw/huc/broccoli/api/ElasticQuery.kt b/src/main/kotlin/nl/knaw/huc/broccoli/api/ElasticQuery.kt index def6109..baf124a 100644 --- a/src/main/kotlin/nl/knaw/huc/broccoli/api/ElasticQuery.kt +++ b/src/main/kotlin/nl/knaw/huc/broccoli/api/ElasticQuery.kt @@ -4,7 +4,12 @@ import com.fasterxml.jackson.annotation.JsonAnyGetter import com.fasterxml.jackson.annotation.JsonIgnore import com.fasterxml.jackson.annotation.JsonInclude import com.fasterxml.jackson.annotation.JsonProperty +import nl.knaw.huc.broccoli.api.Constants.NO_FILTERS import nl.knaw.huc.broccoli.api.Constants.TEXT_TOKEN_COUNT +import nl.knaw.huc.broccoli.core.ElasticQueryBuilder.LogicalAggregationBuilder.LogicalFilterScope +import nl.knaw.huc.broccoli.core.ElasticQueryBuilder.LogicalAggregationBuilder.LogicalFilterSpec +import nl.knaw.huc.broccoli.core.ElasticQueryBuilder.LogicalQueryBuilder.FixedTypeKey +import nl.knaw.huc.broccoli.service.commonPrefix @JsonInclude(JsonInclude.Include.NON_NULL) data class ElasticQuery( @@ -46,8 +51,58 @@ data class RangeQuery( ) } +data class NestedQuery( + @JsonIgnore val fieldName: String, + @JsonIgnore val constraints: Map> +) : BaseQuery() { + @JsonAnyGetter + fun toJson() = mapOf( + "nested" to mapOf( + "path" to fieldName, + "query" to mapOf( + "bool" to mapOf( + "filter" to mutableListOf>>>( + ).apply { + constraints.forEach { (nestedFieldName, allowedValues) -> + add( + mapOf("terms" to mapOf("$fieldName.$nestedFieldName" to allowedValues)) + ) + } + } + ) + ) + ) + ) +} + +data class LogicalQuery( + @JsonIgnore val scopeName: String, + @JsonIgnore val fixed: FixedTypeKey?, + @JsonIgnore val values: Map> +) : BaseQuery() { + @JsonAnyGetter + fun toJson() = mapOf( + "nested" to mapOf( + "path" to scopeName, + "query" to mapOf( + "bool" to mapOf( + "filter" to mutableListOf>>>( + ).apply { + fixed?.let { + add(mapOf("terms" to mapOf(scopeName + fixed.path to listOf(fixed.value)))) + } + values.forEach { (path: String, vals: List) -> + add(mapOf("terms" to mapOf(scopeName + path to vals))) + } + } + ) + ) + ) + ) +} + data class TermsQuery( - val terms: Map> + val terms: Map ) : BaseQuery() data class FullTextQuery( @@ -87,11 +142,13 @@ data class HighlightTerm( ) } -data class Aggregations( - @JsonIgnore val aggs: List -) { +data class Aggregations(private val elements: List) { + private val aggs = elements.toMutableList() + @JsonAnyGetter fun toJson() = aggs.associate { it.name to it.toJson() } + + fun addAll(elements: List) = apply { aggs.addAll(elements) } } abstract class Aggregation(val name: String) { @@ -119,4 +176,103 @@ class TermAggregation( sortOrder?.let { put("order", it) } } ) + + override fun toString(): String = buildString { + append(name).append('|') + numResults?.let { append(it).append('|') } + sortOrder?.let { append(it).append('|') } + append("json:") + } +} + +class LogicalAggregation( + private val scope: LogicalFilterScope, + private val filterSpec: LogicalFilterSpec +) : Aggregation(scope.name) { + override fun toJson(): Map> = mapOf( + "nested" to mapOf("path" to name), + "aggregations" to mapOf( + "filter" to mapOf( // freely configurable here: could also be, e.g., 'name' or "filter_${name}" + "filters" to mapOf( + "filters" to mutableMapOf>>().apply { + if (filterSpec.values.isEmpty()) { + this[NO_FILTERS] = mapOf("match_all" to emptyMap()) + } else { + filterSpec.values.forEach { (fixedValue, names) -> + this[names.commonPrefix()] = mapOf( + "term" to mapOf( + "${name}${filterSpec.path}" to fixedValue + ) + ) + } + } + } + ), + "aggs" to mutableMapOf().apply { + scope.spec.forEach { (name, sortSpecs) -> + sortSpecs.forEach { sortSpec -> + val order = sortSpec["order"] as String + this["$name|$order"] = mapOf( + "terms" to mutableMapOf("field" to "${scope.name}${name}") + .withSortSpec(sortSpec, "documents>count"), + "aggregations" to mapOf>>( + "documents" to mutableMapOf>( + "reverse_nested" to emptyMap() + ).apply { + if (order == "countDesc") { + put( + "aggs", mapOf( + "count" to mapOf( + "value_count" to mapOf( + "field" to "bodyType" + ) + ) + ) + ) + } + } + ) + ) + } + } + } + ) + ) + ) +} + +class NestedAggregation( + name: String, + private val fields: Map> +) : Aggregation(name) { + override fun toJson(): Map> = mapOf( + "nested" to mapOf("path" to name), + "aggregations" to mutableMapOf().apply { + fields.forEach { (nestedFieldName, sortSpec) -> + put( + nestedFieldName, mapOf( + "terms" to mutableMapOf("field" to "$name.$nestedFieldName") + .withSortSpec(sortSpec), + "aggregations" to mapOf>>( + "documents" to mapOf( + "reverse_nested" to emptyMap() + ) + ) + ) + ) + } + } + ) +} + +fun MutableMap.withSortSpec(sortSpec: Map, countSpec: String = "_count"): Map { + sortSpec["size"]?.let { this["size"] = it } + sortSpec["order"]?.let { + this["order"] = when (it) { + "keyAsc" -> mapOf("_key" to "asc") + "keyDesc" -> mapOf("_key" to "desc") + else -> mapOf(countSpec to "desc") + } + } + return this } diff --git a/src/main/kotlin/nl/knaw/huc/broccoli/api/IndexQuery.kt b/src/main/kotlin/nl/knaw/huc/broccoli/api/IndexQuery.kt index db94d01..602adf8 100644 --- a/src/main/kotlin/nl/knaw/huc/broccoli/api/IndexQuery.kt +++ b/src/main/kotlin/nl/knaw/huc/broccoli/api/IndexQuery.kt @@ -9,7 +9,7 @@ data class IndexQuery( val range: IndexRange?, @JsonProperty("aggs") - val aggregations: List? = null + val aggregations: Map>? = null ) { override fun toString(): String = buildString { text?.let { append(it).append('|') } @@ -19,7 +19,7 @@ data class IndexQuery( } } -typealias IndexTerms = Map> +typealias IndexTerms = Map data class IndexRange(val name: String, val from: String?, val to: String?) { override fun toString(): String = "$name:[$from,$to]" diff --git a/src/main/kotlin/nl/knaw/huc/broccoli/config/BroccoliConfiguration.kt b/src/main/kotlin/nl/knaw/huc/broccoli/config/BroccoliConfiguration.kt index 7fd6981..fd2855b 100644 --- a/src/main/kotlin/nl/knaw/huc/broccoli/config/BroccoliConfiguration.kt +++ b/src/main/kotlin/nl/knaw/huc/broccoli/config/BroccoliConfiguration.kt @@ -185,9 +185,90 @@ class IndexFieldConfiguration { @Valid @JsonProperty - val path: String = "$.body.id" + var type: String = "keyword" @Valid @JsonProperty - val type: String? = null + // mutually exclusive with 'logical', 'nested' + val path: String? = null + + @Valid + @JsonProperty + // mutually exclusive with 'path', 'nested' + val logical: LogicalIndexFieldConfiguration? = null + + class LogicalIndexFieldConfiguration { + @Valid + @NotNull + @JsonProperty + val scope: String = "" + + @Valid + @NotNull + @JsonProperty + val path: String = "" + + @Valid + @JsonProperty + val fixed: FixedIndexFieldConfiguration? = null + + class FixedIndexFieldConfiguration { + @Valid + @NotNull + @JsonProperty + val path: String = "" + + @Valid + @NotNull + @JsonProperty + val value: String = "" + } + } + + @Valid + @JsonProperty + // mutually exclusive with 'path', 'logical' + val nested: NestedIndexFieldConfiguration? = null + + class NestedIndexFieldConfiguration { + @Valid + @NotNull + @JsonProperty + val from: List = ArrayList() + + @Valid + @NotNull + @JsonProperty + val fields: List = ArrayList() + + @Valid + @NotNull + @JsonProperty + val with: List = ArrayList() + + class NestedIndexFieldConstraint { + @Valid + @JsonProperty + val equal: String? = null + + @Valid + @JsonProperty + val overlap: String? = null + } + + class PathIndexFieldConfiguration { + @Valid + @NotNull + @JsonProperty + val name: String = "" + + @Valid + @JsonProperty + val path: String? = null + + @Valid + @JsonProperty + val type: String = "keyword" + } + } } diff --git a/src/main/kotlin/nl/knaw/huc/broccoli/core/ElasticQueryBuilder.kt b/src/main/kotlin/nl/knaw/huc/broccoli/core/ElasticQueryBuilder.kt index 5ef633f..c718f75 100644 --- a/src/main/kotlin/nl/knaw/huc/broccoli/core/ElasticQueryBuilder.kt +++ b/src/main/kotlin/nl/knaw/huc/broccoli/core/ElasticQueryBuilder.kt @@ -1,5 +1,6 @@ package nl.knaw.huc.broccoli.core +import jakarta.ws.rs.BadRequestException import nl.knaw.huc.broccoli.api.* import nl.knaw.huc.broccoli.config.IndexConfiguration import org.slf4j.LoggerFactory @@ -25,6 +26,167 @@ class ElasticQueryBuilder(private val index: IndexConfiguration) { fun query(query: IndexQuery) = apply { this.query = normalizeQuery(query) } + fun toElasticQuery(): ElasticQuery { + val logicalAggregationBuilder = LogicalAggregationBuilder(index) + + val query = ElasticQuery( + from = from, + size = size, + sort = Sort(sortBy, sortOrder), + query = buildMainQuery(), + highlight = query.text?.let { queryText -> + HighlightTerm( + text = queryText, + fragmentSize = fragmentSize, + extraFields = index.fields.filter { it.type == "text" }.map { it.name } + ) + }, + aggregations = Aggregations( + (query.aggregations?.keys ?: configuredFieldNames()).mapNotNull { aggName -> + query.aggregations + ?.get(aggName) + ?.let { aggSpec -> + when (configuredFieldType(aggName)) { + "byte", "keyword", "short" -> + TermAggregation( + name = aggName, + numResults = aggSpec["size"] as Int, + sortOrder = orderParams[aggSpec["order"]] + ) + + "date" -> DateAggregation(aggName) + + "logical" -> { + logicalAggregationBuilder.add(aggName, aggSpec) + null // defer adding to Aggregations we are currently building + } + + "nested" -> { + @Suppress("UNCHECKED_CAST") + val nestedAggSpec = aggSpec as Map> + NestedAggregation(name = aggName, fields = nestedAggSpec) + } + + else -> null + } + } + } + ).addAll(logicalAggregationBuilder.toAggregations()) + ) + + return query + } + + class LogicalAggregationBuilder(private val index: IndexConfiguration) { + private val aggSpecs = mutableMapOf>() + + fun add(aggName: String, aggSpec: Map) { + aggSpecs[aggName] = aggSpec + } + + fun toAggregations(): List { + val scopes: MutableMap>>> = mutableMapOf() + + aggSpecs.forEach { (aggName, aggSpec) -> + val field = index.fields.find { it.name == aggName } + ?: throw BadRequestException("Unknown field '$aggName'") + + val logical = field.logical + ?: throw BadRequestException("field $aggName lacks 'logical' configuration") + + // create a copy as we may have to update its size, yet we don't want to fubar the original queryString + val copy = aggSpec.toMutableMap() + + // logical.scope, e.g., "entities" + scopes.merge(logical.scope, mutableMapOf(logical.path to mutableListOf(copy))) { scope, _ -> + scope[logical.path] = + if (scope[logical.path] == null) + mutableListOf(copy) + else { + var found = false + @Suppress("UNCHECKED_CAST") + (scope[logical.path] as MutableList>).onEach { curSpec: MutableMap -> + if (curSpec["order"] == copy["order"]) { + copy["size"]?.let { newSize -> + if (newSize as Int > curSpec["size"] as Int) { + curSpec["size"] = newSize // in place update of 'size', requires copy + } + } + found = true + } + }.apply { + if (!found) add(copy) + } + } + scope + } + } + + return scopes.map { (scope, spec) -> + var fixedField = "" + val values = LinkedHashMap>() // preserve order from config + index.fields.filter { it.logical?.scope == scope } + .forEach { field -> + field.logical!!.fixed?.let { fixed -> + fixedField = fixed.path + values.putIfAbsent(fixed.value, mutableListOf()) + values[fixed.value]!!.add(field.name) + } + } + LogicalAggregation(LogicalFilterScope(scope, spec), LogicalFilterSpec(fixedField, values)) + } + } + + data class LogicalFilterScope( + val name: String, // "entities" + val spec: Map>> + ) + + data class LogicalFilterSpec( + val path: String, // ".category" + val values: Map> // {LOC=[locationName, locationLabels], PERS=[personName, personLabels], HOE=[roleName, roleLabels]} + ) + } + + fun toMultiFacetCountQueries() = mutableListOf() + .apply { + query.terms + ?.filterNot { configuredFieldType(it.key) == "logical" } + ?.forEach { term -> + add( + ElasticQuery( + from = from, + size = size, + sort = Sort(sortBy, sortOrder), + query = buildMainQuery { it.key != term.key }, + aggregations = Aggregations( + query.aggregations + ?.get(term.key) + ?.let { termAgg -> + if (configuredFieldType(term.key) == "nested") { + @Suppress("UNCHECKED_CAST") + val spec = termAgg as MutableMap> + NestedAggregation( + name = term.key, + fields = spec.filterKeys { (term.value as Map<*, *>).containsKey(it) } + ) + } else { + TermAggregation( + name = term.key, + numResults = termAgg["size"] as Int, + sortOrder = orderParams[termAgg["order"]] + ) + } + } + ?.let { listOf(it) } + ?: emptyList() + ) + ) + ) + } + } + private fun normalizeQuery(query: IndexQuery): IndexQuery { return IndexQuery( date = query.date, @@ -48,117 +210,117 @@ class ElasticQueryBuilder(private val index: IndexConfiguration) { ) } - fun toElasticQuery() = ElasticQuery( - from = from, - size = size, - sort = Sort(sortBy, sortOrder), + private fun buildMainQuery(predicate: ((Map.Entry) -> Boolean) = { true }) = ComplexQuery( + bool = BoolQuery( + must = mutableListOf().apply { + /* + * entities -> + * (".category", "HOE") -> + * (roleName -> ["koning"]) + * (roleLabels -> ["Adel & Vorsten"] + * (".category", "PERS") -> + * (personName -> ["frankrijk"]) + * (personLabels -> ["ongelabeld"]) + * attendants -> + * [...] + */ + val logicalQueryBuilder = LogicalQueryBuilder(index) + query.terms + ?.filter(predicate) + ?.forEach { termsQuery -> + when (termsQuery.value) { + is List<*> -> { + val field = index.fields.find { it.name == termsQuery.key } + if (field?.logical != null) { + @Suppress("UNCHECKED_CAST") + logicalQueryBuilder.add(field.name, termsQuery.value as MutableList) + } else { + add(TermsQuery(mapOf(termsQuery.key to (termsQuery.value as List<*>)))) + } + } - query = ComplexQuery( - bool = BoolQuery( - must = mutableListOf().apply { - query.terms?.forEach { - add(TermsQuery(mapOf(it.key to it.value))) - } - query.date?.let { - add(RangeQuery(it.name, it.from, it.to, relation = "within")) - } - query.range?.let { - add(RangeQuery(it.name, it.from, it.to)) - } - query.text?.let { - add(FullTextQuery(QueryString(it))) + is Map<*, *> -> { + @Suppress("UNCHECKED_CAST") + add(NestedQuery(termsQuery.key, termsQuery.value as Map>)) + } + } } + addAll(logicalQueryBuilder.toQueries()) + query.date?.let { + add(RangeQuery(it.name, it.from, it.to, relation = "within")) } - ) - ), - - highlight = query.text?.let { queryText -> - HighlightTerm( - text = queryText, - fragmentSize = fragmentSize, - extraFields = index.fields.filter { it.type == "text" }.map { it.name } - ) - }, - - aggregations = (query.aggregations ?: index.fields.map { it.name }) - .map { parseAggregationParameters(it) } - .also { logger.atDebug().addArgument(it).log("parsed aggregation params: {}") } - .mapNotNull { params -> - when (index.fields.find { it.name == params.fieldName }?.type) { - "keyword", "short", "byte" -> - TermAggregation( - name = params.fieldName, - numResults = params.numResults, - sortOrder = params.sortOrder - ) - - "date" -> DateAggregation(params.fieldName) - else -> null + query.range?.let { + add(RangeQuery(it.name, it.from, it.to)) + } + query.text?.let { + add(FullTextQuery(QueryString(it))) } } - .let { Aggregations(it) } + ) ) - fun toMultiFacetCountQueries() = mutableListOf().apply { - query.terms?.forEach { curTerm -> - add(ElasticQuery( - from = from, - size = size, - sort = Sort(sortBy, sortOrder), - query = ComplexQuery( - bool = BoolQuery( - must = mutableListOf().apply { - query.terms?.forEach { - if (it.key != curTerm.key) add(TermsQuery(mapOf(it.key to it.value))) - } - query.date?.let { - add(RangeQuery(it.name, it.from, it.to, relation = "within")) - } - query.range?.let { - add(RangeQuery(it.name, it.from, it.to)) - } - query.text?.let { - add(FullTextQuery(QueryString(it))) - } - } - ) - ), - aggregations = Aggregations(listOf( - // use aggregation sort order / count, if specified - query.aggregations?.find { it.startsWith(curTerm.key + ':') }?.let { - parseAggregationParameters(it).let { params -> - TermAggregation(params.fieldName, params.numResults, params.sortOrder) - } - } ?: TermAggregation(curTerm.key)) - ))) + class LogicalQueryBuilder(private val index: IndexConfiguration) { + private val scopes = mutableMapOf() + + data class FixedTypeKey(val path: String, val value: String) + + class LogicalTypeScope { + val fixedValueTypes = mutableMapOf>>() + + fun update(key: FixedTypeKey?, logicalPath: String, values: MutableList) = apply { + fixedValueTypes.merge(key, mutableMapOf(logicalPath to values)) { soFar, _ -> + soFar[logicalPath] = values; soFar + } + } + + override fun toString() = buildString { + append("LogicalTypeScope(fixedValueTypes=") + append(fixedValueTypes.toString()) + append(')') + } + } + + fun add(fieldName: String, values: MutableList) { + val field = index.fields.find { it.name == fieldName } + ?: throw BadRequestException("Unknown field: $fieldName") + val logical = field.logical + ?: throw BadRequestException("Missing 'logical:' section in field: $fieldName") + val key = logical.fixed?.let { FixedTypeKey(it.path, it.value) } + scopes.compute(logical.scope) { _, oldValue -> + (oldValue ?: LogicalTypeScope()).update(key, logical.path, values) + } } - }.toList() + + fun toQueries(): List = + mutableListOf().apply { + scopes.forEach { (scopeName: String, scope: LogicalTypeScope) -> + scope.fixedValueTypes.forEach { (key: FixedTypeKey?, values: Map>) -> + add(LogicalQuery(scopeName, key, values)) + } + } + } + } + + private fun configuredFieldNames() = index.fields.map { it.name } + + private fun configuredFieldType(name: String) = + index.fields.find { it.name == name } + ?.let { + if (it.logical != null) "logical" + else if (it.nested != null) "nested" + else it.type + } + ?: "unknown" companion object { private val logger = LoggerFactory.getLogger(ElasticQueryBuilder::class.java) private val ES_FIELD_PREFIX = """^[a-zA-Z]*:""".toRegex() - private data class ParsedAggParams( - val fieldName: String, - var numResults: Int? = null, - var sortOrder: Map? = null - ) - private val orderParams = mapOf( "keyAsc" to mapOf("_key" to "asc"), "keyDesc" to mapOf("_key" to "desc"), "countDesc" to mapOf("_count" to "desc") ) - - private fun parseAggregationParameters(aggName: String): ParsedAggParams = - ParsedAggParams(aggName.substringBeforeLast(delimiter = ':')).apply { - aggName.substringAfterLast(delimiter = ':', missingDelimiterValue = "") - .split(',') - .forEach { param -> - param.toIntOrNull()?.let { numResults = it } - orderParams[param]?.let { sortOrder = it } - } - } } } diff --git a/src/main/kotlin/nl/knaw/huc/broccoli/resources/brinta/BrintaResource.kt b/src/main/kotlin/nl/knaw/huc/broccoli/resources/brinta/BrintaResource.kt index 1a0a92a..8fd8fc1 100644 --- a/src/main/kotlin/nl/knaw/huc/broccoli/resources/brinta/BrintaResource.kt +++ b/src/main/kotlin/nl/knaw/huc/broccoli/resources/brinta/BrintaResource.kt @@ -50,7 +50,7 @@ class BrintaResource( ), ) index.fields.forEach { field -> - field.type?.let { type -> properties[field.name] = mapOf("type" to type) } + field.type.let { type -> properties[field.name] = mapOf("type" to type) } } val mappings = mapOf("properties" to properties) @@ -92,7 +92,11 @@ class BrintaResource( @Path("indices") fun getIndices(@PathParam("projectId") projectId: String): Response = getProject(projectId).brinta.indices - .associate { idx -> idx.name to idx.fields.associate { f -> f.name to (f.type ?: "undefined") } } + .associate { idx -> + idx.name to idx.fields.associate { field -> + field.name to (field.nested?.let { nf -> nf.fields.associate { it.name to it.type } } ?: field.type) + } + } .let { Response.ok(it).build() } @DELETE @@ -215,13 +219,15 @@ class BrintaResource( // Then: optional extra payload: fields from config index.fields.forEach { field -> - try { - anno.read(field.path)?.let { payload[field.name] = it } - logger.atTrace().log("payload[{}] -> {}", field.name, payload[field.name]) - } catch (e: PathNotFoundException) { - // Must catch PNF, even though DEFAULT_PATH_LEAF_TO_NULL is set, because intermediate - // nodes can also be null, i.e., they don't exist, which still yields a PNF Exception. - // Ignore this, just means the annotation doesn't have a value for this field + field.path?.let { path -> + try { + anno.read(path)?.let { payload[field.name] = it } + logger.atTrace().log("payload[{}] -> {}", field.name, payload[field.name]) + } catch (e: PathNotFoundException) { + // Must catch PNF, even though DEFAULT_PATH_LEAF_TO_NULL is set, because intermediate + // nodes can also be null, i.e., they don't exist, which still yields a PNF Exception. + // Ignore this, just means the annotation doesn't have a value for this field + } } } diff --git a/src/main/kotlin/nl/knaw/huc/broccoli/resources/projects/ProjectsResource.kt b/src/main/kotlin/nl/knaw/huc/broccoli/resources/projects/ProjectsResource.kt index 8edc4c9..36e3c37 100644 --- a/src/main/kotlin/nl/knaw/huc/broccoli/resources/projects/ProjectsResource.kt +++ b/src/main/kotlin/nl/knaw/huc/broccoli/resources/projects/ProjectsResource.kt @@ -114,13 +114,14 @@ class ProjectsResource( val baseJson = baseResult.readEntityAsJsonString() .also { logger.trace("base json: {}", it) } - val result = mutableMapOf() - val aggs = mutableMapOf() + val result: MutableMap = mutableMapOf() + val aggs: MutableMap = mutableMapOf() jsonParser.parse(baseJson).let { context -> context.read>("$.hits.total") ?.let { result["total"] = it } - extractAggregations(context)?.let { aggs.putAll(it) } + extractAggregations(index, context)?.let { aggs.putAll(it) } + logger.atDebug().addKeyValue("aggs", aggs).log("base") context.read>>("$.hits.hits[*]") ?.map { buildHitResult(index, it) } @@ -136,17 +137,35 @@ class ProjectsResource( val auxJson = auxResult.readEntityAsJsonString() .also { logger.trace("aux json[{}]: {}", auxIndex, it) } jsonParser.parse(auxJson).let { context -> - extractAggregations(context)?.let { aggs.putAll(it) } + extractAggregations(index, context) + ?.forEach { entry -> + @Suppress("UNCHECKED_CAST") + (aggs[entry.key] as MutableMap).putAll(entry.value as Map) + } } } - // if aggregations are requested, order them according to query string - (queryString.aggregations ?: index.fields.map { it.name }).let { orderedAggregationNames -> - result["aggs"] = LinkedHashMap().apply { - orderedAggregationNames.forEach { unparsedName -> - val parsedName = unparsedName.substringBefore(":") - aggs[parsedName]?.let { put(parsedName, it) } + // use LinkedHashMap to fix aggregation order + result["aggs"] = LinkedHashMap().apply { + queryString.aggregations?.keys?.forEach { name -> + val nameAndOrder = "$name@${queryString.aggregations[name]?.get("order")}" + if (!aggs.containsKey(name) && aggs.containsKey(nameAndOrder)) { + aggs[name] = aggs[nameAndOrder] as Any } + (aggs[name] as MutableMap<*, *>?)?.apply { + val desiredAmount: Int = (queryString.aggregations[name]?.get("size") as Int?) ?: size + if (desiredAmount < entries.size) { + val keep = LinkedHashMap() + entries.take(desiredAmount).forEach { + keep[it.key as Any] = it.value as Any + } + aggs[name] = keep + } + } + } + // prefer query string order; default to order from config + (queryString.aggregations?.keys ?: index.fields.map { it.name }).forEach { name -> + aggs[name]?.let { aggregationResult -> put(name, aggregationResult) } } } diff --git a/src/main/kotlin/nl/knaw/huc/broccoli/service/Util.kt b/src/main/kotlin/nl/knaw/huc/broccoli/service/Util.kt index 1d51929..45fec44 100644 --- a/src/main/kotlin/nl/knaw/huc/broccoli/service/Util.kt +++ b/src/main/kotlin/nl/knaw/huc/broccoli/service/Util.kt @@ -1,26 +1,90 @@ package nl.knaw.huc.broccoli.service import com.jayway.jsonpath.ReadContext +import nl.knaw.huc.broccoli.api.Constants.DOC_COUNT +import nl.knaw.huc.broccoli.api.Constants.NO_FILTERS +import nl.knaw.huc.broccoli.config.IndexConfiguration // migrate to ES specific 'util' -fun extractAggregations(context: ReadContext) = context.read>("$.aggregations") - ?.mapNotNull { aggregation -> - @Suppress("UNCHECKED_CAST") - val buckets = (aggregation.value as Map)["buckets"] as List> - - if (buckets.isEmpty()) - null - else { - mapOf(aggregation.key to buckets.associate { (it["key_as_string"] ?: it["key"]) to it["doc_count"] }) +fun extractAggregations(index: IndexConfiguration, context: ReadContext) = + context.read>("$.aggregations") + ?.mapNotNull { aggregation -> + @Suppress("UNCHECKED_CAST") + val aggValuesMap = aggregation.value as Map + if ("buckets" in aggValuesMap) { + @Suppress("UNCHECKED_CAST") + val buckets = aggValuesMap["buckets"] as List> + if (buckets.isEmpty()) + null + else { + mapOf(aggregation.key to buckets.associate { + (it["key_as_string"] ?: it["key"]) to it[DOC_COUNT] + }) + } + } else if ("nested" in aggValuesMap) { + mapOf( + aggregation.key to mutableListOf>().apply { + aggValuesMap + .filter { it.key != DOC_COUNT } + .forEach { (nestedFacetName: String, nestedFacetValues: Any) -> + val nestedAggValuesMap = nestedFacetValues as Map<*, *> + if ("buckets" in nestedAggValuesMap) { + @Suppress("UNCHECKED_CAST") + val nestedBuckets = nestedAggValuesMap["buckets"] as List> + if (nestedBuckets.isNotEmpty()) { + add( + mapOf(nestedFacetName to nestedBuckets.associate { + (it["key_as_string"] ?: it["key"]) to + (it["documents"] as Map<*, *>)[DOC_COUNT] + }) + ) + } + } + } + }.groupByKey() + ) + } else if ("filter" in aggValuesMap) { + val filterBuckets: Map = getValueAtPath(aggValuesMap, "filter.buckets") + ?: return@mapNotNull null // no yield here after all, perhaps throw Exception? + mutableListOf>().apply { + filterBuckets.forEach { (key, vals) -> + @Suppress("UNCHECKED_CAST") + (vals as Map>) + .filterNot { it.key == DOC_COUNT } + .forEach { (nameAndSort, logicalAggValuesMap) -> + val name = nameAndSort.substringBefore('|') + val order = nameAndSort.substringAfter('|') + val prefix = if (key == NO_FILTERS) null else key + findLogicalFacetName(index, name, prefix)?.let { logicalFacetName -> + val buckets = logicalAggValuesMap["buckets"] as List> + if (buckets.isNotEmpty()) { + add( + mapOf("${logicalFacetName}@$order" to buckets.associate { + (it["key_as_string"] + ?: it["key"]) to (it["documents"] as Map<*, *>)[DOC_COUNT] + }) + ) + } + } + } + } + }.groupByKey() + } else null } - } - ?.groupByKey() + ?.groupByKey() + +fun findLogicalFacetName(index: IndexConfiguration, path: String, prefix: String?): String? { + return index.fields.find { field -> + field.logical?.path == path + && prefix?.let { field.name.startsWith(it) } ?: true + }?.name +} inline fun getValueAtPath(anno: Map<*, *>, path: String): V? { val steps = path.split('.').iterator() var cur: Any = anno - while (cur is Map<*, *>) { + while (cur is Map<*, *> && steps.hasNext()) { cur = cur[steps.next()] ?: return null } @@ -34,3 +98,26 @@ inline fun getValueAtPath(anno: Map<*, *>, path: String): V? { return null } + +/* + * Find common prefix in a list of Strings + * e.g. ["roleLabel", "roleName"] -> "role" + * + * edge cases: + * size 1: ["str"] -> "str" + * size 0: [] -> "" + */ +fun List.commonPrefix(): String { + if (isEmpty()) { + return "" + } + + var result = get(0) + + for (i in 1 until size) { + result = result.commonPrefixWith(get(i)) + } + + return result + +} diff --git a/src/test/kotlin/nl/knaw/huc/broccoli/service/mock/MockIIIFStoreTest.kt b/src/test/kotlin/nl/knaw/huc/broccoli/service/mock/MockIIIFStoreTest.kt deleted file mode 100644 index d630936..0000000 --- a/src/test/kotlin/nl/knaw/huc/broccoli/service/mock/MockIIIFStoreTest.kt +++ /dev/null @@ -1,36 +0,0 @@ -package nl.knaw.huc.broccoli.service.mock - -import io.dropwizard.testing.ResourceHelpers -import io.dropwizard.testing.junit5.DropwizardAppExtension -import io.dropwizard.testing.junit5.DropwizardExtensionsSupport -import jakarta.ws.rs.NotFoundException -import nl.knaw.huc.broccoli.BroccoliApplication -import org.assertj.core.api.Assertions.assertThat -import org.junit.jupiter.api.Test -import org.junit.jupiter.api.assertThrows -import org.junit.jupiter.api.extension.ExtendWith - -@ExtendWith(DropwizardExtensionsSupport::class) -internal class MockIIIFStoreTest { - private val sut = MockIIIFStore(iiifUri = "https://images.diginfra.net/api/pim", client = EXT.client()) - - @Test - fun `mock store should return mocked content`() { - assertThat(sut.getCanvasId("_", 285)).startsWith("https://images.diginfra.net/api/pim/iiif") - } - - @Test - fun `mock store should throw NotFound when volume not found`() { - assertThrows { - sut.getCanvasId("_", 999999) - } - } - - companion object { - val EXT = DropwizardAppExtension( - BroccoliApplication::class.java, - ResourceHelpers.resourceFilePath("config.yml") - ) - - } -}