Skip to content

Commit

Permalink
Attribute with repeated data type. (#991)
Browse files Browse the repository at this point in the history
* Attribute isRepeated flag. ValidateDataTypes indexing job.

* CreateEntityMainn indexing job.

* WriteEntityLevelDisplayHints indexing job.

* WriteTextSearchField indexing job.

* Select repeated attr field. Result parsing test list query.

* Fetch string-only hints. Result parsing test hint query.

* Group by repeated attribute field. Sql building and results parsing tests count query.

* Filter on repeated attribute field. Sql building test list query.

* Only use FLATTENED alias for repeated attribute field.

* OpenAPI repeated type in ValueDisplay. Bug in returning hint query results.

* Remove variant config files.

* Regenerate docs.

* Check actual data rows in result parsing hint & count query tests.

* Fix overlapping conditional in eldh indexing job.

* Always populate repeatedValue property of ValueDisplay in OpenAPI.

---------

Co-authored-by: Dexter Amundsen <dexamundsen@verily.com>
  • Loading branch information
marikomedlock and dexamundsen authored Sep 13, 2024
1 parent ed07388 commit a4e913d
Show file tree
Hide file tree
Showing 32 changed files with 776 additions and 107 deletions.
7 changes: 7 additions & 0 deletions docs/generated/UNDERLAY_CONFIG.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@ When set to true, an indexing job will try to compute a display hint for this at

*Default value:* `false`

### SZAttribute.isDataTypeRepeated
**optional** boolean

True if the data type is repeated (e.g. an array of ints).

*Default value:* `false`

### SZAttribute.isSuppressedForExport
**optional** boolean

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ public void run(boolean isDryRun) {
Field.newBuilder(
columnSchema.getColumnName(),
BigQueryBeamUtils.fromDataType(columnSchema.getDataType()))
.setMode(
columnSchema.isDataTypeRepeated()
? Field.Mode.REPEATED
: Field.Mode.NULLABLE)
.build())
.collect(Collectors.toList());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,18 @@ public void run(boolean isDryRun) {
.collect(Collectors.joining(",", "[", "]")),
sourceQueryField.getType());
}

// Check that the schema repeated flags match those of the index table columns.
boolean sourceQueryFieldIsRepeated = sourceQueryField.getMode().equals(Field.Mode.REPEATED);
boolean isRepeatedFlagsMatch = attribute.isDataTypeRepeated() == sourceQueryFieldIsRepeated;
if (!isRepeatedFlagsMatch) {
foundError = true;
LOGGER.info(
"Data type repeated mismatch found for attribute {}: entity declared {}, SQL schema returns {}",
attribute.getName(),
attribute.isDataTypeRepeated(),
sourceQueryField.getMode());
}
}
if (foundError) {
throw new InvalidConfigException("Data type mismatch found for entity: " + entity.getName());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,23 +113,53 @@ public void run(boolean isDryRun) {
attribute.getName(),
minMax.getKey(),
minMax.getValue());
} else if (isEnumHint(attribute)) {
List<Pair<ValueDisplay, Long>> enumCounts = computeEnumHint(attribute, isDryRun);
} else if (isEnumHintForValueDisplay(attribute)) {
List<Pair<ValueDisplay, Long>> enumCounts =
computeEnumHintForValueDisplay(attribute, isDryRun);
enumCounts.forEach(
enumCount -> {
Literal int64Field =
attribute.isValueDisplay()
? enumCount.getKey().getValue()
: Literal.forInt64(null);
Literal stringField =
attribute.isValueDisplay()
? Literal.forString(enumCount.getKey().getDisplay())
: enumCount.getKey().getValue();

List<Literal> rowOfLiterals = new ArrayList<>();
rowOfLiterals.add(Literal.forString(attribute.getName()));
rowOfLiterals.add(Literal.forDouble(null));
rowOfLiterals.add(Literal.forDouble(null));
rowOfLiterals.add(int64Field);
rowOfLiterals.add(stringField);
rowOfLiterals.add(Literal.forInt64(enumCount.getValue()));
insertRows.add(rowOfLiterals);
LOGGER.info(
"Enum value-display or simple-string hint: {}, {}, {}, {}",
attribute.getName(),
int64Field,
stringField,
enumCount.getValue());
});
} else if (isEnumHintForRepeatedStringValue(attribute)) {
List<Pair<Literal, Long>> enumCounts =
computeEnumHintForRepeatedStringValue(attribute, isDryRun);
enumCounts.forEach(
enumCount -> {
List<Literal> rowOfLiterals = new ArrayList<>();
rowOfLiterals.add(Literal.forString(attribute.getName()));
rowOfLiterals.add(Literal.forDouble(null));
rowOfLiterals.add(Literal.forDouble(null));
rowOfLiterals.add(enumCount.getKey().getValue());
rowOfLiterals.add(Literal.forString(enumCount.getKey().getDisplay()));
rowOfLiterals.add(Literal.forInt64(null));
rowOfLiterals.add(enumCount.getKey());
rowOfLiterals.add(Literal.forInt64(enumCount.getValue()));
insertRows.add(rowOfLiterals);
LOGGER.info(
"Enum hint: {}, {}, {}, {}",
"Enum repeated-string hint: {}, {}, {}, {}",
attribute.getName(),
enumCount.getKey().getValue(),
enumCount.getKey().getDisplay(),
null,
enumCount.getKey(),
enumCount.getValue());
});
} else {
Expand Down Expand Up @@ -205,16 +235,26 @@ public void run(boolean isDryRun) {
}
}

private static boolean isEnumHint(Attribute attribute) {
return attribute.isValueDisplay() && DataType.INT64.equals(attribute.getRuntimeDataType());
private static boolean isEnumHintForValueDisplay(Attribute attribute) {
return (attribute.isValueDisplay() && DataType.INT64.equals(attribute.getRuntimeDataType()))
|| (attribute.isSimple()
&& !attribute.isDataTypeRepeated()
&& DataType.STRING.equals(attribute.getRuntimeDataType()));
}

private static boolean isEnumHintForRepeatedStringValue(Attribute attribute) {
// TODO: Support not-repeated string simple attributes & repeated integer attributes.
return attribute.isSimple()
&& attribute.isDataTypeRepeated()
&& (DataType.STRING.equals(attribute.getDataType()));
}

private static boolean isRangeHint(Attribute attribute) {
if (attribute.isValueDisplay()) {
return false;
}
return switch (attribute.getRuntimeDataType()) {
case DOUBLE, INT64 -> true;
case DOUBLE, INT64 -> !attribute.isDataTypeRepeated();
default -> false;
};
}
Expand Down Expand Up @@ -275,29 +315,36 @@ private Pair<Literal, Literal> computeRangeHint(Attribute attribute, boolean isD
}
}

private List<Pair<ValueDisplay, Long>> computeEnumHint(Attribute attribute, boolean isDryRun) {
private List<Pair<ValueDisplay, Long>> computeEnumHintForValueDisplay(
Attribute attribute, boolean isDryRun) {
// Build the query.
// SELECT attrVal AS enumVal, attrDisp AS enumDisp, COUNT(*) AS enumCount FROM indextable GROUP
// BY enumVal, enumDisp
// SELECT attrVal AS enumVal[, attrDisp AS enumDisp], COUNT(*) AS enumCount FROM indextable
// GROUP BY enumVal[, enumDisp]
SqlField attrValField = indexAttributesTable.getAttributeValueField(attribute.getName());
SqlField attrDispField = indexAttributesTable.getAttributeDisplayField(attribute.getName());
SqlField attrDispField =
attribute.isValueDisplay()
? indexAttributesTable.getAttributeDisplayField(attribute.getName())
: null;
final String enumValAlias = "enumVal";
final String enumDispAlias = "enumDisp";
final String enumCountAlias = "enumCount";

String selectEnumCountSql =
"SELECT "
+ SqlQueryField.of(attrValField, enumValAlias).renderForSelect()
+ ", "
+ SqlQueryField.of(attrDispField, enumDispAlias).renderForSelect()
+ ", COUNT(*) AS "
"SELECT " + SqlQueryField.of(attrValField, enumValAlias).renderForSelect();
if (attribute.isValueDisplay()) {
selectEnumCountSql += ", " + SqlQueryField.of(attrDispField, enumDispAlias).renderForSelect();
}
selectEnumCountSql +=
", COUNT(*) AS "
+ enumCountAlias
+ " FROM "
+ indexAttributesTable.getTablePointer().render()
+ " GROUP BY "
+ SqlQueryField.of(attrValField, enumValAlias).renderForGroupBy(null, true)
+ ", "
+ SqlQueryField.of(attrDispField, enumDispAlias).renderForGroupBy(null, true);
+ SqlQueryField.of(attrValField, enumValAlias).renderForGroupBy(null, true);
if (attribute.isValueDisplay()) {
selectEnumCountSql +=
", " + SqlQueryField.of(attrDispField, enumDispAlias).renderForGroupBy(null, true);
}
LOGGER.info("SQL enum count: {}", selectEnumCountSql);

// Execute the query.
Expand All @@ -315,16 +362,26 @@ private List<Pair<ValueDisplay, Long>> computeEnumHint(Attribute attribute, bool
// Parse the result rows.
for (FieldValueList rowResult : tableResult.getValues()) {
FieldValue enumValFieldValue = rowResult.get(enumValAlias);
Literal enumVal =
Literal.forInt64(enumValFieldValue.isNull() ? null : enumValFieldValue.getLongValue());
FieldValue enumDispFieldValue = rowResult.get(enumDispAlias);
String enumDisp = enumDispFieldValue.isNull() ? null : enumDispFieldValue.getStringValue();
Literal enumVal;
String enumDisp;
if (attribute.isValueDisplay()) {
enumVal =
Literal.forInt64(
enumValFieldValue.isNull() ? null : enumValFieldValue.getLongValue());
FieldValue enumDispFieldValue = rowResult.get(enumDispAlias);
enumDisp = enumDispFieldValue.isNull() ? null : enumDispFieldValue.getStringValue();
} else {
enumVal =
Literal.forString(
enumValFieldValue.isNull() ? null : enumValFieldValue.getStringValue());
enumDisp = null;
}
FieldValue enumCountFieldValue = rowResult.get(enumCountAlias);
long enumCount = enumCountFieldValue.getLongValue();
enumCounts.add(Pair.of(new ValueDisplay(enumVal, enumDisp), enumCount));

if (enumCounts.size() > MAX_ENUM_VALS_FOR_DISPLAY_HINT) {
// if there are more than the max number of values, then skip the display hint
// If there are more than the max number of values, then skip the display hint.
LOGGER.info(
"Skipping enum values display hint because there are >{} possible values: {}",
MAX_ENUM_VALS_FOR_DISPLAY_HINT,
Expand All @@ -334,17 +391,82 @@ private List<Pair<ValueDisplay, Long>> computeEnumHint(Attribute attribute, bool
}
}

// Check that there is exactly one display per value.
Map<Literal, String> valDisplay = new HashMap<>();
enumCounts.forEach(
enumCount -> {
if (valDisplay.containsKey(enumCount.getKey().getValue())) {
throw new InvalidConfigException(
"Found >1 possible display for the enum value " + enumCount.getKey().getValue());
} else {
valDisplay.put(enumCount.getKey().getValue(), enumCount.getKey().getDisplay());
}
});
if (attribute.isValueDisplay()) {
// Check that there is exactly one display per value.
Map<Literal, String> valDisplay = new HashMap<>();
enumCounts.forEach(
enumCount -> {
if (valDisplay.containsKey(enumCount.getKey().getValue())) {
throw new InvalidConfigException(
"Found >1 possible display for the enum value " + enumCount.getKey().getValue());
} else {
valDisplay.put(enumCount.getKey().getValue(), enumCount.getKey().getDisplay());
}
});
}
return enumCounts;
}

private List<Pair<Literal, Long>> computeEnumHintForRepeatedStringValue(
Attribute attribute, boolean isDryRun) {
// TODO: Consolidate the logic here with the ValueDisplay enum hint method.
// Build the query.
// SELECT flattenedAttrVal AS enumVal, COUNT(*) AS enumCount FROM indextable
// CROSS JOIN UNNEST(indextable.attrVal) AS flattenedAttrVal
// GROUP BY enumVal
SqlField attrValField = indexAttributesTable.getAttributeValueField(attribute.getName());
final String enumValAlias = "enumVal";
final String enumCountAlias = "enumCount";
final String flattenedAttrValAlias = "flattenedAttrVal";
SqlField flattenedAttrValField = SqlField.of(flattenedAttrValAlias);

String selectEnumCountSql =
"SELECT "
+ SqlQueryField.of(flattenedAttrValField, enumValAlias).renderForSelect()
+ ", COUNT(*) AS "
+ enumCountAlias
+ " FROM "
+ indexAttributesTable.getTablePointer().render()
+ " CROSS JOIN UNNEST("
+ SqlQueryField.of(attrValField).renderForSelect()
+ ") AS "
+ flattenedAttrValAlias
+ " GROUP BY "
+ SqlQueryField.of(attrValField, enumValAlias).renderForGroupBy(null, true);
LOGGER.info("SQL enum count: {}", selectEnumCountSql);

// Execute the query.
List<Pair<Literal, Long>> enumCounts = new ArrayList<>();
if (isDryRun) {
if (getOutputTable().isEmpty()) {
LOGGER.info("Skipping query dry run because output table does not exist yet.");
} else {
googleBigQuery.dryRunQuery(selectEnumCountSql);
}
enumCounts.add(Pair.of(Literal.forString(""), 0L));
} else {
TableResult tableResult = googleBigQuery.runQueryLongTimeout(selectEnumCountSql);

// Parse the result rows.
for (FieldValueList rowResult : tableResult.getValues()) {
FieldValue enumValFieldValue = rowResult.get(enumValAlias);
Literal enumVal =
Literal.forString(
enumValFieldValue.isNull() ? null : enumValFieldValue.getStringValue());
FieldValue enumCountFieldValue = rowResult.get(enumCountAlias);
long enumCount = enumCountFieldValue.getLongValue();
enumCounts.add(Pair.of(enumVal, enumCount));

if (enumCounts.size() > MAX_ENUM_VALS_FOR_DISPLAY_HINT) {
// if there are more than the max number of values, then skip the display hint
LOGGER.info(
"Skipping enum values display hint because there are >{} possible values: {}",
MAX_ENUM_VALS_FOR_DISPLAY_HINT,
attribute.getName());
return List.of();
}
}
}
return enumCounts;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,22 @@ public void run(boolean isDryRun) {
SqlField attributeTextField;
if (attribute.isValueDisplay()) {
attributeTextField = indexTable.getAttributeDisplayField(attribute.getName());
} else if (!attribute.getDataType().equals(DataType.STRING)) {
attributeTextField =
indexTable
.getAttributeValueField(attribute.getName())
.cloneWithFunctionWrapper("CAST(${fieldSql} AS STRING)");
} else {
String functionWrapper = null;
if (!attribute.getDataType().equals(DataType.STRING)) {
functionWrapper = "CAST(${fieldSql} AS STRING)";
}
if (attribute.isDataTypeRepeated()) {
functionWrapper =
"ARRAY_TO_STRING("
+ (functionWrapper == null ? "${fieldSql}" : functionWrapper)
+ ", \" \")";
}

attributeTextField = indexTable.getAttributeValueField(attribute.getName());
if (functionWrapper != null) {
attributeTextField = attributeTextField.cloneWithFunctionWrapper(functionWrapper);
}
}

String idTextSql =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,21 @@ public static ApiAttribute toApiObject(Attribute attribute) {

public static ApiValueDisplay toApiObject(ValueDisplay valueDisplay) {
ApiValueDisplay apiObject = new ApiValueDisplay();
if (valueDisplay != null) {
apiObject.value(toApiObject(valueDisplay.getValue())).display(valueDisplay.getDisplay());
if (valueDisplay == null) {
return apiObject;
} else if (valueDisplay.isRepeatedValue()) {
return apiObject
.isRepeatedValue(true)
.repeatedValue(
valueDisplay.getRepeatedValue().stream().map(ToApiUtils::toApiObject).toList());
} else {
ApiLiteral apiValue = toApiObject(valueDisplay.getValue());
return apiObject
.value(apiValue)
.display(valueDisplay.getDisplay())
.isRepeatedValue(false)
.repeatedValue(List.of(apiValue));
}
return apiObject;
}

public static ApiLiteral toApiObject(Literal literal) {
Expand Down
6 changes: 6 additions & 0 deletions service/src/main/resources/api/service_openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,12 @@ components:
type: string
description: Optional display string
nullable: true
repeatedValue:
type: array
items:
$ref: "#/components/schemas/Literal"
isRepeatedValue:
type: boolean

Literal:
type: object
Expand Down
1 change: 1 addition & 0 deletions ui/src/tanagra-underlay/underlayConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export type SZAttribute = {
displayHintRangeMax?: number;
displayHintRangeMin?: number;
isComputeDisplayHint?: boolean;
isDataTypeRepeated?: boolean;
isSuppressedForExport?: boolean;
name: string;
runtimeDataType?: SZDataType;
Expand Down
Loading

0 comments on commit a4e913d

Please sign in to comment.