Skip to content

Commit

Permalink
treatAsLanguageTagged #188
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Jun 28, 2024
1 parent a39d0c8 commit 01f9495
Show file tree
Hide file tree
Showing 36 changed files with 1,458 additions and 39 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,11 @@ a single data elements (a DataELement in the API). Its properties are:
against
* `indexField` (String): the name which can be used in a search engine connected
to the application (at the time of writing Apache Solr is supported)
* `inactive` (boolean): the data element is inactive, do not run checks on this
* `identifierField` (boolean): the data element is the identifier of the record
* `asLanguageTagged` (boolean): treat the data element as language tagged. It works
for JSON where the content of the data element is encoded with an associated
array, where the keys are the language tags.

Optionaly you can set the "canonical list" of categories. It provides
two additional functionalities
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ protected <T extends XmlFieldInstance> Object measureCsvWithGenerics(List<String
private void runMeasurements(OutputCollector collector) {
if (calculators != null)
for (Calculator calculator : getCalculators()) {
System.err.println(calculator);
List<MetricResult> result = calculator.measure(cache);
collector.addResult(calculator, result, compressionLevel);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import de.gwdg.metadataqa.api.counter.FieldCounter;
import de.gwdg.metadataqa.api.interfaces.Calculator;
import de.gwdg.metadataqa.api.interfaces.MetricResult;
import de.gwdg.metadataqa.api.json.DataElement;
import de.gwdg.metadataqa.api.model.EdmFieldInstance;
import de.gwdg.metadataqa.api.model.selector.Selector;
import de.gwdg.metadataqa.api.model.XmlFieldInstance;
Expand Down Expand Up @@ -56,18 +57,34 @@ public List<MetricResult> measure(Selector cache)

if (schema != null) {
String path;
DataElement dataELement;
boolean asLanguageTagged;
for (String fieldName : schema.getExtractableFields().keySet()) {
if (idPath == null || !fieldName.equals(FIELD_NAME)) {
dataELement = schema.getPathByLabel(fieldName);
path = schema.getExtractableFields().get(fieldName);
extractSingleField(cache, resultMap, path, fieldName);
extractSingleField(cache, resultMap, path, fieldName, dataELement);
}
}
}
return List.of(new FieldCounterBasedResult<>(getCalculatorName(), resultMap).withNoCompression());
}

private void extractSingleField(Selector cache, FieldCounter<String> resultMap, String path, String fieldName) {
List<XmlFieldInstance> values = cache.get(path);
extractSingleField(cache, resultMap, path, fieldName,null);
}

private void extractSingleField(Selector cache,
FieldCounter<String> resultMap,
String path,
String fieldName,
DataElement dataELement) {
List<XmlFieldInstance> values;
if (dataELement != null) {
values = cache.get(dataELement);
} else {
values = cache.get(path);
}
String value = null;
if (values == null || values.isEmpty() || values.get(0) == null) {
value = nullValue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ public class Field {
private List<Rule> rules;
private String indexField;
private boolean identifierField;
private boolean asLanguageTagged;

public String getName() {
return name;
Expand Down Expand Up @@ -75,4 +76,12 @@ public boolean isIdentifierField() {
public void setIdentifierField(boolean isRecordId) {
this.identifierField = isRecordId;
}

public boolean isAsLanguageTagged() {
return asLanguageTagged;
}

public void setAsLanguageTagged(boolean asLanguageTagged) {
this.asLanguageTagged = asLanguageTagged;
}
}
10 changes: 10 additions & 0 deletions src/main/java/de/gwdg/metadataqa/api/json/DataElement.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public class DataElement implements Cloneable, Serializable {
private List<Rule> rules;
private Schema schema;
private String indexField;
private boolean asLanguageTagged = false;

public DataElement(String label, String path, String solrFieldName) {
this.label = label;
Expand Down Expand Up @@ -292,4 +293,13 @@ public DataElement setIndexField(String indexField) {
public String generateIndexField() {
return label.replaceAll("\\W", "_");
}

public DataElement setAsLanguageTagged() {
this.asLanguageTagged = true;
return this;
}

public boolean isAsLanguageTagged() {
return asLanguageTagged;
}
}
70 changes: 63 additions & 7 deletions src/main/java/de/gwdg/metadataqa/api/json/JsonUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -82,6 +83,13 @@ public static List<String> extractList(Object value) {
public static List<? extends XmlFieldInstance> extractFieldInstanceList(Object value,
String recordId,
String jsonPath) {
return extractFieldInstanceList(value, recordId, jsonPath, false);
}

public static List<? extends XmlFieldInstance> extractFieldInstanceList(Object value,
String recordId,
String jsonPath,
boolean asLanguageTagged) {
List<EdmFieldInstance> extracted = new ArrayList<>();
if (value.getClass() == String.class) {
extracted.add(new EdmFieldInstance((String) value));
Expand Down Expand Up @@ -110,9 +118,13 @@ public static List<? extends XmlFieldInstance> extractFieldInstanceList(Object v
} else if (outerVal.getClass() == BigInteger.class) {
extracted.add(new EdmFieldInstance(((BigInteger) outerVal).toString()));
} else if (outerVal.getClass() == JSONArray.class) {
extracted.addAll(extractInnerArray(outerVal, recordId, jsonPath));
extracted.addAll(extractInnerArray(outerVal, recordId, jsonPath, asLanguageTagged));
} else if (outerVal.getClass() == LinkedHashMap.class) {
extracted.add(hashToFieldInstance(outerVal, recordId, jsonPath));
if (asLanguageTagged) {
extracted.addAll(convertLanguageTaggedMap(outerVal, recordId, jsonPath));
} else {
extracted.add(hashToFieldInstance(outerVal, recordId, jsonPath, asLanguageTagged));
}
} else {
LOGGER.severe(String.format(
"Unhandled outerArray type: %s, %s [record ID: %s, path: %s]",
Expand All @@ -121,7 +133,11 @@ public static List<? extends XmlFieldInstance> extractFieldInstanceList(Object v
}
}
} else if (value.getClass() == LinkedHashMap.class) {
extracted.add(hashToFieldInstance(value, recordId, jsonPath));
if (asLanguageTagged) {
extracted.addAll(convertLanguageTaggedMap(value, recordId, jsonPath));
} else {
extracted.add(hashToFieldInstance(value, recordId, jsonPath, asLanguageTagged));
}
} else if (value.getClass() == Integer.class) {
extracted.add(new EdmFieldInstance(Integer.toString((int) value)));
} else if (value.getClass() == Float.class) {
Expand All @@ -143,17 +159,22 @@ public static List<? extends XmlFieldInstance> extractFieldInstanceList(Object v
return extracted;
}

private static List<EdmFieldInstance> extractInnerArray(Object outerVal, String recordId, String jsonPath) {
private static List<EdmFieldInstance> extractInnerArray(Object outerVal, String recordId, String jsonPath,
boolean asLanguageTagged) {
List<EdmFieldInstance> extracted = new ArrayList<>();
JSONArray array = (JSONArray) outerVal;
for (int j = 0, l2 = array.size(); j < l2; j++) {
Object innerVal = array.get(j);
if (innerVal.getClass() == String.class) {
extracted.add(new EdmFieldInstance((String) innerVal));
} else if (innerVal.getClass() == LinkedHashMap.class) {
extracted.add(hashToFieldInstance(innerVal, recordId, jsonPath));
if (asLanguageTagged) {
extracted.addAll(convertLanguageTaggedMap(innerVal, recordId, jsonPath));
} else {
extracted.add(hashToFieldInstance(innerVal, recordId, jsonPath, asLanguageTagged));
}
} else if (innerVal.getClass() == JSONArray.class) {
extracted.addAll(extractInnerArray(innerVal, recordId, jsonPath));
extracted.addAll(extractInnerArray(innerVal, recordId, jsonPath, asLanguageTagged));
} else {
LOGGER.severe(String.format(
"Unhandled inner array type: %s, [record ID: %s, path: %s]",
Expand All @@ -164,7 +185,34 @@ private static List<EdmFieldInstance> extractInnerArray(Object outerVal, String
return extracted;
}

public static EdmFieldInstance hashToFieldInstance(Object innerVal, String recordId, String jsonPath) {
private static Collection<? extends EdmFieldInstance> convertLanguageTaggedMap(Object innerVal,
String recordId,
String jsonPath) {
List<EdmFieldInstance> instances = new ArrayList<>();
Map<String, Object> map = (LinkedHashMap<String, Object>) innerVal;
for (Map.Entry<String, Object> entry : map.entrySet()) {
String languageTag = entry.getKey();
if (entry.getValue() instanceof JSONArray) {
JSONArray values = (JSONArray) entry.getValue();
for (Object value : values) {
if (value instanceof String) {
instances.add(new EdmFieldInstance(value.toString(), languageTag));
} else {
LOGGER.severe("Other type of entry value: " + entry.getValue().getClass().getCanonicalName() + " " + entry.getValue());
}
}
} else {
LOGGER.severe("Other type of entry value: " + entry.getValue().getClass().getCanonicalName()
+ " " + entry.getValue());
}
}
return instances;
}

public static EdmFieldInstance hashToFieldInstance(Object innerVal,
String recordId,
String jsonPath,
boolean asLanguageTagged) {
Map<String, Object> map = (LinkedHashMap<String, Object>) innerVal;
var instance = new EdmFieldInstance();
for (Map.Entry<String, Object> entry : map.entrySet()) {
Expand Down Expand Up @@ -197,6 +245,14 @@ public static EdmFieldInstance hashToFieldInstance(Object innerVal, String recor
// instance.setValue(map.get("def"));
} else if (entry.getKey().equals("@lang")) {
instance.setLanguage((String) value);
} else if (asLanguageTagged) {
instance.setLanguage(entry.getKey());
if (entry.getValue() instanceof JSONArray) {
JSONArray values = (JSONArray) entry.getValue();
instance.setValue((String) values.get(0));
} else {
LOGGER.severe("Other type of entry value: " + entry.getValue().getClass().getCanonicalName());
}
} else {
LOGGER.severe(String.format(
"Other type (%s) of map: %s, [record ID: %s, path: %s]",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package de.gwdg.metadataqa.api.model.selector;

import de.gwdg.metadataqa.api.json.DataElement;
import de.gwdg.metadataqa.api.model.XmlFieldInstance;

import java.util.HashMap;
Expand All @@ -20,6 +21,9 @@ public abstract class BaseSelector<T extends XmlFieldInstance> implements Select
public List<T> get(String path) {
return get(path, path, null, null);
}
public List<T> get(DataElement dataElement) {
return get(dataElement.getPath(), dataElement.getPath(), null, null);
}

public <E> E get(String path, Class<E> clazz) {
if (!typedCache.containsKey(path)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package de.gwdg.metadataqa.api.model.selector;

import de.gwdg.metadataqa.api.json.DataElement;
import de.gwdg.metadataqa.api.json.JsonUtils;
import de.gwdg.metadataqa.api.model.XmlFieldInstance;
import de.gwdg.metadataqa.api.util.ExceptionUtils;
Expand Down Expand Up @@ -41,13 +42,50 @@ public JsonSelector(Object jsonDocument) {
this.document = jsonDocument;
}

public List<T> get(DataElement dataElement) {
if (!cache.containsKey(dataElement.getPath())) {
set(dataElement, null, null);
}
return cache.get(dataElement.getPath());
}

protected void set(DataElement dataElement, Object jsonFragment, Class clazz) {
List<T> instances = null;
Object value = read(dataElement.getPath(), jsonFragment);
if (value != null) {
if (clazz == null) {
instances = (List<T>) JsonUtils.extractFieldInstanceList(
value, recordId, dataElement.getPath(), dataElement.isAsLanguageTagged()
);
} else {
if (value instanceof JSONArray) {
typedCache.put(dataElement.getPath(), clazz.cast(((JSONArray) value).get(0)));
} else {
typedCache.put(dataElement.getPath(), value);
}
}
}
cache.put(dataElement.getPath(), instances);
}

public List<T> get(String path) {
return get(path, path, null, null);
}

public List<T> get(String address, String path, Object jsonFragment, Class clazz) {
if (!cache.containsKey(address)) {
set(address, path, jsonFragment, clazz);
}
return cache.get(address);
}

@Override
protected void set(String address, String path, Object jsonFragment, Class clazz) {
List<T> instances = null;
Object value = read(path, jsonFragment);
if (value != null) {
if (clazz == null) {
instances = (List<T>) JsonUtils.extractFieldInstanceList(value, recordId, path);
instances = (List<T>) JsonUtils.extractFieldInstanceList(value, recordId, path, false);
} else {
if (value instanceof JSONArray) {
typedCache.put(address, clazz.cast(((JSONArray) value).get(0)));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package de.gwdg.metadataqa.api.model.selector;

import de.gwdg.metadataqa.api.json.DataElement;
import de.gwdg.metadataqa.api.model.XmlFieldInstance;

import java.io.Serializable;
Expand All @@ -11,6 +12,7 @@ public interface Selector<T extends XmlFieldInstance> extends Serializable {
Object read(String path, Object jsonFragment);

List<T> get(String path);
List<T> get(DataElement dataElement);
// <E> E get(String path, Class<E> clazz);
List<T> get(String address, String path, Object jsonFragment);
List<T> get(String address, String path, Object jsonFragment, Class clazz);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public void update(Selector cache, FieldCounter<RuleCheckerOutput> results, Rule

var allPassed = true;
var isNA = false;
List<XmlFieldInstance> instances = cache.get(field.getPath());
List<XmlFieldInstance> instances = cache.get(field);
if (instances != null && !instances.isEmpty()) {
FieldCounter<RuleCheckerOutput> localResults = new FieldCounter<>();
for (RuleChecker checker : checkers) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public void update(Selector cache, FieldCounter<RuleCheckerOutput> results, Rule

var allPassed = true;
var isNA = false;
List<XmlFieldInstance> instances = cache.get(field.getPath());
List<XmlFieldInstance> instances = cache.get(field);
if (instances != null && !instances.isEmpty()) {
FieldCounter<RuleCheckerOutput> localResults = new FieldCounter<>();
for (RuleChecker checker : checkers) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public void update(Selector cache, FieldCounter<RuleCheckerOutput> results, Rule

var allPassed = false;
var isNA = false;
List<XmlFieldInstance> instances = cache.get(field.getPath());
List<XmlFieldInstance> instances = cache.get(field);
if (instances != null && !instances.isEmpty()) {
FieldCounter<RuleCheckerOutput> localResults = new FieldCounter<>();
for (RuleChecker checker : checkers) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public void update(Selector cache, FieldCounter<RuleCheckerOutput> results, Rule
var isNA = true;
int instanceCount = 0;
int failureCount = 0;
List<XmlFieldInstance> instances = cache.get(field.getPath());
List<XmlFieldInstance> instances = cache.get(field);
if (instances != null && !instances.isEmpty()) {
for (XmlFieldInstance instance : instances) {
if (instance.hasValue()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public void update(Selector cache, FieldCounter<RuleCheckerOutput> localResults,

var allPassed = true;
var isNA = true;
List<XmlFieldInstance> instances = cache.get(field.getPath());
List<XmlFieldInstance> instances = cache.get(field);
if (instances != null && !instances.isEmpty()) {
for (XmlFieldInstance instance : instances) {
if (instance.hasValue()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public void update(Selector cache, FieldCounter<RuleCheckerOutput> results, Rule

var allPassed = true;
var isNA = true;
List<XmlFieldInstance> instances = cache.get(field.getPath());
List<XmlFieldInstance> instances = cache.get(field);
if (instances != null && !instances.isEmpty()) {
for (XmlFieldInstance instance : instances) {
if (instance.hasValue()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public void update(Selector cache, FieldCounter<RuleCheckerOutput> results, Rule

var allPassed = false;
var isNA = true;
List<XmlFieldInstance> instances = cache.get(field.getPath());
List<XmlFieldInstance> instances = cache.get(field);
if (instances != null && !instances.isEmpty()) {
for (XmlFieldInstance instance : instances) {
if (instance.hasValue()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public void update(Selector cache, FieldCounter<RuleCheckerOutput> results, Rule

var allPassed = true;
var isNA = true;
List<XmlFieldInstance> instances = cache.get(field.getPath());
List<XmlFieldInstance> instances = cache.get(field);
if (instances != null && !instances.isEmpty()) {
for (XmlFieldInstance instance : instances) {
if (instance.hasValue()) {
Expand Down
Loading

0 comments on commit 01f9495

Please sign in to comment.