Changes to structured_data to respect uniqueItems for arrays.

4dn-dcic · Dec 12, 2023 · 8f4f694 · 8f4f694
1 parent a447f3f
commit 8f4f694
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 20 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -10,6 +10,7 @@ Change Log
 =====
 * Minor fix to misc_utils.to_integer to handle float strings.
 * Minor fix to structured_data to accumulate unique resolved_refs across schemas.
+* Changes to structured_data to respect uniqueItems for arrays.
 
 
 8.5.0

diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py
@@ -1469,28 +1469,33 @@ def string_list(s):
     return [p for p in [part.strip() for part in s.split(",")] if p]
 
 
-def split_string(value: str, delimiter: str, escape: Optional[str] = None) -> List[str]:
+def split_string(value: str, delimiter: str, escape: Optional[str] = None, unique: bool = False) -> List[str]:
     """
     Splits the given string into an array of string based on the given delimiter, and an optional escape character.
     """
     if not isinstance(value, str) or not (value := value.strip()):
         return []
-    if not isinstance(escape, str) or not escape:
-        return [item.strip() for item in value.split(delimiter)]
     result = []
+    if not isinstance(escape, str) or not escape:
+        for item in value.split(delimiter):
+            if (item := item.strip()) and (unique is not True or item not in result):
+                result.append(item)
+        return result
     item = r""
     escaped = False
     for c in value:
         if c == delimiter and not escaped:
-            result.append(item.strip())
+            if (item := item.strip()) and (unique is not True or item not in result):
+                result.append(item)
             item = r""
         elif c == escape and not escaped:
             escaped = True
         else:
             item += c
             escaped = False
-    result.append(item.strip())
-    return [item for item in result if item]
+    if (item := item.strip()) and (unique is not True or item not in result):
+        result.append(item)
+    return result
 
 
 def right_trim(list_or_tuple: Union[List[Any], Tuple[Any]],

diff --git a/dcicutils/structured_data.py b/dcicutils/structured_data.py
@@ -237,7 +237,7 @@ def parse_components(column_components: List[str], path: List[Union[str, int]])
             return {array_name: array} if array_name else {column_component: value}
 
         def set_value_internal(data: Union[dict, list], value: Optional[Any], src: Optional[str],
-                               path: List[Union[str, int]], mapv: Optional[Callable]) -> None:
+                               path: List[Union[str, int]], typeinfo: Optional[dict], mapv: Optional[Callable]) -> None:
 
             def set_value_backtrack_object(path_index: int, path_element: str) -> None:
                 nonlocal data, path, original_data
@@ -257,7 +257,7 @@ def set_value_backtrack_object(path_index: int, path_element: str) -> None:
                     set_value_backtrack_object(i, p)
                 data = data[p]
             if (p := path[-1]) == -1 and isinstance(value, str):
-                values = _split_array_string(value)
+                values = _split_array_string(value, unique=typeinfo.get("unique") if typeinfo else False)
                 if mapv:
                     values = [mapv(value, src) for value in values]
                 merge_objects(data, values)
@@ -288,11 +288,13 @@ def ensure_column_consistency(column_name: str) -> None:
         for column_name in column_names or []:
             ensure_column_consistency(column_name)
             rational_column_name = self._schema.rationalize_column_name(column_name) if self._schema else column_name
-            map_value_function = self._schema.get_map_value_function(rational_column_name) if self._schema else None
+            column_typeinfo = self._schema.get_typeinfo(rational_column_name) if self._schema else None
+            map_value_function = column_typeinfo.get("map") if column_typeinfo else None
             if (column_components := _split_dotted_string(rational_column_name)):
                 merge_objects(structured_row_template, parse_components(column_components, path := []), True)
-                self._set_value_functions[column_name] = (lambda data, value, src, path=path, mapv=map_value_function:
-                                                          set_value_internal(data, value, src, path, mapv))
+                self._set_value_functions[column_name] = (
+                    lambda data, value, src, path=path, typeinfo=column_typeinfo, mapv=map_value_function:
+                        set_value_internal(data, value, src, path, typeinfo, mapv))
         return structured_row_template
 
 
@@ -331,10 +333,7 @@ def unresolved_refs(self) -> List[dict]:
     def resolved_refs(self) -> List[str]:
         return list(self._resolved_refs)
 
-    def get_map_value_function(self, column_name: str) -> Optional[Any]:
-        return (self._get_typeinfo(column_name) or {}).get("map")
-
-    def _get_typeinfo(self, column_name: str) -> Optional[dict]:
+    def get_typeinfo(self, column_name: str) -> Optional[dict]:
         if isinstance(info := self._typeinfo.get(column_name), str):
             info = self._typeinfo.get(info)
         if not info and isinstance(info := self._typeinfo.get(self.unadorn_column_name(column_name)), str):
@@ -467,9 +466,15 @@ def _create_typeinfo(self, schema_json: dict, parent_key: Optional[str] = None)
                             raise Exception(f"Array of undefined or multiple types in JSON schema NOT supported: {key}")
                         raise Exception(f"Invalid array type specifier in JSON schema: {key}")
                     key = key + ARRAY_NAME_SUFFIX_CHAR
+                    if unique := (property_value.get("uniqueItems") is True):
+                        pass
                     property_value = array_property_items
                     property_value_type = property_value.get("type")
-                result.update(self._create_typeinfo(array_property_items, parent_key=key))
+                typeinfo = self._create_typeinfo(array_property_items, parent_key=key)
+                if unique:
+                    typeinfo[key]["unique"] = True
+                result.update(typeinfo)
+#               result.update(self._create_typeinfo(array_property_items, parent_key=key))
                 continue
             result[key] = {"type": property_value_type, "map": self._map_function({**property_value, "column": key})}
             if ARRAY_NAME_SUFFIX_CHAR in key:
@@ -615,5 +620,5 @@ def _split_dotted_string(value: str):
     return split_string(value, DOTTED_NAME_DELIMITER_CHAR)
 
 
-def _split_array_string(value: str):
-    return split_string(value, ARRAY_VALUE_DELIMITER_CHAR, ARRAY_VALUE_DELIMITER_ESCAPE_CHAR)
+def _split_array_string(value: str, unique: bool = False):
+    return split_string(value, ARRAY_VALUE_DELIMITER_CHAR, ARRAY_VALUE_DELIMITER_ESCAPE_CHAR, unique=unique)
diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py
@@ -3593,8 +3593,8 @@ def test_json_lines_reader_lists():
 
 
 def test_split_array_string():
-    def split_array_string(value: str) -> List[str]:
-        return split_string(value, "|", "\\")
+    def split_array_string(value: str, unique: bool = False) -> List[str]:
+        return split_string(value, "|", "\\", unique=unique)
     assert split_array_string(r"abc|def|ghi") == ["abc", "def", "ghi"]
     assert split_array_string(r"abc\|def|ghi") == ["abc|def", "ghi"]
     assert split_array_string(r"abc\\|def|ghi") == ["abc\\", "def", "ghi"]
@@ -3609,6 +3609,12 @@ def split_array_string(value: str) -> List[str]:
     assert split_array_string(r"|") == []
     assert split_array_string(r"\|") == ["|"]
     assert split_array_string(r"\\|") == ["\\"]
+    assert split_array_string(r"abc|def|abc|ghi", unique=False) == ["abc", "def", "abc", "ghi"]
+    assert split_array_string(r"abc|def|abc|ghi", unique=True) == ["abc", "def", "ghi"]
+    assert split_array_string(r"abc\\\|def\|ghi|jkl|mno|jkl", unique=False) == ["abc\\|def|ghi", "jkl", "mno", "jkl"]
+    assert split_array_string(r"abc\\\|def\|ghi|jkl|mno|jkl", unique=True) == ["abc\\|def|ghi", "jkl", "mno"]
+    assert split_string(r"abc|def|ghi|def", delimiter="|", unique=False) == ["abc", "def", "ghi", "def"]
+    assert split_string(r"abc|def|ghi|def", delimiter="|", unique=True) == ["abc", "def", "ghi"]
 
 
 def test_merge_objects_1():