Add support for importing MCF files in RSI. (#292)

datacommonsorg · Apr 1, 2024 · 50504a2 · 50504a2
1 parent 5dcf473
commit 50504a2
Show file tree

Hide file tree

Showing 19 changed files with 870 additions and 31 deletions.
diff --git a/simple/kg_util/README.md b/simple/kg_util/README.md
@@ -0,0 +1,22 @@
+# Libraries for working with Data Commons KG
+
+## MCF Parser
+
+`mcf_parser` includes helper functions to parse files in [MCF
+format](https://github.com/datacommonsorg/data/blob/master/docs/mcf_format.md).
+
+### `mcf_parser.mcf_to_triples(mcf_file)`
+
+Parses file containing a Node MCF graph into triples.
+
+Arguments:
+
+-  `mcf_file (file object)` - An MCF File object opened for read.
+
+Returns an Iterable of triples. Each triple has four string values:
+
+  ```
+  [ <subject-id>, <property>, <object-id or object-value>, <'ID' | 'VALUE'>]
+  ```
+
+Also refer to `mcf_parser_test.py` for usage.
diff --git a/simple/kg_util/__init__.py b/simple/kg_util/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/simple/kg_util/mcf_parser.py b/simple/kg_util/mcf_parser.py
@@ -0,0 +1,242 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to parse an MCF file into triples."""
+
+# TODO: Support TMCF parsing.
+# TODO: Support properties from non-DC/-schema.org namespace.
+# TODO: Support complex values enclosed in [] (LatLng, Quantity, QuantityRange).
+# TODO: Support provenance and other props in Context block.
+# TODO: Add API that returns a map-like structure.
+
+from csv import reader
+import dataclasses
+import sys
+
+_VALUE = 'VALUE'
+_ID = 'ID'
+_node = 'Node'
+_context = 'Context'
+_dcid = 'dcid'
+_namespace = 'namespace'
+_dcid_prefix = 'dcid:'
+_dcs_prefix = 'dcs:'
+_schema_prefix = 'schema:'
+_local_prefix = 'l:'
+_delim = ':'
+
+#
+# Internal Functions
+#
+
+
+def _strip_quotes(s):
+  if s.startswith('"') and s.endswith('"'):
+    return s[1:-1]
+  return s
+
+
+def _strip_ns(v):
+  return v[v.find(_delim) + 1:]
+
+
+def _is_schema_ref_property(prop):
+  return prop in ('typeOf', 'subClassOf', 'subPropertyOf', 'rangeIncludes',
+                  'domainIncludes', 'specializationOf')
+
+
+def _is_common_ref_property(prop):
+  return (_is_schema_ref_property(prop) or
+          prop in ('location', 'observedNode', 'containedInPlace',
+                   'containedIn', 'populationType', 'measuredProperty',
+                   'measurementDenominator', 'populationGroup',
+                   'constraintProperties', 'measurementMethod', 'comparedNode'))
+
+
+def _is_global_ref(value):
+  return (value.startswith(_dcid_prefix) or value.startswith(_dcs_prefix) or
+          value.startswith(_schema_prefix))
+
+
+def _is_local_ref(value):
+  return value.startswith(_local_prefix)
+
+
+@dataclasses.dataclass
+class ParseContext:
+  """Context used for parser"""
+  # MCF file line number.
+  lno: int = 0
+  # Indicates we're within Context block.
+  in_context: bool = False
+  # Namespace map.
+  ns_map: dict = dataclasses.field(default_factory=dict)
+  # Current Node block name.
+  node: str = ''
+  # Indicates whether the current Node block includes dcid property.
+  has_dcid: bool = False
+
+
+def _as_err(msg, pc):
+  return 'Line ' + str(pc.lno) + ': ' + msg
+
+
+def _parse_value(prop, value, pc):
+  """Parses an MCF value string into a pair of value and type.
+
+    Args:
+        prop: Property name
+        value: Value string
+        pc: ParseContext instance
+
+    Returns:
+        Pair of value and type.
+    """
+  expect_ref = _is_common_ref_property(prop)
+
+  if value.startswith('"'):
+    assert len(value) > 2 and value.endswith('"'),\
+        _as_err('malformed string', pc)
+    value = _strip_quotes(value)
+    if not expect_ref:
+      return (value, _VALUE)
+
+  if _is_global_ref(value):
+    return (_strip_ns(value), _ID)
+
+  if _is_local_ref(value):
+    # For local ref, we retain the prefix.
+    return (value, _ID)
+
+  ns_prefix = value.split(':', 1)[0] + _delim
+  if ns_prefix != value and ns_prefix in pc.ns_map:
+    value = value.replace(ns_prefix, pc.ns_map[ns_prefix], 1)
+    return (value, _ID)
+
+  if expect_ref:
+    # If we're here, user likely forgot to add a "dcid:", "dcs:" or
+    # "schema:" prefix.  We cannot tell apart if they failed to add an local
+    # ref ('l:'), but we err on the side of user being careful about adding
+    # local refs and accept the MCF without failing.
+    return (value, _ID)
+
+  return (value, _VALUE)
+
+
+def _parse_values(prop, values_str, pc):
+  value_pairs = []
+  for values in reader([values_str]):
+    for value in values:
+      value_pairs.append(_parse_value(prop, value.strip(), pc))
+    break
+  return value_pairs
+
+
+def _update_ns_map(values_str, pc):
+  """Updates the namespace map with namespace values.
+
+    Args:
+        values_str: String of values for namespace prop from MCF.
+        pc: ParseContext instance.
+    """
+  for values in reader([values_str]):
+    for value in values:
+      value = _strip_quotes(value.strip())
+      parts = value.split('=', 1)
+      assert len(parts) == 2, _as_err('malformed namespace value', pc)
+      k = parts[0] + _delim
+      assert k not in pc.ns_map,\
+          _as_err('duplicate values for namespace prefix ' + k, pc)
+      pc.ns_map[k] = parts[1].rstrip('/') + '/'
+    break
+
+
+#
+# Public Functions
+#
+
+
+def mcf_to_triples(mcf_file):
+  """ Parses the file containing a Node MCF graph into triples.
+
+  Args:
+    mcf_file: Node MCF file object opened for read.
+
+  Returns:
+    An Iterable of triples. Each triple has four values:
+      [<subject-id>, <property>, <object-id or object-value>, <'ID' | 'VALUE'>]
+
+  On parse failures, throws AssertionError with a reference to offending line
+  number in the MCF file.
+  """
+
+  pc = ParseContext()
+  for line in mcf_file:
+    pc.lno += 1
+
+    line = line.strip()
+    if not line or line.startswith('//') or line.startswith('#'):
+      continue
+
+    parts = line.split(_delim, 1)
+    assert len(parts) == 2,\
+        _as_err('Malformed line without a colon delimiter', pc)
+    assert parts[0], _as_err('Malformed empty property', pc)
+    prop, val_str = parts[0].strip(), parts[1].strip()
+
+    if prop == _context:
+      # New Context block.
+      assert not pc.node, _as_err('found Context block after Node', pc)
+      pc.in_context = True
+
+    elif prop == _node:
+      # New Node block.
+      assert val_str, _as_err('Node with no name', pc)
+      assert ',' not in val_str, _as_err('Malformed Node name with ","', pc)
+      assert not val_str.startswith('"'),\
+          _as_err('Malformed Node name starting with "', pc)
+
+      # Finalize current node.
+      if (pc.node and not pc.has_dcid and _is_global_ref(pc.node)):
+        yield [pc.node, _dcid, _strip_ns(pc.node), _VALUE]
+
+      # Update to new node.
+      pc.node = val_str
+      pc.in_context = False
+      pc.has_dcid = False
+
+    elif pc.in_context:
+      # Processing inside Context block.
+      assert prop == _namespace,\
+          _as_err('Context block only supports "namespace" property', pc)
+      _update_ns_map(val_str, pc)
+
+    else:
+      # Processing inside Node block.
+      assert pc.node, _as_err('Prop-Values before Node or Context block', pc)
+
+      for vp in _parse_values(prop, val_str, pc):
+        yield [pc.node, prop, vp[0], vp[1]]
+
+      if prop == _dcid:
+        pc.has_dcid = True
+
+  # Finalize current node.
+  if (pc.node and not pc.has_dcid and _is_global_ref(pc.node)):
+    yield [pc.node, _dcid, _strip_ns(pc.node), _VALUE]
+
+
+if __name__ == '__main__':
+  with open(sys.argv[1], 'r') as f:
+    for t in mcf_to_triples(f):
+      print(t)