From 83bc873db39a2ae41d43b3a5223e71fbdd713fd2 Mon Sep 17 00:00:00 2001 From: Matthew Evans <7916000+ml-evs@users.noreply.github.com> Date: Tue, 16 Apr 2024 18:18:23 +0100 Subject: [PATCH] Fix test for first entry serialization (#54) * Fix test for first entry serialization * Update and run linter on just optimake files, enable in CI * Improve comparisons to reference data * Fix bug where property files had to use same ID format * Update src/optimake/convert.py --- .github/workflows/ci.yml | 4 +++ .pre-commit-config.yaml | 2 ++ src/optimake/archive/cli.py | 1 + src/optimake/archive/scan_records.py | 3 +-- src/optimake/cli.py | 4 ++- src/optimake/convert.py | 21 +++++++-------- src/optimake/parsers.py | 4 ++- tests/test_convert.py | 38 +++++++++++++++++++--------- tests/test_yaml.py | 1 - 9 files changed, 51 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1f6f3a5..ea9718e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,6 +38,10 @@ jobs: pip install -U setuptools wheel pip install -e .[tests,dev] + - name: Run linters + run: | + pre-commit run --all-files + - name: Run tests run: pytest -vv --cov-report=xml --cov-report=term ./tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad7bb64..3d1a267 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,8 @@ default_language_version: python: python3.10 +exclude: "scripts|src/optimade_launch" + repos: - repo: https://github.com/ambv/black rev: 23.3.0 diff --git a/src/optimake/archive/cli.py b/src/optimake/archive/cli.py index ab1d9f4..c1db4ba 100644 --- a/src/optimake/archive/cli.py +++ b/src/optimake/archive/cli.py @@ -1,4 +1,5 @@ import click + from .scan_records import scan_records diff --git a/src/optimake/archive/scan_records.py b/src/optimake/archive/scan_records.py index ef68b19..cc92908 100644 --- a/src/optimake/archive/scan_records.py +++ b/src/optimake/archive/scan_records.py @@ -6,8 +6,7 @@ DEFAULT_ARCHIVE_URL = "https://archive.materialscloud.org/" - -def process_records(records: list, archive_url: str=DEFAULT_ARCHIVE_URL): +def process_records(records: list, archive_url: str = DEFAULT_ARCHIVE_URL): """ Scan the Materials Cloud Archive entries, read the file info and check if there is a file called "optimade.y(ml|aml)". diff --git a/src/optimake/cli.py b/src/optimake/cli.py index cbfaeb3..6298dc5 100644 --- a/src/optimake/cli.py +++ b/src/optimake/cli.py @@ -1,11 +1,13 @@ import argparse from pathlib import Path + from optimake.convert import convert_archive + def main(): parser = argparse.ArgumentParser( prog="optimake", - description="Use an `optimade.yaml` config to describe archived data and create a OPTIMADE JSONL file for ingestion as an OPTIMADE API." + description="Use an `optimade.yaml` config to describe archived data and create a OPTIMADE JSONL file for ingestion as an OPTIMADE API.", ) parser.add_argument("archive_path", help="The path to the archive to ingest.") parser.add_argument("--jsonl-path", help="The path to write the JSONL file to.") diff --git a/src/optimake/convert.py b/src/optimake/convert.py index 026dfd1..9d4caeb 100644 --- a/src/optimake/convert.py +++ b/src/optimake/convert.py @@ -318,19 +318,20 @@ def _parse_and_assign_properties( # Look for precisely matching IDs, or 'filename' matches for id in optimade_entries: - - property_entry_id = id - if id not in parsed_properties: + # detect any other compatible IDs; either those matching immutable ID or those matching the filename rule + property_entry_id = optimade_entries[id]["attributes"].get("immutable_id", None) + if property_entry_id is None: + # try to find a matching ID based on the filename property_entry_id = id.split("/")[-1].split(".")[0] - if property_entry_id not in parsed_properties: - raise RuntimeError( - f"Found {id!r} or {property_entry_id!r} in entries but not in properties {parsed_properties.keys()=}" - ) + # Loop over all defined properties and assign them to the entry, setting to None if missing + # Also cast types if provided for property in all_property_fields: - # Loop over all defined properties and assign them to the entry, setting to None if missing - # Also cast types if provided - value = parsed_properties[property_entry_id].get(property, None) + # Look up both IDs: the file path-based ID or the ergonomic one + # Different property sources can use different ID schemes internally + value = parsed_properties.get(property_entry_id, {}).get( + property, None + ) or parsed_properties.get(id, {}).get(property, None) if property not in property_def_dict: warnings.warn(f"Missing property definition for {property=}") continue diff --git a/src/optimake/parsers.py b/src/optimake/parsers.py index 4ed18fd..ac428c8 100644 --- a/src/optimake/parsers.py +++ b/src/optimake/parsers.py @@ -61,7 +61,9 @@ def load_csv_file( return df.to_dict(orient="index") -PROPERTY_PARSERS: dict[str, list[Callable[[Path], Any]]] = { +PROPERTY_PARSERS: dict[ + str, list[Callable[[Path, list[PropertyDefinition] | None], Any]] +] = { ".csv": [load_csv_file], } diff --git a/tests/test_convert.py b/tests/test_convert.py index 0da1cb4..d6d5463 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -2,9 +2,9 @@ import shutil from pathlib import Path +import numpy as np import pytest from optimade.models import EntryInfoResource - from optimake.convert import convert_archive EXAMPLE_ARCHIVES = (Path(__file__).parent.parent / "examples").glob("*") @@ -25,7 +25,7 @@ def test_convert_example_archives(archive_path, tmp_path): jsonl_path = convert_archive(tmp_path) assert jsonl_path.exists() - + jsonl_path_custom = convert_archive(tmp_path, jsonl_path=tmp_path / "test.jsonl") assert jsonl_path_custom.exists() @@ -60,16 +60,30 @@ def test_convert_example_archives(archive_path, tmp_path): False ), "No structures found in archive but test first entry was provided" - # @ml-evs: species is the only key that can be written in any order, so here we - # just sort before comparing. This will be fixed in the next optimade-python-tools - if species := next_entry.get("attributes", {}).get("species"): - next_entry["attributes"]["species"] = sorted( - species, key=lambda x: x["name"] - ) - for key in ("id", "type", "relationships"): assert next_entry[key] == first_entry[key] - json.dumps(first_entry["attributes"]) == json.dumps( - next_entry["attributes"] - ) + def check_arrays(reference, test, field): + ref_array = reference["attributes"].pop(field, None) + if ref_array: + np.testing.assert_array_almost_equal( + ref_array, test["attributes"].pop(field) + ) + + # check JSON serialization of attributes compared to reference data, handling species and numerical arrays separately + array_fields = ["cartesian_site_positions", "lattice_vectors"] + for field in array_fields: + check_arrays(first_entry, next_entry, field) + first_entry.pop(field, None) + next_entry.pop(field, None) + + first_entry_species = first_entry["attributes"].pop("species", None) + next_entry_species = next_entry["attributes"].pop("species", None) + if first_entry_species: + assert json.dumps( + sorted(first_entry_species, key=lambda _: _["name"]) + ) == json.dumps(sorted(next_entry_species, key=lambda _: _["name"])) + + assert json.dumps( + first_entry["attributes"], sort_keys=True, indent=2 + ) == json.dumps(next_entry["attributes"], sort_keys=True, indent=2) diff --git a/tests/test_yaml.py b/tests/test_yaml.py index 08c4828..71747ee 100644 --- a/tests/test_yaml.py +++ b/tests/test_yaml.py @@ -1,7 +1,6 @@ from pathlib import Path import pytest - from optimake.config import Config EXAMPLE_YAMLS = (Path(__file__).parent.parent / "examples").glob("*/optimade.yaml")