Skip to content

Commit

Permalink
Fix test for first entry serialization (#54)
Browse files Browse the repository at this point in the history
* Fix test for first entry serialization

* Update and run linter on just optimake files, enable in CI

* Improve comparisons to reference data

* Fix bug where property files had to use same ID format

* Update src/optimake/convert.py
  • Loading branch information
ml-evs authored Apr 16, 2024
1 parent 66b52e3 commit 83bc873
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 27 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ jobs:
pip install -U setuptools wheel
pip install -e .[tests,dev]
- name: Run linters
run: |
pre-commit run --all-files
- name: Run tests
run: pytest -vv --cov-report=xml --cov-report=term ./tests

Expand Down
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
default_language_version:
python: python3.10

exclude: "scripts|src/optimade_launch"

repos:
- repo: https://github.com/ambv/black
rev: 23.3.0
Expand Down
1 change: 1 addition & 0 deletions src/optimake/archive/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import click

from .scan_records import scan_records


Expand Down
3 changes: 1 addition & 2 deletions src/optimake/archive/scan_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
DEFAULT_ARCHIVE_URL = "https://archive.materialscloud.org/"



def process_records(records: list, archive_url: str=DEFAULT_ARCHIVE_URL):
def process_records(records: list, archive_url: str = DEFAULT_ARCHIVE_URL):
"""
Scan the Materials Cloud Archive entries, read the file info
and check if there is a file called "optimade.y(ml|aml)".
Expand Down
4 changes: 3 additions & 1 deletion src/optimake/cli.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import argparse
from pathlib import Path

from optimake.convert import convert_archive


def main():
parser = argparse.ArgumentParser(
prog="optimake",
description="Use an `optimade.yaml` config to describe archived data and create a OPTIMADE JSONL file for ingestion as an OPTIMADE API."
description="Use an `optimade.yaml` config to describe archived data and create a OPTIMADE JSONL file for ingestion as an OPTIMADE API.",
)
parser.add_argument("archive_path", help="The path to the archive to ingest.")
parser.add_argument("--jsonl-path", help="The path to write the JSONL file to.")
Expand Down
21 changes: 11 additions & 10 deletions src/optimake/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,19 +318,20 @@ def _parse_and_assign_properties(

# Look for precisely matching IDs, or 'filename' matches
for id in optimade_entries:

property_entry_id = id
if id not in parsed_properties:
# detect any other compatible IDs; either those matching immutable ID or those matching the filename rule
property_entry_id = optimade_entries[id]["attributes"].get("immutable_id", None)
if property_entry_id is None:
# try to find a matching ID based on the filename
property_entry_id = id.split("/")[-1].split(".")[0]
if property_entry_id not in parsed_properties:
raise RuntimeError(
f"Found {id!r} or {property_entry_id!r} in entries but not in properties {parsed_properties.keys()=}"
)

# Loop over all defined properties and assign them to the entry, setting to None if missing
# Also cast types if provided
for property in all_property_fields:
# Loop over all defined properties and assign them to the entry, setting to None if missing
# Also cast types if provided
value = parsed_properties[property_entry_id].get(property, None)
# Look up both IDs: the file path-based ID or the ergonomic one
# Different property sources can use different ID schemes internally
value = parsed_properties.get(property_entry_id, {}).get(
property, None
) or parsed_properties.get(id, {}).get(property, None)
if property not in property_def_dict:
warnings.warn(f"Missing property definition for {property=}")
continue
Expand Down
4 changes: 3 additions & 1 deletion src/optimake/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def load_csv_file(
return df.to_dict(orient="index")


PROPERTY_PARSERS: dict[str, list[Callable[[Path], Any]]] = {
PROPERTY_PARSERS: dict[
str, list[Callable[[Path, list[PropertyDefinition] | None], Any]]
] = {
".csv": [load_csv_file],
}

Expand Down
38 changes: 26 additions & 12 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import shutil
from pathlib import Path

import numpy as np
import pytest
from optimade.models import EntryInfoResource

from optimake.convert import convert_archive

EXAMPLE_ARCHIVES = (Path(__file__).parent.parent / "examples").glob("*")
Expand All @@ -25,7 +25,7 @@ def test_convert_example_archives(archive_path, tmp_path):

jsonl_path = convert_archive(tmp_path)
assert jsonl_path.exists()

jsonl_path_custom = convert_archive(tmp_path, jsonl_path=tmp_path / "test.jsonl")
assert jsonl_path_custom.exists()

Expand Down Expand Up @@ -60,16 +60,30 @@ def test_convert_example_archives(archive_path, tmp_path):
False
), "No structures found in archive but test first entry was provided"

# @ml-evs: species is the only key that can be written in any order, so here we
# just sort before comparing. This will be fixed in the next optimade-python-tools
if species := next_entry.get("attributes", {}).get("species"):
next_entry["attributes"]["species"] = sorted(
species, key=lambda x: x["name"]
)

for key in ("id", "type", "relationships"):
assert next_entry[key] == first_entry[key]

json.dumps(first_entry["attributes"]) == json.dumps(
next_entry["attributes"]
)
def check_arrays(reference, test, field):
ref_array = reference["attributes"].pop(field, None)
if ref_array:
np.testing.assert_array_almost_equal(
ref_array, test["attributes"].pop(field)
)

# check JSON serialization of attributes compared to reference data, handling species and numerical arrays separately
array_fields = ["cartesian_site_positions", "lattice_vectors"]
for field in array_fields:
check_arrays(first_entry, next_entry, field)
first_entry.pop(field, None)
next_entry.pop(field, None)

first_entry_species = first_entry["attributes"].pop("species", None)
next_entry_species = next_entry["attributes"].pop("species", None)
if first_entry_species:
assert json.dumps(
sorted(first_entry_species, key=lambda _: _["name"])
) == json.dumps(sorted(next_entry_species, key=lambda _: _["name"]))

assert json.dumps(
first_entry["attributes"], sort_keys=True, indent=2
) == json.dumps(next_entry["attributes"], sort_keys=True, indent=2)
1 change: 0 additions & 1 deletion tests/test_yaml.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from pathlib import Path

import pytest

from optimake.config import Config

EXAMPLE_YAMLS = (Path(__file__).parent.parent / "examples").glob("*/optimade.yaml")
Expand Down

0 comments on commit 83bc873

Please sign in to comment.