Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for just passing a JSONL file #25

Merged
merged 6 commits into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions src/mc_optimade/examples/bzipped_pymatgen/optimade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ config_version: 0.1.0
database_description: >-
This database contains some bzipped pymatgen objects.

provider_prefix: mcloudarchive

entries:
- entry_type: structures
entry_paths:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "structures.zip/structures/cifs/cc1a41b1-a841-4818-baf1-a6c1441dc52a.cif", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": null, "last_modified": null, "elements": ["B", "Ir", "Mg", "Zn"], "nelements": 4, "elements_ratios": [0.2, 0.5, 0.2, 0.1], "chemical_formula_descriptive": "B4Ir10Mg4Zn2", "chemical_formula_reduced": "B2Ir5Mg2Zn", "chemical_formula_hill": null, "chemical_formula_anonymous": "A5B2C2D", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[9.4623270052342, 0.0, 0.0], [0.0, 9.4623270052342, 0.0], [0.0, 0.0, 2.9327245575729]], "cartesian_site_positions": [[3.0564738352234, 1.6746896673437002, 0.0], [7.7876373378905, 3.0564738352234, 0.0], [6.4058531699108, 7.7876373378905, 0.0], [1.6746896673437002, 6.4058531699108, 0.0], [4.7311635025671, 4.7311635025671, 0.0], [0.0, 0.0, 0.0], [8.2578546699396, 5.9356358378617, 0.0], [3.5266911673725, 8.2578546699396, 0.0], [1.2044723352946, 3.5266911673725, 0.0], [5.9356358378617, 1.2044723352946, 0.0], [4.0683890403188, 6.7751547390362, 1.4663622787864], [8.7995525428859, 7.418335768765, 1.4663622787864], [2.6871722661979005, 4.0683890403188, 1.4663622787864], [7.418335768765, 0.66277446224826, 1.4663622787864], [5.3939379649153, 2.6871722661979005, 1.4663622787864], [0.66277446224826, 2.0439912363691, 1.4663622787864], [6.7751547390362, 5.3939379649153, 1.4663622787864], [2.0439912363691, 8.7995525428859, 1.4663622787864], [4.7311635025671, 0.0, 1.4663622787864], [0.0, 4.7311635025671, 1.4663622787864]], "nsites": 20, "species": [{"name": "Ir", "chemical_symbols": ["Ir"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Mg", "chemical_symbols": ["Mg"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "B", "chemical_symbols": ["B"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Zn", "chemical_symbols": ["Zn"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Mg", "Mg", "Mg", "Mg", "Zn", "Zn", "B", "B", "B", "B", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir"], "assemblies": null, "structure_features": [], "_mcloudarchive_energy": -0.45, "_mcloudarchive_property_b": 0.86, "_mcloudarchive_structure_description": "describing something else"}, "relationships": null}
5 changes: 5 additions & 0 deletions src/mc_optimade/examples/direct_from_jsonl/example.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"x-optimade": {"api_version": "1.2.0"}}
{"formats": ["json"], "description": "structures", "properties": {"_mcloudarchive_energy": {"description": "The total energy per atom as computed by DFT", "unit": "eV/atom", "sortable": null, "type": "float"}, "_mcloudarchive_property_b": {"description": "Alias for some more complicated property_b", "unit": null, "sortable": null, "type": "float"}, "_mcloudarchive_structure_description": {"description": "Provides a human-readable description for this particular entry_type", "unit": null, "sortable": null, "type": "string"}}, "output_fields_by_format": {"json": ["_mcloudarchive_energy", "_mcloudarchive_property_b", "_mcloudarchive_structure_description"]}}
{"id": "structures.zip/structures/cifs/cc1a41b1-a841-4818-baf1-a6c1441dc52a.cif", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": null, "last_modified": null, "elements": ["B", "Ir", "Mg", "Zn"], "nelements": 4, "elements_ratios": [0.2, 0.5, 0.2, 0.1], "chemical_formula_descriptive": "B4Ir10Mg4Zn2", "chemical_formula_reduced": "B2Ir5Mg2Zn", "chemical_formula_hill": null, "chemical_formula_anonymous": "A5B2C2D", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[9.4623270052342, 0.0, 0.0], [0.0, 9.4623270052342, 0.0], [0.0, 0.0, 2.9327245575729]], "cartesian_site_positions": [[3.0564738352234, 1.6746896673437002, 0.0], [7.7876373378905, 3.0564738352234, 0.0], [6.4058531699108, 7.7876373378905, 0.0], [1.6746896673437002, 6.4058531699108, 0.0], [4.7311635025671, 4.7311635025671, 0.0], [0.0, 0.0, 0.0], [8.2578546699396, 5.9356358378617, 0.0], [3.5266911673725, 8.2578546699396, 0.0], [1.2044723352946, 3.5266911673725, 0.0], [5.9356358378617, 1.2044723352946, 0.0], [4.0683890403188, 6.7751547390362, 1.4663622787864], [8.7995525428859, 7.418335768765, 1.4663622787864], [2.6871722661979005, 4.0683890403188, 1.4663622787864], [7.418335768765, 0.66277446224826, 1.4663622787864], [5.3939379649153, 2.6871722661979005, 1.4663622787864], [0.66277446224826, 2.0439912363691, 1.4663622787864], [6.7751547390362, 5.3939379649153, 1.4663622787864], [2.0439912363691, 8.7995525428859, 1.4663622787864], [4.7311635025671, 0.0, 1.4663622787864], [0.0, 4.7311635025671, 1.4663622787864]], "nsites": 20, "species": [{"name": "Ir", "chemical_symbols": ["Ir"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Mg", "chemical_symbols": ["Mg"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "B", "chemical_symbols": ["B"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Zn", "chemical_symbols": ["Zn"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Mg", "Mg", "Mg", "Mg", "Zn", "Zn", "B", "B", "B", "B", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir"], "assemblies": null, "structure_features": [], "_mcloudarchive_energy": -0.45, "_mcloudarchive_property_b": 0.86, "_mcloudarchive_structure_description": "describing something else"}, "relationships": null}
{"id": "structures.zip/structures/cifs/991bec7a-b3a8-49af-ba6d-be5afd685cd4.cif", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": null, "last_modified": null, "elements": ["C", "Sr"], "nelements": 2, "elements_ratios": [0.5, 0.5], "chemical_formula_descriptive": "CSr", "chemical_formula_reduced": "CSr", "chemical_formula_hill": null, "chemical_formula_anonymous": "AB", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[4.006498849786306, 0.0, 0.0], [2.0032494248931525, 3.469729784148075, 0.0], [2.0032494248931525, 1.1565765947160247, 3.271292612341386]], "cartesian_site_positions": [[0.0, 0.0, 0.0], [4.006498849786305, 2.31315318943205, 1.635646306170693]], "nsites": 2, "species": [{"name": "C", "chemical_symbols": ["C"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Sr", "chemical_symbols": ["Sr"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Sr", "C"], "assemblies": null, "structure_features": [], "_mcloudarchive_energy": -0.55, "_mcloudarchive_property_b": 1.01, "_mcloudarchive_structure_description": NaN}, "relationships": null}
{"id": "structures.zip/structures/cifs/55c564f6-ac6a-4122-b8d9-0ad9fe61e961.cif", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": null, "last_modified": null, "elements": ["Ba", "C", "N", "S"], "nelements": 4, "elements_ratios": [0.14285714285714285, 0.2857142857142857, 0.2857142857142857, 0.2857142857142857], "chemical_formula_descriptive": "C4Ba2N4S4", "chemical_formula_reduced": "BaC2N2S2", "chemical_formula_hill": null, "chemical_formula_anonymous": "A2B2C2D", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[6.3587627540404945, 0.0, 0.0], [-2.672647488887009, 5.769819681958754, 0.0], [0.25844951934994664, -0.16511343006546234, 8.71190314896161]], "cartesian_site_positions": [[3.4987802863851005, 5.049341739457014, 6.533927361693402], [0.4457844982025419, 0.5553645123824795, 2.177975787264444], [0.37416734784252487, 2.642448780492868, 5.291889331690525], [2.6281157156591126, 1.202488098280357, 7.775965391700042], [3.5703974366609055, 2.962257471400425, 3.420013817271085], [1.316449068841148, 4.402218153614962, 0.9359377571616379], [0.3512600296798777, 4.156339511491648, 5.900755701251229], [4.011419364830723, 1.818004876809729, 7.167099022143105], [3.5933047547393455, 1.4483667404554434, 2.8111474477141454], [-0.06685458032729186, 3.786701375083563, 1.5448041268185058], [0.3773832460379156, 1.5350786992068879, 4.8597303327393915], [1.621957615426957, 0.7399681351855376, 8.208124390751108], [3.567181538468686, 4.069627552684378, 3.8521728163221476], [2.3226071690796486, 4.864738116705728, 0.5037787583104332]], "nsites": 14, "species": [{"name": "C", "chemical_symbols": ["C"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "S", "chemical_symbols": ["S"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Ba", "chemical_symbols": ["Ba"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "N", "chemical_symbols": ["N"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Ba", "Ba", "C", "C", "C", "C", "S", "S", "S", "S", "N", "N", "N", "N"], "assemblies": null, "structure_features": [], "_mcloudarchive_energy": -0.54, "_mcloudarchive_property_b": 0.99, "_mcloudarchive_structure_description": "some description"}, "relationships": null}
7 changes: 7 additions & 0 deletions src/mc_optimade/examples/direct_from_jsonl/optimade.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
config_version: 0.1.0

database_description: >-
This database contains some example CIFs.

entries:
jsonl_path: example.jsonl
2 changes: 0 additions & 2 deletions src/mc_optimade/examples/folder_of_cifs/optimade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ config_version: 0.1.0
database_description: >-
This database contains some example CIFs.

provider_prefix: mcloudarchive

entries:
- entry_type: structures
entry_paths:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ config_version: 0.1.0
database_description: >-
This database contains 3 xyz files, one of which should be excluded from the API.

provider_prefix: mcloudarchive

entries:
- entry_type: structures
entry_paths:
Expand Down
40 changes: 26 additions & 14 deletions src/mc_optimade/mc_optimade/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,30 @@ class EntryConfig(BaseModel):

@validator("entry_type")
def check_optimade_entry_type(cls, v):
if v not in ("structures", "references") and not v.startswith("_"):
raise ValueError(
f"OPTIMADE entry type must be either 'structures', 'references', or contain a custom prefix, not {v}"
)
if not isinstance(v, JSONLConfig):
if v not in ("structures", "references") and not v.startswith("_"):
raise ValueError(
f"OPTIMADE entry type must be either 'structures', 'references', or contain a custom prefix, not {v}"
)

return v


class JSONLConfig(BaseModel):
"""A description of a single JSON lines file that describes
the target API.

"""

file: Optional[str] = Field(
description="The archive filename containing the JSONL data to be parsed."
)

jsonl_path: str = Field(
description="The path of the JSON-L file within the archive (or directly in the entry, if `archive_file` is `None`)."
)


class Config(BaseModel):
"""This class describes the `optimade.yaml` file
that a user can provide for each MCloud entry.
Expand All @@ -101,21 +117,17 @@ class Config(BaseModel):
description="A human-readable description of the overall database to be provided alongside the data in the API."
)

entries: list[EntryConfig] = Field(
entries: list[EntryConfig] | JSONLConfig = Field(
description="A list of entry configurations for each entry type."
)

provider_prefix: str = Field(
"",
description="A provider prefix to use for custom fields served by this API.",
)

@validator("entries")
def check_one_entry_per_type(cls, v):
if len({e.entry_type for e in v}) != len(v):
raise ValueError(
"Each entry type must be listed only once in the config file."
)
if not isinstance(v, JSONLConfig):
if len({e.entry_type for e in v}) != len(v):
raise ValueError(
"Each entry type must be listed only once in the config file."
)
return v

@staticmethod
Expand Down
16 changes: 13 additions & 3 deletions src/mc_optimade/mc_optimade/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

"""

import os
import warnings
from collections import defaultdict
from pathlib import Path
Expand All @@ -13,9 +14,11 @@
from optimade.models import EntryInfoResource, EntryResource
from optimade.server.schemas import ENTRY_INFO_SCHEMAS, retrieve_queryable_properties

from .config import Config, EntryConfig, ParsedFiles, PropertyDefinition
from .config import Config, EntryConfig, JSONLConfig, ParsedFiles, PropertyDefinition
from .parsers import ENTRY_PARSERS, OPTIMADE_CONVERTERS, PROPERTY_PARSERS, TYPE_MAP

PROVIDER_PREFIX = os.environ.get("MC_OPTIMADE_PROVIDER_PREFIX", "mcloudarchive")


def _construct_entry_type_info(
type: str,
Expand Down Expand Up @@ -63,6 +66,13 @@ def convert_archive(archive_path: Path) -> Path:
# load the config from the root of the archive
mc_config = Config.from_file(archive_path / "optimade.yaml")

# if the config specifies just a JSON-L, then extract any archives
# and return the JSONL path
if isinstance(mc_config.entries, JSONLConfig):
if mc_config.entries.file is not None:
inflate_archive(Path(archive_path), Path(mc_config.entries.file))
return Path(archive_path) / mc_config.entries.jsonl_path

# first, decompress any provided data paths
data_paths: set[Path] = set()
for entry in mc_config.entries:
Expand All @@ -80,7 +90,7 @@ def convert_archive(archive_path: Path) -> Path:

for entry in mc_config.entries:
optimade_entries[entry.entry_type].extend(
construct_entries(archive_path, entry, mc_config.provider_prefix).values()
construct_entries(archive_path, entry, PROVIDER_PREFIX).values()
)

property_definitions = defaultdict(list)
Expand All @@ -91,7 +101,7 @@ def convert_archive(archive_path: Path) -> Path:
archive_path,
optimade_entries,
property_definitions,
mc_config.provider_prefix,
PROVIDER_PREFIX,
)

return jsonl_path
Expand Down