diff --git a/src/mc_optimade/examples/bzipped_pymatgen/optimade.yaml b/src/mc_optimade/examples/bzipped_pymatgen/optimade.yaml index fd912f8..56193da 100644 --- a/src/mc_optimade/examples/bzipped_pymatgen/optimade.yaml +++ b/src/mc_optimade/examples/bzipped_pymatgen/optimade.yaml @@ -3,8 +3,6 @@ config_version: 0.1.0 database_description: >- This database contains some bzipped pymatgen objects. -provider_prefix: mcloudarchive - entries: - entry_type: structures entry_paths: diff --git a/src/mc_optimade/examples/direct_from_jsonl/.testing/first_entry.json b/src/mc_optimade/examples/direct_from_jsonl/.testing/first_entry.json new file mode 100644 index 0000000..35a58cd --- /dev/null +++ b/src/mc_optimade/examples/direct_from_jsonl/.testing/first_entry.json @@ -0,0 +1 @@ +{"id": "structures.zip/structures/cifs/cc1a41b1-a841-4818-baf1-a6c1441dc52a.cif", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": null, "last_modified": null, "elements": ["B", "Ir", "Mg", "Zn"], "nelements": 4, "elements_ratios": [0.2, 0.5, 0.2, 0.1], "chemical_formula_descriptive": "B4Ir10Mg4Zn2", "chemical_formula_reduced": "B2Ir5Mg2Zn", "chemical_formula_hill": null, "chemical_formula_anonymous": "A5B2C2D", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[9.4623270052342, 0.0, 0.0], [0.0, 9.4623270052342, 0.0], [0.0, 0.0, 2.9327245575729]], "cartesian_site_positions": [[3.0564738352234, 1.6746896673437002, 0.0], [7.7876373378905, 3.0564738352234, 0.0], [6.4058531699108, 7.7876373378905, 0.0], [1.6746896673437002, 6.4058531699108, 0.0], [4.7311635025671, 4.7311635025671, 0.0], [0.0, 0.0, 0.0], [8.2578546699396, 5.9356358378617, 0.0], [3.5266911673725, 8.2578546699396, 0.0], [1.2044723352946, 3.5266911673725, 0.0], [5.9356358378617, 1.2044723352946, 0.0], [4.0683890403188, 6.7751547390362, 1.4663622787864], [8.7995525428859, 7.418335768765, 1.4663622787864], [2.6871722661979005, 4.0683890403188, 1.4663622787864], [7.418335768765, 0.66277446224826, 1.4663622787864], [5.3939379649153, 2.6871722661979005, 1.4663622787864], [0.66277446224826, 2.0439912363691, 1.4663622787864], [6.7751547390362, 5.3939379649153, 1.4663622787864], [2.0439912363691, 8.7995525428859, 1.4663622787864], [4.7311635025671, 0.0, 1.4663622787864], [0.0, 4.7311635025671, 1.4663622787864]], "nsites": 20, "species": [{"name": "Ir", "chemical_symbols": ["Ir"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Mg", "chemical_symbols": ["Mg"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "B", "chemical_symbols": ["B"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Zn", "chemical_symbols": ["Zn"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Mg", "Mg", "Mg", "Mg", "Zn", "Zn", "B", "B", "B", "B", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir"], "assemblies": null, "structure_features": [], "_mcloudarchive_energy": -0.45, "_mcloudarchive_property_b": 0.86, "_mcloudarchive_structure_description": "describing something else"}, "relationships": null} diff --git a/src/mc_optimade/examples/direct_from_jsonl/example.jsonl b/src/mc_optimade/examples/direct_from_jsonl/example.jsonl new file mode 100644 index 0000000..95f2f6b --- /dev/null +++ b/src/mc_optimade/examples/direct_from_jsonl/example.jsonl @@ -0,0 +1,5 @@ +{"x-optimade": {"api_version": "1.2.0"}} +{"formats": ["json"], "description": "structures", "properties": {"_mcloudarchive_energy": {"description": "The total energy per atom as computed by DFT", "unit": "eV/atom", "sortable": null, "type": "float"}, "_mcloudarchive_property_b": {"description": "Alias for some more complicated property_b", "unit": null, "sortable": null, "type": "float"}, "_mcloudarchive_structure_description": {"description": "Provides a human-readable description for this particular entry_type", "unit": null, "sortable": null, "type": "string"}}, "output_fields_by_format": {"json": ["_mcloudarchive_energy", "_mcloudarchive_property_b", "_mcloudarchive_structure_description"]}} +{"id": "structures.zip/structures/cifs/cc1a41b1-a841-4818-baf1-a6c1441dc52a.cif", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": null, "last_modified": null, "elements": ["B", "Ir", "Mg", "Zn"], "nelements": 4, "elements_ratios": [0.2, 0.5, 0.2, 0.1], "chemical_formula_descriptive": "B4Ir10Mg4Zn2", "chemical_formula_reduced": "B2Ir5Mg2Zn", "chemical_formula_hill": null, "chemical_formula_anonymous": "A5B2C2D", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[9.4623270052342, 0.0, 0.0], [0.0, 9.4623270052342, 0.0], [0.0, 0.0, 2.9327245575729]], "cartesian_site_positions": [[3.0564738352234, 1.6746896673437002, 0.0], [7.7876373378905, 3.0564738352234, 0.0], [6.4058531699108, 7.7876373378905, 0.0], [1.6746896673437002, 6.4058531699108, 0.0], [4.7311635025671, 4.7311635025671, 0.0], [0.0, 0.0, 0.0], [8.2578546699396, 5.9356358378617, 0.0], [3.5266911673725, 8.2578546699396, 0.0], [1.2044723352946, 3.5266911673725, 0.0], [5.9356358378617, 1.2044723352946, 0.0], [4.0683890403188, 6.7751547390362, 1.4663622787864], [8.7995525428859, 7.418335768765, 1.4663622787864], [2.6871722661979005, 4.0683890403188, 1.4663622787864], [7.418335768765, 0.66277446224826, 1.4663622787864], [5.3939379649153, 2.6871722661979005, 1.4663622787864], [0.66277446224826, 2.0439912363691, 1.4663622787864], [6.7751547390362, 5.3939379649153, 1.4663622787864], [2.0439912363691, 8.7995525428859, 1.4663622787864], [4.7311635025671, 0.0, 1.4663622787864], [0.0, 4.7311635025671, 1.4663622787864]], "nsites": 20, "species": [{"name": "Ir", "chemical_symbols": ["Ir"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Mg", "chemical_symbols": ["Mg"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "B", "chemical_symbols": ["B"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Zn", "chemical_symbols": ["Zn"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Mg", "Mg", "Mg", "Mg", "Zn", "Zn", "B", "B", "B", "B", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir", "Ir"], "assemblies": null, "structure_features": [], "_mcloudarchive_energy": -0.45, "_mcloudarchive_property_b": 0.86, "_mcloudarchive_structure_description": "describing something else"}, "relationships": null} +{"id": "structures.zip/structures/cifs/991bec7a-b3a8-49af-ba6d-be5afd685cd4.cif", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": null, "last_modified": null, "elements": ["C", "Sr"], "nelements": 2, "elements_ratios": [0.5, 0.5], "chemical_formula_descriptive": "CSr", "chemical_formula_reduced": "CSr", "chemical_formula_hill": null, "chemical_formula_anonymous": "AB", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[4.006498849786306, 0.0, 0.0], [2.0032494248931525, 3.469729784148075, 0.0], [2.0032494248931525, 1.1565765947160247, 3.271292612341386]], "cartesian_site_positions": [[0.0, 0.0, 0.0], [4.006498849786305, 2.31315318943205, 1.635646306170693]], "nsites": 2, "species": [{"name": "C", "chemical_symbols": ["C"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Sr", "chemical_symbols": ["Sr"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Sr", "C"], "assemblies": null, "structure_features": [], "_mcloudarchive_energy": -0.55, "_mcloudarchive_property_b": 1.01, "_mcloudarchive_structure_description": NaN}, "relationships": null} +{"id": "structures.zip/structures/cifs/55c564f6-ac6a-4122-b8d9-0ad9fe61e961.cif", "type": "structures", "links": null, "meta": null, "attributes": {"immutable_id": null, "last_modified": null, "elements": ["Ba", "C", "N", "S"], "nelements": 4, "elements_ratios": [0.14285714285714285, 0.2857142857142857, 0.2857142857142857, 0.2857142857142857], "chemical_formula_descriptive": "C4Ba2N4S4", "chemical_formula_reduced": "BaC2N2S2", "chemical_formula_hill": null, "chemical_formula_anonymous": "A2B2C2D", "dimension_types": [1, 1, 1], "nperiodic_dimensions": 3, "lattice_vectors": [[6.3587627540404945, 0.0, 0.0], [-2.672647488887009, 5.769819681958754, 0.0], [0.25844951934994664, -0.16511343006546234, 8.71190314896161]], "cartesian_site_positions": [[3.4987802863851005, 5.049341739457014, 6.533927361693402], [0.4457844982025419, 0.5553645123824795, 2.177975787264444], [0.37416734784252487, 2.642448780492868, 5.291889331690525], [2.6281157156591126, 1.202488098280357, 7.775965391700042], [3.5703974366609055, 2.962257471400425, 3.420013817271085], [1.316449068841148, 4.402218153614962, 0.9359377571616379], [0.3512600296798777, 4.156339511491648, 5.900755701251229], [4.011419364830723, 1.818004876809729, 7.167099022143105], [3.5933047547393455, 1.4483667404554434, 2.8111474477141454], [-0.06685458032729186, 3.786701375083563, 1.5448041268185058], [0.3773832460379156, 1.5350786992068879, 4.8597303327393915], [1.621957615426957, 0.7399681351855376, 8.208124390751108], [3.567181538468686, 4.069627552684378, 3.8521728163221476], [2.3226071690796486, 4.864738116705728, 0.5037787583104332]], "nsites": 14, "species": [{"name": "C", "chemical_symbols": ["C"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "S", "chemical_symbols": ["S"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "Ba", "chemical_symbols": ["Ba"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}, {"name": "N", "chemical_symbols": ["N"], "concentration": [1.0], "mass": null, "original_name": null, "attached": null, "nattached": null}], "species_at_sites": ["Ba", "Ba", "C", "C", "C", "C", "S", "S", "S", "S", "N", "N", "N", "N"], "assemblies": null, "structure_features": [], "_mcloudarchive_energy": -0.54, "_mcloudarchive_property_b": 0.99, "_mcloudarchive_structure_description": "some description"}, "relationships": null} diff --git a/src/mc_optimade/examples/direct_from_jsonl/optimade.yaml b/src/mc_optimade/examples/direct_from_jsonl/optimade.yaml new file mode 100644 index 0000000..88d138d --- /dev/null +++ b/src/mc_optimade/examples/direct_from_jsonl/optimade.yaml @@ -0,0 +1,7 @@ +config_version: 0.1.0 + +database_description: >- + This database contains some example CIFs. + +entries: + jsonl_path: example.jsonl diff --git a/src/mc_optimade/examples/folder_of_cifs/optimade.yaml b/src/mc_optimade/examples/folder_of_cifs/optimade.yaml index 24a4814..1400c6a 100644 --- a/src/mc_optimade/examples/folder_of_cifs/optimade.yaml +++ b/src/mc_optimade/examples/folder_of_cifs/optimade.yaml @@ -3,8 +3,6 @@ config_version: 0.1.0 database_description: >- This database contains some example CIFs. -provider_prefix: mcloudarchive - entries: - entry_type: structures entry_paths: diff --git a/src/mc_optimade/examples/xyz_files_no_compression/optimade.yaml b/src/mc_optimade/examples/xyz_files_no_compression/optimade.yaml index 1ae3897..b98cd26 100644 --- a/src/mc_optimade/examples/xyz_files_no_compression/optimade.yaml +++ b/src/mc_optimade/examples/xyz_files_no_compression/optimade.yaml @@ -3,8 +3,6 @@ config_version: 0.1.0 database_description: >- This database contains 3 xyz files, one of which should be excluded from the API. -provider_prefix: mcloudarchive - entries: - entry_type: structures entry_paths: diff --git a/src/mc_optimade/mc_optimade/config.py b/src/mc_optimade/mc_optimade/config.py index 557f0f9..73bab0b 100644 --- a/src/mc_optimade/mc_optimade/config.py +++ b/src/mc_optimade/mc_optimade/config.py @@ -78,14 +78,30 @@ class EntryConfig(BaseModel): @validator("entry_type") def check_optimade_entry_type(cls, v): - if v not in ("structures", "references") and not v.startswith("_"): - raise ValueError( - f"OPTIMADE entry type must be either 'structures', 'references', or contain a custom prefix, not {v}" - ) + if not isinstance(v, JSONLConfig): + if v not in ("structures", "references") and not v.startswith("_"): + raise ValueError( + f"OPTIMADE entry type must be either 'structures', 'references', or contain a custom prefix, not {v}" + ) return v +class JSONLConfig(BaseModel): + """A description of a single JSON lines file that describes + the target API. + + """ + + file: Optional[str] = Field( + description="The archive filename containing the JSONL data to be parsed." + ) + + jsonl_path: str = Field( + description="The path of the JSON-L file within the archive (or directly in the entry, if `archive_file` is `None`)." + ) + + class Config(BaseModel): """This class describes the `optimade.yaml` file that a user can provide for each MCloud entry. @@ -101,21 +117,17 @@ class Config(BaseModel): description="A human-readable description of the overall database to be provided alongside the data in the API." ) - entries: list[EntryConfig] = Field( + entries: list[EntryConfig] | JSONLConfig = Field( description="A list of entry configurations for each entry type." ) - provider_prefix: str = Field( - "", - description="A provider prefix to use for custom fields served by this API.", - ) - @validator("entries") def check_one_entry_per_type(cls, v): - if len({e.entry_type for e in v}) != len(v): - raise ValueError( - "Each entry type must be listed only once in the config file." - ) + if not isinstance(v, JSONLConfig): + if len({e.entry_type for e in v}) != len(v): + raise ValueError( + "Each entry type must be listed only once in the config file." + ) return v @staticmethod diff --git a/src/mc_optimade/mc_optimade/convert.py b/src/mc_optimade/mc_optimade/convert.py index 0e8e950..59d45b5 100644 --- a/src/mc_optimade/mc_optimade/convert.py +++ b/src/mc_optimade/mc_optimade/convert.py @@ -4,6 +4,7 @@ """ +import os import warnings from collections import defaultdict from pathlib import Path @@ -13,9 +14,11 @@ from optimade.models import EntryInfoResource, EntryResource from optimade.server.schemas import ENTRY_INFO_SCHEMAS, retrieve_queryable_properties -from .config import Config, EntryConfig, ParsedFiles, PropertyDefinition +from .config import Config, EntryConfig, JSONLConfig, ParsedFiles, PropertyDefinition from .parsers import ENTRY_PARSERS, OPTIMADE_CONVERTERS, PROPERTY_PARSERS, TYPE_MAP +PROVIDER_PREFIX = os.environ.get("MC_OPTIMADE_PROVIDER_PREFIX", "mcloudarchive") + def _construct_entry_type_info( type: str, @@ -63,6 +66,13 @@ def convert_archive(archive_path: Path) -> Path: # load the config from the root of the archive mc_config = Config.from_file(archive_path / "optimade.yaml") + # if the config specifies just a JSON-L, then extract any archives + # and return the JSONL path + if isinstance(mc_config.entries, JSONLConfig): + if mc_config.entries.file is not None: + inflate_archive(Path(archive_path), Path(mc_config.entries.file)) + return Path(archive_path) / mc_config.entries.jsonl_path + # first, decompress any provided data paths data_paths: set[Path] = set() for entry in mc_config.entries: @@ -80,7 +90,7 @@ def convert_archive(archive_path: Path) -> Path: for entry in mc_config.entries: optimade_entries[entry.entry_type].extend( - construct_entries(archive_path, entry, mc_config.provider_prefix).values() + construct_entries(archive_path, entry, PROVIDER_PREFIX).values() ) property_definitions = defaultdict(list) @@ -91,7 +101,7 @@ def convert_archive(archive_path: Path) -> Path: archive_path, optimade_entries, property_definitions, - mc_config.provider_prefix, + PROVIDER_PREFIX, ) return jsonl_path