diff --git a/mlptrain/configurations/configuration_set.py b/mlptrain/configurations/configuration_set.py index 16d8c8ac..d38d646c 100644 --- a/mlptrain/configurations/configuration_set.py +++ b/mlptrain/configurations/configuration_set.py @@ -1,5 +1,6 @@ import mlptrain import os +import re import numpy as np from time import time from multiprocessing import Pool @@ -7,6 +8,8 @@ from autode.atoms import elements, Atom from mlptrain.config import Config from mlptrain.log import logger +from mlptrain.forces import Forces +from mlptrain.energy import Energy from mlptrain.configurations.configuration import Configuration from mlptrain.box import Box @@ -50,7 +53,7 @@ def true_energies(self) -> List[Optional[float]]: @property def true_forces(self) -> Optional[np.ndarray]: """ - True force tensor. shape = (N, n_atoms, 3) + List of true config forces. List of np.ndarray with shape: (n_atoms, 3) ----------------------------------------------------------------------- Returns: @@ -265,15 +268,34 @@ def compare( name = self._comparison_name(*args) if os.path.exists(f'{name}.npz'): + logger.info(f'Loading energies and forces from {name}.npz') self.load(f'{name}.npz') else: for arg in args: + # if is an mlp model with a 'predict' function if hasattr(arg, 'predict'): arg.predict(self) + # if is a string reference to a QM calculation method elif isinstance(arg, str): - self.single_point(method=arg) + # if true energies and forces do not already exist for this config set + non_null_true_energies = [ + x for x in self.true_energies if x is not None + ] + if ( + len(non_null_true_energies) == 0 + and self.true_forces.size == 0 + ): + logger.info( + f'Running single point calcs with method {arg}' + ) + self.single_point(method=arg) + else: + logger.info( + f'Not using method {arg}, true energies and forces ' + f'are already defined' + ) else: raise ValueError(f'Cannot compare using {arg}') @@ -318,42 +340,114 @@ def save_xyz( return None def load_xyz( - self, filename: str, charge: int, mult: int, box: Optional[Box] = None + self, + filename: str, + charge: int, + mult: int, + box: Optional[Box] = None, + load_energies: bool = False, + load_forces: bool = False, ) -> None: """ - Load configurations from a .xyz file. Will not load any energies or - forces + Load configurations from a .xyz file with optional box, energies and forces if specified. + Note: this currently assumes that all configurations have the same charge and multiplicity. ----------------------------------------------------------------------- Arguments: - filename: + filename: name of the input .xyz file + + charge: total charge on all configurations in the set + + mult: total spin multiplicity on all configurations in the set - charge: Total charge on the configuration + box: optionally specify a Box or None, if the configurations + are in vacuum (or if 'Lattice' is specified in extended .xyz) - mult: Total spin multiplicity + load_energies: bool - whether to load 'true' configurational energies or not - box: Box or None, if the configurations are in vacuum + load_forces: bool - whether to load 'true' forces from atom lines or not """ - file_lines = open(filename, 'r', errors='ignore').readlines() - atoms = [] def is_xyz_line(_l): return len(_l.split()) in (4, 7) and _l.split()[0] in elements - def append_configuration(_atoms): - self.append(Configuration(_atoms, charge, mult, box)) - return None - - for idx, line in enumerate(file_lines): - if is_xyz_line(line): - atoms.append(Atom(*line.split()[:4])) - - elif len(atoms) > 0: - append_configuration(atoms) - atoms.clear() - - if len(atoms) > 0: - append_configuration(atoms) + with open(filename, 'r', errors='ignore') as xyz_file: + # get first num atoms line + line_id = 1 + line = xyz_file.readline() + while line: + # load everything for a single configuration in this loop + energy, atoms, forces = None, [], [] + num_atoms = int(line) + + # get comments / property line + line_id += 1 + line = xyz_file.readline() + + # get dictionary of properties and values (matching key=value with regex) + pattern = r'(\w+)=("[^"]*"|\S+)' + config_info = re.findall(pattern, line) + config_info_dict = { + key: value.strip('"') for key, value in config_info + } + + # if using extended xyz format, get properties + config_box = box + if len(config_info) > 1: + if ( + box is None + and config_info_dict.get('Lattice') is not None + ): + # set box size from xyz properties line if it is specified + lattice_info = [ + float(x) + for x in re.findall( + '[0-9]+', config_info_dict['Lattice'] + ) + ] + + config_box = Box( + [ + lattice_info[0], + lattice_info[8], + lattice_info[16], + ] + ) + + if load_energies: + assert ( + config_info_dict.get('energy') is not None + ), "Property 'energy' not specified on properties line..." + energy = float(config_info_dict['energy']) + + # get atom lines + for _ in range(num_atoms): + line_id += 1 + line = xyz_file.readline() + assert is_xyz_line( + line + ), f'There was an error in parsing your xyz file on line: {line_id}' + line_split = line.split() + atoms.append(Atom(*line_split[:4])) + + if load_forces: + # add forces to forces dict in configuration + if len(line_split) > 4: + force = tuple([float(x) for x in line_split[4:]]) + assert ( + len(force) == 3 + ), f'Force is not a 3D vector: {force}' + forces.append(force) + + # create configuration, add forces, energy and append it to config set + configuration = Configuration(atoms, charge, mult, config_box) + configuration.forces = Forces(true=np.array(forces)) + configuration.energy = Energy(true=energy) + self.append(configuration) + + # get num atoms line for next config + line_id += 1 + line = xyz_file.readline() return None @@ -434,7 +528,10 @@ def _coordinates(self) -> np.ndarray: (np.ndarray): Coordinates tensor (n, n_atoms, 3), where n is len(self) """ - return np.array([np.asarray(c.coordinates, dtype=float) for c in self]) + return np.array( + [np.asarray(c.coordinates, dtype=float) for c in self], + dtype=object, + ) @property def plumed_coordinates(self) -> Optional[np.ndarray]: @@ -471,9 +568,11 @@ def plumed_coordinates(self) -> Optional[np.ndarray]: for i, coords in enumerate(all_coordinates): if coords is None: - all_coordinates[i] = np.array([np.nan for _ in range(n_cvs)]) + all_coordinates[i] = np.array( + [np.nan for _ in range(n_cvs)], dtype=float + ) - return np.array(all_coordinates) + return np.array(all_coordinates, dtype=object) @property def _atomic_numbers(self) -> np.ndarray: @@ -486,7 +585,8 @@ def _atomic_numbers(self) -> np.ndarray: """ return np.array( - [[atom.atomic_number for atom in c.atoms] for c in self] + [[atom.atomic_number for atom in c.atoms] for c in self], + dtype=object, ) @property @@ -514,7 +614,7 @@ def _multiplicities(self) -> np.ndarray: return np.array([c.mult for c in self]) def _forces(self, kind: str) -> Optional[np.ndarray]: - """True or predicted forces. Returns a 3D tensor""" + """True or predicted forces. Returns a 3D np.ndarray.""" all_forces = [] for config in self: @@ -524,7 +624,7 @@ def _forces(self, kind: str) -> Optional[np.ndarray]: all_forces.append(getattr(config.forces, kind)) - return np.array(all_forces) + return np.array(all_forces, dtype=object) def _save_npz(self, filename: str) -> None: """Save a compressed numpy array of all the data in this set""" @@ -557,14 +657,18 @@ def _load_npz(self, filename: str) -> None: box = Box(size=data['L'][i]) config = Configuration( - atoms=_atoms_from_z_r(data['Z'][i], coords), + atoms=_atoms_from_z_r( + data['Z'][i], np.array(coords, dtype=float) + ), charge=int(data['C'][i]), mult=int(data['M'][i]), box=None if box.has_zero_volume else box, ) if data['R_plumed'].ndim > 0: - config.plumed_coordinates = data['R_plumed'][i] + config.plumed_coordinates = np.array( + data['R_plumed'][i], dtype=float + ) if data['E_true'].ndim > 0: config.energy.true = data['E_true'][i] @@ -579,10 +683,12 @@ def _load_npz(self, filename: str) -> None: config.energy.inherited_bias = data['E_inherited_bias'][i] if data['F_true'].ndim > 0: - config.forces.true = data['F_true'][i] + config.forces.true = np.array(data['F_true'][i], dtype=float) if data['F_predicted'].ndim > 0: - config.forces.predicted = data['F_predicted'][i] + config.forces.predicted = np.array( + data['F_predicted'][i], dtype=float + ) self.append(config) @@ -647,6 +753,23 @@ def _run_parallel_method(self, function, **kwargs): logger.info(f'Calculations done in {(time() - start_time) / 60:.1f} m') return None + def __str__(self): + return ( + f'ConfigurationSet Summary:\n' + f' Coords Dimensions: {self._coordinates.shape if self._coordinates is not None else None}\n' + f' Plumed Coords Dimensions: {self.plumed_coordinates.shape if self.plumed_coordinates is not None else None}\n' + f' Has True Energies: {any(x is not None for x in self.true_energies)}\n' + f' Has Predicted Energies: {any(x is not None for x in self.predicted_energies)}\n' + f' Has Bias Energies: {any(x is not None for x in self.bias_energies)}\n' + f' Has Inherit. Bias Energies: {any(x is not None for x in self.inherited_bias_energies)}\n' + f' True Forces Dim: {self.true_forces.shape}\n' + f' Predicted Forces Dim: {self.predicted_forces.shape}\n' + f' Atomic Numbers Dim: {self._atomic_numbers.shape}\n' + f' Unique Box Sizes: {np.unique(self._box_sizes)}\n' + f' Unique Charges: {np.unique(self._charges)}\n' + f' Unique Multiplicities: {np.unique(self._multiplicities)}' + ) + @staticmethod def _comparison_name(*args): """Name of a comparison between different methods""" diff --git a/mlptrain/configurations/plotting.py b/mlptrain/configurations/plotting.py index 177141f6..dd013b03 100644 --- a/mlptrain/configurations/plotting.py +++ b/mlptrain/configurations/plotting.py @@ -19,7 +19,7 @@ def parity_plot( - config_set: 'mlptrain.ConfigurationSet', name: str = 'paritiy' + config_set: 'mlptrain.ConfigurationSet', name: str = 'parity' ) -> None: """ Plot parity plots of energies, forces and temporal differences (if present) @@ -131,14 +131,32 @@ def _add_force_component_plot(config_set, axis) -> None: plt.get_cmap('Purples'), ] - min_f = min( - [np.min(config_set.true_forces), np.min(config_set.predicted_forces)] + # get the min and max force components in any of (x, y, z) directions for plotting + min_true_f = min( + [np.min(config_forces) for config_forces in config_set.true_forces] ) - max_f = min( - [np.max(config_set.true_forces), np.max(config_set.predicted_forces)] + min_pred_f = min( + [ + np.min(config_forces) + for config_forces in config_set.predicted_forces + ] ) + max_true_f = max( + [np.max(config_forces) for config_forces in config_set.true_forces] + ) + + max_pred_f = max( + [ + np.max(config_forces) + for config_forces in config_set.predicted_forces + ] + ) + + min_f = min([min_true_f, min_pred_f]) + max_f = min([max_true_f, max_pred_f]) + for idx, k in enumerate(['x', 'y', 'z']): xs, ys = [], [] for config in config_set: diff --git a/tests/data/methane.xyz b/tests/data/methane.xyz index 1d6edb9a..4a4a2d50 100644 --- a/tests/data/methane.xyz +++ b/tests/data/methane.xyz @@ -13,7 +13,6 @@ H -1.19178 2.07309 0.94983 H -1.19178 1.76033 -0.76926 H -1.19178 3.40548 -0.18057 5 -5 C -0.83511 2.41296 0.00000 H 0.23489 2.41296 0.00000 diff --git a/tests/test_configuration_set.py b/tests/test_configuration_set.py index a8ac68d1..6550fe08 100644 --- a/tests/test_configuration_set.py +++ b/tests/test_configuration_set.py @@ -1,11 +1,66 @@ import os import numpy as np +import pytest from autode.atoms import Atom from mlptrain.configurations import ConfigurationSet, Configuration from mlptrain.utils import work_in_tmp_dir from mlptrain.box import Box +@pytest.fixture +def config_set_xyz_with_energies_forces(): + configs = ConfigurationSet() + + with open('tmp.xyz', 'w') as xyz_file: + print( + '3', + 'Lattice="20.000000 0.000000 0.000000 0.000000 20.000000 0.000000 0.000000 0.000000 20.000000" ' + 'energy=-11580.70167936 Properties=species:S:1:pos:R:3:forces:R:3', + 'C 0.00000 0.00000 0.00000 -1.00000 -1.00000 -1.00000', + 'O 1.00000 1.00000 1.00000 -2.00000 2.00000 -2.00000', + 'H 2.00000 2.00000 2.00000 3.00000 -3.00000 -3.00000', + '2', + 'Lattice="18.000000 0.000000 0.000000 0.000000 18.000000 0.000000 0.000000 0.000000 18.000000" ' + 'energy=-11581.02323085 Properties=species:S:1:pos:R:3:forces:R:3', + 'C 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000', + 'O 1.00000 1.00000 1.00000 -1.00000 1.00000 1.00000', + sep='\n', + file=xyz_file, + ) + + configs.load_xyz( + 'tmp.xyz', charge=0, mult=1, load_energies=True, load_forces=True + ) + + expected_values = { + 'energies': [-11580.70167936, -11581.02323085], + 'num_atoms': [3, 2], + 'box_sizes': [(20, 20, 20), (18, 18, 18)], + 'coords': np.array( + [ + np.array( + [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [2.0, 2.0, 2.0]], + dtype=float, + ), + np.array([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]], dtype=float), + ], + dtype=object, + ), + 'forces': np.array( + [ + np.array( + [[-1.0, -1.0, -1.0], [-2.0, 2.0, -2.0], [3.0, -3.0, -3.0]], + dtype=float, + ), + np.array([[0.0, 0.0, 0.0], [-1.0, 1.0, 1.0]], dtype=float), + ], + dtype=object, + ), + } + + return configs, expected_values + + @work_in_tmp_dir() def test_configurations_save(): configs = ConfigurationSet() @@ -70,6 +125,9 @@ def test_configurations_load_with_energies_forces(): loaded_config = ConfigurationSet('tmp.npz')[0] assert loaded_config.energy.true is not None + assert loaded_config.energy.predicted is not None + assert loaded_config.forces.true is not None + assert loaded_config.forces.predicted is not None for attr in ('energy', 'forces'): for kind in ('predicted', 'true'): @@ -101,3 +159,64 @@ def test_configurations_load_xyz(): for config in configs: assert config.charge == 0 assert config.mult == 2 + + +@work_in_tmp_dir() +def test_configurations_load_xyz_with_energies_forces( + config_set_xyz_with_energies_forces, +): + # get config set from xyz + configs, exp_vals = config_set_xyz_with_energies_forces + + assert len(configs) == 2 + + # check loading properties line for each config + for i in range(2): + config = configs[i] + assert config.box == Box(exp_vals['box_sizes'][i]) + assert config.charge == 0 + assert config.mult == 1 + assert len(config.atoms) == exp_vals['num_atoms'][i] + assert config.energy.true == exp_vals['energies'][i] + assert np.allclose(config.coordinates, exp_vals['coords'][i]) + assert np.allclose(config.forces.true, exp_vals['forces'][i]) + + # same tests but for config set properties + assert all( + config_energy == exp_vals['energies'][i] + for i, config_energy in enumerate(configs.true_energies) + ) + assert all( + np.allclose(config_coords, exp_vals['coords'][i]) + for i, config_coords in enumerate(configs._coordinates) + ) + assert all( + np.allclose(config_forces, exp_vals['forces'][i]) + for i, config_forces in enumerate(configs.true_forces) + ) + + +@work_in_tmp_dir() +def test_configurations_load_xyz_and_save_npz( + config_set_xyz_with_energies_forces, +): + # get config set from xyz + configs, exp_vals = config_set_xyz_with_energies_forces + + # check saving and loading npz + configs.save('tmp.npz') + loaded_configs = ConfigurationSet('tmp.npz') + + # same tests but for config set properties + assert all( + config_energy == exp_vals['energies'][i] + for i, config_energy in enumerate(loaded_configs.true_energies) + ) + assert all( + np.allclose(config_coords, exp_vals['coords'][i]) + for i, config_coords in enumerate(loaded_configs._coordinates) + ) + assert all( + np.allclose(config_forces, exp_vals['forces'][i]) + for i, config_forces in enumerate(loaded_configs.true_forces) + ) diff --git a/tests/test_plotting.py b/tests/test_plotting.py new file mode 100644 index 00000000..217e378c --- /dev/null +++ b/tests/test_plotting.py @@ -0,0 +1,26 @@ +import numpy as np +from autode.atoms import Atom +from mlptrain.configurations import ConfigurationSet, Configuration +from mlptrain.configurations.plotting import parity_plot +from mlptrain.utils import work_in_tmp_dir + + +@work_in_tmp_dir() +def test_parity_plot(): + config_set = ConfigurationSet(allow_duplicates=True) + + for i in range(1000): + config = Configuration(atoms=[Atom('H'), Atom('C'), Atom('C')]) + + # energies + config.energy.true = 0.0 + (1.0 * i) + config.energy.predicted = 1.0 + (1.0 * i) + + # forces + config.forces.true = np.random.uniform(1, 100, 3).reshape(1, 3) + config.forces.predicted = np.random.uniform(1, 100, 3).reshape(1, 3) + + config_set.append(config) + + # simply check there are no issues running this code + parity_plot(config_set)