From e7ee509512637214cd62c447c02268664c86a64e Mon Sep 17 00:00:00 2001 From: Peter Eastman Date: Tue, 30 Aug 2022 16:50:04 -0700 Subject: [PATCH] Downloader skips samples with very large forces (#41) --- downloader/config.yaml | 7 ++++++- downloader/downloader.py | 9 +++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/downloader/config.yaml b/downloader/config.yaml index dd49138..ca13e46 100644 --- a/downloader/config.yaml +++ b/downloader/config.yaml @@ -27,4 +27,9 @@ values: - 'SCF DIPOLE' - 'SCF QUADRUPOLE' - 'WIBERG LOWDIN INDICES' - - 'MAYER INDICES' \ No newline at end of file + - 'MAYER INDICES' + +# This specifies a cutoff on forces. A sample will be skipped if any component of the force on any atom +# is larger than this value (in hartree/bohr). Comment out this line to include all samples regardless of +# force magnitude. +max_force: 1.0 \ No newline at end of file diff --git a/downloader/downloader.py b/downloader/downloader.py index 463abdc..ed87d05 100644 --- a/downloader/downloader.py +++ b/downloader/downloader.py @@ -65,6 +65,10 @@ def compute_reference_energy(smiles): with open('config.yaml') as input: config = yaml.safe_load(input.read()) +if 'max_force' in config: + max_force = float(config['max_force']) +else: + max_force = None client = FractalClient() outputfile = h5py.File('SPICE.hdf5', 'w') for subset in config['subsets']: @@ -98,6 +102,11 @@ def compute_reference_energy(smiles): group.create_dataset('subset', data=[subset], dtype=h5py.string_dtype()) group.create_dataset('smiles', data=[smiles], dtype=h5py.string_dtype()) group.create_dataset("atomic_numbers", data=molecules[0].atomic_numbers, dtype=np.int16) + if max_force is not None: + force = np.array([vars['DFT TOTAL GRADIENT'] for vars in qcvars]) + samples = [i for i in range(len(molecules)) if np.max(np.abs(force[i])) <= max_force] + molecules = [molecules[i] for i in samples] + qcvars = [qcvars[i] for i in samples] ds = group.create_dataset('conformations', data=np.array([m.geometry for m in molecules]), dtype=np.float32) ds.attrs['units'] = 'bohr' ds = group.create_dataset('formation_energy', data=np.array([vars['DFT TOTAL ENERGY']-ref_energy for vars in qcvars]), dtype=np.float32)