From e7ee509512637214cd62c447c02268664c86a64e Mon Sep 17 00:00:00 2001
From: Peter Eastman <peastman@stanford.edu>
Date: Tue, 30 Aug 2022 16:50:04 -0700
Subject: [PATCH] Downloader skips samples with very large forces (#41)

---
 downloader/config.yaml   | 7 ++++++-
 downloader/downloader.py | 9 +++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/downloader/config.yaml b/downloader/config.yaml
index dd49138..ca13e46 100644
--- a/downloader/config.yaml
+++ b/downloader/config.yaml
@@ -27,4 +27,9 @@ values:
   - 'SCF DIPOLE'
   - 'SCF QUADRUPOLE'
   - 'WIBERG LOWDIN INDICES'
-  - 'MAYER INDICES'
\ No newline at end of file
+  - 'MAYER INDICES'
+
+# This specifies a cutoff on forces.  A sample will be skipped if any component of the force on any atom
+# is larger than this value (in hartree/bohr).  Comment out this line to include all samples regardless of
+# force magnitude.
+max_force: 1.0
\ No newline at end of file
diff --git a/downloader/downloader.py b/downloader/downloader.py
index 463abdc..ed87d05 100644
--- a/downloader/downloader.py
+++ b/downloader/downloader.py
@@ -65,6 +65,10 @@ def compute_reference_energy(smiles):
 
 with open('config.yaml') as input:
     config = yaml.safe_load(input.read())
+if 'max_force' in config:
+    max_force = float(config['max_force'])
+else:
+    max_force = None
 client = FractalClient()
 outputfile = h5py.File('SPICE.hdf5', 'w')
 for subset in config['subsets']:
@@ -98,6 +102,11 @@ def compute_reference_energy(smiles):
         group.create_dataset('subset', data=[subset], dtype=h5py.string_dtype())
         group.create_dataset('smiles', data=[smiles], dtype=h5py.string_dtype())
         group.create_dataset("atomic_numbers", data=molecules[0].atomic_numbers, dtype=np.int16)
+        if max_force is not None:
+            force = np.array([vars['DFT TOTAL GRADIENT'] for vars in qcvars])
+            samples = [i for i in range(len(molecules)) if np.max(np.abs(force[i])) <= max_force]
+            molecules = [molecules[i] for i in samples]
+            qcvars = [qcvars[i] for i in samples]
         ds = group.create_dataset('conformations', data=np.array([m.geometry for m in molecules]), dtype=np.float32)
         ds.attrs['units'] = 'bohr'
         ds = group.create_dataset('formation_energy', data=np.array([vars['DFT TOTAL ENERGY']-ref_energy for vars in qcvars]), dtype=np.float32)