Minor clean-up

Jonas-Verhellen · Feb 27, 2021 · fae912e · fae912e
1 parent 0e776ff
commit fae912e
Show file tree

Hide file tree

Showing 5,148 changed files with 19 additions and 330,855 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Argenomic is an open-source implementation of an illumination algorithm for opti
 
 ## Getting Started
 
-After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Thiotixene) can be called upon in the following manner:
+After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Celecoxib) can be called upon in the following manner:
 ```
 python3 illuminate.py generations=100
 ```
@@ -61,7 +61,7 @@ Important dependencies of the Argenomic software environment and where to find t
 
 * Jan Jensen for his work in developing and open-sourcing a graph-based genetic algorithm for molecular optimisation, which served as impetus for this project.
 
-* Jean-Baptiste Mouret and Jeff Clune for their breakthrough invention of illumination algorithms, providing a holistic view of high-performing solutions throughout a search space.  
+* Jean-Baptiste Mouret and Jeff Clune for their breakthrough invention of illumination algorithms, providing a holistic view of high-performing solutions throughout a search space.
 
 * Pat Walters for his scripts indicating how to run structural alerts using the RDKit and ChEMBL, and for his many enlightening medicinal chemistry blog posts.
 

diff --git a/app.py b/app.py
diff --git a/argenomic/__pycache__/infrastructure.cpython-37.pyc b/argenomic/__pycache__/infrastructure.cpython-37.pyc
diff --git a/argenomic/infrastructure.py b/argenomic/infrastructure.py
@@ -68,7 +68,7 @@ def sample(self, size: int) -> List[Chem.Mol]:
         molecules, weights = map(list, zip(*pairs))
         return random.choices(molecules, k=size, weights=weights)
 
-    def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]:
+    def sample_pairs(self, size: int, generation: float) -> List[Tuple[Chem.Mol, Chem.Mol]]:
         """
         Returns a list of pairs of elite molecules of the requisted length. 
         The elite molecules are randomly drawn, weighted by their fitness. 
@@ -77,7 +77,7 @@ def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]:
         molecules, weights = map(list, zip(*pairs))
         sample_molecules = random.choices(molecules, k=size, weights=weights)
         sample_pairs = np.random.choice(list(filter(None, sample_molecules)), size=(size, 2), replace=True)
-        sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs]
+        sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs]       
         return sample_pairs
 
     def store_data(self, generation: float) -> None:

diff --git a/configuration/config.yaml b/configuration/config.yaml
@@ -1,5 +1,5 @@
 ---
-data_file: data/smiles/guacamol_initial_rediscovery_thiotixene.smi
+data_file: data/smiles/guacamol_initial_rediscovery_celecoxib.smi
 batch_size: 40
 initial_size: 100
 workers: 1
@@ -25,7 +25,7 @@ descriptor:
   - - 40
     - 130
 fitness:
-  target: O=S(=O)(N(C)C)c2cc1C(\c3c(Sc1cc2)cccc3)=C/CCN4CCN(C)CC4
+  target: O=S(=O)(c3ccc(n1nc(cc1c2ccc(cc2)C)C(F)(F)F)cc3)N
   type: ECFP4
 arbiter:
   rules:

diff --git a/data/README.md b/data/README.md
@@ -1,21 +1,13 @@
-# Sample Package Data
+# Data 
+This directory contains data that were included with the argenomic package. This is also a place where non-code related additional information (such as data files, molecular structures,  etc.) can 
+conveniently stored. Please note that the output files are automatically stored in their own folder. 
 
-This directory contains sample additional data you may want to include with your package.
-This is a place where non-code related additional information (such as data files, molecular structures,  etc.) can 
-go that you want to ship alongside your code.
+An overview of the subdirectories: 
 
-Please note that it is not recommended to place large files in your git directory. If your project requires files larger
-than a few megabytes in size it is recommended to host these files elsewhere. This is especially true for binary files
-as the `git` structure is unable to correctly take updates to these files and will store a complete copy of every version
-in your `git` history which can quickly add up. As a note most `git` hosting services like GitHub have a 1 GB per repository
-cap.
+* cvt: Contains the cvt centroids as stored in cache. New cvt centroids, as a generated by argenomic, will be stored here automaticaly. 
 
-## Including package data
+* figures: A collection of figures generated with data from argenomic.
 
-Modify your package's `setup.py` file and the `setup()` command. Include the 
-[`package_data`](http://setuptools.readthedocs.io/en/latest/setuptools.html#basic-use) keyword and point it at the 
-correct files.
+* smarts: Two smarts-files:  alert_collection.csv (containing the smarts needed to remove unwanted molecules) and mutation_collection.tsv containing the smarts causing the mutations an their probability weights. 
 
-## Manifest
-
-* `look_and_say.dat`: first entries of the "Look and Say" integer series, sequence [A005150](https://oeis.org/A005150)
+* smiles: a number of files contianing databases of smiles from ZINC, ChEMBL and the Guacamol projects.
diff --git a/illuminate.py b/illuminate.py
@@ -33,8 +33,8 @@ def __init__(self, config) -> None:
 
     def __call__(self) -> None:
         self.initial_population()
-        for generation in range(self.generations):
-            molecules = self.generate_molecules()
+        for generation in range(1, self.generations):
+            molecules = self.generate_molecules(generation)
             molecules = self.process_molecules(molecules)
             self.archive.add_to_archive(molecules)
             self.archive.store_data(generation)
@@ -45,6 +45,7 @@ def initial_population(self) -> None:
         molecules = self.calculate_descriptors(molecules)
         molecules = self.calculate_fitnesses(molecules)
         self.archive.add_to_archive(molecules)
+        self.archive.store_data(0)
         return None
 
     def load_from_database(self) -> List[Molecule]:
@@ -54,10 +55,10 @@ def load_from_database(self) -> List[Molecule]:
         molecules = [Molecule(Chem.CanonSmiles(smiles), pedigree) for smiles in smiles_list]
         return molecules
 
-    def generate_molecules(self) -> List[Molecule]:
+    def generate_molecules(self, generation) -> List[Molecule]:
         molecules = []
         molecule_samples = self.archive.sample(self.batch_size)
-        molecule_sample_pairs = self.archive.sample_pairs(self.batch_size)
+        molecule_sample_pairs = self.archive.sample_pairs(self.batch_size, generation)
         for molecule in molecule_samples:
             molecules.extend(self.mutator(molecule)) 
         for molecule_pair in molecule_sample_pairs:

diff --git a/outputs/2020-11-01/19-54-54/.hydra/config.yaml b/outputs/2020-11-01/19-54-54/.hydra/config.yaml
diff --git a/outputs/2020-11-01/19-54-54/.hydra/hydra.yaml b/outputs/2020-11-01/19-54-54/.hydra/hydra.yaml
diff --git a/outputs/2020-11-01/19-54-54/.hydra/overrides.yaml b/outputs/2020-11-01/19-54-54/.hydra/overrides.yaml
diff --git a/outputs/2020-11-01/19-54-54/illuminate.log b/outputs/2020-11-01/19-54-54/illuminate.log
diff --git a/outputs/2020-11-01/19-55-35/.hydra/config.yaml b/outputs/2020-11-01/19-55-35/.hydra/config.yaml