From 0dea35c4e247265f9efd1472fde60867c81c028f Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 18 Apr 2024 04:32:38 -0400 Subject: [PATCH 01/27] fix cluster issue gpus-per-task --- mila/sbatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mila/sbatch.py b/mila/sbatch.py index f2cc71538..ed8fa878d 100644 --- a/mila/sbatch.py +++ b/mila/sbatch.py @@ -41,7 +41,7 @@ conda activate {env} fi {wandb_offline} -srun --gpus-per-task=1 --output={output} {python_command} +srun --output={output} {python_command} """ From 904fea19b457440d405e13321f3af68497dd9fe9 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 18 Apr 2024 08:35:34 -0400 Subject: [PATCH 02/27] new yaml configs --- configs/exps/deup/datasets/new-mc-faenet.yaml | 28 ++++++++++++ configs/exps/deup/gnn/depfaenet.yaml | 0 configs/exps/deup/gnn/faenet-training.yaml | 43 +++++++++++++++++++ configs/exps/deup/uncertainty/v0.yaml | 1 - configs/exps/deup/uncertainty/v1.yaml | 33 ++++++++++++++ 5 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 configs/exps/deup/datasets/new-mc-faenet.yaml create mode 100644 configs/exps/deup/gnn/depfaenet.yaml create mode 100644 configs/exps/deup/gnn/faenet-training.yaml create mode 100644 configs/exps/deup/uncertainty/v1.yaml diff --git a/configs/exps/deup/datasets/new-mc-faenet.yaml b/configs/exps/deup/datasets/new-mc-faenet.yaml new file mode 100644 index 000000000..95aab7a29 --- /dev/null +++ b/configs/exps/deup/datasets/new-mc-faenet.yaml @@ -0,0 +1,28 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + +default: + config: faenet-is2re-all + wandb_project: ocp-deup + wandb_tags: base-model, MC-D, 2935198 + test_ri: True + mode: train + checkpoint: /network/scratch/a/alexandre.duval/ocp/runs/2935198/checkpoints/best_checkpoint.pt + restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/2935198 + model: + dropout_lowest_layer: output + first_trainable_layer: dropout + dropout_lin: 0.7 + cp_data_to_tmpdir: true + inference_time_loops: 1 + deup_dataset: + create: after # "before" -> created before training (for deup) "after" -> created after training (for is2re) "" - not created + dataset_strs: ["train", "val_id", "val_ood_cat", "val_ood_ads"] + n_samples: 7 + +runs: + - optim: + max_epochs: 12 diff --git a/configs/exps/deup/gnn/depfaenet.yaml b/configs/exps/deup/gnn/depfaenet.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/configs/exps/deup/gnn/faenet-training.yaml b/configs/exps/deup/gnn/faenet-training.yaml new file mode 100644 index 000000000..5e5575263 --- /dev/null +++ b/configs/exps/deup/gnn/faenet-training.yaml @@ -0,0 +1,43 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + time: 18:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + wandb_tags: "top-model" + wandb_project: ocp-deup + optim: + batch_size: 256 + eval_batch_size: 256 + cp_data_to_tmpdir: True + +runs: + - config: faenet-is2re-all + note: "top-runs" + frame_averaging: 2D + fa_method: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 384 + num_filters: 480 + num_gaussians: 104 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 6.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 12 + eval_every: 0.25 \ No newline at end of file diff --git a/configs/exps/deup/uncertainty/v0.yaml b/configs/exps/deup/uncertainty/v0.yaml index 94597ddaf..4cdd6d802 100644 --- a/configs/exps/deup/uncertainty/v0.yaml +++ b/configs/exps/deup/uncertainty/v0.yaml @@ -3,7 +3,6 @@ job: cpus: 4 gres: gpu:rtx8000:1 partition: long - code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-3 default: config: deup_faenet-deup_is2re-all diff --git a/configs/exps/deup/uncertainty/v1.yaml b/configs/exps/deup/uncertainty/v1.yaml new file mode 100644 index 000000000..4f69d7828 --- /dev/null +++ b/configs/exps/deup/uncertainty/v1.yaml @@ -0,0 +1,33 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + +default: + config: deup_faenet-deup_is2re-all + + wandb_project: ocp-deup + wandb_tags: base-model, MC-D, 3264530 + test_ri: True + mode: train + model: + dropout_lowest_layer: null + first_trainable_layer: output + dropout_lin: 0.7 + cp_data_to_tmpdir: false + inference_time_loops: 1 + restart_from_dir: /network/scratch/s/schmidtv/ocp/runs/3264530 + checkpoint: /network/scratch/s/schmidtv/ocp/runs/3264530 + dataset: # mandatory if restart_from_dir is set + default_val: deup-val_ood_cat-val_ood_ads + deup-train-val_id: + src: /network/scratch/s/schmidtv/ocp/runs/3264530/deup_dataset + deup-val_ood_cat-val_ood_ads: + src: /network/scratch/s/schmidtv/ocp/runs/3264530/deup_dataset + deup_dataset: + create: False + +runs: + - optim: + max_epochs: 12 From 7c481391ac6cbe5a77ee068518296e053dcf8930 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 18 Apr 2024 09:30:24 -0400 Subject: [PATCH 03/27] update path trained gnn model --- configs/exps/deup/datasets/new-mc-faenet.yaml | 6 +++--- ocpmodels/datasets/deup_dataset_creator.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/exps/deup/datasets/new-mc-faenet.yaml b/configs/exps/deup/datasets/new-mc-faenet.yaml index 95aab7a29..56ea29868 100644 --- a/configs/exps/deup/datasets/new-mc-faenet.yaml +++ b/configs/exps/deup/datasets/new-mc-faenet.yaml @@ -7,11 +7,11 @@ job: default: config: faenet-is2re-all wandb_project: ocp-deup - wandb_tags: base-model, MC-D, 2935198 + wandb_tags: base-model, MC-D, 4615191 test_ri: True mode: train - checkpoint: /network/scratch/a/alexandre.duval/ocp/runs/2935198/checkpoints/best_checkpoint.pt - restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/2935198 + checkpoint: /network/scratch/a/alexandre.duval/scratch/ocp/runs/4615191/checkpoints/best_checkpoint.pt + restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4615191/ model: dropout_lowest_layer: output first_trainable_layer: dropout diff --git a/ocpmodels/datasets/deup_dataset_creator.py b/ocpmodels/datasets/deup_dataset_creator.py index 64d67fd16..4bc6a8bc0 100644 --- a/ocpmodels/datasets/deup_dataset_creator.py +++ b/ocpmodels/datasets/deup_dataset_creator.py @@ -431,7 +431,7 @@ def write_lmdb(self, samples, path, total_size=-1, max_samples=-1): from ocpmodels.datasets.lmdb_dataset import DeupDataset from ocpmodels.common.utils import JOB_ID, RUNS_DIR, make_config_from_conf_str - base_trainer_path = "/network/scratch/s/schmidtv/ocp/runs/3298908" + base_trainer_path = "/network/scratch/a/alexandre.duval/ocp/runs/4615191" # what models to load for inference trainers_conf = { From 72ae772108c95dea3dc5b0501bb46bc813261fef Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 18 Apr 2024 09:30:41 -0400 Subject: [PATCH 04/27] fa_frames => fa_method --- configs/models/deup_faenet.yaml | 2 +- ocpmodels/datasets/deup_dataset_creator.py | 2 +- ocpmodels/datasets/lmdb_dataset.py | 6 +++--- scripts/train_density_estimator.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/models/deup_faenet.yaml b/configs/models/deup_faenet.yaml index 2284687e2..efa779c80 100644 --- a/configs/models/deup_faenet.yaml +++ b/configs/models/deup_faenet.yaml @@ -57,7 +57,7 @@ default: energy_coefficient: 1 frame_averaging: False # 2D, 3D, da, False - fa_frames: False # can be {None, full, random, det, e3, e3-random, e3-det} + fa_method: False # can be {None, full, random, det, e3, e3-random, e3-det} # ------------------- # ----- IS2RE ----- diff --git a/ocpmodels/datasets/deup_dataset_creator.py b/ocpmodels/datasets/deup_dataset_creator.py index 4bc6a8bc0..1af5cdf8f 100644 --- a/ocpmodels/datasets/deup_dataset_creator.py +++ b/ocpmodels/datasets/deup_dataset_creator.py @@ -167,7 +167,7 @@ def load_trainers(self, overrides={}): shared_config = {} shared_config["graph_rewiring"] = self.trainers[0].config["graph_rewiring"] - shared_config["fa_frames"] = self.trainers[0].config["fa_frames"] + shared_config["fa_method"] = self.trainers[0].config["fa_method"] shared_config["frame_averaging"] = self.trainers[0].config["frame_averaging"] # Done! diff --git a/ocpmodels/datasets/lmdb_dataset.py b/ocpmodels/datasets/lmdb_dataset.py index e4ea6bd7b..8f8fb2444 100644 --- a/ocpmodels/datasets/lmdb_dataset.py +++ b/ocpmodels/datasets/lmdb_dataset.py @@ -37,7 +37,7 @@ class LmdbDataset(Dataset): config (dict): Dataset configuration transform (callable, optional): Data transform function. (default: :obj:`None`) - fa_frames (str, optional): type of frame averaging method applied, if any. + fa_method (str, optional): type of frame averaging method applied, if any. adsorbates (str, optional): comma-separated list of adsorbates to filter. If None or "all", no filtering is applied. (default: None) @@ -49,7 +49,7 @@ def __init__( self, config, transform=None, - fa_frames=None, + fa_method=None, lmdb_glob=None, adsorbates=None, adsorbates_ref_dir=None, @@ -96,7 +96,7 @@ def __init__( self.filter_per_adsorbates() self.transform = transform - self.fa_method = fa_frames + self.fa_method = fa_method def filter_per_adsorbates(self): """Filter the dataset to only include structures with a specific diff --git a/scripts/train_density_estimator.py b/scripts/train_density_estimator.py index a7a45b327..b5f5bc491 100644 --- a/scripts/train_density_estimator.py +++ b/scripts/train_density_estimator.py @@ -303,7 +303,7 @@ def validate(epoch, model, loader): "num_workers": 0, }, "frame_averaging": None, - "fa_frames": None, + "fa_method": None, "silent": False, "graph_rewiring": "remove-tag-0", "de": { From 040b475d479cd3a3ed7f48f1b357456fa4e8cf94 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 18 Apr 2024 09:54:07 -0400 Subject: [PATCH 05/27] skip_co = concat is not possible --- .../exps/deup/gnn/{depfaenet.yaml => depfaenet-training.yaml} | 0 configs/exps/deup/gnn/faenet-training.yaml | 4 ++-- ocpmodels/datasets/deup_dataset_creator.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) rename configs/exps/deup/gnn/{depfaenet.yaml => depfaenet-training.yaml} (100%) diff --git a/configs/exps/deup/gnn/depfaenet.yaml b/configs/exps/deup/gnn/depfaenet-training.yaml similarity index 100% rename from configs/exps/deup/gnn/depfaenet.yaml rename to configs/exps/deup/gnn/depfaenet-training.yaml diff --git a/configs/exps/deup/gnn/faenet-training.yaml b/configs/exps/deup/gnn/faenet-training.yaml index 5e5575263..8bf38ec5f 100644 --- a/configs/exps/deup/gnn/faenet-training.yaml +++ b/configs/exps/deup/gnn/faenet-training.yaml @@ -18,7 +18,7 @@ default: runs: - config: faenet-is2re-all - note: "top-runs" + note: "top run no concat" frame_averaging: 2D fa_method: se3-random model: @@ -34,7 +34,7 @@ runs: num_gaussians: 104 num_interactions: 5 second_layer_MLP: False - skip_co: concat + skip_co: False cutoff: 6.0 optim: lr_initial: 0.002 diff --git a/ocpmodels/datasets/deup_dataset_creator.py b/ocpmodels/datasets/deup_dataset_creator.py index 1af5cdf8f..b57522422 100644 --- a/ocpmodels/datasets/deup_dataset_creator.py +++ b/ocpmodels/datasets/deup_dataset_creator.py @@ -306,6 +306,7 @@ def create_deup_dataset( stats = {d: {} for d in dataset_strs} + # Loop on train, val_id, val_ood_cat, val_ood_ads for dataset_name in dataset_strs: deup_samples = [] deup_ds_size = 0 From 83659c643d744a1f156b6c6c931930a58615a967 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 19 Apr 2024 05:55:16 -0400 Subject: [PATCH 06/27] Merge only relevant changed from disconnected_gnn branch, to run depfaenet --- configs/exps/catalyst/gflownet.yaml | 143 ++++++++++ configs/exps/catalyst/reproduce-configs.yaml | 75 +++++ configs/models/depfaenet.yaml | 271 +++++++++++++++++++ configs/models/painn.yaml | 3 + mila/sbatch.py | 11 +- ocpmodels/common/flags.py | 18 +- ocpmodels/datasets/data_transforms.py | 30 ++ ocpmodels/models/__init__.py | 1 + ocpmodels/models/base_model.py | 26 +- ocpmodels/models/depfaenet.py | 97 +++++++ ocpmodels/preprocessing/graph_rewiring.py | 5 + ocpmodels/trainers/base_trainer.py | 32 ++- ocpmodels/trainers/single_trainer.py | 8 +- scripts/debug_faenet.py | 222 +++++++++++++++ 14 files changed, 929 insertions(+), 13 deletions(-) create mode 100644 configs/exps/catalyst/gflownet.yaml create mode 100644 configs/exps/catalyst/reproduce-configs.yaml create mode 100644 configs/models/depfaenet.yaml create mode 100644 ocpmodels/models/depfaenet.py create mode 100644 scripts/debug_faenet.py diff --git a/configs/exps/catalyst/gflownet.yaml b/configs/exps/catalyst/gflownet.yaml new file mode 100644 index 000000000..2432f4733 --- /dev/null +++ b/configs/exps/catalyst/gflownet.yaml @@ -0,0 +1,143 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 15:00:00 + +default: + # wandb_name: alvaro-carbonero-math + wandb_project: ocp-alvaro + wandb_tags: "gflownet-model" + test_ri: True + mode: train + # graph_rewiring: remove-tag-0 + graph_rewiring: "" + frame_averaging: 2D + fa_method: se3-random + cp_data_to_tmpdir: True + is_disconnected: true + model: + edge_embed_type: all_rij + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 0 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + batch_size: 256 + eval_batch_size: 256 + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 9 + eval_every: 0.4 + +runs: + + # - config: faenet-is2re-all + # note: baseline faenet + + # - config: depfaenet-is2re-all + # note: depfaenet baseline + + # - config: depfaenet-is2re-all + # note: depfaenet per-adsorbate + # adsorbates: {'*O', '*OH', '*OH2', '*H'} + + # - config: depfaenet-is2re-all + # note: depfaenet per-adsorbate long string + # adsorbates: '*O, *OH, *OH2, *H' + + # - config: depfaenet-is2re-all + # note: depfaenet per-adsorbate string of a list + # adsorbates: "*O, *OH, *OH2, *H" + + # - config: depfaenet-is2re-all + # note: Trained on selected adsorbate more epochs + # adsorbates: "*O, *OH, *OH2, *H" + # optim: + # max_epochs: 10 + + # - config: depfaenet-is2re-all + # note: depfaenet full data + + # - config: depfaenet-is2re-all + # note: To be used for continue from dir + + # - config: depfaenet-is2re-all + # note: Fine-tune on per-ads-dataset 4 epoch + # continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + # adsorbates: "*O, *OH, *OH2, *H" + # optim: + # max_epochs: 4 + # lr_initial: 0.00015 + + # - config: depfaenet-is2re-all + # note: Fine-tune on per-ads-dataset 10 epoch + # continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + # adsorbates: "*O, *OH, *OH2, *H" + # optim: + # max_epochs: 10 + # lr_initial: 0.00015 + + - config: depfaenet-is2re-all + note: Fine-tune on per-ads-dataset 10 epoch + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 20 + lr_initial: 0.0001 + + - config: depfaenet-is2re-all + note: Fine-tune on per-ads-dataset 20 epoch + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 20 + lr_initial: 0.00015 + + - config: depfaenet-is2re-all + note: Fine-tune on per-ads-dataset 15 epoch + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 15 + lr_initial: 0.0002 + + - config: depfaenet-is2re-all + note: Fine-tune on per-ads-dataset 10 epoch + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 10 + lr_initial: 0.0001 + + - config: depfaenet-is2re-all + note: Fine-tune on per-ads-dataset starting from fine-tuned model + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4071859 + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 10 + lr_initial: 0.0001 + + - config: depfaenet-is2re-all + note: Trained on selected adsorbate + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 25 + lr_initial: 0.0001 + + - config: depfaenet-is2re-all + note: Trained on selected adsorbate + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 25 diff --git a/configs/exps/catalyst/reproduce-configs.yaml b/configs/exps/catalyst/reproduce-configs.yaml new file mode 100644 index 000000000..c4c834585 --- /dev/null +++ b/configs/exps/catalyst/reproduce-configs.yaml @@ -0,0 +1,75 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 15:00:00 + +default: + # wandb_name: alvaro-carbonero-math + wandb_project: ocp-alvaro + wandb_tags: "reproduce-best-config" + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + note: "repoduce-top-run" + frame_averaging: 2D + fa_method: se3-random + cp_data_to_tmpdir: True + is_disconnected: true + model: + edge_embed_type: all_rij + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + batch_size: 256 + eval_batch_size: 256 + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 9 + eval_every: 0.4 + +runs: + + - config: faenet-is2re-all + note: baseline faenet + + - config: indfaenet-is2re-all + note: baseline with top configs + + - config: indfaenet-is2re-all + note: baseline with runs' configs + model: + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 528 + num_filters: 672 + num_gaussians: 148 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + + - config: depfaenet-is2re-all + note: baseline with top configs + + - config: indfaenet-is2re-all + note: so that ads get old dimensions + model: + hidden_channels: 704 + num_gaussians: 200 + num_filters: 896 \ No newline at end of file diff --git a/configs/models/depfaenet.yaml b/configs/models/depfaenet.yaml new file mode 100644 index 000000000..852ebc3bf --- /dev/null +++ b/configs/models/depfaenet.yaml @@ -0,0 +1,271 @@ +default: + model: + name: depfaenet + act: swish + hidden_channels: 128 + num_filters: 100 + num_interactions: 3 + num_gaussians: 100 + cutoff: 6.0 + use_pbc: True + regress_forces: False + # drlab attributes: + tag_hidden_channels: 0 # 32 + pg_hidden_channels: 0 # 32 -> period & group embedding hidden channels + phys_embeds: False # True + phys_hidden_channels: 0 + energy_head: False # can be {False, weighted-av-initial-embeds, weighted-av-final-embeds, pooling, graclus, random} + # faenet new features + skip_co: False # output skip connections {False, "add", "concat"} + second_layer_MLP: False # in EmbeddingBlock + complex_mp: False + edge_embed_type: rij # {'rij','all_rij','sh', 'all'}) + mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'} + graph_norm: False # bool + att_heads: 1 # int + force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True + force_decoder_model_config: + simple: + hidden_channels: 128 + norm: batch1d # batch1d, layer or null + mlp: + hidden_channels: 256 + norm: batch1d # batch1d, layer or null + res: + hidden_channels: 128 + norm: batch1d # batch1d, layer or null + res_updown: + hidden_channels: 128 + norm: batch1d # batch1d, layer or null + optim: + batch_size: 64 + eval_batch_size: 64 + num_workers: 4 + lr_gamma: 0.1 + lr_initial: 0.001 + warmup_factor: 0.2 + max_epochs: 20 + energy_grad_coefficient: 10 + force_coefficient: 30 + energy_coefficient: 1 + + frame_averaging: False # 2D, 3D, da, False + fa_frames: False # can be {None, full, random, det, e3, e3-random, e3-det} + +# ------------------- +# ----- IS2RE ----- +# ------------------- + +is2re: + # *** Important note *** + # The total number of gpus used for this run was 1. + # If the global batch size (num_gpus * batch_size) is modified + # the lr_milestones and warmup_steps need to be adjusted accordingly. + 10k: + optim: + lr_initial: 0.005 + lr_milestones: # epochs at which lr_initial <- lr_initial * lr_gamma + - 1562 + - 2343 + - 3125 + warmup_steps: 468 + max_epochs: 20 + + 100k: + model: + hidden_channels: 256 + optim: + lr_initial: 0.005 + lr_milestones: # epochs at which lr_initial <- lr_initial * lr_gamma + - 1562 + - 2343 + - 3125 + warmup_steps: 468 + max_epochs: 20 + + all: + model: + hidden_channels: 384 + num_interactions: 4 + optim: + batch_size: 256 + eval_batch_size: 256 + lr_initial: 0.001 + lr_gamma: 0.1 + lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma + - 18000 + - 27000 + - 37000 + warmup_steps: 6000 + max_epochs: 20 + +# ------------------ +# ----- S2EF ----- +# ------------------ + +# For 2 GPUs + +s2ef: + default: + model: + num_interactions: 4 + hidden_channels: 750 + num_gaussians: 200 + num_filters: 256 + regress_forces: "direct" + force_coefficient: 30 + energy_grad_coefficient: 10 + optim: + batch_size: 96 + eval_batch_size: 96 + warmup_factor: 0.2 + lr_gamma: 0.1 + lr_initial: 0.0001 + max_epochs: 15 + warmup_steps: 30000 + lr_milestones: + - 55000 + - 75000 + - 10000 + + 200k: {} + + # 1 gpus + 2M: + model: + num_interactions: 5 + hidden_channels: 1024 + num_gaussians: 200 + num_filters: 256 + optim: + batch_size: 192 + eval_batch_size: 192 + + 20M: {} + + all: {} + +qm9: + default: + model: + act: swish + att_heads: 1 + complex_mp: true + cutoff: 6.0 + edge_embed_type: all_rij + energy_head: '' + graph_norm: true + graph_rewiring: null + hidden_channels: 400 + max_num_neighbors: 30 + mp_type: updownscale_base + num_filters: 480 + num_gaussians: 100 + num_interactions: 5 + otf_graph: false + pg_hidden_channels: 32 + phys_embeds: false + phys_hidden_channels: 0 + regress_forces: '' + second_layer_MLP: true + skip_co: true + tag_hidden_channels: 0 + use_pbc: false + + optim: + batch_size: 64 + es_min_abs_change: 1.0e-06 + es_patience: 20 + es_warmup_epochs: 600 + eval_batch_size: 64 + factor: 0.9 + lr_initial: 0.0003 + loss_energy: mse + lr_gamma: 0.1 + lr_initial: 0.001 + max_epochs: 1500 + min_lr: 1.0e-06 + mode: min + optimizer: AdamW + patience: 15 + scheduler: ReduceLROnPlateau + threshold: 0.0001 + threshold_mode: abs + verbose: true + warmup_factor: 0.2 + warmup_steps: 3000 + + 10k: {} + all: {} + +qm7x: + default: + model: # SOTA settings + act: swish + att_heads: 1 + complex_mp: true + cutoff: 5.0 + edge_embed_type: all_rij + energy_head: false + force_decoder_model_config: + mlp: + hidden_channels: 256 + norm: batch1d + res: + hidden_channels: 128 + norm: batch1d + res_updown: + hidden_channels: 128 + norm: layer + simple: + hidden_channels: 128 + norm: batch1d + force_decoder_type: res_updown + graph_norm: false + hidden_channels: 500 + max_num_neighbors: 40 + mp_type: updownscale_base + num_filters: 400 + num_gaussians: 50 + num_interactions: 5 + otf_graph: false + pg_hidden_channels: 32 + phys_embeds: true + phys_hidden_channels: 0 + regress_forces: direct_with_gradient_target + second_layer_MLP: true + skip_co: false + tag_hidden_channels: 0 + use_pbc: false + + optim: + batch_size: 100 + energy_grad_coefficient: 5 + eval_batch_size: 100 + eval_every: 0.34 + factor: 0.75 + force_coefficient: 75 + loss_energy: mae + loss_force: mse + lr_gamma: 0.1 + lr_initial: 0.000193 + max_steps: 4000000 + min_lr: 1.0e-06 + mode: min + optimizer: AdamW + scheduler: ReduceLROnPlateau + threshold: 0.001 + threshold_mode: abs + verbose: true + warmup_factor: 0.2 + warmup_steps: 3000 + + all: {} + 1k: {} + +qm9: + default: + model: + use_pbc: False + all: {} + 10k: {} diff --git a/configs/models/painn.yaml b/configs/models/painn.yaml index 2c0abac11..c138652a8 100644 --- a/configs/models/painn.yaml +++ b/configs/models/painn.yaml @@ -2,6 +2,9 @@ default: model: name: painn use_pbc: True + optim: + num_workers: 4 + eval_batch_size: 64 # ------------------- # ----- IS2RE ----- diff --git a/mila/sbatch.py b/mila/sbatch.py index ed8fa878d..b6417adf1 100644 --- a/mila/sbatch.py +++ b/mila/sbatch.py @@ -1,12 +1,13 @@ -from minydra import resolved_args, MinyDict -from pathlib import Path -from datetime import datetime import os +import re import subprocess -from shutil import copyfile import sys -import re +from datetime import datetime +from pathlib import Path +from shutil import copyfile + import yaml +from minydra import MinyDict, resolved_args IS_DRAC = ( "narval.calcul.quebec" in os.environ.get("HOSTNAME", "") diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index a6fbf20d0..761e61dac 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -87,12 +87,14 @@ def add_core_args(self): "--checkpoint", type=str, help="Model checkpoint to load" ) self.parser.add_argument( - "--continue_from_dir", type=str, help="Run to continue, loading its config" + "--continue_from_dir", + type=str, + help="Continue an existing run, loading its config and overwriting desired arguments", ) self.parser.add_argument( "--restart_from_dir", type=str, - help="Run to restart, loading its config and overwriting " + help="Restart training from an existing run, loading its config and overwriting args" + "from the command-line", ) self.parser.add_argument( @@ -293,6 +295,18 @@ def add_core_args(self): help="Number of validation loops to run in order to collect inference" + " timing stats", ) + self.parser.add_argument( + "--is_disconnected", + type=bool, + default=False, + help="Eliminates edges between catalyst and adsorbate.", + ) + self.parser.add_argument( + "--lowest_energy_only", + type=bool, + default=False, + help="Makes trainer use the lowest energy data point for every (catalyst, adsorbate, cell) tuple. ONLY USE WITH ALL DATASET", + ) flags = Flags() diff --git a/ocpmodels/datasets/data_transforms.py b/ocpmodels/datasets/data_transforms.py index 6c26d2a9a..17a63dfa5 100644 --- a/ocpmodels/datasets/data_transforms.py +++ b/ocpmodels/datasets/data_transforms.py @@ -127,6 +127,35 @@ def __call__(self, data): return self.rewiring_func(data) +class Disconnected(Transform): + def __init__(self, is_disconnected=False) -> None: + self.inactive = not is_disconnected + + def edge_classifier(self, edge_index, tags): + edges_with_tags = tags[ + edge_index.type(torch.long) + ] # Tensor with shape=edge_index.shape where every entry is a tag + filt1 = edges_with_tags[0] == edges_with_tags[1] + filt2 = (edges_with_tags[0] != 2) * (edges_with_tags[1] != 2) + + # Edge is removed if tags are different (R1), and at least one end has tag 2 (R2). We want ~(R1*R2) = ~R1+~R2. + # filt1 = ~R1. Let L1 be that head has tag 2, and L2 is that tail has tag 2. Then R2 = L1+L2, so ~R2 = ~L1*~L2 = filt2. + + return filt1 + filt2 + + def __call__(self, data): + if self.inactive: + return data + + values = self.edge_classifier(data.edge_index, data.tags) + + data.edge_index = data.edge_index[:, values] + data.cell_offsets = data.cell_offsets[values, :] + data.distances = data.distances[values] + + return data + + class Compose: # https://pytorch.org/vision/stable/_modules/torchvision/transforms/transforms.html#Compose def __init__(self, transforms): @@ -167,5 +196,6 @@ def get_transforms(trainer_config): AddAttributes(), GraphRewiring(trainer_config.get("graph_rewiring")), FrameAveraging(trainer_config["frame_averaging"], trainer_config["fa_method"]), + Disconnected(trainer_config["is_disconnected"]), ] return Compose(transforms) diff --git a/ocpmodels/models/__init__.py b/ocpmodels/models/__init__.py index a722f7817..c15c217b0 100644 --- a/ocpmodels/models/__init__.py +++ b/ocpmodels/models/__init__.py @@ -7,6 +7,7 @@ from .cgcnn import CGCNN # noqa: F401 from .dimenet import DimeNet # noqa: F401 from .faenet import FAENet # noqa: F401 +from .depfaenet import depFAENet # noqa: F401 from .gemnet.gemnet import GemNetT # noqa: F401 from .dimenet_plus_plus import DimeNetPlusPlus # noqa: F401 from .forcenet import ForceNet # noqa: F401 diff --git a/ocpmodels/models/base_model.py b/ocpmodels/models/base_model.py index 4a5c84a20..e2df0e737 100644 --- a/ocpmodels/models/base_model.py +++ b/ocpmodels/models/base_model.py @@ -4,10 +4,12 @@ This source code is licensed under the MIT license found in the LICENSE file in the root directory of this source tree. """ + import logging import torch import torch.nn as nn +from torch_geometric.data import HeteroData from torch_geometric.nn import radius_graph from ocpmodels.common.utils import ( @@ -74,7 +76,14 @@ def forward(self, data, mode="train", regress_forces=None, q=None): # energy gradient w.r.t. positions will be computed if mode == "train" or self.regress_forces == "from_energy": - data.pos.requires_grad_(True) + if type(data) is list: + data[0].pos.requires_grad_(True) + data[1].pos.requires_grad_(True) + elif type(data[0]) is HeteroData: + data["adsorbate"].pos.requires_grad_(True) + data["catalyst"].pos.requires_grad_(True) + else: + data.pos.requires_grad_(True) # predict energy preds = self.energy_forward(data, q=q) @@ -85,7 +94,20 @@ def forward(self, data, mode="train", regress_forces=None, q=None): forces = self.forces_forward(preds) if mode == "train" or self.regress_forces == "from_energy": - grad_forces = self.forces_as_energy_grad(data.pos, preds["energy"]) + if ( + "gemnet" in self.__class__.__name__.lower() + and self.regress_forces == "from_energy" + ): + # gemnet forces are already computed + grad_forces = forces + else: + # compute forces from energy gradient + try: + grad_forces = self.forces_as_energy_grad( + data.pos, preds["energy"] + ) + except: + grad_forces = self.forces_as_energy_grad(data["adsorbate"].pos) if self.regress_forces == "from_energy": # predicted forces are the energy gradient diff --git a/ocpmodels/models/depfaenet.py b/ocpmodels/models/depfaenet.py new file mode 100644 index 000000000..25f6a0968 --- /dev/null +++ b/ocpmodels/models/depfaenet.py @@ -0,0 +1,97 @@ +import torch +from torch.nn import Linear +from torch import nn +from torch_scatter import scatter + +from ocpmodels.models.faenet import FAENet +from ocpmodels.models.faenet import OutputBlock as conOutputBlock +from ocpmodels.common.registry import registry +from ocpmodels.common.utils import conditional_grad +from ocpmodels.models.utils.activations import swish + +from torch_geometric.data import Batch + + +class discOutputBlock(conOutputBlock): + def __init__(self, energy_head, hidden_channels, act, disconnected_mlp=False): + super(discOutputBlock, self).__init__(energy_head, hidden_channels, act) + + # We modify the last output linear function to make the output a vector + self.lin2 = Linear(hidden_channels // 2, hidden_channels // 2) + + self.disconnected_mlp = disconnected_mlp + if self.disconnected_mlp: + self.ads_lin = Linear(hidden_channels // 2, hidden_channels // 2) + self.cat_lin = Linear(hidden_channels // 2, hidden_channels // 2) + + # Combines the hidden representation of each to a scalar. + self.combination = nn.Sequential( + Linear(hidden_channels // 2 * 2, hidden_channels // 2), + swish, + Linear(hidden_channels // 2, 1), + ) + + def tags_saver(self, tags): + self.current_tags = tags + + def forward(self, h, edge_index, edge_weight, batch, alpha): + if ( + self.energy_head == "weighted-av-final-embeds" + ): # Right now, this is the only available option. + alpha = self.w_lin(h) + + elif self.energy_head == "graclus": + h, batch = self.graclus(h, edge_index, edge_weight, batch) + + elif self.energy_head in {"pooling", "random"}: + h, batch, pooling_loss = self.hierarchical_pooling( + h, edge_index, edge_weight, batch + ) + + # MLP + h = self.lin1(h) + h = self.lin2(self.act(h)) + + if self.energy_head in { + "weighted-av-initial-embeds", + "weighted-av-final-embeds", + }: + h = h * alpha + + # We pool separately and then we concatenate. + ads = self.current_tags == 2 + cat = ~ads + + ads_out = scatter(h, batch * ads, dim=0, reduce="add") + cat_out = scatter(h, batch * cat, dim=0, reduce="add") + + if self.disconnected_mlp: + ads_out = self.ads_lin(ads_out) + cat_out = self.cat_lin(cat_out) + + system = torch.cat([ads_out, cat_out], dim=1) + + # Finally, we predict a number. + energy = self.combination(system) + + return energy + + +@registry.register_model("depfaenet") +class depFAENet(FAENet): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + # We replace the old output block by the new output block + self.disconnected_mlp = kwargs.get("disconnected_mlp", False) + self.output_block = discOutputBlock( + self.energy_head, kwargs["hidden_channels"], self.act, self.disconnected_mlp + ) + + @conditional_grad(torch.enable_grad()) + def energy_forward(self, data): + # We need to save the tags so this step is necessary. + self.output_block.tags_saver(data.tags) + pred = super().energy_forward(data) + + return pred diff --git a/ocpmodels/preprocessing/graph_rewiring.py b/ocpmodels/preprocessing/graph_rewiring.py index 2f3b103a6..b9115e907 100644 --- a/ocpmodels/preprocessing/graph_rewiring.py +++ b/ocpmodels/preprocessing/graph_rewiring.py @@ -36,6 +36,11 @@ def remove_tag0_nodes(data): data.tags = data.tags[non_sub] if hasattr(data, "pos_relaxed"): data.pos_relaxed = data.pos_relaxed[non_sub, :] + if hasattr(data, "query"): + data.h = data.h[non_sub, :] + data.query = data.query[non_sub, :] + data.key = data.key[non_sub, :] + data.value = data.value[non_sub, :] # per-edge tensors data.edge_index = data.edge_index[:, neither_is_sub] diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index ea1537737..e871027ef 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -8,6 +8,7 @@ import errno import logging import os +import pickle import random import time from abc import ABC, abstractmethod @@ -24,7 +25,7 @@ from rich.console import Console from rich.table import Table from torch.nn.parallel.distributed import DistributedDataParallel -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, Subset from torch_geometric.data import Batch from tqdm import tqdm @@ -57,6 +58,7 @@ class BaseTrainer(ABC): def __init__(self, load=True, **kwargs): run_dir = kwargs["run_dir"] + model_name = kwargs["model"].pop( "name", kwargs.get("model_name", "Unknown - base_trainer issue") ) @@ -173,9 +175,21 @@ def __init__(self, load=True, **kwargs): ) (run_dir / f"config-{JOB_ID}.yaml").write_text(yaml.dump(self.config)) - if load: - self.load() + # Here's the models whose edges are removed as a transform + transform_models = [ + "depfaenet", + ] + if self.config["is_disconnected"]: + print("\n\nHeads up: cat-ads edges being removed!") + if self.config["model_name"] in transform_models: + if not self.config["is_disconnected"]: + print( + f"\n\nWhen using {self.config['model_name']},", + "the flag 'is_disconnected' should be used! The flag has been turned on.\n", + ) + self.config["is_disconnected"] = True + self.load() self.evaluator = Evaluator( task=self.task_name, model_regresses_forces=self.config["model"].get("regress_forces", ""), @@ -244,6 +258,7 @@ def get_dataloader(self, dataset, sampler): pin_memory=True, batch_sampler=sampler, ) + return loader def load_datasets(self): @@ -281,6 +296,16 @@ def load_datasets(self): silent=self.silent, ) + if self.config["lowest_energy_only"]: + with open( + "/network/scratch/a/alvaro.carbonero/lowest_energy.pkl", "rb" + ) as fp: + good_indices = pickle.load(fp) + good_indices = list(good_indices) + + self.real_dataset = self.datasets["train"] + self.datasets["train"] = Subset(self.datasets["train"], good_indices) + shuffle = False if "train" in split: shuffle = True @@ -402,6 +427,7 @@ def load_model(self): "task_name": self.task_name, }, **self.config["model"], + "model_name": self.config["model_name"], } self.model = registry.get_model_class(self.config["model_name"])( diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index c8850fe1a..25f82ec9a 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -227,6 +227,8 @@ def train( # Calculate start_epoch from step instead of loading the epoch number # to prevent inconsistencies due to different batch size in checkpoint. + if self.config["continue_from_dir"] is not None and self.config["adsorbates"] not in {None, "all"}: + self.step = 0 start_epoch = self.step // n_train max_epochs = self.config["optim"]["max_epochs"] timer = Times() @@ -498,7 +500,11 @@ def end_of_training( # Close datasets if debug_batches < 0: for ds in self.datasets.values(): - ds.close_db() + try: + ds.close_db() + except: + assert self.config["lowest_energy_only"] == True + self.real_dataset.close_db() def model_forward(self, batch_list, mode="train", q=None): """Perform a forward pass of the model when frame averaging is applied. diff --git a/scripts/debug_faenet.py b/scripts/debug_faenet.py new file mode 100644 index 000000000..56d79c3d6 --- /dev/null +++ b/scripts/debug_faenet.py @@ -0,0 +1,222 @@ +""" +Copyright (c) Facebook, Inc. and its affiliates. + +This source code is licensed under the MIT license found in the +LICENSE file in the root directory of this source tree. +""" + +import logging +import os +import time +import traceback +import sys +import torch +from yaml import dump + +from ocpmodels.common import dist_utils +from ocpmodels.common.flags import flags +from ocpmodels.common.registry import registry +from ocpmodels.common.utils import ( + JOB_ID, + auto_note, + build_config, + merge_dicts, + move_lmdb_data_to_slurm_tmpdir, + resolve, + setup_imports, + setup_logging, + update_from_sbatch_py_vars, + set_min_hidden_channels, +) +from ocpmodels.common.orion_utils import ( + continue_orion_exp, + load_orion_exp, + sample_orion_hparams, +) +from ocpmodels.trainers import BaseTrainer + +# os.environ["CUDA_LAUNCH_BLOCKING"] = "1" +torch.multiprocessing.set_sharing_strategy("file_system") + + +def print_warnings(): + warnings = [ + "`max_num_neighbors` is set to 40. This should be tuned per model.", + "`tag_specific_weights` is not handled for " + + "`regress_forces: direct_with_gradient_target` in compute_loss()", + ] + print("\n" + "-" * 80 + "\n") + print("šŸ›‘ OCP-DR-Lab Warnings (nota benes):") + for warning in warnings: + print(f" ā€¢ {warning}") + print("Remove warnings when they are fixed in the code/configs.") + print("\n" + "-" * 80 + "\n") + + +def wrap_up(args, start_time, error=None, signal=None, trainer=None): + total_time = time.time() - start_time + logging.info(f"Total time taken: {total_time}") + if trainer and trainer.logger is not None: + trainer.logger.log({"Total time": total_time}) + + if args.distributed: + print( + "\nWaiting for all processes to finish with dist_utils.cleanup()...", + end="", + ) + dist_utils.cleanup() + print("Done!") + + if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read(): + print("\nSelf-canceling SLURM job in 32s", JOB_ID) + os.popen(f"sleep 32 && scancel {JOB_ID}") + + if trainer and trainer.logger: + trainer.logger.finish(error or signal) + + +if __name__ == "__main__": + error = signal = orion_exp = orion_trial = trainer = None + orion_race_condition = False + hparams = {} + + setup_logging() + + parser = flags.get_parser() + args, override_args = parser.parse_known_args() + args = update_from_sbatch_py_vars(args) + if args.logdir: + args.logdir = resolve(args.logdir) + + # -- Build config + + args.wandb_name = "alvaro-carbonero-math" + args.wandb_project = "ocp-alvaro" + args.test_ri = True + args.mode = "train" + args.graph_rewiring = "remove-tag-0" + args.cp_data_to_tmpdir = True + args.config = "indfaenet-is2re-10k" + args.frame_averaging = "2D" + args.fa_frames = "se3-random" + + trainer_config = build_config(args, override_args) + + if dist_utils.is_master(): + trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config) + dist_utils.synchronize() + + trainer_config["dataset"] = dist_utils.broadcast_from_master( + trainer_config["dataset"] + ) + + trainer_config["model"]["edge_embed_type"] = "all_rij" + trainer_config["model"]["mp_type"] = "updownscale" + trainer_config["model"]["phys_embeds"] = True + trainer_config["model"]["tag_hidden_channels"] = 32 + trainer_config["model"]["pg_hidden_channels"] = 64 + trainer_config["model"]["energy_head"] = "weighted-av-final-embeds" + trainer_config["model"]["complex_mp"] = False + trainer_config["model"]["graph_norm"] = True + trainer_config["model"]["hidden_channels"] = 352 + trainer_config["model"]["num_filters"] = 448 + trainer_config["model"]["num_gaussians"] = 99 + trainer_config["model"]["num_interactions"] = 6 + trainer_config["model"]["second_layer_MLP"] = True + trainer_config["model"]["skip_co"] = "concat" + # trainer_config["model"]["transformer_out"] = False + trainer_config["model"]["afaenet_gat_mode"] = "v1" + # trainer_config["model"]["disconnected_mlp"] = True + + # trainer_config["optim"]["batch_sizes"] = 256 + # trainer_config["optim"]["eval_batch_sizes"] = 256 + trainer_config["optim"]["lr_initial"] = 0.0019 + trainer_config["optim"]["scheduler"] = "LinearWarmupCosineAnnealingLR" + trainer_config["optim"]["max_epochs"] = 20 + trainer_config["optim"]["eval_every"] = 0.4 + + # -- Initial setup + + setup_imports() + print("\nšŸš© All things imported.\n") + start_time = time.time() + + try: + # -- Orion + + if args.orion_exp_config_path and dist_utils.is_master(): + orion_exp = load_orion_exp(args) + hparams, orion_trial = sample_orion_hparams(orion_exp, trainer_config) + + if hparams.get("orion_race_condition"): + logging.warning("\n\n ā›”ļø Orion race condition. Stopping here.\n\n") + wrap_up(args, start_time, error, signal) + sys.exit() + + hparams = dist_utils.broadcast_from_master(hparams) + if hparams: + print("\nšŸ’Ž Received hyper-parameters from Orion:") + print(dump(hparams), end="\n") + trainer_config = merge_dicts(trainer_config, hparams) + + # -- Setup trainer + trainer_config = continue_orion_exp(trainer_config) + trainer_config = auto_note(trainer_config) + trainer_config = set_min_hidden_channels(trainer_config) + + try: + cls = registry.get_trainer_class(trainer_config["trainer"]) + trainer: BaseTrainer = cls(**trainer_config) + except Exception as e: + traceback.print_exc() + logging.warning(f"\nšŸ’€ Error in trainer initialization: {e}\n") + signal = "trainer_init_error" + + if signal is None: + task = registry.get_task_class(trainer_config["mode"])(trainer_config) + task.setup(trainer) + print_warnings() + + # -- Start Training + + signal = task.run() + + # -- End of training + + # handle job preemption / time limit + if signal == "SIGTERM": + print("\nJob was preempted. Wrapping up...\n") + if trainer: + trainer.close_datasets() + + dist_utils.synchronize() + + objective = dist_utils.broadcast_from_master( + trainer.objective if trainer else None + ) + + if orion_exp is not None: + if objective is None: + if signal == "loss_is_nan": + objective = 1e12 + print("Received NaN objective from worker. Setting to 1e12.") + if signal == "trainer_init_error": + objective = 1e12 + print( + "Received trainer_init_error from worker.", + "Setting objective to 1e12.", + ) + if objective is not None: + orion_exp.observe( + orion_trial, + [{"type": "objective", "name": "energy_mae", "value": objective}], + ) + else: + print("Received None objective from worker. Skipping observation.") + + except Exception: + error = True + print(traceback.format_exc()) + + finally: + wrap_up(args, start_time, error, signal, trainer=trainer) From 442ca59bcd788167e7c0533dde0a8b7c0e9a1770 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 19 Apr 2024 06:54:22 -0400 Subject: [PATCH 07/27] remove edge_embed_type --- configs/exps/catalyst/gflownet.yaml | 76 ++++++++++++------------ configs/exps/is2re/top-configs.yaml | 2 - configs/exps/orion/faenet-is2re-all.yaml | 2 - configs/exps/orion/faenet-qm9.yaml | 2 - configs/models/depfaenet.yaml | 3 - configs/models/deup_faenet.yaml | 3 - scripts/debug_faenet.py | 1 - scripts/test_all.py | 6 +- 8 files changed, 40 insertions(+), 55 deletions(-) diff --git a/configs/exps/catalyst/gflownet.yaml b/configs/exps/catalyst/gflownet.yaml index 2432f4733..4499b6e2a 100644 --- a/configs/exps/catalyst/gflownet.yaml +++ b/configs/exps/catalyst/gflownet.yaml @@ -6,9 +6,8 @@ job: time: 15:00:00 default: - # wandb_name: alvaro-carbonero-math - wandb_project: ocp-alvaro - wandb_tags: "gflownet-model" + wandb_project: ocp-deup # ocp-alvaro + wandb_tags: gflownet-model, depfaenet test_ri: True mode: train # graph_rewiring: remove-tag-0 @@ -18,7 +17,6 @@ default: cp_data_to_tmpdir: True is_disconnected: true model: - edge_embed_type: all_rij mp_type: updownscale_base phys_embeds: True tag_hidden_channels: 0 @@ -89,55 +87,55 @@ runs: # max_epochs: 10 # lr_initial: 0.00015 - - config: depfaenet-is2re-all - note: Fine-tune on per-ads-dataset 10 epoch - continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 - adsorbates: "*O, *OH, *OH2, *H" - optim: - max_epochs: 20 - lr_initial: 0.0001 + # - config: depfaenet-is2re-all + # note: Fine-tune on per-ads-dataset 10 epoch + # continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + # adsorbates: "*O, *OH, *OH2, *H" + # optim: + # max_epochs: 20 + # lr_initial: 0.0001 - - config: depfaenet-is2re-all - note: Fine-tune on per-ads-dataset 20 epoch - continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 - adsorbates: "*O, *OH, *OH2, *H" - optim: - max_epochs: 20 - lr_initial: 0.00015 + # - config: depfaenet-is2re-all + # note: Fine-tune on per-ads-dataset 20 epoch + # continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + # adsorbates: "*O, *OH, *OH2, *H" + # optim: + # max_epochs: 20 + # lr_initial: 0.00015 - config: depfaenet-is2re-all - note: Fine-tune on per-ads-dataset 15 epoch + note: Depfaenet per-ads-dataset continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 adsorbates: "*O, *OH, *OH2, *H" optim: - max_epochs: 15 + max_epochs: 12 lr_initial: 0.0002 - config: depfaenet-is2re-all - note: Fine-tune on per-ads-dataset 10 epoch + note: Depfaenet per-ads-dataset continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 adsorbates: "*O, *OH, *OH2, *H" optim: max_epochs: 10 lr_initial: 0.0001 - - config: depfaenet-is2re-all - note: Fine-tune on per-ads-dataset starting from fine-tuned model - continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4071859 - adsorbates: "*O, *OH, *OH2, *H" - optim: - max_epochs: 10 - lr_initial: 0.0001 + # - config: depfaenet-is2re-all + # note: Fine-tune on per-ads-dataset starting from fine-tuned model + # continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4071859 + # adsorbates: "*O, *OH, *OH2, *H" + # optim: + # max_epochs: 10 + # lr_initial: 0.0001 - - config: depfaenet-is2re-all - note: Trained on selected adsorbate - adsorbates: "*O, *OH, *OH2, *H" - optim: - max_epochs: 25 - lr_initial: 0.0001 + # - config: depfaenet-is2re-all + # note: Trained on selected adsorbate + # adsorbates: "*O, *OH, *OH2, *H" + # optim: + # max_epochs: 25 + # lr_initial: 0.0001 - - config: depfaenet-is2re-all - note: Trained on selected adsorbate - adsorbates: "*O, *OH, *OH2, *H" - optim: - max_epochs: 25 + # - config: depfaenet-is2re-all + # note: Trained on selected adsorbate + # adsorbates: "*O, *OH, *OH2, *H" + # optim: + # max_epochs: 25 diff --git a/configs/exps/is2re/top-configs.yaml b/configs/exps/is2re/top-configs.yaml index cf4e79fe4..6fa882648 100644 --- a/configs/exps/is2re/top-configs.yaml +++ b/configs/exps/is2re/top-configs.yaml @@ -9,8 +9,6 @@ default: test_ri: True mode: train graph_rewiring: remove-tag-0 - model: - edge_embed_type: all_rij wandb_tags: "best-config" optim: batch_size: 256 diff --git a/configs/exps/orion/faenet-is2re-all.yaml b/configs/exps/orion/faenet-is2re-all.yaml index b3a1ccbca..baecd59d9 100644 --- a/configs/exps/orion/faenet-is2re-all.yaml +++ b/configs/exps/orion/faenet-is2re-all.yaml @@ -14,8 +14,6 @@ default: wandb_tags: is2re-all, orion cp_data_to_tmpdir: true graph_rewiring: remove-tag-0 - model: - edge_embed_type: all_rij frame_averaging: 2D fa_method: random optim: diff --git a/configs/exps/orion/faenet-qm9.yaml b/configs/exps/orion/faenet-qm9.yaml index 2d26414fd..722ed4472 100644 --- a/configs/exps/orion/faenet-qm9.yaml +++ b/configs/exps/orion/faenet-qm9.yaml @@ -39,8 +39,6 @@ default: targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, batch_size frame_averaging: 3D fa_method: random - model: - edge_embed_type: all_rij orion: # Remember to change the experiment name if you change anything in the search space diff --git a/configs/models/depfaenet.yaml b/configs/models/depfaenet.yaml index 852ebc3bf..da19d6e35 100644 --- a/configs/models/depfaenet.yaml +++ b/configs/models/depfaenet.yaml @@ -19,7 +19,6 @@ default: skip_co: False # output skip connections {False, "add", "concat"} second_layer_MLP: False # in EmbeddingBlock complex_mp: False - edge_embed_type: rij # {'rij','all_rij','sh', 'all'}) mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'} graph_norm: False # bool att_heads: 1 # int @@ -152,7 +151,6 @@ qm9: att_heads: 1 complex_mp: true cutoff: 6.0 - edge_embed_type: all_rij energy_head: '' graph_norm: true graph_rewiring: null @@ -205,7 +203,6 @@ qm7x: att_heads: 1 complex_mp: true cutoff: 5.0 - edge_embed_type: all_rij energy_head: false force_decoder_model_config: mlp: diff --git a/configs/models/deup_faenet.yaml b/configs/models/deup_faenet.yaml index efa779c80..bdc723bb5 100644 --- a/configs/models/deup_faenet.yaml +++ b/configs/models/deup_faenet.yaml @@ -25,7 +25,6 @@ default: skip_co: False # output skip connections {False, "add", "concat"} second_layer_MLP: False # in EmbeddingBlock complex_mp: False - edge_embed_type: rij # {'rij','all_rij','sh', 'all'}) mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'} graph_norm: False # bool att_heads: 1 # int @@ -153,7 +152,6 @@ qm9: att_heads: 1 complex_mp: true cutoff: 6.0 - edge_embed_type: all_rij energy_head: '' graph_norm: true graph_rewiring: null @@ -205,7 +203,6 @@ qm7x: att_heads: 1 complex_mp: true cutoff: 5.0 - edge_embed_type: all_rij energy_head: false force_decoder_model_config: mlp: diff --git a/scripts/debug_faenet.py b/scripts/debug_faenet.py index 56d79c3d6..6e55aef82 100644 --- a/scripts/debug_faenet.py +++ b/scripts/debug_faenet.py @@ -110,7 +110,6 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None): trainer_config["dataset"] ) - trainer_config["model"]["edge_embed_type"] = "all_rij" trainer_config["model"]["mp_type"] = "updownscale" trainer_config["model"]["phys_embeds"] = True trainer_config["model"]["tag_hidden_channels"] = 32 diff --git a/scripts/test_all.py b/scripts/test_all.py index 783f6f302..39d69b4a2 100644 --- a/scripts/test_all.py +++ b/scripts/test_all.py @@ -180,9 +180,9 @@ def isin(key, args): "--config=sfarinet-qm7x-1k --regress_forces=direct", "--config=sfarinet-qm7x-1k --regress_forces=direct_with_gradient_target", "--config=sfarinet-qm7x-1k --regress_forces=from_energy", - "--config=faenet-is2re-10k --model.edge_embed_type=rij --model.mp_type=base", - "--config=faenet-is2re-10k --model.edge_embed_type=all --model.mp_type=simple", - "--config=faenet-is2re-10k --model.edge_embed_type=sh --model.mp_type=updownscale", + "--config=faenet-is2re-10k --model.mp_type=base", + "--config=faenet-is2re-10k --model.mp_type=simple", + "--config=faenet-is2re-10k --model.mp_type=updownscale", # "--config=faenet-is2re-10k --model.edge_embed_type=all_rij --model.mp_type=local_env", # "--config=faenet-is2re-10k --model.mp_type=att", # "--config=faenet-is2re-10k --model.mp_type=base_with_att", From 0d70e8e9488b32de955d014f7ad89118030f1c83 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 19 Apr 2024 07:23:36 -0400 Subject: [PATCH 08/27] create deup-depfaenet, add dropout_lin, modif class names --- configs/exps/catalyst/gflownet.yaml | 2 +- ocpmodels/models/__init__.py | 2 +- ocpmodels/models/depfaenet.py | 30 ++++---- ocpmodels/models/deup_depfaenet.py | 105 ++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+), 17 deletions(-) create mode 100644 ocpmodels/models/deup_depfaenet.py diff --git a/configs/exps/catalyst/gflownet.yaml b/configs/exps/catalyst/gflownet.yaml index 4499b6e2a..8dc46c189 100644 --- a/configs/exps/catalyst/gflownet.yaml +++ b/configs/exps/catalyst/gflownet.yaml @@ -1,7 +1,7 @@ job: mem: 32GB cpus: 4 - gres: gpu:rtx8000:1 + gres: gpu:1 partition: long time: 15:00:00 diff --git a/ocpmodels/models/__init__.py b/ocpmodels/models/__init__.py index c15c217b0..9241e161f 100644 --- a/ocpmodels/models/__init__.py +++ b/ocpmodels/models/__init__.py @@ -7,7 +7,7 @@ from .cgcnn import CGCNN # noqa: F401 from .dimenet import DimeNet # noqa: F401 from .faenet import FAENet # noqa: F401 -from .depfaenet import depFAENet # noqa: F401 +from .depfaenet import DepFAENet # noqa: F401 from .gemnet.gemnet import GemNetT # noqa: F401 from .dimenet_plus_plus import DimeNetPlusPlus # noqa: F401 from .forcenet import ForceNet # noqa: F401 diff --git a/ocpmodels/models/depfaenet.py b/ocpmodels/models/depfaenet.py index 25f6a0968..97d197916 100644 --- a/ocpmodels/models/depfaenet.py +++ b/ocpmodels/models/depfaenet.py @@ -2,6 +2,7 @@ from torch.nn import Linear from torch import nn from torch_scatter import scatter +import torch.nn.functional as F from ocpmodels.models.faenet import FAENet from ocpmodels.models.faenet import OutputBlock as conOutputBlock @@ -12,9 +13,9 @@ from torch_geometric.data import Batch -class discOutputBlock(conOutputBlock): - def __init__(self, energy_head, hidden_channels, act, disconnected_mlp=False): - super(discOutputBlock, self).__init__(energy_head, hidden_channels, act) +class DiscOutputBlock(conOutputBlock): + def __init__(self, energy_head, hidden_channels, act, dropout_lin, disconnected_mlp=False): + super(DiscOutputBlock, self).__init__(energy_head, hidden_channels, act, dropout_lin) # We modify the last output linear function to make the output a vector self.lin2 = Linear(hidden_channels // 2, hidden_channels // 2) @@ -40,17 +41,16 @@ def forward(self, h, edge_index, edge_weight, batch, alpha): ): # Right now, this is the only available option. alpha = self.w_lin(h) - elif self.energy_head == "graclus": - h, batch = self.graclus(h, edge_index, edge_weight, batch) - - elif self.energy_head in {"pooling", "random"}: - h, batch, pooling_loss = self.hierarchical_pooling( - h, edge_index, edge_weight, batch - ) - # MLP + h = F.dropout( + h, p=self.dropout_lin, training=self.training or self.deup_inference + ) h = self.lin1(h) - h = self.lin2(self.act(h)) + h = self.act(h) + h = F.dropout( + h, p=self.dropout_lin, training=self.training or self.deup_inference + ) + h = self.lin2(h) if self.energy_head in { "weighted-av-initial-embeds", @@ -78,14 +78,14 @@ def forward(self, h, edge_index, edge_weight, batch, alpha): @registry.register_model("depfaenet") -class depFAENet(FAENet): +class DepFAENet(FAENet): def __init__(self, **kwargs): super().__init__(**kwargs) # We replace the old output block by the new output block self.disconnected_mlp = kwargs.get("disconnected_mlp", False) - self.output_block = discOutputBlock( - self.energy_head, kwargs["hidden_channels"], self.act, self.disconnected_mlp + self.output_block = DiscOutputBlock( + self.energy_head, kwargs["hidden_channels"], self.act, self.disconnected_mlp, self.dropout_lin, ) @conditional_grad(torch.enable_grad()) diff --git a/ocpmodels/models/deup_depfaenet.py b/ocpmodels/models/deup_depfaenet.py new file mode 100644 index 000000000..8457acf45 --- /dev/null +++ b/ocpmodels/models/deup_depfaenet.py @@ -0,0 +1,105 @@ +import torch +from torch import nn +from torch.nn import Linear +from torch_scatter import scatter +from ocpmodels.common.registry import registry +from ocpmodels.models.depfaenet import DepFAENet, DiscOutputBlock + + +class DeupDepOutputBlock(DiscOutputBlock): + def __init__( + self, energy_head, hidden_channels, act, dropout_lin, deup_features={} + ): + super().__init__(energy_head, hidden_channels, act, dropout_lin) + + self.deup_features = deup_features + self.deup_data_keys = [f"deup_{k}" for k in deup_features] + self.deup_extra_dim = 0 + self._set_q_dim = False + + if "s" in deup_features: + self.deup_extra_dim += 1 + if "energy_pred_std" in deup_features: + self.deup_extra_dim += 1 + if "q" in deup_features: + self._set_q_dim = True + + if self.deup_extra_dim > 0: + self.deup_lin = Linear( + self.lin1.out_features + self.deup_extra_dim, self.lin1.out_features + ) + + def forward(self, h, edge_index, edge_weight, batch, alpha, data=None): + if self._set_q_dim: + assert data is not None + assert "deup_q" in data.to_dict().keys() + self.deup_extra_dim += data.deup_q.shape[-1] + self.deup_lin = Linear( + self.lin1.out_features + self.deup_extra_dim, self.lin1.out_features + ) + print("\nLazy loading deup extra dim from q. New dim:", self.deup_extra_dim) + print("āš ļø OutputBlock will be reinitialized.\n") + self.reset_parameters() + self._set_q_dim = False + + if self.energy_head == "weighted-av-final-embeds": + alpha = self.w_lin(h) + + # OutputBlock to get final atom rep + # No dropout in deup-(dep)faenet + h = self.lin1(h) + h = self.act(h) + if self.deup_extra_dim <= 0: + h = self.lin2(h) + + if self.energy_head in { + "weighted-av-initial-embeds", + "weighted-av-final-embeds", + }: + h = h * alpha + + # Global pooling -- get final graph rep + out = scatter( + h, + batch, + dim=0, + reduce="mean" if self.deup_extra_dim > 0 else "add", + ) + + # Concat graph representation with deup features (s, kde(q), std) + # and apply MLPs + if self.deup_extra_dim > 0: + assert data is not None + data_keys = set(data.to_dict().keys()) + assert all(dk in data_keys for dk in self.deup_data_keys), ( + f"Some deup data keys ({self.deup_data_keys}) are missing" + + f" from the data dict ({data_keys})" + ) + out = torch.cat( + [out] + + [data[f"deup_{k}"][:, None].float() for k in self.deup_features], + dim=-1, + ) + out = self.deup_lin(out) + out = self.act(out) + out = self.lin2(out) + + return out + +@registry.register_model("deup_depfaenet") +class DeupFAENet(DepFAENet): + def __init__(self, *args, **kwargs): + kwargs["dropout_edge"] = 0 + super().__init__(*args, **kwargs) + self.output_block = DeupDepOutputBlock( + self.energy_head, + kwargs["hidden_channels"], + self.act, + self.dropout_lin, + kwargs.get("deup_features", {}), + ) + assert ( + self.energy_head != "weighted-av-initial-embeds" + ), "Unsupported head weighted-av-initial-embeds" + assert self.skip_co != "concat", "Unsupported skip connection concat" + assert self.skip_co != "add", "Unsupported skip connection add" \ No newline at end of file From fd9d1d1524a6d661ca2a1f882f3c315f3125640d Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 19 Apr 2024 07:31:42 -0400 Subject: [PATCH 09/27] add q --- ocpmodels/models/depfaenet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/models/depfaenet.py b/ocpmodels/models/depfaenet.py index 97d197916..af3da682d 100644 --- a/ocpmodels/models/depfaenet.py +++ b/ocpmodels/models/depfaenet.py @@ -89,7 +89,7 @@ def __init__(self, **kwargs): ) @conditional_grad(torch.enable_grad()) - def energy_forward(self, data): + def energy_forward(self, data, q=None): # We need to save the tags so this step is necessary. self.output_block.tags_saver(data.tags) pred = super().energy_forward(data) From 2ab5c335b22903e958efb97f825a6f009d65ef28 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 19 Apr 2024 08:00:59 -0400 Subject: [PATCH 10/27] fix forward of output block depfaenet --- ocpmodels/models/depfaenet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocpmodels/models/depfaenet.py b/ocpmodels/models/depfaenet.py index af3da682d..87f76e08c 100644 --- a/ocpmodels/models/depfaenet.py +++ b/ocpmodels/models/depfaenet.py @@ -35,7 +35,7 @@ def __init__(self, energy_head, hidden_channels, act, dropout_lin, disconnected_ def tags_saver(self, tags): self.current_tags = tags - def forward(self, h, edge_index, edge_weight, batch, alpha): + def forward(self, h, edge_index, edge_weight, batch, alpha, data): if ( self.energy_head == "weighted-av-final-embeds" ): # Right now, this is the only available option. @@ -85,7 +85,7 @@ def __init__(self, **kwargs): # We replace the old output block by the new output block self.disconnected_mlp = kwargs.get("disconnected_mlp", False) self.output_block = DiscOutputBlock( - self.energy_head, kwargs["hidden_channels"], self.act, self.disconnected_mlp, self.dropout_lin, + self.energy_head, kwargs["hidden_channels"], self.act, self.dropout_lin, self.disconnected_mlp, ) @conditional_grad(torch.enable_grad()) From 9f18bfd2244e06a7a4e1c9eb3927223c2ed3df6e Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 23 Apr 2024 07:41:59 -0400 Subject: [PATCH 11/27] new model checkpoints to create deup-dataset --- ...c-faenet.yaml => data-with-depfaenet.yaml} | 6 ++-- configs/exps/deup/datasets/mc-faenet.yaml | 28 +++++++++++++++++++ ...c-faenet.yaml => old-train-mc-faenet.yaml} | 0 3 files changed, 31 insertions(+), 3 deletions(-) rename configs/exps/deup/datasets/{new-mc-faenet.yaml => data-with-depfaenet.yaml} (80%) create mode 100644 configs/exps/deup/datasets/mc-faenet.yaml rename configs/exps/deup/datasets/{train-mc-faenet.yaml => old-train-mc-faenet.yaml} (100%) diff --git a/configs/exps/deup/datasets/new-mc-faenet.yaml b/configs/exps/deup/datasets/data-with-depfaenet.yaml similarity index 80% rename from configs/exps/deup/datasets/new-mc-faenet.yaml rename to configs/exps/deup/datasets/data-with-depfaenet.yaml index 56ea29868..8c7d4a00e 100644 --- a/configs/exps/deup/datasets/new-mc-faenet.yaml +++ b/configs/exps/deup/datasets/data-with-depfaenet.yaml @@ -7,11 +7,11 @@ job: default: config: faenet-is2re-all wandb_project: ocp-deup - wandb_tags: base-model, MC-D, 4615191 + wandb_tags: depfaenet, MC-D,4621042 test_ri: True mode: train - checkpoint: /network/scratch/a/alexandre.duval/scratch/ocp/runs/4615191/checkpoints/best_checkpoint.pt - restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4615191/ + checkpoint: /network/scratch/a/alexandre.duval/ocp/runs/4621042/checkpoints/best_checkpoint.pt + restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4621042/ model: dropout_lowest_layer: output first_trainable_layer: dropout diff --git a/configs/exps/deup/datasets/mc-faenet.yaml b/configs/exps/deup/datasets/mc-faenet.yaml new file mode 100644 index 000000000..8069e3573 --- /dev/null +++ b/configs/exps/deup/datasets/mc-faenet.yaml @@ -0,0 +1,28 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + +default: + config: faenet-is2re-all + wandb_project: ocp-deup + wandb_tags: base-model, MC-D, 4616500 + test_ri: True + mode: train + checkpoint: /network/scratch/a/alexandre.duval/ocp/runs/4616500/checkpoints/best_checkpoint.pt + restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4616500/ + model: + dropout_lowest_layer: output + first_trainable_layer: dropout + dropout_lin: 0.7 + cp_data_to_tmpdir: true + inference_time_loops: 1 + deup_dataset: + create: after # "before" -> created before training (for deup) "after" -> created after training (for is2re) "" - not created + dataset_strs: ["train", "val_id", "val_ood_cat", "val_ood_ads"] + n_samples: 7 + +runs: + - optim: + max_epochs: 12 diff --git a/configs/exps/deup/datasets/train-mc-faenet.yaml b/configs/exps/deup/datasets/old-train-mc-faenet.yaml similarity index 100% rename from configs/exps/deup/datasets/train-mc-faenet.yaml rename to configs/exps/deup/datasets/old-train-mc-faenet.yaml From e0fb6f7738c746e205921ba9916166ea4f18a519 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 23 Apr 2024 08:34:27 -0400 Subject: [PATCH 12/27] argparse deup_dataset + comments --- ocpmodels/datasets/deup_dataset_creator.py | 29 ++++++++++++++++++---- scripts/deup_dataset.sh | 11 ++++++++ 2 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 scripts/deup_dataset.sh diff --git a/ocpmodels/datasets/deup_dataset_creator.py b/ocpmodels/datasets/deup_dataset_creator.py index b57522422..f3a4e3adc 100644 --- a/ocpmodels/datasets/deup_dataset_creator.py +++ b/ocpmodels/datasets/deup_dataset_creator.py @@ -228,6 +228,7 @@ def _structure(preds): if self.mc_dropout: if n_samples <= 0: raise ValueError("n_samples must be > 0 for MC-Dropout ensembles.") + # Speed up computation by re-using latent representation q for all models preds += [ self.trainers[0].model_forward(batch_list, mode="deup", q=q) for _ in range(n_samples - len(preds)) @@ -320,12 +321,14 @@ def create_deup_dataset( preds = self.forward( batch_list, n_samples=n_samples, shared_encoder=True ) - + # Compute mean and standard deviation of GNN predictions pred_mean = preds["energies"].mean(dim=1) # Batch pred_std = preds["energies"].std(dim=1) # Batch + # Compute residual between mean predicted energy and ground truth loss = self.trainers[0].loss_fn["energy"]( pred_mean, batch.y_relaxed.to(pred_mean.device) ) + # Store deup samples deup_samples += [ { "energy_target": batch.y_relaxed.clone(), @@ -431,13 +434,29 @@ def write_lmdb(self, samples, path, total_size=-1, max_samples=-1): from ocpmodels.datasets.deup_dataset_creator import DeupDatasetCreator from ocpmodels.datasets.lmdb_dataset import DeupDataset from ocpmodels.common.utils import JOB_ID, RUNS_DIR, make_config_from_conf_str + import argparse + + def parse_args(): + parser = argparse.ArgumentParser(description="Deup Dataset Creator") + parser.add_argument( + "--checkpoints", + nargs="+", + default="/network/scratch/a/alexandre.duval/ocp/runs/4616500/", + help="Paths to the checkpoints", + ) + parser.add_argument( + "--dropout", + type=float, + default=0.2, + help="Dropout value", + ) + return parser.parse_args() - base_trainer_path = "/network/scratch/a/alexandre.duval/ocp/runs/4615191" + args = parse_args() - # what models to load for inference trainers_conf = { - "checkpoints": [base_trainer_path], - "dropout": 0.7, + "checkpoints": args.checkpoints, + "dropout": args.dropout, } # setting first_trainable_layer to output means that the latent space # q will be defined as input to the output layer, even though the model diff --git a/scripts/deup_dataset.sh b/scripts/deup_dataset.sh new file mode 100644 index 000000000..d42384a05 --- /dev/null +++ b/scripts/deup_dataset.sh @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --job-name=deup-dataset +#SBATCH --ntasks=1 +#SBATCH --mem=32GB +#SBATCH --gres=gpu:1 +#SBATCH --output="/network/scratch/a/alexandre.duval/ocp/runs/output-%j.txt" # replace: location where you want to store the output of the job + +module load anaconda/3 # replace: load anaconda module +conda activate ocp # replace: conda env name +cd /home/mila/a/alexandre.duval/ocp/ocp # replace: location of the code +python -m ocpmodels.datasets.deup_dataset_creator \ No newline at end of file From 5b9c76f1973df00ae333333e71eadfc9f3af2053 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Wed, 24 Apr 2024 05:18:45 -0400 Subject: [PATCH 13/27] fix chkpt_path + edge case error + new configs --- .../deup/datasets/data-with-depfaenet.yaml | 4 +- configs/exps/deup/gnn/depfaenet-training.yaml | 57 ++++++++++++++ configs/exps/deup/gnn/faenet-training.yaml | 7 +- configs/exps/deup/gnn/pretrain-depfaenet.yaml | 78 +++++++++++++++++++ ocpmodels/tasks/task.py | 8 +- ocpmodels/trainers/single_trainer.py | 34 +++++--- 6 files changed, 168 insertions(+), 20 deletions(-) create mode 100644 configs/exps/deup/gnn/pretrain-depfaenet.yaml diff --git a/configs/exps/deup/datasets/data-with-depfaenet.yaml b/configs/exps/deup/datasets/data-with-depfaenet.yaml index 8c7d4a00e..e329beff8 100644 --- a/configs/exps/deup/datasets/data-with-depfaenet.yaml +++ b/configs/exps/deup/datasets/data-with-depfaenet.yaml @@ -5,7 +5,7 @@ job: partition: long default: - config: faenet-is2re-all + config: depfaenet-is2re-all wandb_project: ocp-deup wandb_tags: depfaenet, MC-D,4621042 test_ri: True @@ -15,7 +15,7 @@ default: model: dropout_lowest_layer: output first_trainable_layer: dropout - dropout_lin: 0.7 + dropout_lin: 0.3 cp_data_to_tmpdir: true inference_time_loops: 1 deup_dataset: diff --git a/configs/exps/deup/gnn/depfaenet-training.yaml b/configs/exps/deup/gnn/depfaenet-training.yaml index e69de29bb..d81ac5d38 100644 --- a/configs/exps/deup/gnn/depfaenet-training.yaml +++ b/configs/exps/deup/gnn/depfaenet-training.yaml @@ -0,0 +1,57 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + time: 15:00:00 + +default: + wandb_project: ocp-deup + wandb_tags: depfaenet, no-concat, with-tag0, dropout + test_ri: True + mode: train + graph_rewiring: "" + frame_averaging: 2D + fa_method: se3-random + cp_data_to_tmpdir: True + is_disconnected: true + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 0 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: False + cutoff: 4.0 + dropout_lin: 0.3 + optim: + batch_size: 256 + eval_batch_size: 256 + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + eval_every: 0.4 + +runs: + + - config: depfaenet-is2re-all + note: Depfaenet per-ads-dataset + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 10 + lr_initial: 0.0002 + + - config: depfaenet-is2re-all + note: Depfaenet per-ads-dataset + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 12 + lr_initial: 0.0001 diff --git a/configs/exps/deup/gnn/faenet-training.yaml b/configs/exps/deup/gnn/faenet-training.yaml index 8bf38ec5f..0d6aa34d5 100644 --- a/configs/exps/deup/gnn/faenet-training.yaml +++ b/configs/exps/deup/gnn/faenet-training.yaml @@ -8,8 +8,7 @@ job: default: test_ri: True mode: train - graph_rewiring: remove-tag-0 - wandb_tags: "top-model" + wandb_tags: faenet, no-concat, with-tag0, dropout wandb_project: ocp-deup optim: batch_size: 256 @@ -36,8 +35,10 @@ runs: second_layer_MLP: False skip_co: False cutoff: 6.0 + dropout_lin: 0.3 + dropout_lowest_layer: output optim: lr_initial: 0.002 scheduler: LinearWarmupCosineAnnealingLR - max_epochs: 12 + max_epochs: 14 eval_every: 0.25 \ No newline at end of file diff --git a/configs/exps/deup/gnn/pretrain-depfaenet.yaml b/configs/exps/deup/gnn/pretrain-depfaenet.yaml new file mode 100644 index 000000000..83029997d --- /dev/null +++ b/configs/exps/deup/gnn/pretrain-depfaenet.yaml @@ -0,0 +1,78 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + time: 15:00:00 + +default: + wandb_project: ocp-deup + wandb_tags: gflownet-model, depfaenet + test_ri: True + mode: train + graph_rewiring: "" + frame_averaging: 2D + fa_method: se3-random + cp_data_to_tmpdir: True + is_disconnected: true + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 0 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: False + cutoff: 4.0 + dropout_lin: 0.3 + optim: + batch_size: 256 + eval_batch_size: 256 + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + eval_every: 0.4 + +runs: + + - config: depfaenet-is2re-all + note: Depfaenet pre-train + dropout + optim: + max_epochs: 12 + lr_initial: 0.0002 + + - config: depfaenet-is2re-all + note: Depfaenet pre-train + dropout + optim: + max_epochs: 10 + lr_initial: 0.0001 + + - config: depfaenet-is2re-all + note: depfaenet with top configs + dropout + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: False + cutoff: 4.0 + optim: + batch_size: 256 + eval_batch_size: 256 + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 9 + eval_every: 0.4 \ No newline at end of file diff --git a/ocpmodels/tasks/task.py b/ocpmodels/tasks/task.py index 8a9e3d8be..c3c938eec 100644 --- a/ocpmodels/tasks/task.py +++ b/ocpmodels/tasks/task.py @@ -27,10 +27,10 @@ def setup(self, trainer): self.trainer.load_checkpoint(self.config["checkpoint"]) print() - # save checkpoint path to runner state for slurm resubmissions - self.chkpt_path = os.path.join( - self.trainer.config["checkpoint_dir"], "checkpoint.pt" - ) + # save checkpoint path to runner state for slurm resubmissions + self.chkpt_path = os.path.join( + self.trainer.config["checkpoint_dir"], "checkpoint.pt" + ) def run(self): raise NotImplementedError diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 25f82ec9a..b9e2f921c 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -227,7 +227,11 @@ def train( # Calculate start_epoch from step instead of loading the epoch number # to prevent inconsistencies due to different batch size in checkpoint. - if self.config["continue_from_dir"] is not None and self.config["adsorbates"] not in {None, "all"}: + if ( + "continue_from_dir" in self.config + and self.config["continue_from_dir"] is not None + and self.config["adsorbates"] not in {None, "all"} + ): self.step = 0 start_epoch = self.step // n_train max_epochs = self.config["optim"]["max_epochs"] @@ -589,11 +593,15 @@ def compute_loss(self, preds, batch_list): # Energy loss energy_target = torch.cat( [ - batch.y_relaxed.to(self.device) - if self.task_name == "is2re" - else batch.deup_loss.to(self.device) - if self.task_name == "deup_is2re" - else batch.y.to(self.device) + ( + batch.y_relaxed.to(self.device) + if self.task_name == "is2re" + else ( + batch.deup_loss.to(self.device) + if self.task_name == "deup_is2re" + else batch.y.to(self.device) + ) + ) for batch in batch_list ], dim=0, @@ -706,11 +714,15 @@ def compute_metrics( target = { "energy": torch.cat( [ - batch.y_relaxed.to(self.device) - if self.task_name == "is2re" - else batch.deup_loss.to(self.device) - if self.task_name == "deup_is2re" - else batch.y.to(self.device) + ( + batch.y_relaxed.to(self.device) + if self.task_name == "is2re" + else ( + batch.deup_loss.to(self.device) + if self.task_name == "deup_is2re" + else batch.y.to(self.device) + ) + ) for batch in batch_list ], dim=0, From 8599de7e1b56b8a0a590fd6c9212396b358d3fe1 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Wed, 24 Apr 2024 07:57:56 -0400 Subject: [PATCH 14/27] adapt configs for v0 deup-faenet training on deup-dataset --- .../exps/deup/uncertainty/faenet_test.yaml | 31 ++ configs/exps/deup/uncertainty/v1.yaml | 13 +- configs/models/deup_depfaenet.yaml | 273 ++++++++++++++++++ configs/models/deup_faenet.yaml | 53 ++-- configs/models/tasks/deup_is2re.yaml | 2 +- 5 files changed, 340 insertions(+), 32 deletions(-) create mode 100644 configs/exps/deup/uncertainty/faenet_test.yaml create mode 100644 configs/models/deup_depfaenet.yaml diff --git a/configs/exps/deup/uncertainty/faenet_test.yaml b/configs/exps/deup/uncertainty/faenet_test.yaml new file mode 100644 index 000000000..19742c31c --- /dev/null +++ b/configs/exps/deup/uncertainty/faenet_test.yaml @@ -0,0 +1,31 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + +default: + config: deup_faenet-deup_is2re-all + wandb_project: ocp-deup + wandb_tags: faenet, MC-D, 4616500-model, 4642835-dataset + test_ri: True + mode: train + model: + dropout_lowest_layer: null + first_trainable_layer: output + dropout_lin: 0.3 + cp_data_to_tmpdir: false + inference_time_loops: 1 + # restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4621042/ + # checkpoint: /network/scratch/a/alexandre.duval/ocp/runs/4621042/ + dataset: # mandatory if restart_from_dir is set + default_val: deup-val_ood_cat-val_ood_ads + deup-train-val_id: + src: /network/scratch/a/alexandre.duval/ocp/runs/4642835/deup_dataset + deup-val_ood_cat-val_ood_ads: + src: /network/scratch/a/alexandre.duval/ocp/runs/4642835/deup_dataset + deup_dataset: + create: False + +runs: + - note: deup-faenet d=0.2 (not trained with d) \ No newline at end of file diff --git a/configs/exps/deup/uncertainty/v1.yaml b/configs/exps/deup/uncertainty/v1.yaml index 4f69d7828..1f6a064c2 100644 --- a/configs/exps/deup/uncertainty/v1.yaml +++ b/configs/exps/deup/uncertainty/v1.yaml @@ -6,25 +6,24 @@ job: default: config: deup_faenet-deup_is2re-all - wandb_project: ocp-deup - wandb_tags: base-model, MC-D, 3264530 + wandb_tags: base-model, MC-D, 4616500-model, 4642835-dataset test_ri: True mode: train model: dropout_lowest_layer: null first_trainable_layer: output - dropout_lin: 0.7 + dropout_lin: 0.3 cp_data_to_tmpdir: false inference_time_loops: 1 - restart_from_dir: /network/scratch/s/schmidtv/ocp/runs/3264530 - checkpoint: /network/scratch/s/schmidtv/ocp/runs/3264530 + restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4621042/ + # checkpoint: /network/scratch/a/alexandre.duval/ocp/runs/4621042/ dataset: # mandatory if restart_from_dir is set default_val: deup-val_ood_cat-val_ood_ads deup-train-val_id: - src: /network/scratch/s/schmidtv/ocp/runs/3264530/deup_dataset + src: /network/scratch/a/alexandre.duval/ocp/runs/4642835/deup_dataset deup-val_ood_cat-val_ood_ads: - src: /network/scratch/s/schmidtv/ocp/runs/3264530/deup_dataset + src: /network/scratch/a/alexandre.duval/ocp/runs/4642835/deup_dataset deup_dataset: create: False diff --git a/configs/models/deup_depfaenet.yaml b/configs/models/deup_depfaenet.yaml new file mode 100644 index 000000000..24ab2587c --- /dev/null +++ b/configs/models/deup_depfaenet.yaml @@ -0,0 +1,273 @@ +default: + model: + name: deup_depfaenet + act: swish + dropout_lin: 0.0 + dropout_edge: 0.0 + dropout_lowest_layer: output # lowest layer where `dropout_lin` is applied. Can be `inter-{i}` or `output`. Defaults to `output`. + first_trainable_layer: dropout # lowest layer to NOT freeze. All previous layers will be frozen. Can be ``, `embed`, `inter-{i}`, `output`, or `dropout`. + # if it is `` then no layer is frozen. If it is `dropout` then it will be set to the layer before `dropout_lowest_layer`. + # Defaults to ``. + hidden_channels: 384 + num_filters: 480 + num_interactions: 5 + num_gaussians: 104 + cutoff: 6.0 + use_pbc: True + regress_forces: False + tag_hidden_channels: 64 # only for OC20 + pg_hidden_channels: 64 # period & group embedding hidden channels + phys_embeds: True # physics-aware embeddings for atoms + phys_hidden_channels: 0 + energy_head: weighted-av-final-embeds # Energy head: {False, weighted-av-initial-embeds, weighted-av-final-embeds} + skip_co: False # Skip connections {False, "add", "concat"} + second_layer_MLP: False # in EmbeddingBlock + complex_mp: True # 2-layer MLP in Interaction blocks + mp_type: base # Message Passing type {'base', 'simple', 'updownscale', 'updownscale_base'} + graph_norm: True # graph normalization layer + force_decoder_type: "mlp" # force head (`"simple"`, `"mlp"`, `"res"`, `"res_updown"`) + force_decoder_model_config: + simple: + hidden_channels: 128 + norm: batch1d # batch1d, layer or null + mlp: + hidden_channels: 256 + norm: batch1d # batch1d, layer or null + res: + hidden_channels: 128 + norm: batch1d # batch1d, layer or null + res_updown: + hidden_channels: 128 + norm: batch1d # batch1d, layer or null + deup_features: [s, energy_pred_std] + optim: + batch_size: 256 + eval_batch_size: 256 + max_epochs: 12 + scheduler: LinearWarmupCosineAnnealingLR + optimizer: AdamW + num_workers: 4 + warmup_steps: 6000 + warmup_factor: 0.2 + lr_initial: 0.002 + lr_gamma: 0.1 + energy_grad_coefficient: 10 + force_coefficient: 30 + energy_coefficient: 1 + lr_milestones: + - 18000 + - 27000 + - 37000 + epoch_fine_tune: 4 + + frame_averaging: "" # 2D, 3D, da, False + fa_method: "" # can be {None, full, random, det, e3, e3-random, e3-det} + +# ------------------- +# ----- IS2RE ----- +# ------------------- + +deup_is2re: # was: is2re + 10k: + optim: + lr_initial: 0.005 + lr_milestones: # epochs at which lr_initial <- lr_initial * lr_gamma + - 1562 + - 2343 + - 3125 + warmup_steps: 468 + max_epochs: 20 + + 100k: + model: + hidden_channels: 256 + optim: + lr_initial: 0.005 + lr_milestones: # epochs at which lr_initial <- lr_initial * lr_gamma + - 1562 + - 2343 + - 3125 + warmup_steps: 468 + max_epochs: 20 + + all: + model: + hidden_channels: 384 + num_interactions: 4 + optim: + batch_size: 256 + eval_batch_size: 256 + lr_initial: 0.001 + lr_gamma: 0.1 + lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma + - 18000 + - 27000 + - 37000 + warmup_steps: 6000 + max_epochs: 20 + +# ------------------ +# ----- S2EF ----- +# ------------------ + +# For 2 GPUs + +s2ef: + default: + model: + num_interactions: 4 + hidden_channels: 750 + num_gaussians: 200 + num_filters: 256 + regress_forces: "direct" + optim: + batch_size: 96 + eval_batch_size: 96 + warmup_factor: 0.2 + lr_gamma: 0.1 + lr_initial: 0.0001 + max_epochs: 15 + warmup_steps: 30000 + lr_milestones: + - 55000 + - 75000 + - 10000 + + 200k: {} + + # 1 gpus + 2M: + model: + num_interactions: 5 + hidden_channels: 1024 + num_gaussians: 200 + num_filters: 256 + optim: + batch_size: 192 + eval_batch_size: 192 + + 20M: {} + + all: {} + +qm9: + default: + model: + act: swish + att_heads: 1 + complex_mp: true + cutoff: 6.0 + energy_head: '' + graph_norm: true + graph_rewiring: null + hidden_channels: 400 + max_num_neighbors: 30 + mp_type: updownscale_base + num_filters: 480 + num_gaussians: 100 + num_interactions: 5 + otf_graph: false + pg_hidden_channels: 32 + phys_embeds: false + phys_hidden_channels: 0 + regress_forces: '' + second_layer_MLP: true + skip_co: true + tag_hidden_channels: 0 + use_pbc: false + + optim: + batch_size: 64 + es_min_abs_change: 1.0e-06 + es_patience: 20 + es_warmup_epochs: 600 + eval_batch_size: 64 + factor: 0.9 + loss_energy: mse + lr_gamma: 0.1 + lr_initial: 0.0003 + max_epochs: 1500 + min_lr: 1.0e-06 + mode: min + optimizer: AdamW + patience: 15 + scheduler: ReduceLROnPlateau + threshold: 0.0001 + threshold_mode: abs + verbose: true + warmup_factor: 0.2 + warmup_steps: 3000 + + 10k: {} + all: {} + +qm7x: + default: + model: # SOTA settings + act: swish + att_heads: 1 + complex_mp: true + cutoff: 5.0 + energy_head: false + force_decoder_model_config: + mlp: + hidden_channels: 256 + norm: batch1d + res: + hidden_channels: 128 + norm: batch1d + res_updown: + hidden_channels: 128 + norm: layer + simple: + hidden_channels: 128 + norm: batch1d + force_decoder_type: res_updown + graph_norm: false + hidden_channels: 500 + max_num_neighbors: 40 + mp_type: updownscale_base + num_filters: 400 + num_gaussians: 50 + num_interactions: 5 + otf_graph: false + pg_hidden_channels: 32 + phys_embeds: true + phys_hidden_channels: 0 + regress_forces: direct_with_gradient_target + second_layer_MLP: true + skip_co: false + tag_hidden_channels: 0 + use_pbc: false + + optim: + batch_size: 100 + energy_grad_coefficient: 5 + eval_batch_size: 100 + eval_every: 0.34 + factor: 0.75 + force_coefficient: 75 + loss_energy: mae + loss_force: mse + lr_gamma: 0.1 + lr_initial: 0.000193 + max_steps: 4000000 + min_lr: 1.0e-06 + mode: min + optimizer: AdamW + scheduler: ReduceLROnPlateau + threshold: 0.001 + threshold_mode: abs + verbose: true + warmup_factor: 0.2 + warmup_steps: 3000 + + all: {} + 1k: {} + +qm9: + default: + model: + use_pbc: False + all: {} + 10k: {} diff --git a/configs/models/deup_faenet.yaml b/configs/models/deup_faenet.yaml index bdc723bb5..f6e52681f 100644 --- a/configs/models/deup_faenet.yaml +++ b/configs/models/deup_faenet.yaml @@ -8,27 +8,24 @@ default: first_trainable_layer: dropout # lowest layer to NOT freeze. All previous layers will be frozen. Can be ``, `embed`, `inter-{i}`, `output`, or `dropout`. # if it is `` then no layer is frozen. If it is `dropout` then it will be set to the layer before `dropout_lowest_layer`. # Defaults to ``. - hidden_channels: 128 - num_filters: 100 - num_interactions: 3 - num_gaussians: 100 + hidden_channels: 384 + num_filters: 480 + num_interactions: 5 + num_gaussians: 104 cutoff: 6.0 use_pbc: True regress_forces: False - # drlab attributes: - tag_hidden_channels: 0 # 32 - pg_hidden_channels: 0 # 32 -> period & group embedding hidden channels - phys_embeds: False # True + tag_hidden_channels: 64 # only for OC20 + pg_hidden_channels: 64 # period & group embedding hidden channels + phys_embeds: True # physics-aware embeddings for atoms phys_hidden_channels: 0 - energy_head: False # can be {False, weighted-av-initial-embeds, weighted-av-final-embeds} - # faenet new features - skip_co: False # output skip connections {False, "add", "concat"} + energy_head: weighted-av-final-embeds # Energy head: {False, weighted-av-initial-embeds, weighted-av-final-embeds} + skip_co: False # Skip connections {False, "add", "concat"} second_layer_MLP: False # in EmbeddingBlock - complex_mp: False - mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'} - graph_norm: False # bool - att_heads: 1 # int - force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True + complex_mp: True # 2-layer MLP in Interaction blocks + mp_type: base # Message Passing type {'base', 'simple', 'updownscale', 'updownscale_base'} + graph_norm: True # graph normalization layer + force_decoder_type: "mlp" # force head (`"simple"`, `"mlp"`, `"res"`, `"res_updown"`) force_decoder_model_config: simple: hidden_channels: 128 @@ -44,19 +41,27 @@ default: norm: batch1d # batch1d, layer or null deup_features: [s, energy_pred_std] optim: - batch_size: 64 - eval_batch_size: 64 + batch_size: 256 + eval_batch_size: 256 + max_epochs: 12 + scheduler: LinearWarmupCosineAnnealingLR + optimizer: AdamW num_workers: 4 - lr_gamma: 0.1 - lr_initial: 0.001 + warmup_steps: 6000 warmup_factor: 0.2 - max_epochs: 20 - energy_grad_coefficient: 5 + lr_initial: 0.002 + lr_gamma: 0.1 + energy_grad_coefficient: 10 force_coefficient: 30 energy_coefficient: 1 + lr_milestones: + - 18000 + - 27000 + - 37000 + epoch_fine_tune: 4 - frame_averaging: False # 2D, 3D, da, False - fa_method: False # can be {None, full, random, det, e3, e3-random, e3-det} + frame_averaging: "" # 2D, 3D, da, False + fa_method: "" # can be {None, full, random, det, e3, e3-random, e3-det} # ------------------- # ----- IS2RE ----- diff --git a/configs/models/tasks/deup_is2re.yaml b/configs/models/tasks/deup_is2re.yaml index 65aab2e31..fa85d99ab 100644 --- a/configs/models/tasks/deup_is2re.yaml +++ b/configs/models/tasks/deup_is2re.yaml @@ -41,7 +41,7 @@ default: n_samples: 7 ensemble_checkpoints: /network/scratch/a/alexandre.duval/ocp/runs/2935198 - ensemble_dropout: 0.7 + ensemble_dropout: 0.3 10k: From 58b992727c63e6156698c6dabdfb5c3d250f3d4a Mon Sep 17 00:00:00 2001 From: Christina Date: Thu, 25 Apr 2024 07:53:11 -0400 Subject: [PATCH 15/27] fix module load --- ocpmodels/common/utils.py | 18 +++++++++++------- ocpmodels/models/__init__.py | 6 +++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index af7cddc22..99185a39a 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -755,7 +755,7 @@ def add_edge_distance_to_graph( # Copied from https://github.com/facebookresearch/mmf/blob/master/mmf/utils/env.py#L89. -def setup_imports(): +def setup_imports(skip_modules=[]): from ocpmodels.common.registry import registry try: @@ -803,10 +803,14 @@ def setup_imports(): splits = f.split(os.sep) file_name = splits[-1] module_name = file_name[: file_name.find(".py")] - importlib.import_module("ocpmodels.%s.%s" % (key[1:], module_name)) + if module_name not in skip_modules: + importlib.import_module("ocpmodels.%s.%s" % (key[1:], module_name)) # manual model imports - importlib.import_module("ocpmodels.models.gemnet_oc.gemnet_oc") + try: + importlib.import_module("ocpmodels.models.gemnet_oc.gemnet_oc") + except: + print("unable to load gemnet_oc") experimental_folder = os.path.join(root_folder, "../experimental/") if os.path.exists(experimental_folder): @@ -1797,7 +1801,7 @@ def make_script_trainer(str_args=[], overrides={}, silent=False, mode="train"): return trainer -def make_config_from_dir(path, mode, overrides={}, silent=None): +def make_config_from_dir(path, mode, overrides={}, silent=None, setup_imports=[]): """ Make a config from a directory. This is useful when restarting or continuing from a previous run. @@ -1834,11 +1838,11 @@ def make_config_from_dir(path, mode, overrides={}, silent=None): config = build_config(default_args, silent=silent) config = merge_dicts(config, overrides) - setup_imports() + setup_imports(setup_imports=setup_imports) return config -def make_trainer_from_dir(path, mode, overrides={}, silent=None): +def make_trainer_from_dir(path, mode, overrides={}, silent=None, skip_imports=[]): """ Make a trainer from a directory. @@ -1854,7 +1858,7 @@ def make_trainer_from_dir(path, mode, overrides={}, silent=None): Returns: Trainer: The loaded trainer. """ - config = make_config_from_dir(path, mode, overrides, silent) + config = make_config_from_dir(path, mode, overrides, silent, skip_imports) return registry.get_trainer_class(config["trainer"])(**config) diff --git a/ocpmodels/models/__init__.py b/ocpmodels/models/__init__.py index 9241e161f..8a56eaea4 100644 --- a/ocpmodels/models/__init__.py +++ b/ocpmodels/models/__init__.py @@ -8,7 +8,11 @@ from .dimenet import DimeNet # noqa: F401 from .faenet import FAENet # noqa: F401 from .depfaenet import DepFAENet # noqa: F401 -from .gemnet.gemnet import GemNetT # noqa: F401 + +try: + from .gemnet.gemnet import GemNetT # noqa: F401 +except: + print("unable to load gemnet") from .dimenet_plus_plus import DimeNetPlusPlus # noqa: F401 from .forcenet import ForceNet # noqa: F401 from .schnet import SchNet # noqa: F401 From 5a5524c59f467d406ab5fffc2dc391c685b18441 Mon Sep 17 00:00:00 2001 From: Christina Date: Thu, 25 Apr 2024 08:14:50 -0400 Subject: [PATCH 16/27] return hidden state in wrapper --- ocpmodels/common/gfn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ocpmodels/common/gfn.py b/ocpmodels/common/gfn.py index 95257359f..0a1f45521 100644 --- a/ocpmodels/common/gfn.py +++ b/ocpmodels/common/gfn.py @@ -107,6 +107,7 @@ def forward( self, batch: Union[Batch, Data, List[Data], List[Batch]], preprocess: bool = True, + retrieve_hidden: bool = False, ): """Perform a forward pass of the model when frame averaging is applied. @@ -162,6 +163,8 @@ def forward( if preds["energy"].shape[-1] == 1: preds["energy"] = preds["energy"].view(-1) + if retrieve_hidden: + return preds return preds["energy"] # denormalize? def freeze(self): From 6594960177e4f3b14853ac62ffe146ef6092d5af Mon Sep 17 00:00:00 2001 From: vict0rsch Date: Thu, 25 Apr 2024 09:49:10 -0400 Subject: [PATCH 17/27] `scatter` `q` in `energy_forward` --- ocpmodels/models/faenet.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ocpmodels/models/faenet.py b/ocpmodels/models/faenet.py index dc7dd1efd..7d5c6044d 100644 --- a/ocpmodels/models/faenet.py +++ b/ocpmodels/models/faenet.py @@ -7,17 +7,17 @@ import torch.nn.functional as F from torch import nn from torch.nn import Embedding, Linear -from torch_geometric.utils import dropout_edge from torch_geometric.nn import MessagePassing, radius_graph from torch_geometric.nn.norm import GraphNorm +from torch_geometric.utils import dropout_edge from torch_scatter import scatter from ocpmodels.common.registry import registry +from ocpmodels.common.utils import conditional_grad, get_pbc_distances from ocpmodels.models.base_model import BaseModel from ocpmodels.models.force_decoder import ForceDecoder from ocpmodels.models.utils.activations import swish from ocpmodels.modules.phys_embeddings import PhysEmbedding -from ocpmodels.common.utils import get_pbc_distances, conditional_grad class GaussianSmearing(nn.Module): @@ -751,6 +751,9 @@ def energy_forward(self, data, q=None): q = h.clone().detach() else: + # WARNING + # q which is NOT the hidden state h if it was stored as a scattered + # version of h. This works for GPs, NOT for MC-dropout h = q alpha = None @@ -763,6 +766,9 @@ def energy_forward(self, data, q=None): elif self.skip_co == "add": energy = sum(energy_skip_co) + if q and len(q) > len(energy): + q = scatter(q, batch, dim=0, reduce="mean") # N_graphs x hidden_channels + preds = { "energy": energy, "hidden_state": h, From 5ce2f3f7c074aa063238d153817afca1418be35f Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 25 Apr 2024 14:03:17 -0400 Subject: [PATCH 18/27] fix configs for depfaenet/faenet fine-tuning --- ...-training.yaml => depfaenet-finetune.yaml} | 46 +++++++++++-------- configs/exps/deup/gnn/faenet-finetune.yaml | 46 +++++++++++++++++++ 2 files changed, 74 insertions(+), 18 deletions(-) rename configs/exps/deup/gnn/{depfaenet-training.yaml => depfaenet-finetune.yaml} (78%) create mode 100644 configs/exps/deup/gnn/faenet-finetune.yaml diff --git a/configs/exps/deup/gnn/depfaenet-training.yaml b/configs/exps/deup/gnn/depfaenet-finetune.yaml similarity index 78% rename from configs/exps/deup/gnn/depfaenet-training.yaml rename to configs/exps/deup/gnn/depfaenet-finetune.yaml index d81ac5d38..1da2e29bc 100644 --- a/configs/exps/deup/gnn/depfaenet-training.yaml +++ b/configs/exps/deup/gnn/depfaenet-finetune.yaml @@ -13,6 +13,33 @@ default: graph_rewiring: "" frame_averaging: 2D fa_method: se3-random + is_disconnected: true + +runs: + + - config: depfaenet-is2re-all + note: Depfaenet per-ads-dataset + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4647488 #4647466 #4023244 + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 10 + lr_initial: 0.0002 + + - config: depfaenet-is2re-all + note: Depfaenet per-ads-dataset + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4647488 #4647466 # 4023244 + adsorbates: "*O, *OH, *OH2, *H" + optim: + max_epochs: 12 + lr_initial: 0.0001 + +- config: depfaenet-is2re-all + note: Depfaenet per-ads-dataset + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4647466 # 4023244 + adsorbates: "*O, *OH, *OH2, *H" + graph_rewiring: "" + frame_averaging: 2D + fa_method: se3-random cp_data_to_tmpdir: True is_disconnected: true model: @@ -37,21 +64,4 @@ default: lr_initial: 0.002 scheduler: LinearWarmupCosineAnnealingLR eval_every: 0.4 - -runs: - - - config: depfaenet-is2re-all - note: Depfaenet per-ads-dataset - continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 - adsorbates: "*O, *OH, *OH2, *H" - optim: - max_epochs: 10 - lr_initial: 0.0002 - - - config: depfaenet-is2re-all - note: Depfaenet per-ads-dataset - continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4023244 - adsorbates: "*O, *OH, *OH2, *H" - optim: - max_epochs: 12 - lr_initial: 0.0001 + max_epochs: 12 \ No newline at end of file diff --git a/configs/exps/deup/gnn/faenet-finetune.yaml b/configs/exps/deup/gnn/faenet-finetune.yaml new file mode 100644 index 000000000..fae81269a --- /dev/null +++ b/configs/exps/deup/gnn/faenet-finetune.yaml @@ -0,0 +1,46 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + time: 18:00:00 + +default: + test_ri: True + mode: train + wandb_tags: faenet, no-concat, with-tag0, dropout, fine-tuned + wandb_project: ocp-deup + optim: + batch_size: 256 + eval_batch_size: 256 + cp_data_to_tmpdir: True + +runs: + - config: faenet-is2re-all + note: "fine-tuned faenet" + continue_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4647489 + adsorbates: "*O, *OH, *OH2, *H" + frame_averaging: 2D + fa_method: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 384 + num_filters: 480 + num_gaussians: 104 + num_interactions: 5 + second_layer_MLP: False + skip_co: False + cutoff: 6.0 + dropout_lin: 0.3 + dropout_lowest_layer: output + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 14 + eval_every: 0.25 \ No newline at end of file From 606fcd07ae72aa3b80d48abce3fc62ed13593dbc Mon Sep 17 00:00:00 2001 From: Christina Date: Fri, 26 Apr 2024 02:33:56 -0400 Subject: [PATCH 19/27] quickfixes --- ocpmodels/common/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 99185a39a..b7a39d391 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -755,7 +755,7 @@ def add_edge_distance_to_graph( # Copied from https://github.com/facebookresearch/mmf/blob/master/mmf/utils/env.py#L89. -def setup_imports(skip_modules=[]): +def setup_imports(skip_imports=[]): from ocpmodels.common.registry import registry try: @@ -803,7 +803,7 @@ def setup_imports(skip_modules=[]): splits = f.split(os.sep) file_name = splits[-1] module_name = file_name[: file_name.find(".py")] - if module_name not in skip_modules: + if module_name not in skip_imports: importlib.import_module("ocpmodels.%s.%s" % (key[1:], module_name)) # manual model imports @@ -1801,7 +1801,7 @@ def make_script_trainer(str_args=[], overrides={}, silent=False, mode="train"): return trainer -def make_config_from_dir(path, mode, overrides={}, silent=None, setup_imports=[]): +def make_config_from_dir(path, mode, overrides={}, silent=None, skip_imports=[]): """ Make a config from a directory. This is useful when restarting or continuing from a previous run. @@ -1838,7 +1838,7 @@ def make_config_from_dir(path, mode, overrides={}, silent=None, setup_imports=[] config = build_config(default_args, silent=silent) config = merge_dicts(config, overrides) - setup_imports(setup_imports=setup_imports) + setup_imports(skip_imports=skip_imports) return config From fcf265055d8f9c7f6ce146670eed92655af94f8c Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 26 Apr 2024 03:30:53 -0400 Subject: [PATCH 20/27] update configs deup-depfaenet --- configs/exps/deup/gnn/faenet-finetune.yaml | 5 +- configs/exps/deup/gnn/faenet-training.yaml | 5 +- .../exps/deup/uncertainty/deup_depfaenet.yaml | 61 +++++++++++++++++++ ocpmodels/datasets/deup_dataset_creator.py | 4 +- scripts/gnn_dev.py | 9 ++- 5 files changed, 73 insertions(+), 11 deletions(-) create mode 100644 configs/exps/deup/uncertainty/deup_depfaenet.yaml diff --git a/configs/exps/deup/gnn/faenet-finetune.yaml b/configs/exps/deup/gnn/faenet-finetune.yaml index fae81269a..de5180834 100644 --- a/configs/exps/deup/gnn/faenet-finetune.yaml +++ b/configs/exps/deup/gnn/faenet-finetune.yaml @@ -10,9 +10,10 @@ default: mode: train wandb_tags: faenet, no-concat, with-tag0, dropout, fine-tuned wandb_project: ocp-deup + graph_rewiring: "" optim: - batch_size: 256 - eval_batch_size: 256 + batch_size: 232 + eval_batch_size: 232 cp_data_to_tmpdir: True runs: diff --git a/configs/exps/deup/gnn/faenet-training.yaml b/configs/exps/deup/gnn/faenet-training.yaml index 0d6aa34d5..c3775e66e 100644 --- a/configs/exps/deup/gnn/faenet-training.yaml +++ b/configs/exps/deup/gnn/faenet-training.yaml @@ -11,13 +11,14 @@ default: wandb_tags: faenet, no-concat, with-tag0, dropout wandb_project: ocp-deup optim: - batch_size: 256 - eval_batch_size: 256 + batch_size: 200 + eval_batch_size: 200 cp_data_to_tmpdir: True runs: - config: faenet-is2re-all note: "top run no concat" + graph_rewiring: "" frame_averaging: 2D fa_method: se3-random model: diff --git a/configs/exps/deup/uncertainty/deup_depfaenet.yaml b/configs/exps/deup/uncertainty/deup_depfaenet.yaml new file mode 100644 index 000000000..7b3ccd8a1 --- /dev/null +++ b/configs/exps/deup/uncertainty/deup_depfaenet.yaml @@ -0,0 +1,61 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + +default: + config: deup_faenet-deup_is2re-all + wandb_project: ocp-deup + wandb_tags: deup-depfaenet, 4648581-model, 4657270-dataset + test_ri: True + mode: train + model: + dropout_lowest_layer: output + first_trainable_layer: output + dropout_lin: 0.3 + cp_data_to_tmpdir: false + inference_time_loops: 1 + restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4648581/ + checkpoint: /network/scratch/a/alexandre.duval/ocp/runs/4648581/ + dataset: # mandatory if restart_from_dir is set + default_val: deup-val_ood_cat-val_ood_ads + deup-train-val_id: + src: /network/scratch/a/alexandre.duval/ocp/runs/4657270/deup_dataset + deup-val_ood_cat-val_ood_ads: + src: /network/scratch/a/alexandre.duval/ocp/runs/4657270/deup_dataset + deup_dataset: + create: False + +runs: + - note: deup-depfaenet (with dropout) + graph_rewiring: "" + frame_averaging: 2D + fa_method: se3-random + cp_data_to_tmpdir: True + is_disconnected: true + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 0 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: False + cutoff: 4.0 + dropout_lin: 0.3 + optim: + batch_size: 256 + eval_batch_size: 256 + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + eval_every: 0.4 + max_epochs: 12 + + - note: deup-depfaenet (without specifying configs) \ No newline at end of file diff --git a/ocpmodels/datasets/deup_dataset_creator.py b/ocpmodels/datasets/deup_dataset_creator.py index f3a4e3adc..fca3fea8b 100644 --- a/ocpmodels/datasets/deup_dataset_creator.py +++ b/ocpmodels/datasets/deup_dataset_creator.py @@ -441,13 +441,13 @@ def parse_args(): parser.add_argument( "--checkpoints", nargs="+", - default="/network/scratch/a/alexandre.duval/ocp/runs/4616500/", + default="/network/scratch/a/alexandre.duval/ocp/runs/4648581/", help="Paths to the checkpoints", ) parser.add_argument( "--dropout", type=float, - default=0.2, + default=0.3, help="Dropout value", ) return parser.parse_args() diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py index bc2205553..bc3924fbe 100644 --- a/scripts/gnn_dev.py +++ b/scripts/gnn_dev.py @@ -16,7 +16,7 @@ if __name__ == "__main__": config = {} # Customize args - config["graph_rewiring"] = "remove-tag-0" + config["graph_rewiring"] = "" config["frame_averaging"] = "2D" config["fa_method"] = "random" # "random" config["test_ri"] = False @@ -29,10 +29,9 @@ str_args = sys.argv[1:] if all("config" not in arg for arg in str_args): str_args.append("--is_debug") - # str_args.append("--config=faenet-is2re-all") - str_args.append("--config=faenet-is2re-10k") - str_args.append("--adsorbates={'*O', '*OH', '*OH2', '*H'}") - # str_args.append("--is_disconnected=True") + str_args.append("--config=deup_depfaenet-deup_is2re-10k") + # str_args.append("--adsorbates={'*O', '*OH', '*OH2', '*H'}") + str_args.append("--is_disconnected=True") # str_args.append("--silent=0") warnings.warn( "No model / mode is given; chosen as default" + f"Using: {str_args[-1]}" From 4d73707b4399c29970ee68c6ed870d74660837e2 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 26 Apr 2024 03:48:24 -0400 Subject: [PATCH 21/27] test use deup-dataset in an active learning framework --- scripts/active_learning.py | 97 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 scripts/active_learning.py diff --git a/scripts/active_learning.py b/scripts/active_learning.py new file mode 100644 index 000000000..9d8d33bb6 --- /dev/null +++ b/scripts/active_learning.py @@ -0,0 +1,97 @@ +""" +Copyright (c) Facebook, Inc. and its affiliates. + +This source code is licensed under the MIT license found in the +LICENSE file in the root directory of this source tree. +""" +import sys +import warnings +from pathlib import Path + +sys.path.append(str(Path(__file__).resolve().parent.parent)) + +from ocpmodels.common.utils import make_script_trainer, make_trainer_from_dir +from ocpmodels.common.gfn import FAENetWrapper +from ocpmodels.trainers import SingleTrainer +from ocpmodels.datasets.lmdb_dataset import DeupDataset +from ocpmodels.datasets.data_transforms import get_transforms + +if __name__ == "__main__": + + deup_dataset_chkpt = "/network/scratch/a/alexandre.duval/ocp/runs/4657270/deup_dataset" + model_chkpt = "/network/scratch/a/alexandre.duval/ocp/runs/4648581/checkpoints/best_checkpoint.pt" + + data_config = { + "default_val": "deup-val_ood_cat-val_ood_ads", + "deup-train-val_id": { + "src": deup_dataset_chkpt + }, + "deup-val_ood_cat-val_ood_ads": { + "src": deup_dataset_chkpt + }, + "train": { + "src": "/network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/train/", + "normalize_labels": True, + }, + "val_id": { + "src": "/network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_id/" + }, + "val_ood_cat": { + "src": "/network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_ood_cat/" + }, + "val_ood_ads": { + "src": "/network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_ood_ads/" + }, + "val_ood_both": { + "src": "/network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_ood_both/" + }, + } + + trainer = make_trainer_from_dir( + model_chkpt, + mode="continue", + overrides={ + "is_debug": True, + "silent": True, + "cp_data_to_tmpdir": False, + "config": "depfaenet-deup_is2re-all", + "deup_dataset.create": False, + "dataset": data_config, + }, + silent=True, + ) + + wrapper = FAENetWrapper( + faenet=trainer.model, + transform=get_transforms(trainer.config), + frame_averaging=trainer.config.get("frame_averaging", ""), + trainer_config=trainer.config, + ) + + wrapper.freeze() + loaders = trainer.loaders + + data_gen = iter(loaders["deup-train-val_id"]) + batch = next(data_gen) + preds = wrapper(batch) + + # trainer.config["dataset"].update({ + # "deup-train-val_id": { + # "src": "/network/scratch/s/schmidtv/ocp/runs/3301084/deup_dataset" + # }, + # "deup-val_ood_cat-val_ood_ads": { + # "src": "/network/scratch/s/schmidtv/ocp/runs/3301084/deup_dataset" + # }, + # "default_val": "deup-val_ood_cat-val_ood_ads" + # }) + + # deup_dataset_path = "/network/scratch/a/alexandre.duval/ocp/runs/4642835/deup_dataset" + # deup_dataset = DeupDataset( + # { + # **trainer.config["dataset"], + # }, + # "deup-train-val_id", + # transform=get_transforms(trainer.config), + # ) + + # deup_sample = deup_dataset[0] \ No newline at end of file From 175567efaebe65587de13c74b1b8b01415bb509f Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 17 May 2024 10:58:25 -0400 Subject: [PATCH 22/27] deupdepfaenet configs --- configs/exps/deup/uncertainty/deup_depfaenet.yaml | 6 +++--- configs/models/deup_depfaenet.yaml | 2 +- configs/models/deup_faenet.yaml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/exps/deup/uncertainty/deup_depfaenet.yaml b/configs/exps/deup/uncertainty/deup_depfaenet.yaml index 7b3ccd8a1..07f4f142f 100644 --- a/configs/exps/deup/uncertainty/deup_depfaenet.yaml +++ b/configs/exps/deup/uncertainty/deup_depfaenet.yaml @@ -5,7 +5,7 @@ job: partition: long default: - config: deup_faenet-deup_is2re-all + config: deup_depfaenet-deup_is2re-all wandb_project: ocp-deup wandb_tags: deup-depfaenet, 4648581-model, 4657270-dataset test_ri: True @@ -16,8 +16,8 @@ default: dropout_lin: 0.3 cp_data_to_tmpdir: false inference_time_loops: 1 - restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4648581/ - checkpoint: /network/scratch/a/alexandre.duval/ocp/runs/4648581/ + # restart_from_dir: /network/scratch/a/alexandre.duval/ocp/runs/4648581/ + # checkpoint: /network/scratch/a/alexandre.duval/ocp/runs/4648581/ dataset: # mandatory if restart_from_dir is set default_val: deup-val_ood_cat-val_ood_ads deup-train-val_id: diff --git a/configs/models/deup_depfaenet.yaml b/configs/models/deup_depfaenet.yaml index 24ab2587c..be23501f7 100644 --- a/configs/models/deup_depfaenet.yaml +++ b/configs/models/deup_depfaenet.yaml @@ -39,7 +39,7 @@ default: res_updown: hidden_channels: 128 norm: batch1d # batch1d, layer or null - deup_features: [s, energy_pred_std] + deup_features: [s, energy_pred_std] # add q for density optim: batch_size: 256 eval_batch_size: 256 diff --git a/configs/models/deup_faenet.yaml b/configs/models/deup_faenet.yaml index f6e52681f..c11f6e450 100644 --- a/configs/models/deup_faenet.yaml +++ b/configs/models/deup_faenet.yaml @@ -39,7 +39,7 @@ default: res_updown: hidden_channels: 128 norm: batch1d # batch1d, layer or null - deup_features: [s, energy_pred_std] + deup_features: [s, energy_pred_std] # add q for density if it exists optim: batch_size: 256 eval_batch_size: 256 From 03f30388994907671668c6b77396f23bf1b84e3a Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 17 May 2024 11:01:41 -0400 Subject: [PATCH 23/27] fix issues with q + enforce graph-level deup-dataset --- ocpmodels/datasets/deup_dataset_creator.py | 2 ++ ocpmodels/models/depfaenet.py | 2 +- ocpmodels/models/deup_depfaenet.py | 20 +++++++++++--------- ocpmodels/models/deup_faenet.py | 15 ++++++++------- ocpmodels/models/faenet.py | 6 ++++-- 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/ocpmodels/datasets/deup_dataset_creator.py b/ocpmodels/datasets/deup_dataset_creator.py index fca3fea8b..8b6e193c5 100644 --- a/ocpmodels/datasets/deup_dataset_creator.py +++ b/ocpmodels/datasets/deup_dataset_creator.py @@ -329,6 +329,7 @@ def create_deup_dataset( pred_mean, batch.y_relaxed.to(pred_mean.device) ) # Store deup samples + assert len(preds["q"]) == len(batch) deup_samples += [ { "energy_target": batch.y_relaxed.clone(), @@ -481,6 +482,7 @@ def parse_args(): # base_config = make_config_from_conf_str("faenet-is2re-all") # base_datasets_config = base_config["dataset"] + # Load deup dataset deup_dataset = DeupDataset( { **base_datasets_config, diff --git a/ocpmodels/models/depfaenet.py b/ocpmodels/models/depfaenet.py index 87f76e08c..4e83dbc0e 100644 --- a/ocpmodels/models/depfaenet.py +++ b/ocpmodels/models/depfaenet.py @@ -92,6 +92,6 @@ def __init__(self, **kwargs): def energy_forward(self, data, q=None): # We need to save the tags so this step is necessary. self.output_block.tags_saver(data.tags) - pred = super().energy_forward(data) + pred = super().energy_forward(data, q) return pred diff --git a/ocpmodels/models/deup_depfaenet.py b/ocpmodels/models/deup_depfaenet.py index 8457acf45..619ff6a68 100644 --- a/ocpmodels/models/deup_depfaenet.py +++ b/ocpmodels/models/deup_depfaenet.py @@ -30,6 +30,7 @@ def __init__( ) def forward(self, h, edge_index, edge_weight, batch, alpha, data=None): + # If sample density is used as feature, we need to add the extra dimension if self._set_q_dim: assert data is not None assert "deup_q" in data.to_dict().keys() @@ -58,13 +59,14 @@ def forward(self, h, edge_index, edge_weight, batch, alpha, data=None): }: h = h * alpha - # Global pooling -- get final graph rep - out = scatter( - h, - batch, - dim=0, - reduce="mean" if self.deup_extra_dim > 0 else "add", - ) + # Pool into a graph rep if necessary + if len(h) > len(batch): + h = scatter( + h, + batch, + dim=0, + reduce="mean" if self.deup_extra_dim > 0 else "add", + ) # Concat graph representation with deup features (s, kde(q), std) # and apply MLPs @@ -76,7 +78,7 @@ def forward(self, h, edge_index, edge_weight, batch, alpha, data=None): + f" from the data dict ({data_keys})" ) out = torch.cat( - [out] + [h] + [data[f"deup_{k}"][:, None].float() for k in self.deup_features], dim=-1, ) @@ -87,7 +89,7 @@ def forward(self, h, edge_index, edge_weight, batch, alpha, data=None): return out @registry.register_model("deup_depfaenet") -class DeupFAENet(DepFAENet): +class DeupDepFAENet(DepFAENet): def __init__(self, *args, **kwargs): kwargs["dropout_edge"] = 0 super().__init__(*args, **kwargs) diff --git a/ocpmodels/models/deup_faenet.py b/ocpmodels/models/deup_faenet.py index 88a55964c..726e0e350 100644 --- a/ocpmodels/models/deup_faenet.py +++ b/ocpmodels/models/deup_faenet.py @@ -58,12 +58,13 @@ def forward(self, h, edge_index, edge_weight, batch, alpha, data=None): h = h * alpha # Global pooling -- get final graph rep - out = scatter( - h, - batch, - dim=0, - reduce="mean" if self.deup_extra_dim > 0 else "add", - ) + if len(h) > len(batch): + h = scatter( + h, + batch, + dim=0, + reduce="mean" if self.deup_extra_dim > 0 else "add", + ) # Concat graph representation with deup features (s, kde(q), std) # and apply MLPs @@ -75,7 +76,7 @@ def forward(self, h, edge_index, edge_weight, batch, alpha, data=None): + f" from the data dict ({data_keys})" ) out = torch.cat( - [out] + [h] + [data[f"deup_{k}"][:, None].float() for k in self.deup_features], dim=-1, ) diff --git a/ocpmodels/models/faenet.py b/ocpmodels/models/faenet.py index 7d5c6044d..78b9980cd 100644 --- a/ocpmodels/models/faenet.py +++ b/ocpmodels/models/faenet.py @@ -711,7 +711,7 @@ def energy_forward(self, data, q=None): edge_attr = edge_attr[edge_mask] rel_pos = rel_pos[edge_mask] - if q is None: + if not hasattr(data, "deup_q"): # Embedding block h, e = self.embed_block(z, rel_pos, edge_attr, data.tags) @@ -754,6 +754,7 @@ def energy_forward(self, data, q=None): # WARNING # q which is NOT the hidden state h if it was stored as a scattered # version of h. This works for GPs, NOT for MC-dropout + q = data.deup_q # No need to clone # TODO: check that it's not a problem (move to deup models) h = q alpha = None @@ -766,7 +767,8 @@ def energy_forward(self, data, q=None): elif self.skip_co == "add": energy = sum(energy_skip_co) - if q and len(q) > len(energy): + # Store graph-level representation. # TODO: maybe want node-level rep + if q is not None and len(q) > len(energy): # N_atoms x hidden_channels q = scatter(q, batch, dim=0, reduce="mean") # N_graphs x hidden_channels preds = { From ae7b17559354e3cc5e3af29bee4b363a32c7c987 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Mon, 20 May 2024 06:21:08 -0400 Subject: [PATCH 24/27] random instead of randon in yaml --- configs/models/faenet.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/models/faenet.yaml b/configs/models/faenet.yaml index 3e66bba5f..97d113632 100644 --- a/configs/models/faenet.yaml +++ b/configs/models/faenet.yaml @@ -1,6 +1,6 @@ default: frame_averaging: "" # {"2D", "3D", "DA", ""} - fa_method: "" # {"", all, randon, det, se3-all, se3-randon, se3-det} + fa_method: "" # {"", all, random, det, se3-all, se3-random, se3-det} model: name: faenet act: swish @@ -69,7 +69,7 @@ is2re: default: graph_rewiring: remove-tag-0 frame_averaging: "2D" # {"2D", "3D", "DA", ""} - fa_method: "se3-random" # {"", all, randon, det, se3-all, se3-randon, se3-det} + fa_method: "se3-random" # {"", all, random, det, se3-all, se3-random, se3-det} # *** Important note *** # The total number of gpus used for this run was 1. # If the global batch size (num_gpus * batch_size) is modified From 7c2714cc51c35a09875c577c647371d8e9c7634c Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Mon, 20 May 2024 06:23:13 -0400 Subject: [PATCH 25/27] random, not stochastic --- ocpmodels/datasets/data_transforms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocpmodels/datasets/data_transforms.py b/ocpmodels/datasets/data_transforms.py index 17a63dfa5..db01cbb1e 100644 --- a/ocpmodels/datasets/data_transforms.py +++ b/ocpmodels/datasets/data_transforms.py @@ -41,11 +41,11 @@ class FrameAveraging(Transform): Can be 2D FA, 3D FA, Data Augmentation or no FA, respectively denoted by (`"2D"`, `"3D"`, `"DA"`, `""`) fa_method (str): the actual frame averaging technique used. - "stochastic" refers to sampling one frame at random (at each epoch), "det" + "random" refers to sampling one frame at random (at each epoch), "det" to chosing deterministically one frame, and "all" to using all frames. The prefix "se3-" refers to the SE(3) equivariant version of the method. "" - means that no frame averaging is used. (`""`, `"stochastic"`, `"all"`, - `"det"`, `"se3-stochastic"`, `"se3-all"`, `"se3-det"`) + means that no frame averaging is used. (`""`, `"random"`, `"all"`, + `"det"`, `"se3-random"`, `"se3-all"`, `"se3-det"`) Returns: (data.Data): updated data object with new positions (+ unit cell) attributes From b006540fe4027a8a82854fbd3b1901411908c234 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 21 May 2024 09:59:08 -0400 Subject: [PATCH 26/27] signnet analysis (workshop submission) --- scripts/signnet.py | 116 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 scripts/signnet.py diff --git a/scripts/signnet.py b/scripts/signnet.py new file mode 100644 index 000000000..9d1dfb843 --- /dev/null +++ b/scripts/signnet.py @@ -0,0 +1,116 @@ +import sys +from pathlib import Path +import torch + +sys.path.append(str(Path(__file__).resolve().parent.parent)) + +from ocpmodels.common.utils import make_script_trainer +from ocpmodels.trainers import SingleTrainer +from torch_geometric.data import Batch + +if __name__ == "__main__": + config = {} + + # Customize args + config["graph_rewiring"] = "remove-tag-0" + config["frame_averaging"] = "3D" + config["fa_method"] = "all" + config["test_ri"] = False + # config["optim"] = {"batch_size": 1} + + str_args = sys.argv[1:] + if all("config" not in arg for arg in str_args): + str_args.append("--is_debug") + str_args.append("--config=faenet-is2re-10k") + + # Create trainer + trainer: SingleTrainer = make_script_trainer(str_args=str_args, overrides=config) + + for batch in trainer.loaders["train"]: + break + b = batch[0] + rotated_b = b.clone() + rotated_b = trainer.rotate_graph(rotated_b, rotation="z") + rotation_matrix = rotated_b["rot"] + rotated_b = rotated_b["batch_list"][0] + + # Check: X' = X R (or X = X' R^T) + assert torch.allclose(rotated_b[0].pos @ rotation_matrix.T, b[0].pos, atol=1e-04) + assert torch.allclose(b[0].pos @ rotation_matrix, rotated_b[0].pos, atol=1e-04) + # Check: X U_i = X' U_i (compare X_fa and X'fa, abs values to deal with different frames) + assert torch.allclose( + torch.abs(b[0].pos @ b[0].fa_rot[0].squeeze(0)), + torch.abs(rotated_b[0].pos @ rotated_b[0].fa_rot[0].squeeze(0)), + atol=10e-03, + ) + # Check: U_i' = R U_i + + # SignNet model + class SignNet(torch.nn.Module): + def __init__(self, in_channels=3, hidden_channels=12, out_channels=3): + super(SignNet, self).__init__() + self.mlp = torch.nn.Sequential( + torch.nn.Linear(in_channels, hidden_channels), + torch.nn.ReLU(), + torch.nn.Linear(hidden_channels, out_channels), + ) + torch.nn.init.xavier_uniform_(self.mlp[0].weight) + torch.nn.init.xavier_uniform_(self.mlp[2].weight) + self.mlp2 = torch.nn.Linear(3 * out_channels, 3 * out_channels) + + torch.nn.init.xavier_uniform_(self.mlp2.weight) + + def forward(self, x, second_mlp=False): + if second_mlp: + res = self.mlp(x) + self.mlp(-x) + res = res.view(-1) # flatten res + res = self.mlp2(res) + return res.view((3, -1)).T # reshape as eigenvector column matrix + return (self.mlp(x) + self.mlp(-x)).T + + signnet = SignNet() + second_mlp = True + + for i in range(len(b.sid)): + g = Batch.get_example(b, i) + rotated_g = Batch.get_example(rotated_b, i) + + # Test: X_fa = R X_fa' + torch.allclose(rotation_matrix @ rotated_g.fa_rot[0], g.fa_rot[0], atol=5e-01) + + # SignNet on eigenvector matrix U for g and rotated_g + # Need SignNet(U_i) = U*, for every frame U_i + # Eigenvectors are the columns of fa_rot. Need rows for SignNet MLPs + eigen = signnet(g.fa_rot[0].squeeze(0).T, second_mlp) + eigen_bis = signnet(g.fa_rot[1].squeeze(0).T, second_mlp) + assert torch.allclose(eigen, eigen_bis, atol=1e-04) + + # Compare with rotated graph + rot_eigen = signnet(rotated_g.fa_rot[0].squeeze(0).T, second_mlp) + # Check U*' = R U* + if torch.allclose(rot_eigen, eigen, atol=1e-4): + print("U* is invariant to rotations") + elif torch.allclose(rot_eigen, rotation_matrix @ eigen, atol=1e-4): + print("U* is equivariant to rotations") + else: + print("U* is neither invariant nor equivariant") + # Double-Check: X U* = X' U*' + new_pos = g.pos @ eigen + new_rotated_pos = rotated_g.pos @ rot_eigen + if not torch.allclose(new_pos, new_rotated_pos, atol=1e-4): + print("No equivariance: X U* != X' U*'") + + # Different eigenvalues matrix => want different U* + m = g.fa_rot[0].squeeze(0).T + torch.randn(3, 3) + e = signnet(m, second_mlp) + if torch.allclose(e, eigen, atol=1e-4): + print("Issue: distinct graph has same signnet eigenvectors") + + # Same but on real eigenvec matrix + next_g = Batch.get_example(b, i+1) + e = signnet(next_g.fa_rot[0].squeeze(0).T, second_mlp) + if torch.allclose(e, eigen, atol=1e-4): + print("Issue: distinct graph has same signnet eigenvectors") + + # Try with more complex network + # Repalce False by True in signnet above From cc503353961b78ea83294c9d72444702bec169b4 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 31 Oct 2024 14:42:47 -0400 Subject: [PATCH 27/27] denormalise predictions --- ocpmodels/common/gfn.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/ocpmodels/common/gfn.py b/ocpmodels/common/gfn.py index 0a1f45521..0e4ba7389 100644 --- a/ocpmodels/common/gfn.py +++ b/ocpmodels/common/gfn.py @@ -20,6 +20,7 @@ def __init__( transform: Callable = None, frame_averaging: str = None, trainer_config: dict = None, + normalizers: dict = None, ): """ `FAENetWrapper` is a wrapper class for the FAENet model. It is used to perform @@ -31,6 +32,7 @@ def __init__( frame_averaging (str, optional): The frame averaging method to use. trainer_config (dict, optional): The trainer config used to create the model. Defaults to None. + normalizers (dict, optional): The normalizers used to create the model. """ super().__init__() @@ -39,6 +41,7 @@ def __init__( self.frame_averaging = frame_averaging self.trainer_config = trainer_config self._is_frozen = None + self.normalizers = normalizers @property def frozen(self): @@ -165,7 +168,15 @@ def forward( if retrieve_hidden: return preds - return preds["energy"] # denormalize? + breakpoint() + + # Denormalize predictions + preds["energy"] = self.normalizers["target"].denorm( + preds["energy"], + ) + # preds["energy"] = preds["energy"].to(torch.float16) + + return preds["energy"] def freeze(self): """Freeze the model parameters.""" @@ -274,6 +285,7 @@ def prepare_for_gfn(ckpt_paths: dict, release: str) -> tuple: transform=get_transforms(trainer.config), frame_averaging=trainer.config.get("frame_averaging", ""), trainer_config=trainer.config, + normalizers=trainer.normalizers, ) wrapper.freeze() loaders = trainer.loaders @@ -288,10 +300,10 @@ def prepare_for_gfn(ckpt_paths: dict, release: str) -> tuple: from ocpmodels.common.gfn import prepare_for_gfn ckpt_paths = {"mila": "/path/to/releases_dir"} - release = "v2.3_graph_phys" + release = "0.0.1" # or ckpt_paths = { - "mila": "/network/scratch/s/schmidtv/ocp/runs/3789733/checkpoints/best_checkpoint.pt" + "mila": "/network/scratch/a/alexandre.duval/ocp/catalyst-ckpts/0.0.1/best_checkpoint.pt" } release = None wrapper, loaders = prepare_for_gfn(ckpt_paths, release)