diff --git a/benchmarks/matbench_v0.1_DensGNN/hyper.py b/benchmarks/matbench_v0.1_DensGNN/hyper.py new file mode 100644 index 00000000..e197dca6 --- /dev/null +++ b/benchmarks/matbench_v0.1_DensGNN/hyper.py @@ -0,0 +1,934 @@ +hyper_1 = { + "model": { + "class_name": "make_model_asu", + "module_name": "kgcnn.literature.DenseGNN", + "config": { + "name": "DenseGNN", + "inputs": { + + "offset": {"shape": (None, 3), "name": "offset", "dtype": "float32", "ragged": True}, + # "voronoi_ridge_area": {"shape": (None, ), "name": "voronoi_ridge_area", "dtype": "float32", "ragged": True}, + "atomic_number": {"shape": (None,), "name": "atomic_number", "dtype": "int32", "ragged": True}, + # "AGNIFinger": {"shape": (None,61), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "AGNIFinger": {"shape": (None,24), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "edge_indices": {"shape": (None, 2), "name": "edge_indices", "dtype": "int64", "ragged": True}, + "charge": {'shape': [1], 'name': "charge", 'dtype': 'float32', 'ragged': False}, + + }, + + "input_block_cfg" : {'node_size': 128, + 'edge_size': 128, + 'edge_embedding_args': {'bins_distance': 32, + 'max_distance': 8.0, + 'distance_log_base': 1.0, + 'bins_voronoi_area': None, + 'max_voronoi_area': None}}, + + + "output_block_cfg" : {'edge_mlp': None, + 'node_mlp': None, + 'global_mlp': {'units': [1], + 'activation': ['linear']}, + 'aggregate_edges_local': 'sum', + 'aggregate_edges_global': 'mean', + 'aggregate_nodes': 'mean', + 'return_updated_edges': False, + 'return_updated_nodes': True, + 'return_updated_globals': True, + 'edge_attention_mlp_local': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'edge_attention_mlp_global': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'node_attention_mlp': {'units': [32, 1], 'activation': ['swish', 'swish']}, + 'edge_gate': None, + 'node_gate': None, + 'global_gate': None, + 'residual_node_update': False, + 'residual_edge_update': False, + 'residual_global_update': False, + 'update_edges_input': [True, True, True, True], + 'update_nodes_input': [True, True, True], + 'update_global_input': [True, True, True], + 'multiplicity_readout': True}, + + "input_embedding": {"node": {"input_dim": 96, "output_dim": 64}, + "graph": {"input_dim": 100, "output_dim": 64} + }, + "depth": 5, + "n_units":128, + "gin_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + "graph_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + + "gin_args": {"pooling_method":"sum", "g_pooling_method":"max", + "edge_mlp_args": {"units": [128]*3, "use_bias": True, "activation": ["swish"]*3}, + "concat_args": {"axis": -1}, + "node_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + "graph_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + }, + } + }, + "training": { + "fit": {"batch_size": 64, "epochs": 300, "validation_freq": 20, "verbose": 2, "callbacks": []}, + + "compile": { + + # "optimizer": {"class_name": "Adam", + # "config": {"lr": { + # "class_name": "ExponentialDecay", + # "config": {"initial_learning_rate": 0.001, + # "decay_steps": 5800, + # "decay_rate": 0.5, "staircase": False}, + # } + # } + # }, + + "optimizer": { + "class_name": "Adam", + "config": { + "learning_rate": { + "class_name": "kgcnn>KerasPolynomialDecaySchedule", + "config": { + "dataset_size": 509, "batch_size": 64, "epochs": 800, + "lr_start": 0.0001, "lr_stop": 1.0e-05 + } + } + } + }, + + + "loss": "mean_absolute_error" + }, + + "cross_validation": {"class_name": "KFold", + "config": {"n_splits": 5, "random_state": 42, "shuffle": True}}, + "scaler": {"class_name": "StandardScaler", "config": {"with_std": True, "with_mean": True, "copy": True}} + }, + + "data": { + "dataset": { + "config": {}, + "methods": [ + + {"set_representation": { + "pre_processor": { + + "class_name": "KNNUnitCell", + "module_name": "kgcnn.crystal.preprocessor", + "config": {"k": 24} + + # "class_name": "VoronoiUnitCell", + # "module_name": "kgcnn.crystal.preprocessor", + # "config": {"min_ridge_area": 0.1} + + }, + "reset_graphs": False}}, + + ] + }, + "data_unit": "" + }, + + + "info": { + "postfix": "", + "postfix_file": "", + "kgcnn_version": "3.0.2" + } +} + +hyper_2 = { + "model": { + "class_name": "make_model_asu", + "module_name": "kgcnn.literature.DenseGNN", + "config": { + "name": "DenseGNN", + "inputs": { + "offset": {"shape": (None, 3), "name": "offset", "dtype": "float32", "ragged": True}, + # "voronoi_ridge_area": {"shape": (None, ), "name": "voronoi_ridge_area", "dtype": "float32", "ragged": True}, + "atomic_number": {"shape": (None,), "name": "atomic_number", "dtype": "int32", "ragged": True}, + # "AGNIFinger": {"shape": (None,61), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "AGNIFinger": {"shape": (None,24), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "edge_indices": {"shape": (None, 2), "name": "edge_indices", "dtype": "int64", "ragged": True}, + "charge": {'shape': [1], 'name': "charge", 'dtype': 'float32', 'ragged': False}, + }, + + "input_block_cfg" : {'node_size': 128, + 'edge_size': 128, + 'edge_embedding_args': {'bins_distance': 32, + 'max_distance': 8.0, + 'distance_log_base': 1.0, + 'bins_voronoi_area': None, + 'max_voronoi_area': None}}, + + + "output_block_cfg" : {'edge_mlp': None, + 'node_mlp': None, + 'global_mlp': {'units': [1], + 'activation': ['linear']}, + 'aggregate_edges_local': 'sum', + 'aggregate_edges_global': 'mean', + 'aggregate_nodes': 'mean', + 'return_updated_edges': False, + 'return_updated_nodes': True, + 'return_updated_globals': True, + 'edge_attention_mlp_local': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'edge_attention_mlp_global': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'node_attention_mlp': {'units': [32, 1], 'activation': ['swish', 'swish']}, + 'edge_gate': None, + 'node_gate': None, + 'global_gate': None, + 'residual_node_update': False, + 'residual_edge_update': False, + 'residual_global_update': False, + 'update_edges_input': [True, True, True, True], + 'update_nodes_input': [True, True, True], + 'update_global_input': [True, True, True], + 'multiplicity_readout': True}, + + "input_embedding": {"node": {"input_dim": 96, "output_dim": 64}, + "graph": {"input_dim": 100, "output_dim": 64} + }, + "depth": 5, + "n_units":128, + "gin_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + "graph_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + + "gin_args": {"pooling_method":"max", "g_pooling_method":"max", + "edge_mlp_args": {"units": [128]*3, "use_bias": True, "activation": ["swish"]*3}, + "concat_args": {"axis": -1}, + "node_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + "graph_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + }, + } + }, + + "training": { + "fit": {"batch_size": 64, "epochs": 300, "validation_freq": 20, "verbose": 2, "callbacks": []}, + "compile": { + "optimizer": {"class_name": "Adam", + "config": {"lr": { + "class_name": "ExponentialDecay", + "config": {"initial_learning_rate": 0.001, + "decay_steps": 5800, + "decay_rate": 0.5, "staircase": False}, + } + } + }, + + # "optimizer": { + # "class_name": "Adam", + # "config": { + # "learning_rate": { + # "class_name": "kgcnn>KerasPolynomialDecaySchedule", + # "config": { + # "dataset_size": 1013, "batch_size": 64, "epochs": 800, + # "lr_start": 0.0001, "lr_stop": 1.0e-05 + # } + # } + # } + # }, + "loss": "mean_absolute_error" + }, + + "cross_validation": {"class_name": "KFold", + "config": {"n_splits": 5, "random_state": 42, "shuffle": True}}, + "scaler": {"class_name": "StandardScaler", "config": {"with_std": True, "with_mean": True, "copy": True}} + }, + + "data": { + "dataset": { + "config": {}, + "methods": [ + + {"set_representation": { + "pre_processor": { + + "class_name": "KNNUnitCell", + "module_name": "kgcnn.crystal.preprocessor", + "config": {"k": 12} + + # "class_name": "VoronoiUnitCell", + # "module_name": "kgcnn.crystal.preprocessor", + # "config": {"min_ridge_area": 0.1} + + }, + "reset_graphs": False}}, + + ] + }, + "data_unit": "" + }, + + + "info": { + "postfix": "", + "postfix_file": "", + "kgcnn_version": "3.0.2" + } +} + + +hyper_3 = { + "model": { + "class_name": "make_model_asu", + "module_name": "kgcnn.literature.DenseGNN", + "config": { + "name": "DenseGNN", + "inputs": { + "offset": {"shape": (None, 3), "name": "offset", "dtype": "float32", "ragged": True}, + "voronoi_ridge_area": {"shape": (None, ), "name": "voronoi_ridge_area", "dtype": "float32", "ragged": True}, + "atomic_number": {"shape": (None,), "name": "atomic_number", "dtype": "int32", "ragged": True}, + # "AGNIFinger": {"shape": (None,61), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "AGNIFinger": {"shape": (None,24), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "edge_indices": {"shape": (None, 2), "name": "edge_indices", "dtype": "int64", "ragged": True}, + "charge": {'shape': [1], 'name': "charge", 'dtype': 'float32', 'ragged': False}, + + }, + + "input_block_cfg" : {'node_size': 128, + 'edge_size': 128, + 'edge_embedding_args': {'bins_distance': 32, + 'max_distance': 8.0, + 'distance_log_base': 1.0, + 'bins_voronoi_area': 32, + 'max_voronoi_area': 32}}, + + "output_block_cfg" : {'edge_mlp': None, + 'node_mlp': None, + 'global_mlp': {'units': [1], + 'activation': ['linear']}, + 'aggregate_edges_local': 'sum', + 'aggregate_edges_global': 'mean', + 'aggregate_nodes': 'mean', + 'return_updated_edges': False, + 'return_updated_nodes': True, + 'return_updated_globals': True, + 'edge_attention_mlp_local': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'edge_attention_mlp_global': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'node_attention_mlp': {'units': [32, 1], 'activation': ['swish', 'swish']}, + 'edge_gate': None, + 'node_gate': None, + 'global_gate': None, + 'residual_node_update': False, + 'residual_edge_update': False, + 'residual_global_update': False, + 'update_edges_input': [True, True, True, True], + 'update_nodes_input': [True, True, True], + 'update_global_input': [True, True, True], + 'multiplicity_readout': True}, + "input_embedding": {"node": {"input_dim": 96, "output_dim": 64}, + "graph": {"input_dim": 100, "output_dim": 64} + }, + "depth": 5, + "n_units":128, + "gin_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + "graph_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + + "gin_args": {"pooling_method":"sum", "g_pooling_method":"max", + "edge_mlp_args": {"units": [128]*3, "use_bias": True, "activation": ["swish"]*3}, + "concat_args": {"axis": -1}, + "node_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + "graph_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + }, + } + }, + "training": { + "fit": {"batch_size": 128, "epochs": 300, "validation_freq": 20, "verbose": 2, "callbacks": []}, + + "compile": { + + "optimizer": {"class_name": "Adam", + "config": {"lr": { + "class_name": "ExponentialDecay", + "config": {"initial_learning_rate": 0.001, + "decay_steps": 5800, + "decay_rate": 0.5, "staircase": False}, + } + } + }, + "loss": "mean_absolute_error" + }, + + "cross_validation": {"class_name": "KFold", + "config": {"n_splits": 5, "random_state": 42, "shuffle": True}}, + "scaler": {"class_name": "StandardScaler", "config": {"with_std": True, "with_mean": True, "copy": True}} + }, + "data": { + "dataset": { + "config": {}, + "methods": [ + + {"set_representation": { + "pre_processor": { + # "class_name": "KNNUnitCell", + # "module_name": "kgcnn.crystal.preprocessor", + # "config": {"k": 12} + "class_name": "VoronoiUnitCell", + "module_name": "kgcnn.crystal.preprocessor", + "config": {"min_ridge_area": 0.1} + }, + "reset_graphs": False}}, + ] + }, + "data_unit": "" + }, + "info": { + "postfix": "", + "postfix_file": "", + "kgcnn_version": "3.0.2" + } +} + +hyper_4 = { + "model": { + "class_name": "make_model_asu", + "module_name": "kgcnn.literature.DenseGNN", + "config": { + "name": "DenseGNN", + "inputs": { + "offset": {"shape": (None, 3), "name": "offset", "dtype": "float32", "ragged": True}, + "voronoi_ridge_area": {"shape": (None, ), "name": "voronoi_ridge_area", "dtype": "float32", "ragged": True}, + "atomic_number": {"shape": (None,), "name": "atomic_number", "dtype": "int32", "ragged": True}, + # "AGNIFinger": {"shape": (None,61), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "AGNIFinger": {"shape": (None,24), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "edge_indices": {"shape": (None, 2), "name": "edge_indices", "dtype": "int64", "ragged": True}, + "charge": {'shape': [1], 'name': "charge", 'dtype': 'float32', 'ragged': False}, + }, + "input_block_cfg" : {'node_size': 128, + 'edge_size': 128, + 'edge_embedding_args': {'bins_distance': 32, + 'max_distance': 8.0, + 'distance_log_base': 1.0, + 'bins_voronoi_area': 32, + 'max_voronoi_area': 32}}, + + "output_block_cfg" : {'edge_mlp': None, + 'node_mlp': None, + 'global_mlp': {'units': [1], + 'activation': ['linear']}, + 'aggregate_edges_local': 'sum', + 'aggregate_edges_global': 'mean', + 'aggregate_nodes': 'mean', + 'return_updated_edges': False, + 'return_updated_nodes': True, + 'return_updated_globals': True, + 'edge_attention_mlp_local': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'edge_attention_mlp_global': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'node_attention_mlp': {'units': [32, 1], 'activation': ['swish', 'swish']}, + 'edge_gate': None, + 'node_gate': None, + 'global_gate': None, + 'residual_node_update': False, + 'residual_edge_update': False, + 'residual_global_update': False, + 'update_edges_input': [True, True, True, True], + 'update_nodes_input': [True, True, True], + 'update_global_input': [True, True, True], + 'multiplicity_readout': True}, + + "input_embedding": {"node": {"input_dim": 96, "output_dim": 64}, + "graph": {"input_dim": 100, "output_dim": 64} + }, + "depth": 5, + "n_units":128, + "gin_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + "graph_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + + "gin_args": {"pooling_method":"sum", "g_pooling_method":"max", + "edge_mlp_args": {"units": [128]*3, "use_bias": True, "activation": ["swish"]*3}, + "concat_args": {"axis": -1}, + "node_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + "graph_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + }, + } + }, + + "training": { + "fit": {"batch_size": 128, "epochs": 300, "validation_freq": 20, "verbose": 2, "callbacks": []}, + + "compile": { + + "optimizer": {"class_name": "Adam", + "config": {"lr": { + "class_name": "ExponentialDecay", + "config": {"initial_learning_rate": 0.001, + "decay_steps": 5800, + "decay_rate": 0.5, "staircase": False}, + } + } + }, + # "optimizer": { + # "class_name": "Adam", + # "config": { + # "learning_rate": { + # "class_name": "kgcnn>KerasPolynomialDecaySchedule", + # "config": { + # "dataset_size": 15142, "batch_size": 64, "epochs": 800, + # "lr_start": 0.0005, "lr_stop": 1.0e-05 + # } + # } + # } + # }, + "loss": "mean_absolute_error" + }, + + "cross_validation": {"class_name": "KFold", + "config": {"n_splits": 5, "random_state": 42, "shuffle": True}}, + "scaler": {"class_name": "StandardScaler", "config": {"with_std": True, "with_mean": True, "copy": True}} + }, + + "data": { + "dataset": { + + "config": {}, + "methods": [ + + {"set_representation": { + "pre_processor": { + + # "class_name": "KNNUnitCell", + # "module_name": "kgcnn.crystal.preprocessor", + # "config": {"k": 12} + + "class_name": "VoronoiUnitCell", + "module_name": "kgcnn.crystal.preprocessor", + "config": {"min_ridge_area": 0.1} + + }, + "reset_graphs": False}}, + + ] + }, + "data_unit": "" + }, + + + "info": { + "postfix": "", + "postfix_file": "", + "kgcnn_version": "3.0.2" + } +} + + +hyper_5 = { + "model": { + "class_name": "make_model_asu", + "module_name": "kgcnn.literature.DenseGNN", + "config": { + "name": "DenseGNN", + "inputs": { + + "offset": {"shape": (None, 3), "name": "offset", "dtype": "float32", "ragged": True}, + "voronoi_ridge_area": {"shape": (None, ), "name": "voronoi_ridge_area", "dtype": "float32", "ragged": True}, + "atomic_number": {"shape": (None,), "name": "atomic_number", "dtype": "int32", "ragged": True}, + # "AGNIFinger": {"shape": (None,61), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "AGNIFinger": {"shape": (None,24), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "edge_indices": {"shape": (None, 2), "name": "edge_indices", "dtype": "int64", "ragged": True}, + "charge": {'shape': [1], 'name': "charge", 'dtype': 'float32', 'ragged': False}, + + }, + "input_block_cfg" : {'node_size': 128, + 'edge_size': 128, + 'edge_embedding_args': {'bins_distance': 32, + 'max_distance': 8.0, + 'distance_log_base': 1.0, + 'bins_voronoi_area': 32, + 'max_voronoi_area': 32}}, + + + "output_block_cfg" : {'edge_mlp': None, + 'node_mlp': None, + 'global_mlp': {'units': [1], + 'activation': ['linear']}, + 'aggregate_edges_local': 'sum', + 'aggregate_edges_global': 'mean', + 'aggregate_nodes': 'mean', + 'return_updated_edges': False, + 'return_updated_nodes': True, + 'return_updated_globals': True, + 'edge_attention_mlp_local': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'edge_attention_mlp_global': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'node_attention_mlp': {'units': [32, 1], 'activation': ['swish', 'swish']}, + 'edge_gate': None, + 'node_gate': None, + 'global_gate': None, + 'residual_node_update': False, + 'residual_edge_update': False, + 'residual_global_update': False, + 'update_edges_input': [True, True, True, True], + 'update_nodes_input': [True, True, True], + 'update_global_input': [True, True, True], + 'multiplicity_readout': True}, + + "input_embedding": {"node": {"input_dim": 96, "output_dim": 64}, + "graph": {"input_dim": 100, "output_dim": 64} + }, + "depth": 5, + "n_units":128, + "gin_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + "graph_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + + "gin_args": {"pooling_method":"sum", "g_pooling_method":"max", + "edge_mlp_args": {"units": [128]*3, "use_bias": True, "activation": ["swish"]*3}, + "concat_args": {"axis": -1}, + "node_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + "graph_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + }, + } + }, + "training": { + "fit": {"batch_size": 128, "epochs": 300, "validation_freq": 20, "verbose": 2, "callbacks": []}, + "compile": { + "optimizer": {"class_name": "Adam", + "config": {"lr": { + "class_name": "ExponentialDecay", + "config": {"initial_learning_rate": 0.001, + "decay_steps": 5800, + "decay_rate": 0.5, "staircase": False}, + } + } + }, + # "optimizer": { + # "class_name": "Adam", + # "config": { + # "learning_rate": { + # "class_name": "kgcnn>KerasPolynomialDecaySchedule", + # "config": { + # "dataset_size": 8789, "batch_size": 64, "epochs": 800, + # "lr_start": 0.0001, "lr_stop": 1.0e-05 + # } + # } + # } + # }, + + "loss": "mean_absolute_error" + }, + + "cross_validation": {"class_name": "KFold", + "config": {"n_splits": 5, "random_state": 42, "shuffle": True}}, + "scaler": {"class_name": "StandardScaler", "config": {"with_std": True, "with_mean": True, "copy": True}} + }, + + "data": { + "dataset": { + "config": {}, + "methods": [ + {"set_representation": { + "pre_processor": { + + # "class_name": "KNNUnitCell", + # "module_name": "kgcnn.crystal.preprocessor", + # "config": {"k": 12} + + "class_name": "VoronoiUnitCell", + "module_name": "kgcnn.crystal.preprocessor", + "config": {"min_ridge_area": 0.1} + + }, + "reset_graphs": False}}, + + ] + }, + "data_unit": "" + }, + + + "info": { + "postfix": "", + "postfix_file": "", + "kgcnn_version": "3.0.2" + } +} + + +hyper_6 = { + "model": { + "class_name": "make_model_asu", + "module_name": "kgcnn.literature.DenseGNN", + "config": { + "name": "DenseGNN", + "inputs": { + "offset": {"shape": (None, 3), "name": "offset", "dtype": "float32", "ragged": True}, + "voronoi_ridge_area": {"shape": (None, ), "name": "voronoi_ridge_area", "dtype": "float32", "ragged": True}, + "atomic_number": {"shape": (None,), "name": "atomic_number", "dtype": "int32", "ragged": True}, + "AGNIFinger": {"shape": (None,61), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + # "AGNIFinger": {"shape": (None,24), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "edge_indices": {"shape": (None, 2), "name": "edge_indices", "dtype": "int64", "ragged": True}, + "charge": {'shape': [1], 'name': "charge", 'dtype': 'float32', 'ragged': False}, + }, + + "input_block_cfg" : {'node_size': 128, + 'edge_size': 128, + 'edge_embedding_args': {'bins_distance': 32, + 'max_distance': 8.0, + 'distance_log_base': 1.0, + 'bins_voronoi_area': 32, + 'max_voronoi_area': 32}}, + + "output_block_cfg" : {'edge_mlp': None, + 'node_mlp': None, + 'global_mlp': {'units': [1], + 'activation': ['linear']}, + 'aggregate_edges_local': 'sum', + 'aggregate_edges_global': 'mean', + 'aggregate_nodes': 'mean', + 'return_updated_edges': False, + 'return_updated_nodes': True, + 'return_updated_globals': True, + 'edge_attention_mlp_local': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'edge_attention_mlp_global': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'node_attention_mlp': {'units': [32, 1], 'activation': ['swish', 'swish']}, + 'edge_gate': None, + 'node_gate': None, + 'global_gate': None, + 'residual_node_update': False, + 'residual_edge_update': False, + 'residual_global_update': False, + 'update_edges_input': [True, True, True, True], + 'update_nodes_input': [True, True, True], + 'update_global_input': [True, True, True], + 'multiplicity_readout': True}, + + "input_embedding": {"node": {"input_dim": 96, "output_dim": 64}, + "graph": {"input_dim": 100, "output_dim": 64} + }, + "depth": 5, + "n_units":128, + "gin_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + "graph_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + + "gin_args": {"pooling_method":"sum", "g_pooling_method":"mean", + "edge_mlp_args": {"units": [128]*3, "use_bias": True, "activation": ["swish"]*3}, + "concat_args": {"axis": -1}, + "node_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + "graph_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + }, + } + }, + + "training": { + "fit": {"batch_size": 128, "epochs": 300, "validation_freq": 20, "verbose": 2, "callbacks": []}, + + "compile": { + + "optimizer": {"class_name": "Adam", + "config": {"lr": { + "class_name": "ExponentialDecay", + "config": {"initial_learning_rate": 0.001, + "decay_steps": 5800, + "decay_rate": 0.5, "staircase": False}, + } + } + }, + + # "optimizer": { + # "class_name": "Adam", + # "config": { + # "learning_rate": { + # "class_name": "kgcnn>KerasPolynomialDecaySchedule", + # "config": { + # "dataset_size": 8789, "batch_size": 64, "epochs": 800, + # "lr_start": 0.0001, "lr_stop": 1.0e-05 + # } + # } + # } + # }, + + "loss": "mean_absolute_error" + }, + + "cross_validation": {"class_name": "KFold", + "config": {"n_splits": 5, "random_state": 42, "shuffle": True}}, + "scaler": {"class_name": "StandardScaler", "config": {"with_std": True, "with_mean": True, "copy": True}} + }, + + "data": { + "dataset": { + "config": {}, + "methods": [ + {"set_representation": { + "pre_processor": { + + # "class_name": "KNNUnitCell", + # "module_name": "kgcnn.crystal.preprocessor", + # "config": {"k": 12} + + "class_name": "VoronoiUnitCell", + "module_name": "kgcnn.crystal.preprocessor", + "config": {"min_ridge_area": 0.1} + + }, + "reset_graphs": False}}, + + ] + }, + "data_unit": "" + }, + + + "info": { + "postfix": "", + "postfix_file": "", + "kgcnn_version": "3.0.2" + } +} + + +hyper_7 = { + "model": { + "class_name": "make_model_asu", + "module_name": "kgcnn.literature.DenseGNN", + "config": { + "name": "DenseGNN", + "inputs": { + + "offset": {"shape": (None, 3), "name": "offset", "dtype": "float32", "ragged": True}, + # "voronoi_ridge_area": {"shape": (None, ), "name": "voronoi_ridge_area", "dtype": "float32", "ragged": True}, + "atomic_number": {"shape": (None,), "name": "atomic_number", "dtype": "int32", "ragged": True}, + # "AGNIFinger": {"shape": (None,61), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "AGNIFinger": {"shape": (None,24), "name": "AGNIFinger", "dtype": "float32", "ragged": True}, + "edge_indices": {"shape": (None, 2), "name": "edge_indices", "dtype": "int64", "ragged": True}, + "charge": {'shape': [1], 'name': "charge", 'dtype': 'float32', 'ragged': False}, + + }, + + "input_block_cfg" : {'node_size': 128, + 'edge_size': 128, + 'edge_embedding_args': {'bins_distance': 32, + 'max_distance': 8.0, + 'distance_log_base': 1.0, + 'bins_voronoi_area': None, + 'max_voronoi_area': None}}, + + + "output_block_cfg" : {'edge_mlp': None, + 'node_mlp': None, + 'global_mlp': {'units': [1], + 'activation': ['linear']}, + 'aggregate_edges_local': 'sum', + 'aggregate_edges_global': 'mean', + 'aggregate_nodes': 'mean', + 'return_updated_edges': False, + 'return_updated_nodes': True, + 'return_updated_globals': True, + 'edge_attention_mlp_local': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'edge_attention_mlp_global': {'units': [32, 1], + 'activation': ['swish', 'swish']}, + 'node_attention_mlp': {'units': [32, 1], 'activation': ['swish', 'swish']}, + 'edge_gate': None, + 'node_gate': None, + 'global_gate': None, + 'residual_node_update': False, + 'residual_edge_update': False, + 'residual_global_update': False, + 'update_edges_input': [True, True, True, True], + 'update_nodes_input': [True, True, True], + 'update_global_input': [True, True, True], + 'multiplicity_readout': True}, + + "input_embedding": {"node": {"input_dim": 96, "output_dim": 64}, + "graph": {"input_dim": 100, "output_dim": 64} + }, + "depth": 5, + "n_units":128, + "gin_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + "graph_mlp": {"units": [128], "use_bias": True, "activation": ["swish"], + }, + + "gin_args": {"pooling_method":"sum", "g_pooling_method":"max", + "edge_mlp_args": {"units": [128]*3, "use_bias": True, "activation": ["swish"]*3}, + "concat_args": {"axis": -1}, + "node_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + "graph_mlp_args": {"units": [128], "use_bias": True, "activation": ["swish"]}, + }, + } + }, + + "training": { + "fit": {"batch_size": 64, "epochs": 300, "validation_freq": 20, "verbose": 2, "callbacks": []}, + "compile": { + + # "optimizer": {"class_name": "Adam", + # "config": {"lr": { + # "class_name": "ExponentialDecay", + # "config": {"initial_learning_rate": 0.001, + # "decay_steps": 5800, + # "decay_rate": 0.5, "staircase": False}, + # } + # } + # }, + + "optimizer": { + "class_name": "Adam", + "config": { + "learning_rate": { + "class_name": "kgcnn>KerasPolynomialDecaySchedule", + "config": { + "dataset_size": 3811, "batch_size": 64, "epochs": 800, + "lr_start": 0.0001, "lr_stop": 1.0e-05 + } + } + } + }, + + "loss": "mean_absolute_error" + }, + + "cross_validation": {"class_name": "KFold", + "config": {"n_splits": 5, "random_state": 42, "shuffle": True}}, + "scaler": {"class_name": "StandardScaler", "config": {"with_std": True, "with_mean": True, "copy": True}} + }, + + "data": { + "dataset": { + "config": {}, + "methods": [ + {"set_representation": { + "pre_processor": { + "class_name": "KNNUnitCell", + "module_name": "kgcnn.crystal.preprocessor", + "config": {"k": 12} + + # "class_name": "VoronoiUnitCell", + # "module_name": "kgcnn.crystal.preprocessor", + # "config": {"min_ridge_area": 0.1} + + }, + "reset_graphs": False}}, + ] + }, + "data_unit": "" + }, + + + "info": { + "postfix": "", + "postfix_file": "", + "kgcnn_version": "3.0.2" + } +} diff --git a/benchmarks/matbench_v0.1_DensGNN/info.json b/benchmarks/matbench_v0.1_DensGNN/info.json new file mode 100644 index 00000000..18343829 --- /dev/null +++ b/benchmarks/matbench_v0.1_DensGNN/info.json @@ -0,0 +1,10 @@ +{ + "authors": "Hongwei Du, Hong Wang (original code by Hongwei Du)", + "algorithm": "DenseGNN (kgcnn v3.0.2)", + "algorithm_long": "DenseGNN: universal and scalable deeper graph neural networks for high performance property prediction in crystals and molecules. Adapted implementation of `kgcnn`. Original code from https://github.com/dhw059/DenseGNN. Initially, features for nodes, edges, and global data are extracted. In each layer of the GNN, the GraphMLP layers transform the node and edge features, while the DenseGNN layer aggregates information from neighbors and global features. The outputs are concatenated and used to update the node features. Finally, the output block processes the updated edge and node features along with global features and edge indices to produce the final output.We used a larger input embedding vector [64] of atom species and added the charge as input graph attributes. The training configuration specifies a batch size of 64, 300 epochs, and validation every 20 batches. It uses the Adam optimizer with an exponential decay learning rate starting at 0.001, decaying every 5800 steps by a rate of 0.5. The loss function is mean absolute error. K-fold cross-validation with 5 splits is applied, and data is standardized using StandardScaler.Training was carried out on Nvidia-RTX4090 with 24 GB of memory.Hyperparameter were not optimized.", + "bibtex_refs": [ + "@article{UnderReview, author = {Hongwei Du, Hong Wang}, title = {DenseGNN: universal and scalable deeper graph neural networks for high-performance property prediction in crystals and molecules}, journal = {npj Computational Materials}, volume = {}, number = {}, pages = {}, year = {2024}, doi = {Under Review}, URL = {}, eprint = {}}" + ], + "notes": "", + "requirements": "See GitHub page https://github.com/dhw059/DenseGNN." +} diff --git a/benchmarks/matbench_v0.1_DensGNN/results.json.gz b/benchmarks/matbench_v0.1_DensGNN/results.json.gz new file mode 100644 index 00000000..65f4fd14 Binary files /dev/null and b/benchmarks/matbench_v0.1_DensGNN/results.json.gz differ diff --git a/benchmarks/matbench_v0.1_DensGNN/train.ipynb b/benchmarks/matbench_v0.1_DensGNN/train.ipynb new file mode 100644 index 00000000..af9cb50f --- /dev/null +++ b/benchmarks/matbench_v0.1_DensGNN/train.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os.path\n", + "import argparse\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "from matbench.bench import MatbenchBenchmark\n", + "from kgcnn.data.crystal import CrystalDataset\n", + "from kgcnn.literature.DenseGNN import make_model_asu\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "from kgcnn.training.schedule import LinearWarmupExponentialDecay\n", + "from kgcnn.training.scheduler import LinearLearningRateScheduler\n", + "import kgcnn.training.callbacks\n", + "from kgcnn.utils.devices import set_devices_gpu\n", + "import numpy as np\n", + "from copy import deepcopy\n", + "from hyper import *\n", + "\n", + "parser = argparse.ArgumentParser(description='Train DenseGNN.')\n", + "parser.add_argument(\"--gpu\", required=False, help=\"GPU index used for training.\",\n", + " default=None, nargs=\"+\", type=int)\n", + "args = vars(parser.parse_args())\n", + "print(\"Input of argparse:\", args)\n", + "gpu_to_use = args[\"gpu\"]\n", + "set_devices_gpu(gpu_to_use)\n", + "\n", + "subsets_compatible = [\"matbench_jdft2d\", \"matbench_phonons\", \"matbench_mp_gap\", \n", + " \"matbench_perovskites\",\n", + " \"matbench_log_kvrh\", \"matbench_log_gvrh\", \"matbench_dielectric\"]\n", + "mb = MatbenchBenchmark(subset=subsets_compatible, autoload=False)\n", + "\n", + "callbacks = {\n", + " \"graph_labels\": lambda st, ds: np.expand_dims(ds, axis=-1),\n", + " \"node_coordinates\": lambda st, ds: np.array(st.cart_coords, dtype=\"float\"),\n", + " \"node_frac_coordinates\": lambda st, ds: np.array(st.frac_coords, dtype=\"float\"),\n", + " \"graph_lattice\": lambda st, ds: np.ascontiguousarray(np.array(st.lattice.matrix), dtype=\"float\"),\n", + " \"abc\": lambda st, ds: np.array(st.lattice.abc),\n", + " \"charge\": lambda st, ds: np.array([st.charge], dtype=\"float\"),\n", + " \"volume\": lambda st, ds: np.array([st.lattice.volume], dtype=\"float\"),\n", + " \"node_number\": lambda st, ds: np.array(st.atomic_numbers, dtype=\"int\"),\n", + "}\n", + "\n", + "hyper_all = {\n", + " \"matbench_jdft2d\": hyper_1,\n", + " \"matbench_phonons\": hyper_2,\n", + " \"matbench_mp_gap\": hyper_3,\n", + " \"matbench_perovskites\": hyper_4,\n", + " \"matbench_log_kvrh\": hyper_5,\n", + " \"matbench_log_gvrh\": hyper_6,\n", + " \"matbench_dielectric\": hyper_7,\n", + "}\n", + "\n", + "restart_training = True\n", + "remove_invalid_graphs_on_predict = True\n", + "\n", + "for idx_task, task in enumerate(mb.tasks):\n", + " task.load()\n", + " for i, fold in enumerate(task.folds):\n", + " hyper = deepcopy(hyper_all[task.dataset_name])\n", + "\n", + " # Define loss for either classification or regression\n", + " loss = {\n", + " \"class_name\": \"BinaryCrossentropy\", \"config\": {\"from_logits\": True}\n", + " } if task.metadata[\"task_type\"] == \"classification\" else \"mean_absolute_error\"\n", + " hyper[\"training\"][\"compile\"][\"loss\"] = loss\n", + "\n", + " if restart_training and os.path.exists(\n", + " \"%s_predictions_%s_fold_%s.npy\" % (task.dataset_name, hyper[\"model\"][\"config\"][\"name\"], i)):\n", + " predictions = np.load(\n", + " \"%s_predictions_%s_fold_%s.npy\" % (task.dataset_name, hyper[\"model\"][\"config\"][\"name\"], i)\n", + " )\n", + " task.record(fold, predictions)\n", + " continue\n", + "\n", + " train_inputs, train_outputs = task.get_train_and_val_data(fold)\n", + " data_train = CrystalDataset()\n", + "\n", + " data_train._map_callbacks(train_inputs, pd.Series(train_outputs.values), callbacks)\n", + " print(\"Making graph... (this may take a while)\")\n", + " data_train.set_methods(hyper[\"data\"][\"dataset\"][\"methods\"])\n", + " data_train.clean(hyper[\"model\"][\"config\"][\"inputs\"])\n", + "\n", + " y_train = np.array(data_train.get(\"graph_labels\"))\n", + " x_train = data_train.tensor(hyper[\"model\"][\"config\"][\"inputs\"])\n", + "\n", + " if task.metadata[\"task_type\"] == \"classification\":\n", + " scaler = None\n", + " else:\n", + " scaler = StandardScaler(**hyper[\"training\"][\"scaler\"][\"config\"])\n", + " y_train = scaler.fit_transform(y_train)\n", + " print(y_train.shape)\n", + "\n", + " # train and validate your model\n", + " model = make_model_asu(**hyper[\"model\"][\"config\"])\n", + " model.compile(\n", + " loss=tf.keras.losses.get(hyper[\"training\"][\"compile\"][\"loss\"]),\n", + " optimizer=tf.keras.optimizers.get(hyper[\"training\"][\"compile\"][\"optimizer\"])\n", + " )\n", + " hist = model.fit(\n", + " x_train, y_train,\n", + " batch_size=hyper[\"training\"][\"fit\"][\"batch_size\"],\n", + " epochs=hyper[\"training\"][\"fit\"][\"epochs\"],\n", + " verbose=hyper[\"training\"][\"fit\"][\"verbose\"],\n", + " callbacks=[tf.keras.utils.deserialize_keras_object(x) for x in hyper[\"training\"][\"fit\"][\"callbacks\"]]\n", + " )\n", + "\n", + " # Get testing data\n", + " test_inputs = task.get_test_data(fold, include_target=False)\n", + " data_test = CrystalDataset()\n", + " data_test._map_callbacks(test_inputs, pd.Series(np.zeros(len(test_inputs))), callbacks)\n", + " print(\"Making graph... (this may take a while)\")\n", + " data_test.set_methods(hyper[\"data\"][\"dataset\"][\"methods\"])\n", + "\n", + " if remove_invalid_graphs_on_predict:\n", + " removed = data_test.clean(hyper[\"model\"][\"config\"][\"inputs\"])\n", + " np.save(\n", + " \"%s_predictions_invalid_%s_fold_%s.npy\" % (task.dataset_name, hyper[\"model\"][\"config\"][\"name\"], i),\n", + " removed\n", + " )\n", + " else:\n", + " removed = None\n", + "\n", + " # Predict on the testing data\n", + " x_test = data_test.tensor(hyper[\"model\"][\"config\"][\"inputs\"])\n", + " predictions_model = model.predict(x_test)\n", + "\n", + " if remove_invalid_graphs_on_predict:\n", + " indices_test = [j for j in range(len(test_inputs))]\n", + " for j in removed:\n", + " indices_test.pop(j)\n", + " predictions = np.expand_dims(np.zeros(len(test_inputs), dtype=\"float\"), axis=-1)\n", + " predictions[np.array(indices_test)] = predictions_model\n", + " else:\n", + " predictions = predictions_model\n", + "\n", + " if task.metadata[\"task_type\"] == \"classification\":\n", + " def np_sigmoid(x):\n", + " return np.exp(-np.logaddexp(0, -x))\n", + " predictions = np_sigmoid(predictions)\n", + " else:\n", + " predictions = scaler.inverse_transform(predictions)\n", + "\n", + " if predictions.shape[-1] == 1:\n", + " predictions = np.squeeze(predictions, axis=-1)\n", + "\n", + " np.save(\n", + " \"%s_predictions_%s_fold_%s.npy\" % (task.dataset_name, hyper[\"model\"][\"config\"][\"name\"], i),\n", + " predictions\n", + " )\n", + "\n", + " # Record data!\n", + " task.record(fold, predictions)\n", + "\n", + "# Save your results\n", + "mb.to_file(\"results_densegnn.json.gz\")\n", + "\n", + "for key, values in mb.scores.items():\n", + " factor = 1000.0 if key in [\"matbench_jdft2d\"] else 1.0\n", + " if key not in [\"matbench_mp_is_metal\"]:\n", + " print(key, factor*values[\"mae\"][\"mean\"], factor*values[\"mae\"][\"std\"])\n", + " else:\n", + " print(key, values[\"rocauc\"][\"mean\"], values[\"rocauc\"][\"std\"])\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/info.json b/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/info.json new file mode 100644 index 00000000..e22c45f1 --- /dev/null +++ b/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/info.json @@ -0,0 +1,8 @@ +{ + "authors": "Hongwei Du, Hong Wang (original code by Hongwei Du)", + "algorithm": "LGDCNN", + "algorithm_long": "Rational Design of Deep Learning Networks Based on Fusion Strategy for Improved Materials Property Predictions. See github page for more information: https://github.com/dhw059/DeepModelFusion.", + "bibtex_refs": "@article{UnderReview, author = {Hongwei Du, Hong Wang}, title = {Rational Design of Deep Learning Networks Based on Fusion Strategy for Improved Materials Property Predictions}, journal = {Journal of Chemical Theory and Computation}, volume = {}, number = {}, pages = {}, year = {2024}, doi = {Under Review}, URL = {}, eprint = {}}", + "requirements": "See GitHub page for LGDCNN.", + "notes": "" +} \ No newline at end of file diff --git a/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/matbench_notebook.ipynb b/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/matbench_notebook.ipynb new file mode 100644 index 00000000..7ebcc8bd --- /dev/null +++ b/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/matbench_notebook.ipynb @@ -0,0 +1,257 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from matbench.bench import MatbenchBenchmark\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "from sklearn.metrics import roc_auc_score \n", + "from lgdcnn.fusion_lstm_dcnn import LGDCNN\n", + "from lgdcnn.train import Model\n", + "from lgdcnn.utils.get_compute_device import get_compute_device\n", + "\n", + "compute_device = get_compute_device(prefer_last=False)\n", + "RNG_SEED = 42\n", + "torch.manual_seed(RNG_SEED)\n", + "np.random.seed(RNG_SEED)\n", + "model_name = \"L-G-DCNN-matbench\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# %%\n", + "def get_model(data_dir,model_name, mat_prop, i, classification=False, batch_size=None,\n", + " transfer=None, verbose=True):\n", + " # Get the TorchedLGDCNN architecture loaded\n", + " model = Model(LGDCNN(compute_device=compute_device).to(compute_device),\n", + " model_name=f'{mat_prop}{i}', verbose=verbose)\n", + "\n", + " # Train network starting at pretrained weights\n", + " if transfer is not None:\n", + " model.load_network(f'{transfer}.pth')\n", + " model.model_name = f'{mat_prop}'\n", + "\n", + " # Apply BCEWithLogitsLoss to model output if binary classification is True\n", + " if classification:\n", + " model.classification = True\n", + "\n", + " # Get the datafiles you will learn from\n", + " train_data = f'{data_dir}/{mat_prop}/train.csv'\n", + " val_data = f'{data_dir}/{mat_prop}/val.csv'\n", + "\n", + " # Load the train and validation data before fitting the network\n", + " data_size = pd.read_csv(train_data).shape[0]\n", + " batch_size = 2**round(np.log2(data_size)-4)\n", + " if batch_size < 2**7:\n", + " batch_size = 2**7\n", + " if batch_size > 2**12:\n", + " batch_size = 2**12\n", + " \n", + " model.load_data(train_data, batch_size=batch_size//2, train=True)\n", + " print(f'training with batchsize {model.batch_size} '\n", + " f'(2**{np.log2(model.batch_size):0.3f})')\n", + " model.load_data(val_data, batch_size=batch_size//2)\n", + "\n", + " # Set the number of epochs, decide if you want a loss curve to be plotted\n", + " model.fit(epochs=300, losscurve=False)\n", + "\n", + " # Save the network (saved as f\"{model_name}.pth\")\n", + " model.save_network(model_name)\n", + " return model\n", + "\n", + "def load_model(data_dir, model_name, mat_prop, i, classification, file_name, verbose=True):\n", + " # Load up a saved network.\n", + " model = Model(LGDCNN(compute_device=compute_device).to(compute_device),\n", + " model_name=f'{mat_prop}{i}', verbose=verbose)\n", + " model.load_network(model_name, f'{mat_prop}{i}.pth')\n", + "\n", + " # Check if classifcation task\n", + " if classification:\n", + " model.classification = True\n", + " # Load the data you want to predict with\n", + " data = f'{data_dir}/{mat_prop}/{file_name}'\n", + " # data is reloaded to model.data_loader\n", + " model.load_data(data, batch_size=2**9)\n", + " return model\n", + "\n", + "def get_results(model):\n", + " output = model.predict(model.data_loader) # predict the data saved here\n", + " return model, output\n", + "\n", + "def to_csv(output, save_name):\n", + " # parse output and save to csv\n", + " act, pred, formulae, uncertainty = output\n", + " df = pd.DataFrame([formulae, act, pred, uncertainty]).T\n", + " # df.columns = ['composition', 'target', 'pred-0', 'uncertainty']\n", + " df.columns = ['formula', 'actual', 'predicted', 'uncertainty']\n", + " save_path = 'matbench_predictions/'\n", + " os.makedirs(save_path, exist_ok=True)\n", + " df.to_csv(f'{save_path}/{save_name}', index_label='Index')\n", + " \n", + "\n", + "def save_results(data_dir, model_name,mat_prop, fold, classification, file_name, ):\n", + " model = load_model(data_dir, model_name,mat_prop, fold, classification, file_name = 'test.csv' )\n", + " model, output = get_results(model)\n", + " \n", + " # Get appropriate metrics for saving to csv\n", + " if model.classification:\n", + " auc = roc_auc_score(output[0], output[1])\n", + " print(f'\\n{mat_prop} ROC AUC: {auc:0.3f}')\n", + " else:\n", + " mae = np.abs(output[0] - output[1]).mean()\n", + " print(f'\\n{mat_prop} mae: {mae:0.3g}')\n", + "\n", + " # save predictions to a csv\n", + " fname = f'{mat_prop}_{file_name.replace(\".csv\", \"\")}_output{fold}.csv'\n", + " to_csv(output, fname)\n", + " return model, output" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "#condesne_formula takes a material and returns the chemical formula in the correct format for LGDCNN\n", + "def condense_formula(mat):\n", + " if isinstance(mat, str):\n", + " return mat\n", + " else:\n", + " return mat.formula.replace(' ', '')\n", + "\n", + "#change_input runs condesne_formula on all the input data used for training\n", + "def change_input(train_inputs):\n", + " inputs = []\n", + " for input in train_inputs:\n", + " inputs.append(condense_formula(input))\n", + " return inputs\n", + "\n", + "#make_df creates a data frame containing the train inputs and outputs for LGDCNN\n", + "def make_df(train_inputs, train_outputs):\n", + " input_df = pd.DataFrame({'formula': train_inputs, 'target': train_outputs})\n", + " return input_df\n", + "\n", + "#make_df_test creates a data frame containing the test inputs for LGDCNN\n", + "def make_df_test(test_inputs, test_outputs):\n", + " test_df = pd.DataFrame({'formula' : test_inputs, 'target': test_outputs})\n", + " # test_df['target'] = np.nan\n", + " return test_df\n", + "\n", + "#split_train_val splits the training data into two sets: training and validation\n", + "def split_train_val(df):\n", + " df = df.sample(frac = 1.0, random_state = 7)\n", + " val_df = df.sample(frac = 0.1, random_state = 7)\n", + " train_df = df.drop(val_df.index) \n", + " print(train_df.shape, val_df.shape) \n", + " return train_df, val_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subset = [\"matbench_jdft2d\", \"matbench_steels\", \n", + " \"matbench_perovskites\", \"matbench_expt_gap\",\n", + " \"matbench_phonons\", \"matbench_dielectric\", \n", + " \"matbench_log_gvrh\", \"matbench_log_kvrh\",\n", + " \"matbench_mp_gap\", \"matbench_mp_e_form\"]\n", + "\n", + "mb = MatbenchBenchmark(autoload=False, subset=subset)\n", + "data_dir = 'data/matbench_temp'\n", + "os.makedirs(data_dir, exist_ok= True)\n", + "\n", + "results_dict = {}\n", + "\n", + "for task in mb.tasks:\n", + " task.load()\n", + " mat_prop = task.dataset_name\n", + " os.makedirs(f'{data_dir}/{mat_prop}', exist_ok= True)\n", + " for fold in task.folds:\n", + " train_inputs, train_outputs = task.get_train_and_val_data(fold)\n", + " test_inputs , test_outputs = task.get_test_data(fold, include_target=True)\n", + "\n", + " #Preparing the inputs data for LGDCNN\n", + " inputs = change_input(train_inputs)\n", + " df = make_df(inputs, train_outputs)\n", + "\n", + " #Creating the training and validation sets\n", + " train_df, val_df = split_train_val(df)\n", + " train_df.to_csv(f'{data_dir}/{mat_prop}/train.csv')\n", + " val_df.to_csv(f'{data_dir}/{mat_prop}/val.csv')\n", + "\n", + " #Getting and preparing the testing data\n", + " test_inputs_formula = change_input(test_inputs)\n", + " test_df = make_df_test(test_inputs_formula, test_outputs)\n", + " test_df.to_csv(f'{data_dir}/{mat_prop}/test.csv')\n", + "\n", + " #Training LGDCNN\n", + " model = get_model(data_dir, model_name, mat_prop, fold, classification = False, verbose = True, )\n", + " \n", + " model_test, output = save_results(data_dir, model_name,mat_prop, fold, classification = False,\n", + " file_name='test.csv',)\n", + " \n", + " # Recording our data!\n", + " predictions = output[1]\n", + " task.record(fold, predictions)\n", + "\n", + "# Saving our results\n", + "mb.to_file(\"LGDCNN_\"+mat_prop +\".json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pytorch", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.17" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "7dd5d76405b906035e1d1a24c7f24088f68ab8fc773386bbbd9b8e7c7c6d48a3" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/results.json.gz b/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/results.json.gz new file mode 100644 index 00000000..f3278fef Binary files /dev/null and b/benchmarks/matbench_v0.1_LGDCNN_v1.0_composition/results.json.gz differ