diff --git a/info.json b/info.json new file mode 100644 index 00000000..a0842e34 --- /dev/null +++ b/info.json @@ -0,0 +1,55 @@ +{ +"author": "Xinping Song" +"algorithm": "RF-SCM/Magpie v1.0" +"algorithm_long": +"bibtex_refs": ['@article{Dunn2020,\n' + ' doi = {10.1038/s41524-020-00406-3},\n' + ' url = {https://doi.org/10.1038/s41524-020-00406-3},\n' + ' year = {2020},\n' + ' month = sep,\n' + ' publisher = {Springer Science and Business Media {LLC}},\n' + ' volume = {6},\n' + ' number = {1},\n' + ' author = {Alexander Dunn and Qi Wang and Alex Ganose and Daniel Dopp and ' + 'Anubhav Jain},\n' + ' title = {Benchmarking materials property prediction methods: the Matbench ' + 'test set and Automatminer reference algorithm},\n' + ' journal = {npj Computational Materials}\n' + '}', + '@article{Breiman2001,\n' + ' doi = {10.1023/a:1010933404324},\n' + ' url = {https://doi.org/10.1023/a:1010933404324},\n' + ' year = {2001},\n' + ' publisher = {Springer Science and Business Media {LLC}},\n' + ' volume = {45},\n' + ' number = {1},\n' + ' pages = {5--32},\n' + ' author = {Leo Breiman},\n' + ' journal = {Machine Learning}\n' + '}', + '@article{Ward2016,\n' + ' doi = {10.1038/npjcompumats.2016.28},\n' + ' url = {https://doi.org/10.1038/npjcompumats.2016.28},\n' + ' year = {2016},\n' + ' month = aug,\n' + ' publisher = {Springer Science and Business Media {LLC}},\n' + ' volume = {2},\n' + ' number = {1},\n' + ' author = {Logan Ward and Ankit Agrawal and Alok Choudhary and Christopher ' + 'Wolverton},\n' + ' title = {A general-purpose machine learning framework for predicting ' + 'properties of inorganic materials},\n' + ' journal = {npj Computational Materials}\n' + '}', + '@article {QUA:QUA24917,author = {Faber, Felix and Lindmaa, Alexander and von ' + 'Lilienfeld, O. Anatole and Armiento, Rickard},title = {Crystal structure ' + 'representations for machine learning models of formation energies},journal = ' + '{International Journal of Quantum Chemistry},volume = {115},number = ' + '{16},issn = {1097-461X},url = {http://dx.doi.org/10.1002/qua.24917},doi = ' + '{10.1002/qua.24917},pages = {1094--1101},keywords = {machine learning, ' + 'formation energies, representations, crystal structure, periodic ' + 'systems},year = {2015},}'] +"notes": +"requirements":{"python": ["scikit-learn==0.23.2", "numpy==1.22.4", "matbench==0.6.0"]} + +} \ No newline at end of file diff --git a/my_python_file.py.py b/my_python_file.py.py new file mode 100644 index 00000000..15dfeedd --- /dev/null +++ b/my_python_file.py.py @@ -0,0 +1,66 @@ +""" +Code for training and recording the matbench_v0.1 random forest benchmark. + +The ML pipeline is placed within the Automatminer pipeline code infrastructure for convenience. + +All training and inference was done on a single 128-core HPC node. + +Reduce the number of jobs n_jobs for less memory usage on consumer machines. +""" + +if __name__ == '__main__': + from automatminer import MatPipe + from automatminer.automl.adaptors import SinglePipelineAdaptor, TPOTAdaptor + from automatminer.featurization import AutoFeaturizer + from automatminer.preprocessing import DataCleaner, FeatureReducer + from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + + from matbench.bench import MatbenchBenchmark + from multiprocessing import set_start_method + + set_start_method("spawn", force=True) + + # The learner is a single 500-estimator Random Forest model + learner = SinglePipelineAdaptor( + regressor=RandomForestRegressor(n_estimators=500), + classifier=RandomForestClassifier(n_estimators=500), + ) + pipe_config = { + "learner": learner, + "reducer": FeatureReducer(reducers=[]), + "cleaner": DataCleaner(feature_na_method="mean", max_na_frac=0.01, na_method_fit="drop", na_method_transform="mean"), + "autofeaturizer": AutoFeaturizer(n_jobs=8, preset="debug"), + } + + pipe = MatPipe(**pipe_config) + + mb = MatbenchBenchmark(autoload=False) + + i = 0 + + #for task in mb.tasks: + task = mb.matbench_jdft2d + print(task) + task.load() + for fold in task.folds: + + df_train = task.get_train_and_val_data(fold, as_type="df") + + # Fit the RF with matpipe + pipe.fit(df_train, task.metadata.target) + + df_test = task.get_test_data(fold, include_target=False, as_type="df") + predictions = pipe.predict(df_test)[f"{task.metadata.target} predicted"] + + # A single configuration is used + params = {'note': 'single config; see benchmark user metadata'} + + task.record(fold, predictions, params=params) + + mb.to_file("results_" + str(i) + ".json.gz") + i += 1 + + # Save your results + mb.to_file("results.json.gz") + + diff --git a/results.json.gz b/results.json.gz new file mode 100644 index 00000000..10c38bc7 Binary files /dev/null and b/results.json.gz differ