Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new_benchmark #353

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 172 additions & 0 deletions info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
{
"author": "Xinping Song"
"algorithm": "RF-SCM/Magpie v1.0"
"algorithm_long":
"bibtex_refs": ['@article{Dunn2020,\n'
' doi = {10.1038/s41524-020-00406-3},\n'
' url = {https://doi.org/10.1038/s41524-020-00406-3},\n'
' year = {2020},\n'
' month = sep,\n'
' publisher = {Springer Science and Business Media {LLC}},\n'
' volume = {6},\n'
' number = {1},\n'
' author = {Alexander Dunn and Qi Wang and Alex Ganose and Daniel Dopp and '
'Anubhav Jain},\n'
' title = {Benchmarking materials property prediction methods: the Matbench '
'test set and Automatminer reference algorithm},\n'
' journal = {npj Computational Materials}\n'
'}',
'@article{Breiman2001,\n'
' doi = {10.1023/a:1010933404324},\n'
' url = {https://doi.org/10.1023/a:1010933404324},\n'
' year = {2001},\n'
' publisher = {Springer Science and Business Media {LLC}},\n'
' volume = {45},\n'
' number = {1},\n'
' pages = {5--32},\n'
' author = {Leo Breiman},\n'
' journal = {Machine Learning}\n'
'}',
'@article{Ward2016,\n'
' doi = {10.1038/npjcompumats.2016.28},\n'
' url = {https://doi.org/10.1038/npjcompumats.2016.28},\n'
' year = {2016},\n'
' month = aug,\n'
' publisher = {Springer Science and Business Media {LLC}},\n'
' volume = {2},\n'
' number = {1},\n'
' author = {Logan Ward and Ankit Agrawal and Alok Choudhary and Christopher '
'Wolverton},\n'
' title = {A general-purpose machine learning framework for predicting '
'properties of inorganic materials},\n'
' journal = {npj Computational Materials}\n'
'}',
'@article {QUA:QUA24917,author = {Faber, Felix and Lindmaa, Alexander and von '
'Lilienfeld, O. Anatole and Armiento, Rickard},title = {Crystal structure '
'representations for machine learning models of formation energies},journal = '
'{International Journal of Quantum Chemistry},volume = {115},number = '
'{16},issn = {1097-461X},url = {http://dx.doi.org/10.1002/qua.24917},doi = '
'{10.1002/qua.24917},pages = {1094--1101},keywords = {machine learning, '
'formation energies, representations, crystal structure, periodic '
'systems},year = {2015},}']
"notes":
"python_version": ">=3.8,<3.9"
"requirements":{"python":[asttokens==2.0.5
attrs==21.4.0
automatminer @ git+https://github.com/noidvan/automatminer.git@ceb81f8537dd66fcaa3fef43396b2e102366cfd2
backcall==0.2.0
beniget==0.4.1
black==21.12b0
certifi==2021.10.8
cffi==1.15.0
charset-normalizer==2.0.10
click==8.0.3
cloudpickle==2.0.0
cycler==0.11.0
Cython==0.29.26
daal==2021.5.1
dask==2022.1.0
deap==1.4.1
decorator==5.1.1
distlib==0.3.4
dnspython==2.6.1
emmet-core==0.68.0
executing==0.8.2
filelock==3.4.2
fonttools==4.28.5
fsspec==2022.1.0
future==1.0.0
gast==0.5.3
hypothesis==6.35.1
idna==3.3
importlib_resources==6.4.0
iniconfig==1.1.1
intel-openmp==2022.0.1
ipp==2021.5.1
ipython==8.0.0
jedi==0.18.1
joblib==1.1.0
kiwisolver==1.3.2
latexcodec==3.0.0
line-profiler==3.4.0
llvmlite==0.38.0
locket==0.2.1
matbench @ git+https://github.com/noidvan/matbench@abdb4679b2ee92d0659527d10e7d75e14a87d7a4
matminer==0.8.0
matplotlib==3.5.1
matplotlib-inline==0.1.3
memory-profiler==0.60.0
mkl==2022.0.1
monty==2024.5.24
mp-api==0.36.1
mpmath==1.3.0
msgpack==1.0.8
mypy-extensions==0.4.3
networkx==3.2.1
numba==0.55.0
numexpr==2.8.1
numpy==1.22.4
packaging==21.3
palettable==3.3.3
pandas==1.3.5
parso==0.8.3
partd==1.2.0
pathspec==0.9.0
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.0.0
platformdirs==2.4.1
plotly==5.22.0
pluggy==1.0.0
ply==3.11
prompt-toolkit==3.0.24
psutil==5.9.0
ptyprocess==0.7.0
pure-eval==0.2.1
py==1.11.0
pybind11==2.9.0
pybtex==0.24.0
pycparser==2.21
pydantic==1.10.16
Pygments==2.11.2
pymatgen==2023.8.10
pymongo==4.7.3
pyparsing==3.0.6
pytest==6.2.5
python-dateutil==2.8.2
pythran==0.11.0
pytz==2021.3
PyYAML==5.3.1
requests==2.27.1
ruamel.yaml==0.17.4
ruamel.yaml.clib==0.2.8
scikit-learn==0.23.2
scipy==1.7.3
six==1.16.0
skrebate==0.62
sortedcontainers==2.4.0
spglib==2.4.0
stack-data==0.1.4
stopit==1.1.2
sympy==1.12.1
tabulate==0.9.0
tbb==2021.4.0
tenacity==8.3.0
threadpoolctl==3.0.0
toml==0.10.2
tomli==1.2.3
toolz==0.11.2
TPOT==0.11.7
tqdm==4.66.4
traitlets==5.1.1
typing_extensions==4.12.2
uncertainties==3.2.1
update-checker==0.18.0
urllib3==1.26.8
virtualenv==20.13.0
wcwidth==0.2.5
xgboost==1.5.2
zipp==3.19.2]
}

}
66 changes: 66 additions & 0 deletions my_python_file.py.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
Code for training and recording the matbench_v0.1 random forest benchmark.

The ML pipeline is placed within the Automatminer pipeline code infrastructure for convenience.

All training and inference was done on a single 128-core HPC node.

Reduce the number of jobs n_jobs for less memory usage on consumer machines.
"""

if __name__ == '__main__':
from automatminer import MatPipe
from automatminer.automl.adaptors import SinglePipelineAdaptor, TPOTAdaptor
from automatminer.featurization import AutoFeaturizer
from automatminer.preprocessing import DataCleaner, FeatureReducer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from matbench.bench import MatbenchBenchmark
from multiprocessing import set_start_method

set_start_method("spawn", force=True)

# The learner is a single 500-estimator Random Forest model
learner = SinglePipelineAdaptor(
regressor=RandomForestRegressor(n_estimators=500),
classifier=RandomForestClassifier(n_estimators=500),
)
pipe_config = {
"learner": learner,
"reducer": FeatureReducer(reducers=[]),
"cleaner": DataCleaner(feature_na_method="mean", max_na_frac=0.01, na_method_fit="drop", na_method_transform="mean"),
"autofeaturizer": AutoFeaturizer(n_jobs=8, preset="debug"),
}

pipe = MatPipe(**pipe_config)

mb = MatbenchBenchmark(autoload=False)

i = 0

#for task in mb.tasks:
task = mb.matbench_jdft2d
print(task)
task.load()
for fold in task.folds:

df_train = task.get_train_and_val_data(fold, as_type="df")

# Fit the RF with matpipe
pipe.fit(df_train, task.metadata.target)

df_test = task.get_test_data(fold, include_target=False, as_type="df")
predictions = pipe.predict(df_test)[f"{task.metadata.target} predicted"]

# A single configuration is used
params = {'note': 'single config; see benchmark user metadata'}

task.record(fold, predictions, params=params)

mb.to_file("results_" + str(i) + ".json.gz")
i += 1

# Save your results
mb.to_file("results.json.gz")


Binary file added results.json.gz
Binary file not shown.
Loading