From 938bbe2a9bb0625c45d261397f962b861bf2ecd4 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Sat, 4 Feb 2023 14:38:20 -0800 Subject: [PATCH] update pre-commit hooks and apply new black format --- .pre-commit-config.yaml | 14 +++++++------- dft/fetch_cod_structs.py | 1 - notebooks/multitask/cross_val.py | 2 -- notebooks/multitask/ensemble.py | 2 -- readme.md | 13 ++++++++----- thermo/bnn/torch_dropout.py | 2 -- thermo/rf.py | 2 +- thermo/utils/__init__.py | 1 - 8 files changed, 16 insertions(+), 21 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d08196b..d5d1ac6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,23 +7,23 @@ default_install_hook_types: [pre-commit, commit-msg] repos: - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + rev: 5.12.0 hooks: - id: isort - repo: https://github.com/psf/black - rev: 22.8.0 + rev: 23.1.0 hooks: - id: black-jupyter - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 + rev: 6.0.0 hooks: - id: flake8 additional_dependencies: [flake8-bugbear] - repo: https://github.com/asottile/pyupgrade - rev: v2.38.2 + rev: v3.3.1 hooks: - id: pyupgrade args: [--py39-plus] @@ -34,7 +34,7 @@ repos: - id: format-ipy-cells - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v4.4.0 hooks: - id: check-case-conflict - id: check-symlinks @@ -45,13 +45,13 @@ repos: - id: trailing-whitespace - repo: https://github.com/codespell-project/codespell - rev: v2.2.1 + rev: v2.2.2 hooks: - id: codespell stages: [commit, commit-msg] exclude_types: [json, csv] - repo: https://github.com/PyCQA/autoflake - rev: v1.6.1 + rev: v2.0.1 hooks: - id: autoflake diff --git a/dft/fetch_cod_structs.py b/dft/fetch_cod_structs.py index ac6f47f..4911c63 100644 --- a/dft/fetch_cod_structs.py +++ b/dft/fetch_cod_structs.py @@ -81,7 +81,6 @@ # %% for struct_path in structure_paths: - path = dirname(struct_path) if isfile(f"{path}/INCAR"): continue diff --git a/notebooks/multitask/cross_val.py b/notebooks/multitask/cross_val.py index 2828279..f999bf2 100644 --- a/notebooks/multitask/cross_val.py +++ b/notebooks/multitask/cross_val.py @@ -96,9 +96,7 @@ def forward(self, x): metrics = {key: [] for key in metrics} for epoch in range(model.epoch, total_epochs): - for samples, truth in DataLoader(train_set, batch_size=32, shuffle=True): - optim.zero_grad() preds = model(samples) diff --git a/notebooks/multitask/ensemble.py b/notebooks/multitask/ensemble.py index 4c03523..edd8dac 100644 --- a/notebooks/multitask/ensemble.py +++ b/notebooks/multitask/ensemble.py @@ -81,9 +81,7 @@ def forward(self, x): metrics = {key: [] for key in metrics} for epoch in range(model.epoch, total_epochs): - for samples, targets in train_loader: - optim.zero_grad() preds = model(samples) diff --git a/readme.md b/readme.md index b5261a3..3ea45a6 100644 --- a/readme.md +++ b/readme.md @@ -1,14 +1,17 @@ -# Data-Driven Risk-Conscious Thermoelectric Materials Discovery +

Data-Driven Risk-Conscious
Thermoelectric Materials Discovery

+ +

-[![License](https://img.shields.io/github/license/janosh/thermo?label=License)](/license) -[![GitHub Repo Size](https://img.shields.io/github/repo-size/janosh/thermo?label=Repo+Size)](https://github.com/janosh/thermo/graphs/contributors) [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/janosh/thermo/main.svg)](https://results.pre-commit.ci/latest/github/janosh/thermo/main) +[![This project supports Python 3.8+](https://img.shields.io/badge/Python-3.8+-blue.svg?logo=python&logoColor=white)](https://python.org/downloads) +[![GitHub Repo Size](https://img.shields.io/github/repo-size/janosh/thermo?label=Repo+Size)](https://github.com/janosh/thermo/graphs/contributors) +

## Project description -The aim is to discover high figure of merit ($zT > 1$) and sustainable (lead-free and rare earth-free) bulk thermoelectrics using machine learning-guided experimentation. The key advance is going beyond 'big data' which in this domain is unattainable for the foreseeable future since both first principles calculations and experimental synthesis and characterization of bulk thermoelectrics are costly and low throughput. Instead, we move towards so-called 'optimal data' by developing novel algorithms that optimize thermoelectric performance ($zT$) with minimal number of expensive calculations and experiments. +The aim is to discover high-figureā€“of-merit ($zT > 1$) and sustainable (lead-free and rare earth-free) bulk thermoelectrics using machine learning-guided experimentation. The key advance is going beyond 'big data' which in this domain is unattainable for the foreseeable future since both first-principles calculations and experimental synthesis and characterization of bulk thermoelectrics are costly and low throughput. Instead, we move towards so-called 'optimal data' by developing novel algorithms that optimize thermoelectric performance ($zT$) with minimal number of expensive calculations and experiments. -To date there has been no statistically robust approach to simultaneously incorporate experimental and model error into machine learning models in a search space with high opportunity cost and high latency (i.e. large time between prediction and validation). +To date, there has been no statistically robust approach to simultaneously incorporate experimental and model error into machine learning models in a search space with high opportunity cost and high latency (i.e. large time between prediction and validation). Consequently, searches have been unable to effectively guide experimentalists in the selection of exploring or exploiting new materials when the validation step is inherently low throughput and resource-intensive, as is the case for synthesizing new bulk functional materials like thermoelectrics. This project aims to implement a holistic pipeline to discover novel thermoelectrics: ML models predict the $zT$ of a large database of structures as well as their own uncertainty for each prediction. Candidate structures are then selected, based on maximizing $zT$ subject to a tolerable level of uncertainty, to proceed to the next stage where expensive experimental synthesis and characterization of high-$zT$ candidates are guided by Bayesian optimization and active machine learning. diff --git a/thermo/bnn/torch_dropout.py b/thermo/bnn/torch_dropout.py index b2d7333..dcb0282 100644 --- a/thermo/bnn/torch_dropout.py +++ b/thermo/bnn/torch_dropout.py @@ -45,7 +45,6 @@ def denorm_X(self, tensor, is_std=False): class GaultoisData(Normalized): def __init__(self, test_size=0.1, train=True, target_cols=None): - features, targets = load_gaultois(target_cols=target_cols) targets, features = dropna(targets, features) @@ -154,7 +153,6 @@ def fit( print(cols) for epoch in range(self.epochs, epochs): - targets, outputs = [], [] for samples, target in loader: diff --git a/thermo/rf.py b/thermo/rf.py index 6e1772e..f70be77 100644 --- a/thermo/rf.py +++ b/thermo/rf.py @@ -29,7 +29,7 @@ def get_params(self, _deep: bool = True) -> dict: which when trying to inspect instances of this class would throw a RuntimeError complaining that "scikit-learn estimators should always specify their parameters in the signature of their __init__ (no varargs). - Constructor (self, *args, **kwargs) doesn't follow this convention.". + Constructor (self, *args, **kwargs) doesn't follow this convention.". sklearn enforces this to be able to read and set the parameter names in meta algorithms like pipeline and grid search which we don't need. """ diff --git a/thermo/utils/__init__.py b/thermo/utils/__init__.py index 9766d20..9ca3695 100644 --- a/thermo/utils/__init__.py +++ b/thermo/utils/__init__.py @@ -69,7 +69,6 @@ def cross_val_predict(splitter, features, targets, predict_fn): for train_idx, test_idx in tqdm( splitter.split(features), desc=f"{splitter.n_splits}-fold CV" ): - X_train, X_test = features.iloc[train_idx], features.iloc[test_idx] y_train, y_test = targets.iloc[train_idx], targets.iloc[test_idx]