-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
153 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import logging | ||
import os | ||
from dataclasses import dataclass, field | ||
from typing import TYPE_CHECKING, Dict, List, Optional | ||
|
||
from olmo_core.distributed.utils import get_rank | ||
from olmo_core.exceptions import OLMoEnvironmentError | ||
|
||
from .callback import Callback | ||
|
||
if TYPE_CHECKING: | ||
from comet_ml import Experiment | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
COMET_API_KEY_ENV_VAR = "COMET_API_KEY" | ||
|
||
|
||
@dataclass | ||
class CometCallback(Callback): | ||
""" | ||
Logs metrics to Comet.ml from rank 0. | ||
.. important:: | ||
Requires the ``comet_ml`` package and the environment variable ``COMET_API_KEY``. | ||
.. note:: | ||
This callback logs metrics from every single step to Comet.ml, regardless of the value | ||
of :data:`Trainer.metrics_collect_interval <olmo_core.train.Trainer.metrics_collect_interval>`. | ||
""" | ||
|
||
enabled: bool = True | ||
""" | ||
Set to false to disable this callback. | ||
""" | ||
|
||
name: Optional[str] = None | ||
""" | ||
The name to give the Comet.ml experiment. | ||
""" | ||
|
||
project: Optional[str] = None | ||
""" | ||
The Comet.ml project to use. | ||
""" | ||
|
||
workspace: Optional[str] = None | ||
""" | ||
The name of the Comet.ml workspace to use. | ||
""" | ||
|
||
tags: Optional[List[str]] = None | ||
""" | ||
Tags to assign the experiment. | ||
""" | ||
|
||
cancel_tags: Optional[List[str]] = field( | ||
default_factory=lambda: ["cancel", "canceled", "cancelled"] | ||
) | ||
""" | ||
If you add any of these tags to an experiment on Comet.ml, the run will cancel itself. | ||
Defaults to ``["cancel", "canceled", "cancelled"]``. | ||
""" | ||
|
||
cancel_check_interval: Optional[int] = None | ||
""" | ||
Check for cancel tags every this many steps. Defaults to | ||
:data:`olmo_core.train.Trainer.cancel_check_interval`. | ||
""" | ||
|
||
failure_tag: str = "failed" | ||
""" | ||
The tag to assign to failed experiments. | ||
""" | ||
|
||
_exp = None | ||
_finalized: bool = False | ||
|
||
@property | ||
def exp(self) -> "Experiment": | ||
return self._exp # type: ignore | ||
|
||
@exp.setter | ||
def exp(self, exp: "Experiment"): | ||
self._exp = exp | ||
|
||
@property | ||
def finalized(self) -> bool: | ||
return self._finalized | ||
|
||
def finalize(self): | ||
if not self.finalized: | ||
self.exp.end() | ||
self._finalized = True | ||
|
||
def pre_train(self): | ||
if self.enabled and get_rank() == 0: | ||
import comet_ml as comet | ||
|
||
if COMET_API_KEY_ENV_VAR not in os.environ: | ||
raise OLMoEnvironmentError(f"missing env var '{COMET_API_KEY_ENV_VAR}'") | ||
|
||
self.exp = comet.Experiment( | ||
api_key=os.environ[COMET_API_KEY_ENV_VAR], | ||
project_name=self.project, | ||
workspace=self.workspace, | ||
) | ||
|
||
if self.name is not None: | ||
self.exp.set_name(self.name) | ||
|
||
if self.tags: | ||
self.exp.add_tags(self.tags) | ||
|
||
def log_metrics(self, step: int, metrics: Dict[str, float]): | ||
if self.enabled and get_rank() == 0: | ||
self.exp.log_metrics(metrics, step=step) | ||
|
||
def post_step(self): | ||
cancel_check_interval = self.cancel_check_interval or self.trainer.cancel_check_interval | ||
if self.enabled and get_rank() == 0 and self.step % cancel_check_interval == 0: | ||
self.trainer.thread_pool.submit(self.check_if_canceled) | ||
|
||
def post_train(self): | ||
if self.enabled and get_rank() == 0: | ||
log.info("Finalizing successful Comet.ml experiment...") | ||
self.finalize() | ||
|
||
def on_error(self, exc: BaseException): | ||
del exc | ||
if self.enabled and get_rank() == 0: | ||
log.warning("Finalizing failed Comet.ml experiment...") | ||
self.exp.add_tag(self.failure_tag) | ||
self.finalize() | ||
|
||
def check_if_canceled(self): | ||
if self.enabled and not self.finalized and self.cancel_tags: | ||
try: | ||
tags = self.exp.get_tags() | ||
except Exception as exc: | ||
log.warning(f"Failed to pull tags for Comet.ml experiment:\n{exc}") | ||
return | ||
|
||
for tag in tags: | ||
if tag.lower() in self.cancel_tags: | ||
self.trainer.cancel_run("canceled from Comet.ml tag") | ||
return |