From 513e341d41abf6e5e040c6e0e043f4fbd0fc5cd3 Mon Sep 17 00:00:00 2001 From: Fabrice Normandin Date: Thu, 31 Aug 2023 12:00:23 -0400 Subject: [PATCH] Add step to check disk-quota in mila code Signed-off-by: Fabrice Normandin --- milatools/cli/commands.py | 137 +++++++++++++++++++++++++++++++++----- 1 file changed, 120 insertions(+), 17 deletions(-) diff --git a/milatools/cli/commands.py b/milatools/cli/commands.py index fb59e22b..0f42bc58 100644 --- a/milatools/cli/commands.py +++ b/milatools/cli/commands.py @@ -8,6 +8,7 @@ import traceback import webbrowser from contextlib import ExitStack +from logging import getLogger as get_logger from pathlib import Path from urllib.parse import urlencode @@ -32,6 +33,8 @@ yn, ) +logger = get_logger(__name__) + def main(): """Entry point for milatools.""" @@ -57,9 +60,7 @@ def main(): "title": f"[v{mversion}] Issue running the command `mila " f"{sys.argv[1]}`", } - github_issue_url = ( - f"https://github.com/mila-iqia/milatools/issues/new?{urlencode(options)}" - ) + github_issue_url = f"https://github.com/mila-iqia/milatools/issues/new?{urlencode(options)}" print( T.bold_yellow( f"An error occured during the execution of the command " @@ -212,7 +213,9 @@ def init(): print(T.bold_cyan("=" * 60)) print( - T.bold_cyan("Congrats! You are now ready to start working on the cluster!") + T.bold_cyan( + "Congrats! You are now ready to start working on the cluster!" + ) ) print(T.bold_cyan("=" * 60)) print(T.bold("To connect to a login node:")) @@ -290,6 +293,8 @@ def code(): remote = Remote("mila") here = Local() + check_disk_quota(remote) + cnode = _find_allocation(remote, job_name="mila-code") if persist: cnode = cnode.persist() @@ -297,11 +302,6 @@ def code(): node_name = data["node_name"] - if not path.startswith("/"): - # Get $HOME because we have to give the full path to code - home = remote.home() - path = "/".join([home, path]) - try: while True: here.run( @@ -368,7 +368,9 @@ def kill(): remote = Remote("mila") if all: - for identifier in remote.get_lines("ls .milatools/control", hide=True): + for identifier in remote.get_lines( + "ls .milatools/control", hide=True + ): info = _get_server_info(remote, identifier, hide=True) if "jobid" in info: remote.run(f"scancel {info['jobid']}") @@ -395,7 +397,9 @@ def list(): remote.run("mkdir -p ~/.milatools/control", hide=True) - for identifier in remote.get_lines("ls .milatools/control", hide=True): + for identifier in remote.get_lines( + "ls .milatools/control", hide=True + ): info = _get_server_info(remote, identifier, hide=True) jobid = info.get("jobid", None) status = remote.get_output( @@ -406,13 +410,18 @@ def list(): necessary_keys = {"node_name", "to_forward"} if any(k not in info for k in necessary_keys): qn.print( - f"{identifier} ({program}, MISSING INFO)", style="bold red" + f"{identifier} ({program}, MISSING INFO)", + style="bold red", ) to_purge.append((identifier, jobid)) else: - qn.print(f"{identifier} ({program})", style="bold yellow") + qn.print( + f"{identifier} ({program})", style="bold yellow" + ) else: - qn.print(f"{identifier} ({program}, DEAD)", style="bold red") + qn.print( + f"{identifier} ({program}, DEAD)", style="bold red" + ) to_purge.append((identifier, None)) for k, v in info.items(): print(f" {k:20} : {v}") @@ -431,7 +440,9 @@ def lab(): path: Option = default(None) if path and path.endswith(".ipynb"): - exit("Only directories can be given to the mila serve lab command") + exit( + "Only directories can be given to the mila serve lab command" + ) _standard_server( path, @@ -453,7 +464,9 @@ def notebook(): path: Option = default(None) if path and path.endswith(".ipynb"): - exit("Only directories can be given to the mila serve notebook command") + exit( + "Only directories can be given to the mila serve notebook command" + ) _standard_server( path, @@ -648,7 +661,9 @@ def _standard_server( if cf is not None: remote.simple_run(f"echo program = {program} >> {cf}") - remote.simple_run(f"echo node_name = {results['node_name']} >> {cf}") + remote.simple_run( + f"echo node_name = {results['node_name']} >> {cf}" + ) remote.simple_run(f"echo host = {host} >> {cf}") remote.simple_run(f"echo to_forward = {to_forward} >> {cf}") if token_pattern: @@ -684,6 +699,94 @@ def _standard_server( proc.kill() +def _get_disk_quota_usage( + remote: Remote, print_command_output: bool = True +) -> tuple[tuple[float, float], tuple[int, int]]: + """Checks the disk quota on the $HOME filesystem on the mila cluster. + + Returns whether the quota is exceeded, in terms of storage space or number of files. + + Here is what the output of `disk-quota` looks like on the Mila cluster: + ```console + + Quota information for storage pool Default (ID: 1): + + user/group || size || chunk files + name | id || used | hard || used | hard + --------------|------||------------|------------||---------|--------- + normandf|1471600598|| 97.20 GiB| 100.00 GiB|| 806898| 1000000 + ``` + """ + disk_quota_output = remote.get_output( + "disk-quota", hide=not print_command_output + ) + last_line_parts = disk_quota_output.splitlines()[-1] + ( + _username, + _id, + _, + used_gb, + max_gb, + _, + used_files, + max_files, + ) = last_line_parts.split("|") + used_gb = float(used_gb.removesuffix("GiB").strip()) + max_gb = float(max_gb.removesuffix("GiB").strip()) + used_files = int(used_files.strip()) + max_files = int(max_files.strip()) + return (used_gb, max_gb), (used_files, max_files) + + +def check_disk_quota(remote: Remote) -> None: + cluster = "mila" # todo: if we run this on CC, then we should use `diskusage_report` + # todo: Check the disk-quota of other filesystems if needed. + filesystem = "$HOME" + logger.debug("Checking disk quota on $HOME...") + (used_gb, max_gb), (used_files, max_files) = _get_disk_quota_usage(remote) + logger.debug( + f"Disk usage: {used_gb} / {max_gb} GiB and {used_files} / {max_files} files" + ) + size_ratio = used_gb / max_gb + files_ratio = used_files / max_files + reason = ( + f"{used_gb} / {max_gb} GiB" + if size_ratio > files_ratio + else f"{used_files} / {max_files} files" + ) + + freeing_up_space_instructions = ( + "For example, temporary files (logs, checkpoints, etc.) can be moved to $SCRATCH, " + "while files that need to be stored for longer periods can be moved to $ARCHIVE " + "or to a shared project folder under /network/projects.\n" + "Visit https://docs.mila.quebec/Information.html#storage to learn more about how to " + "best make use of the different filesystems available on the cluster." + "" + ) + + if used_gb >= max_gb or used_files >= max_files: + raise MilatoolsUserError( + T.red( + f"ERROR: Your disk quota on the {filesystem} filesystem is exceeded! ({reason}).\n" + f"To fix this, login to the cluster with `ssh {cluster}` and free up some space, " + f"either by deleting files, or by moving them to a suitable filesystem.\n" + + freeing_up_space_instructions + ) + ) + if max(size_ratio, files_ratio) > 0.9: + warning_message = ( + f"WARNING: You are getting pretty close to your disk quota on the $HOME " + f"filesystem: ({reason})\n" + "Please consider freeing up some space in your $HOME folder, either by " + "deleting files, or by moving them to a more suitable filesystem.\n" + + freeing_up_space_instructions + ) + # TODO: Perhaps we could use the logger or the warnings package instead of just printing? + # logger.warning(UserWarning(warning_message)) + # warnings.warn(UserWarning(T.yellow(warning_message))) + print(UserWarning(T.yellow(warning_message))) + + @tooled def _find_allocation(remote, job_name="mila-tools"): # Node to connect to