Skip to content

Commit

Permalink
Add step to check disk-quota in mila code
Browse files Browse the repository at this point in the history
Signed-off-by: Fabrice Normandin <normandf@mila.quebec>
  • Loading branch information
lebrice committed Aug 31, 2023
1 parent 330d3a6 commit 513e341
Showing 1 changed file with 120 additions and 17 deletions.
137 changes: 120 additions & 17 deletions milatools/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import traceback
import webbrowser
from contextlib import ExitStack
from logging import getLogger as get_logger
from pathlib import Path
from urllib.parse import urlencode

Expand All @@ -32,6 +33,8 @@
yn,
)

logger = get_logger(__name__)


def main():
"""Entry point for milatools."""
Expand All @@ -57,9 +60,7 @@ def main():
"title": f"[v{mversion}] Issue running the command `mila "
f"{sys.argv[1]}`",
}
github_issue_url = (
f"https://github.com/mila-iqia/milatools/issues/new?{urlencode(options)}"
)
github_issue_url = f"https://github.com/mila-iqia/milatools/issues/new?{urlencode(options)}"
print(
T.bold_yellow(
f"An error occured during the execution of the command "
Expand Down Expand Up @@ -212,7 +213,9 @@ def init():

print(T.bold_cyan("=" * 60))
print(
T.bold_cyan("Congrats! You are now ready to start working on the cluster!")
T.bold_cyan(
"Congrats! You are now ready to start working on the cluster!"
)
)
print(T.bold_cyan("=" * 60))
print(T.bold("To connect to a login node:"))
Expand Down Expand Up @@ -290,18 +293,15 @@ def code():
remote = Remote("mila")
here = Local()

check_disk_quota(remote)

cnode = _find_allocation(remote, job_name="mila-code")
if persist:
cnode = cnode.persist()
data, proc = cnode.ensure_allocation()

node_name = data["node_name"]

if not path.startswith("/"):
# Get $HOME because we have to give the full path to code
home = remote.home()
path = "/".join([home, path])

try:
while True:
here.run(
Expand Down Expand Up @@ -368,7 +368,9 @@ def kill():
remote = Remote("mila")

if all:
for identifier in remote.get_lines("ls .milatools/control", hide=True):
for identifier in remote.get_lines(
"ls .milatools/control", hide=True
):
info = _get_server_info(remote, identifier, hide=True)
if "jobid" in info:
remote.run(f"scancel {info['jobid']}")
Expand All @@ -395,7 +397,9 @@ def list():

remote.run("mkdir -p ~/.milatools/control", hide=True)

for identifier in remote.get_lines("ls .milatools/control", hide=True):
for identifier in remote.get_lines(
"ls .milatools/control", hide=True
):
info = _get_server_info(remote, identifier, hide=True)
jobid = info.get("jobid", None)
status = remote.get_output(
Expand All @@ -406,13 +410,18 @@ def list():
necessary_keys = {"node_name", "to_forward"}
if any(k not in info for k in necessary_keys):
qn.print(
f"{identifier} ({program}, MISSING INFO)", style="bold red"
f"{identifier} ({program}, MISSING INFO)",
style="bold red",
)
to_purge.append((identifier, jobid))
else:
qn.print(f"{identifier} ({program})", style="bold yellow")
qn.print(
f"{identifier} ({program})", style="bold yellow"
)
else:
qn.print(f"{identifier} ({program}, DEAD)", style="bold red")
qn.print(
f"{identifier} ({program}, DEAD)", style="bold red"
)
to_purge.append((identifier, None))
for k, v in info.items():
print(f" {k:20} : {v}")
Expand All @@ -431,7 +440,9 @@ def lab():
path: Option = default(None)

if path and path.endswith(".ipynb"):
exit("Only directories can be given to the mila serve lab command")
exit(
"Only directories can be given to the mila serve lab command"
)

_standard_server(
path,
Expand All @@ -453,7 +464,9 @@ def notebook():
path: Option = default(None)

if path and path.endswith(".ipynb"):
exit("Only directories can be given to the mila serve notebook command")
exit(
"Only directories can be given to the mila serve notebook command"
)

_standard_server(
path,
Expand Down Expand Up @@ -648,7 +661,9 @@ def _standard_server(

if cf is not None:
remote.simple_run(f"echo program = {program} >> {cf}")
remote.simple_run(f"echo node_name = {results['node_name']} >> {cf}")
remote.simple_run(
f"echo node_name = {results['node_name']} >> {cf}"
)
remote.simple_run(f"echo host = {host} >> {cf}")
remote.simple_run(f"echo to_forward = {to_forward} >> {cf}")
if token_pattern:
Expand Down Expand Up @@ -684,6 +699,94 @@ def _standard_server(
proc.kill()


def _get_disk_quota_usage(
remote: Remote, print_command_output: bool = True
) -> tuple[tuple[float, float], tuple[int, int]]:
"""Checks the disk quota on the $HOME filesystem on the mila cluster.
Returns whether the quota is exceeded, in terms of storage space or number of files.
Here is what the output of `disk-quota` looks like on the Mila cluster:
```console
Quota information for storage pool Default (ID: 1):
user/group || size || chunk files
name | id || used | hard || used | hard
--------------|------||------------|------------||---------|---------
normandf|1471600598|| 97.20 GiB| 100.00 GiB|| 806898| 1000000
```
"""
disk_quota_output = remote.get_output(
"disk-quota", hide=not print_command_output
)
last_line_parts = disk_quota_output.splitlines()[-1]
(
_username,
_id,
_,
used_gb,
max_gb,
_,
used_files,
max_files,
) = last_line_parts.split("|")
used_gb = float(used_gb.removesuffix("GiB").strip())
max_gb = float(max_gb.removesuffix("GiB").strip())
used_files = int(used_files.strip())
max_files = int(max_files.strip())
return (used_gb, max_gb), (used_files, max_files)


def check_disk_quota(remote: Remote) -> None:
cluster = "mila" # todo: if we run this on CC, then we should use `diskusage_report`
# todo: Check the disk-quota of other filesystems if needed.
filesystem = "$HOME"
logger.debug("Checking disk quota on $HOME...")
(used_gb, max_gb), (used_files, max_files) = _get_disk_quota_usage(remote)
logger.debug(
f"Disk usage: {used_gb} / {max_gb} GiB and {used_files} / {max_files} files"
)
size_ratio = used_gb / max_gb
files_ratio = used_files / max_files
reason = (
f"{used_gb} / {max_gb} GiB"
if size_ratio > files_ratio
else f"{used_files} / {max_files} files"
)

freeing_up_space_instructions = (
"For example, temporary files (logs, checkpoints, etc.) can be moved to $SCRATCH, "
"while files that need to be stored for longer periods can be moved to $ARCHIVE "
"or to a shared project folder under /network/projects.\n"
"Visit https://docs.mila.quebec/Information.html#storage to learn more about how to "
"best make use of the different filesystems available on the cluster."
""
)

if used_gb >= max_gb or used_files >= max_files:
raise MilatoolsUserError(
T.red(
f"ERROR: Your disk quota on the {filesystem} filesystem is exceeded! ({reason}).\n"
f"To fix this, login to the cluster with `ssh {cluster}` and free up some space, "
f"either by deleting files, or by moving them to a suitable filesystem.\n"
+ freeing_up_space_instructions
)
)
if max(size_ratio, files_ratio) > 0.9:
warning_message = (
f"WARNING: You are getting pretty close to your disk quota on the $HOME "
f"filesystem: ({reason})\n"
"Please consider freeing up some space in your $HOME folder, either by "
"deleting files, or by moving them to a more suitable filesystem.\n"
+ freeing_up_space_instructions
)
# TODO: Perhaps we could use the logger or the warnings package instead of just printing?
# logger.warning(UserWarning(warning_message))
# warnings.warn(UserWarning(T.yellow(warning_message)))
print(UserWarning(T.yellow(warning_message)))


@tooled
def _find_allocation(remote, job_name="mila-tools"):
# Node to connect to
Expand Down

0 comments on commit 513e341

Please sign in to comment.