Skip to content

Commit

Permalink
Merge pull request #50 from dfinity/deadlinenotification
Browse files Browse the repository at this point in the history
Notify every hour if a rollout task is stalled on waiting for alerts to resolve
  • Loading branch information
DFINITYManu authored Oct 1, 2024
2 parents 2bdd9c0 + 6dfa474 commit 40aa9d9
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 34 deletions.
15 changes: 6 additions & 9 deletions plugins/operators/ic_os_rollout.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,7 @@ def __init__(
self.source_task_id = source_task_id
dr_dre_slack_id = DR_DRE_SLACK_ID
text = (
(
"""Proposal <{{
"""Proposal <{{
task_instance.xcom_pull(
task_ids='%(source_task_id)s',
map_indexes=task_instance.map_index,
Expand All @@ -221,10 +220,8 @@ def __init__(
map_indexes=task_instance.map_index,
).proposal_id
}}> is now up for voting. <!subteam^%(dr_dre_slack_id)s>"""
""" please vote for the proposal using your HSM."""
)
% locals()
)
""" please vote for the proposal using your HSM."""
) % locals()
slack.SlackAPIPostOperator.__init__(
self,
channel=SLACK_CHANNEL,
Expand Down Expand Up @@ -257,10 +254,10 @@ def __init__(
) -> None:
dr_dre_slack_id = DR_DRE_SLACK_ID
text = (
"""Subnet `%(subnet_id)s` has not finished upgrading in over an hour."""
""" <!subteam^%(dr_dre_slack_id)s>"""
f"""Subnet `{subnet_id}` has not finished upgrading in over an hour."""
f""" <!subteam^{dr_dre_slack_id}>"""
""" please investigate *as soon as possible*."""
) % locals()
)
slack.SlackAPIPostOperator.__init__(
self,
channel=SLACK_CHANNEL,
Expand Down
44 changes: 19 additions & 25 deletions plugins/sensors/ic_os_rollout.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,43 +345,37 @@ def send_notification_if_necessary(subnet_id: str) -> None:
# a message to Slack notifying the DRE operator that a subnet
# has not exited the alerts condition in over an hour.
now = time.time()
first_alert_check_timestamp = context["task_instance"].xcom_pull(
key="first_alert_check_timestamp",
key = "alert_check_timestamp"
alert_check_timestamp = context["task_instance"].xcom_pull(
key=key,
map_indexes=context["task_instance"].map_index,
)
if not first_alert_check_timestamp:
if not alert_check_timestamp:
# Value is not yet xcommed. Xcom it now.
deadline = now + SUBNET_UPDATE_STALL_TIMEOUT_SECONDS
self.log.info(
"Notification routine not yet run; storing timestamp %s",
now,
)
# Value is not yet xcommed.
context["task_instance"].xcom_push(
key="first_alert_check_timestamp",
value=now,
"Notification deadline not initialized, storing %s", deadline
)
context["task_instance"].xcom_push(key=key, value=deadline)
else:
self.log.info(
"Notification routine already ran at %r",
first_alert_check_timestamp,
)
first_alert_check_timestamp = float(first_alert_check_timestamp)
if (
first_alert_check_timestamp
> now + SUBNET_UPDATE_STALL_TIMEOUT_SECONDS
):
deadline = float(alert_check_timestamp)
if now > deadline:
# Value is xcommed and is old enough.
deadline = now + SUBNET_UPDATE_STALL_TIMEOUT_SECONDS
self.log.info(
"Routine ran over %s seconds ago, notifying",
now - first_alert_check_timestamp,
"Notification deadline has been hit, notifying"
" and resetting deadline to %s",
deadline,
)
# Value is xcommed and is old enough.
# Send message here.
NotifyAboutStalledSubnet(
task_id="notify_about_stalled_subnet",
subnet_id=subnet_id,
).execute(context=context)
# send message here, then
# Remember new deadline.
context["task_instance"].xcom_push(
key="first_alert_check_timestamp",
value=now + 3600,
key=key,
value=deadline,
)

subnet_id, git_revision = subnet_id_and_git_revision_from_args(
Expand Down

0 comments on commit 40aa9d9

Please sign in to comment.