Skip to content

Commit

Permalink
env for setting node register timeout
Browse files Browse the repository at this point in the history
Signed-off-by: JasonChen <892670992@qq.com>
  • Loading branch information
NKcqx committed Dec 12, 2023
1 parent 3ee23aa commit f18ada4
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/occlum/Occlum.custom.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"PATH=/bin",
"RAY_BACKEND_LOG_LEVEL=debug",
"RAY_agent_register_timeout_ms=300000",
"RAY_node_register_timeout_seconds=120",
"RAY_worker_register_timeout_seconds=300",
"RAY_raylet_client_connect_timeout_milliseconds=2500",
"RAY_raylet_client_num_connect_attempts=100",
Expand Down
7 changes: 6 additions & 1 deletion python/ray/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,17 +305,22 @@ def __init__(

if not connect_only:
self.start_ray_processes()
# Timeout waiting for node to be registered in GCS
node_register_timeout = os.environ.get(
"RAY_node_register_timeout_seconds", 30)
# we should update the address info after the node has been started
try:
ray._private.services.wait_for_node(
self.redis_address,
self.gcs_address,
self._plasma_store_socket_name,
self.redis_password,
node_register_timeout
)
except TimeoutError:
raise Exception(
"The current node has not been updated within 30 "
"The current node has not been updated within"
f"{node_register_timeout} "
"seconds, this could happen because of some of "
"the Ray processes failed to startup."
)
Expand Down

0 comments on commit f18ada4

Please sign in to comment.