Skip to content

Commit

Permalink
Paramaterize GPU vs TPU
Browse files Browse the repository at this point in the history
  • Loading branch information
michelle-yooh committed Dec 13, 2023
1 parent 348e43b commit 1c0b49b
Showing 1 changed file with 61 additions and 26 deletions.
87 changes: 61 additions & 26 deletions xpk.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,7 @@
schedulerName: {args.scheduler}
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-accelerator: {system.gke_accelerator}
cloud.google.com/gke-accelerator: {system.gce_machine_type}
# cloud.google.com/gke-tpu-topology: {system.topology}
{node_selector}
priorityClassName: {args.priority}
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
Expand All @@ -113,7 +111,7 @@
echo XPK Start: $(date) ; {command} ; EXIT_CODE=$? ; echo XPK End: $(date); echo EXIT_CODE=$EXIT_CODE ; sleep 5; exit $EXIT_CODE
resources:
limits:
google.com/tpu: {system.chips_per_vm}
{resource_type}: {system.chips_per_vm}
"""

workload_delete_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2
Expand Down Expand Up @@ -154,11 +152,11 @@
withinClusterQueue: LowerPriority
namespaceSelector: {{}} # match all.
resourceGroups:
- coveredResources: ["google.com/tpu"]
- coveredResources: ["{resource_type}"]
flavors:
- name: {cluster_hardware_name}
resources:
- name: "google.com/tpu"
- name: "{resource_type}"
nominalQuota: {total_chips} # Number of slices * number of chips in each slice
---
apiVersion: kueue.x-k8s.io/v1beta1
Expand Down Expand Up @@ -257,11 +255,14 @@ class SystemCharacteristics:
IF YOU MODIFY THE BELOW UserFacingNameToSystemCharacteristics MAP YOU SHOULD ALSO ADD CORRESPONDING
MODIFICATIONS TO UserFacingNameToSystemCharacteristics IN MaxText/accelerator_to_spec_map.py !!!!! """
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
UserFacingNameToSystemCharacteristics = {
GpuUserFacingNameToSystemCharacteristics = {
# H100-80gb
'h100-80gb-8': SystemCharacteristics(
'N/A', 1, 'nvidia-h100-80gb', 'a3-highgpu-8g', 8
),
}

TpuUserFacingNameToSystemCharacteristics = {
# v5p
'v5p-8': SystemCharacteristics(
'2x2x1', 1, 'tpu-v5p-slice', 'ct5p-hightpu-4t', 4
Expand Down Expand Up @@ -1188,7 +1189,7 @@ def run_gke_node_pool_create_command(args, system_characteristics) -> int:
"""

xpk_print(
f'Creating {args.num_slices} node pool or pools of {args.tpu_type}\n'
f'Creating {args.num_slices} node pool or pools of {args.device_type}\n'
f'Underlyingly, we assume that means: {system_characteristics}'
)

Expand Down Expand Up @@ -1379,12 +1380,13 @@ def enable_kueue_crds(args, system) -> int:
Returns:
0 if successful and 1 otherwise.
"""
cluster_hardware_name = f'{args.num_slices}x{args.tpu_type}'
cluster_hardware_name = f'{args.num_slices}x{args.device_type}'
total_chips = args.num_slices * system.vms_per_slice * system.chips_per_vm
yml_string = cluster_set_crd_yaml.format(
system=system,
cluster_hardware_name=cluster_hardware_name,
total_chips=total_chips,
resource_type=get_resource_type(args)
)
tmp = write_temporary_file(yml_string)
command = f'kubectl apply -f {str(tmp.file.name)}'
Expand Down Expand Up @@ -1464,7 +1466,7 @@ def cluster_create(args) -> int:
Returns:
0 if successful and 1 otherwise.
"""
system_characteristics = UserFacingNameToSystemCharacteristics[args.tpu_type]
system_characteristics = get_system_characteristics(args.device_type)

xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
add_zone_and_project(args)
Expand Down Expand Up @@ -1845,6 +1847,37 @@ def setup_docker_image(args) -> tuple[int, str]:
return 0, docker_image


def create_node_selector(args, system) -> str:
if args.device_type in TpuUserFacingNameToSystemCharacteristics:
return """cloud.google.com/gke-tpu-accelerator: {gke_accelerator}
cloud.google.com/gke-tpu-topology: {topology}
""".format(gke_accelerator=system.gke_accelerator, topology=system.topology)
elif args.device_type in GpuUserFacingNameToSystemCharacteristics:
return """cloud.google.com/gke-accelerator: {gke_accelerator}
cloud.google.com/gce-machine-type: {gce_machine_type}
""".format(gke_accelerator=system.gke_accelerator,
gce_machine_type=system.gce_machine_type)
else:
raise ValueError("Unknown device type")


def get_resource_type(args) -> str:
if args.device_type in TpuUserFacingNameToSystemCharacteristics:
return "google.com/tpu"
elif args.device_type in GpuUserFacingNameToSystemCharacteristics:
return "nvidia.com/gpu"
else:
raise ValueError("Unknown device type")


def get_system_characteristics(name):
if name in TpuUserFacingNameToSystemCharacteristics:
return TpuUserFacingNameToSystemCharacteristics[name]
elif name in GpuUserFacingNameToSystemCharacteristics:
return GpuUserFacingNameToSystemCharacteristics[name]
else:
raise ValueError("Unknown device type")

def workload_create(args) -> int:
"""Run jobset apply command for a file.
Expand All @@ -1870,7 +1903,7 @@ def workload_create(args) -> int:
xpk_exit(1)

xpk_print('Starting workload create', flush=True)
system = UserFacingNameToSystemCharacteristics[args.tpu_type]
system = get_system_characteristics(args.device_type)

setup_docker_image_code, docker_image = setup_docker_image(args)
if setup_docker_image_code != 0:
Expand All @@ -1885,7 +1918,9 @@ def workload_create(args) -> int:
yml_string = workload_create_yaml.format(args=args,
system=system,
docker_image=docker_image,
command=command)
command=command,
node_selector=create_node_selector(args, system),
resource_type=get_resource_type(args))
tmp = write_temporary_file(yml_string)
command = f'kubectl apply -f {str(tmp.file.name)}'

Expand Down Expand Up @@ -2148,13 +2183,13 @@ def directory_path_type(value):
),
required=True,
)
# cluster_create_required_arguments.add_argument(
# '--tpu-type',
# type=str,
# default='v5litepod-16',
# help='The type of the TPU. v5litepod and v4 are the only supported types.',
# required=True,
# )
cluster_create_required_arguments.add_argument(
'--device-type',
type=str,
default='v5litepod-16',
help='The type of the TPU. v5litepod and v4 are the only supported types.',
required=True,
)

# Capacity Arguments
cluster_create_capacity_arguments.add_argument(
Expand Down Expand Up @@ -2428,13 +2463,13 @@ def directory_path_type(value):
),
required=True,
)
# workload_create_parser_required_arguments.add_argument(
# '--tpu-type',
# type=str,
# default=None,
# help='The tpu type to use, v5litepod-16, etc.',
# required=True,
# )
workload_create_parser_required_arguments.add_argument(
'--device-type',
type=str,
default=None,
help='The tpu type to use, v5litepod-16, etc.',
required=True,
)
workload_create_parser_required_arguments.add_argument(
'--cluster',
type=str,
Expand Down

0 comments on commit 1c0b49b

Please sign in to comment.