From 1c0b49b6e8325fc4907550cb9bb3c51aca8ea969 Mon Sep 17 00:00:00 2001 From: michelle-yooh Date: Wed, 13 Dec 2023 02:27:50 +0000 Subject: [PATCH] Paramaterize GPU vs TPU --- xpk.py | 87 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 26 deletions(-) diff --git a/xpk.py b/xpk.py index 693f7d29..f5b908dc 100644 --- a/xpk.py +++ b/xpk.py @@ -91,9 +91,7 @@ schedulerName: {args.scheduler} restartPolicy: Never nodeSelector: - cloud.google.com/gke-accelerator: {system.gke_accelerator} - cloud.google.com/gke-accelerator: {system.gce_machine_type} - # cloud.google.com/gke-tpu-topology: {system.topology} + {node_selector} priorityClassName: {args.priority} hostNetwork: true dnsPolicy: ClusterFirstWithHostNet @@ -113,7 +111,7 @@ echo XPK Start: $(date) ; {command} ; EXIT_CODE=$? ; echo XPK End: $(date); echo EXIT_CODE=$EXIT_CODE ; sleep 5; exit $EXIT_CODE resources: limits: - google.com/tpu: {system.chips_per_vm} + {resource_type}: {system.chips_per_vm} """ workload_delete_yaml = """apiVersion: jobset.x-k8s.io/v1alpha2 @@ -154,11 +152,11 @@ withinClusterQueue: LowerPriority namespaceSelector: {{}} # match all. resourceGroups: - - coveredResources: ["google.com/tpu"] + - coveredResources: ["{resource_type}"] flavors: - name: {cluster_hardware_name} resources: - - name: "google.com/tpu" + - name: "{resource_type}" nominalQuota: {total_chips} # Number of slices * number of chips in each slice --- apiVersion: kueue.x-k8s.io/v1beta1 @@ -257,11 +255,14 @@ class SystemCharacteristics: IF YOU MODIFY THE BELOW UserFacingNameToSystemCharacteristics MAP YOU SHOULD ALSO ADD CORRESPONDING MODIFICATIONS TO UserFacingNameToSystemCharacteristics IN MaxText/accelerator_to_spec_map.py !!!!! """ # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv -UserFacingNameToSystemCharacteristics = { +GpuUserFacingNameToSystemCharacteristics = { # H100-80gb 'h100-80gb-8': SystemCharacteristics( 'N/A', 1, 'nvidia-h100-80gb', 'a3-highgpu-8g', 8 ), +} + +TpuUserFacingNameToSystemCharacteristics = { # v5p 'v5p-8': SystemCharacteristics( '2x2x1', 1, 'tpu-v5p-slice', 'ct5p-hightpu-4t', 4 @@ -1188,7 +1189,7 @@ def run_gke_node_pool_create_command(args, system_characteristics) -> int: """ xpk_print( - f'Creating {args.num_slices} node pool or pools of {args.tpu_type}\n' + f'Creating {args.num_slices} node pool or pools of {args.device_type}\n' f'Underlyingly, we assume that means: {system_characteristics}' ) @@ -1379,12 +1380,13 @@ def enable_kueue_crds(args, system) -> int: Returns: 0 if successful and 1 otherwise. """ - cluster_hardware_name = f'{args.num_slices}x{args.tpu_type}' + cluster_hardware_name = f'{args.num_slices}x{args.device_type}' total_chips = args.num_slices * system.vms_per_slice * system.chips_per_vm yml_string = cluster_set_crd_yaml.format( system=system, cluster_hardware_name=cluster_hardware_name, total_chips=total_chips, + resource_type=get_resource_type(args) ) tmp = write_temporary_file(yml_string) command = f'kubectl apply -f {str(tmp.file.name)}' @@ -1464,7 +1466,7 @@ def cluster_create(args) -> int: Returns: 0 if successful and 1 otherwise. """ - system_characteristics = UserFacingNameToSystemCharacteristics[args.tpu_type] + system_characteristics = get_system_characteristics(args.device_type) xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True) add_zone_and_project(args) @@ -1845,6 +1847,37 @@ def setup_docker_image(args) -> tuple[int, str]: return 0, docker_image +def create_node_selector(args, system) -> str: + if args.device_type in TpuUserFacingNameToSystemCharacteristics: + return """cloud.google.com/gke-tpu-accelerator: {gke_accelerator} + cloud.google.com/gke-tpu-topology: {topology} + """.format(gke_accelerator=system.gke_accelerator, topology=system.topology) + elif args.device_type in GpuUserFacingNameToSystemCharacteristics: + return """cloud.google.com/gke-accelerator: {gke_accelerator} + cloud.google.com/gce-machine-type: {gce_machine_type} + """.format(gke_accelerator=system.gke_accelerator, + gce_machine_type=system.gce_machine_type) + else: + raise ValueError("Unknown device type") + + +def get_resource_type(args) -> str: + if args.device_type in TpuUserFacingNameToSystemCharacteristics: + return "google.com/tpu" + elif args.device_type in GpuUserFacingNameToSystemCharacteristics: + return "nvidia.com/gpu" + else: + raise ValueError("Unknown device type") + + +def get_system_characteristics(name): + if name in TpuUserFacingNameToSystemCharacteristics: + return TpuUserFacingNameToSystemCharacteristics[name] + elif name in GpuUserFacingNameToSystemCharacteristics: + return GpuUserFacingNameToSystemCharacteristics[name] + else: + raise ValueError("Unknown device type") + def workload_create(args) -> int: """Run jobset apply command for a file. @@ -1870,7 +1903,7 @@ def workload_create(args) -> int: xpk_exit(1) xpk_print('Starting workload create', flush=True) - system = UserFacingNameToSystemCharacteristics[args.tpu_type] + system = get_system_characteristics(args.device_type) setup_docker_image_code, docker_image = setup_docker_image(args) if setup_docker_image_code != 0: @@ -1885,7 +1918,9 @@ def workload_create(args) -> int: yml_string = workload_create_yaml.format(args=args, system=system, docker_image=docker_image, - command=command) + command=command, + node_selector=create_node_selector(args, system), + resource_type=get_resource_type(args)) tmp = write_temporary_file(yml_string) command = f'kubectl apply -f {str(tmp.file.name)}' @@ -2148,13 +2183,13 @@ def directory_path_type(value): ), required=True, ) -# cluster_create_required_arguments.add_argument( -# '--tpu-type', -# type=str, -# default='v5litepod-16', -# help='The type of the TPU. v5litepod and v4 are the only supported types.', -# required=True, -# ) +cluster_create_required_arguments.add_argument( + '--device-type', + type=str, + default='v5litepod-16', + help='The type of the TPU. v5litepod and v4 are the only supported types.', + required=True, +) # Capacity Arguments cluster_create_capacity_arguments.add_argument( @@ -2428,13 +2463,13 @@ def directory_path_type(value): ), required=True, ) -# workload_create_parser_required_arguments.add_argument( -# '--tpu-type', -# type=str, -# default=None, -# help='The tpu type to use, v5litepod-16, etc.', -# required=True, -# ) +workload_create_parser_required_arguments.add_argument( + '--device-type', + type=str, + default=None, + help='The tpu type to use, v5litepod-16, etc.', + required=True, +) workload_create_parser_required_arguments.add_argument( '--cluster', type=str,