Skip to content

Commit

Permalink
Adding CPU support in XPK
Browse files Browse the repository at this point in the history
  • Loading branch information
RoshaniN committed Jan 19, 2024
1 parent 8897ac8 commit f4d1ace
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 8 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ Please select a CPU type that exists in all zones in the region.
# Find CPU Types supported in zones.
gcloud compute machine-types list --zones=$ZONE_LIST
# Adjust default cpu machine type.
python3 xpk.py cluster create --cluster-cpu-machine-type=CPU_TYPE ...
python3 xpk.py cluster create --default-pool-cpu-machine-type=CPU_TYPE ...
```
## Permission Issues: `requires one of ["permission_name"] permission(s)`.
Expand Down
63 changes: 56 additions & 7 deletions xpk.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@
spec:
schedulerName: {args.scheduler}
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: NotIn
values:
- default-pool
nodeSelector:
{accelerator_label}
{machine_label}
Expand Down Expand Up @@ -247,7 +256,8 @@

AcceleratorType = {
'TPU': 1,
'GPU': 2
'GPU': 2,
'CPU': 3
}

@dataclass
Expand All @@ -264,6 +274,10 @@ class AcceleratorCharacteristics:
# GPU
AcceleratorType['GPU']: AcceleratorCharacteristics(
'nvidia.com/gpu', 'cloud.google.com/gke-accelerator', 'cloud.google.com/gce-machine-type'
),
# CPU
AcceleratorType['CPU']: AcceleratorCharacteristics(
'cpu', '', 'cloud.google.com/gke-nodepool'
)
}

Expand Down Expand Up @@ -643,6 +657,25 @@ class SystemCharacteristics:
'v4-4096': SystemCharacteristics(
'8x16x16', 512,'tpu-v4-podslice', 'ct4p-hightpu-4t', 4, AcceleratorType['TPU'], 'v4-4096'
),

# CPU system characteristics
# n2-standard-32-$VMs
'n2-standard-32-1': SystemCharacteristics(
'N/A', 1,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-1'
),
'n2-standard-32-2': SystemCharacteristics(
'N/A', 2,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-2'
),
'n2-standard-32-4': SystemCharacteristics(
'N/A', 4,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-4'
),
'n2-standard-32-8': SystemCharacteristics(
'N/A', 8,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-8'
),

'n2-standard-32-256': SystemCharacteristics(
'N/A', 256,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-256'
),
}
""" If you modify UserFacingNameToSystemCharacteristics you should also modify the corresponding
Map in MaxText/accelerator_to_spec_map.py """
Expand Down Expand Up @@ -1036,6 +1069,10 @@ def run_gke_cluster_create_command(args) -> int:
Returns:
0 if successful and 1 otherwise.
"""
# cluster_cpu_machine_type is being deprecated!
if args.cluster_cpu_machine_type is not None:
xpk_print('Note that cluster-cpu-machine-type is deprecated, please use default-pool-cpu-machine-type instead!')
xpk_exit(0)

# Create the regional cluster with `num-nodes` CPU nodes in the same zone as
# TPUs. This has been tested with clusters of 300 VMs. Larger clusters will
Expand All @@ -1048,7 +1085,7 @@ def run_gke_cluster_create_command(args) -> int:
f' --node-locations={args.zone}'
f' --project={args.project} --region={zone_to_region(args.zone)}'
f' --cluster-version={args.gke_version} --location-policy=BALANCED'
f' --machine-type={args.cluster_cpu_machine_type}'
f' --machine-type={args.default_pool_cpu_machine_type}'
' --scopes=storage-full,gke-default'
f' {args.custom_cluster_arguments}'
)
Expand Down Expand Up @@ -1316,10 +1353,10 @@ def run_gke_node_pool_create_command(args, system) -> int:
f' {capacity_args}'
' --scopes=storage-full,gke-default'
' --enable-gvnic --max-pods-per-node 15'
f' {args.custom_tpu_nodepool_arguments}'
)
if system.accelerator_type == AcceleratorType['TPU']:
command += (f' --tpu-topology={system.topology}')
command += (f' {args.custom_tpu_nodepool_arguments}')
elif system.accelerator_type == AcceleratorType['GPU']:
command += f' --accelerator type={system.gke_accelerator},count={str(system.chips_per_vm)}'
task = f'NodepoolCreate-{node_pool_name}'
Expand Down Expand Up @@ -2182,6 +2219,8 @@ def create_accelerator_label(accelerator_type, system) -> str:
Returns:
The accelerator label.
"""
if accelerator_type == AcceleratorType['CPU']:
return ""
return f"{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}: {system.gke_accelerator}"

def create_machine_label(accelerator_type, system) -> str:
Expand Down Expand Up @@ -2573,7 +2612,7 @@ def directory_path_type(value):
'--device-type',
type=str,
default=None,
help='The device type to use (can be tpu or gpu), v5litepod-16, h100-80gb-8, etc.'
help='The device type to use (can be tpu or gpu or cpu), v5litepod-16, h100-80gb-8, n2-standard-32-4 etc.'
)


Expand Down Expand Up @@ -2629,14 +2668,24 @@ def directory_path_type(value):
required=True,
)
cluster_create_optional_arguments.add_argument(
'--cluster-cpu-machine-type',
'--default-pool-cpu-machine-type',
type=str,
default='e2-standard-16',
help=(
'Set the machine tpu within the default cpu node pool. For'
'Set the machine type within the default cpu node pool. For'
' regional clusters, all zones must support the machine type.'
)
)
cluster_create_optional_arguments.add_argument(
'--cluster-cpu-machine-type',
type=str,
default='',
help=(
'Deprecated! Please use --default-pool-cpu-machine-type instead,'
' to denote the machine type of the default cpu node pool. Set'
' the machine type of other cpu nodepools using --device-type.'
)
)
cluster_create_optional_arguments.add_argument(
'--custom-cluster-arguments',
type=str,
Expand Down Expand Up @@ -2884,7 +2933,7 @@ def directory_path_type(value):
'--device-type',
type=str,
default=None,
help='The device type to use (can be tpu or gpu), v5litepod-16, h100-80gb-8, etc.'
help='The device type to use (can be tpu or gpu or cpu), v5litepod-16, h100-80gb-8, n2-standard-32-4 etc.'
)

### Workload Optional Arguments
Expand Down

0 comments on commit f4d1ace

Please sign in to comment.