Skip to content

Commit

Permalink
Adding CPU support in XPK
Browse files Browse the repository at this point in the history
  • Loading branch information
RoshaniN committed Jan 26, 2024
1 parent 8897ac8 commit 410a1f4
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ Please select a CPU type that exists in all zones in the region.
# Find CPU Types supported in zones.
gcloud compute machine-types list --zones=$ZONE_LIST
# Adjust default cpu machine type.
python3 xpk.py cluster create --cluster-cpu-machine-type=CPU_TYPE ...
python3 xpk.py cluster create --default-pool-cpu-machine-type=CPU_TYPE ...
```
## Permission Issues: `requires one of ["permission_name"] permission(s)`.
Expand Down
103 changes: 94 additions & 9 deletions xpk.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@
spec:
schedulerName: {args.scheduler}
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: NotIn
values:
- default-pool
nodeSelector:
{accelerator_label}
{machine_label}
Expand All @@ -100,6 +109,27 @@
terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
containers:
{container}
env:
- name: REPLICATED_JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
- name: JOBSET_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
- name: JAX_COORDINATOR_ADDRESS
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
- name: JAX_PROCESS_ID
valueFrom:
fieldRef:
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
- name: JOB_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/job-index']
- name: JAX_PROCESS_COUNT
value: "{process_count}"
volumeMounts:
- mountPath: /dev/shm
name: dshm-2
Expand Down Expand Up @@ -247,7 +277,8 @@

AcceleratorType = {
'TPU': 1,
'GPU': 2
'GPU': 2,
'CPU': 3
}

@dataclass
Expand All @@ -264,6 +295,10 @@ class AcceleratorCharacteristics:
# GPU
AcceleratorType['GPU']: AcceleratorCharacteristics(
'nvidia.com/gpu', 'cloud.google.com/gke-accelerator', 'cloud.google.com/gce-machine-type'
),
# CPU
AcceleratorType['CPU']: AcceleratorCharacteristics(
'cpu', '', 'cloud.google.com/gke-nodepool'
)
}

Expand Down Expand Up @@ -643,6 +678,28 @@ class SystemCharacteristics:
'v4-4096': SystemCharacteristics(
'8x16x16', 512,'tpu-v4-podslice', 'ct4p-hightpu-4t', 4, AcceleratorType['TPU'], 'v4-4096'
),

# CPU system characteristics
# n2-standard-32-$VMs
'n2-standard-32-1': SystemCharacteristics(
'N/A', 1,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-1'
),
'n2-standard-32-2': SystemCharacteristics(
'N/A', 2,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-2'
),
'n2-standard-32-4': SystemCharacteristics(
'N/A', 4,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-4'
),
'n2-standard-32-8': SystemCharacteristics(
'N/A', 8,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-8'
),

'n2-standard-32-256': SystemCharacteristics(
'N/A', 256,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-256'
),
'n2-standard-32-1048': SystemCharacteristics(
'N/A', 1048,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-1048'
),
}
""" If you modify UserFacingNameToSystemCharacteristics you should also modify the corresponding
Map in MaxText/accelerator_to_spec_map.py """
Expand Down Expand Up @@ -1036,6 +1093,12 @@ def run_gke_cluster_create_command(args) -> int:
Returns:
0 if successful and 1 otherwise.
"""
# cluster_cpu_machine_type is being deprecated!
if args.cluster_cpu_machine_type != '':
xpk_print('Note that cluster-cpu-machine-type is deprecated,',
'please use default-pool-cpu-machine-type instead!',
args.cluster_cpu_machine_type)
xpk_exit(0)

# Create the regional cluster with `num-nodes` CPU nodes in the same zone as
# TPUs. This has been tested with clusters of 300 VMs. Larger clusters will
Expand All @@ -1048,7 +1111,7 @@ def run_gke_cluster_create_command(args) -> int:
f' --node-locations={args.zone}'
f' --project={args.project} --region={zone_to_region(args.zone)}'
f' --cluster-version={args.gke_version} --location-policy=BALANCED'
f' --machine-type={args.cluster_cpu_machine_type}'
f' --machine-type={args.default_pool_cpu_machine_type}'
' --scopes=storage-full,gke-default'
f' {args.custom_cluster_arguments}'
)
Expand Down Expand Up @@ -1307,7 +1370,7 @@ def run_gke_node_pool_create_command(args, system) -> int:
command = (
'gcloud beta container node-pools create'
f' {node_pool_name} --node-version={args.gke_version}'
f' --placement-type=COMPACT --cluster={args.cluster}'
f' --cluster={args.cluster}'
f' --project={args.project} --node-locations={args.zone}'
f' --region={zone_to_region(args.zone)}'
f' --num-nodes={system.vms_per_slice}'
Expand All @@ -1316,11 +1379,13 @@ def run_gke_node_pool_create_command(args, system) -> int:
f' {capacity_args}'
' --scopes=storage-full,gke-default'
' --enable-gvnic --max-pods-per-node 15'
f' {args.custom_tpu_nodepool_arguments}'
)
if system.accelerator_type == AcceleratorType['TPU']:
command += (f' --placement-type=COMPACT ')
command += (f' --tpu-topology={system.topology}')
command += (f' {args.custom_tpu_nodepool_arguments}')
elif system.accelerator_type == AcceleratorType['GPU']:
command += (f' --placement-type=COMPACT ')
command += f' --accelerator type={system.gke_accelerator},count={str(system.chips_per_vm)}'
task = f'NodepoolCreate-{node_pool_name}'
commands.append(command)
Expand Down Expand Up @@ -1486,6 +1551,7 @@ def enable_kueue_crds(args, system) -> int:
total_chips=total_chips,
accelerator_label=create_accelerator_label(system.accelerator_type, system),
machine_label=create_machine_label(system.accelerator_type, system),
process_count=calculate_process_count(args.num_slices,system.vms_per_slice),
resource_type=AcceleratorTypeToAcceleratorCharacteristics[system.accelerator_type].resource_type
)
tmp = write_temporary_file(yml_string)
Expand Down Expand Up @@ -1612,7 +1678,7 @@ def cluster_create(args) -> int:
if create_cluster_configmap_code != 0:
xpk_exit(create_cluster_configmap_code)

xpk_print('GKE commands done! TPUs are created.')
xpk_print('GKE commands done! Resources are created.')
xpk_print(
'See your GKE Cluster here:'
# pylint: disable=line-too-long
Expand Down Expand Up @@ -2051,6 +2117,7 @@ def get_main_container(args, system, docker_image, command, resource_type) -> st
ports:
- containerPort: 8471
- containerPort: 8080
- containerPort: 1234
securityContext:
privileged: true
command:
Expand Down Expand Up @@ -2182,6 +2249,8 @@ def create_accelerator_label(accelerator_type, system) -> str:
Returns:
The accelerator label.
"""
if accelerator_type == AcceleratorType['CPU']:
return ""
return f"{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}: {system.gke_accelerator}"

def create_machine_label(accelerator_type, system) -> str:
Expand All @@ -2198,6 +2267,11 @@ def create_machine_label(accelerator_type, system) -> str:
return f"{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].machine_label}: {system.topology}"
return ""

def calculate_process_count(num_slices, vms_per_slice) -> str:
num_processes = int(num_slices) * int(vms_per_slice)
# print(f"{num_processes}")
return f"{num_processes}"

def get_system_characteristics(args) -> tuple[SystemCharacteristics|None, int]:
"""Get system characteristics based on user provided arguments.
Expand Down Expand Up @@ -2277,6 +2351,7 @@ def workload_create(args) -> int:
container=container,
accelerator_label=create_accelerator_label(system.accelerator_type, system),
machine_label=create_machine_label(system.accelerator_type, system),
process_count=calculate_process_count(args.num_slices,system.vms_per_slice),
resource_type=resource_type)
tmp = write_temporary_file(yml_string)
command = f'kubectl apply -f {str(tmp.file.name)}'
Expand Down Expand Up @@ -2573,7 +2648,7 @@ def directory_path_type(value):
'--device-type',
type=str,
default=None,
help='The device type to use (can be tpu or gpu), v5litepod-16, h100-80gb-8, etc.'
help='The device type to use (can be tpu or gpu or cpu), v5litepod-16, h100-80gb-8, n2-standard-32-4 etc.'
)


Expand Down Expand Up @@ -2629,14 +2704,24 @@ def directory_path_type(value):
required=True,
)
cluster_create_optional_arguments.add_argument(
'--cluster-cpu-machine-type',
'--default-pool-cpu-machine-type',
type=str,
default='e2-standard-16',
help=(
'Set the machine tpu within the default cpu node pool. For'
'Set the machine type within the default cpu node pool. For'
' regional clusters, all zones must support the machine type.'
)
)
cluster_create_optional_arguments.add_argument(
'--cluster-cpu-machine-type',
type=str,
default='',
help=(
'Deprecated! Please use --default-pool-cpu-machine-type instead,'
' to denote the machine type of the default cpu node pool. Set'
' the machine type of other cpu nodepools using --device-type.'
)
)
cluster_create_optional_arguments.add_argument(
'--custom-cluster-arguments',
type=str,
Expand Down Expand Up @@ -2884,7 +2969,7 @@ def directory_path_type(value):
'--device-type',
type=str,
default=None,
help='The device type to use (can be tpu or gpu), v5litepod-16, h100-80gb-8, etc.'
help='The device type to use (can be tpu or gpu or cpu), v5litepod-16, h100-80gb-8, n2-standard-32-4 etc.'
)

### Workload Optional Arguments
Expand Down

0 comments on commit 410a1f4

Please sign in to comment.