diff --git a/README.md b/README.md index b5743351..2163900f 100644 --- a/README.md +++ b/README.md @@ -403,7 +403,7 @@ Please select a CPU type that exists in all zones in the region. # Find CPU Types supported in zones. gcloud compute machine-types list --zones=$ZONE_LIST # Adjust default cpu machine type. -python3 xpk.py cluster create --cluster-cpu-machine-type=CPU_TYPE ... +python3 xpk.py cluster create --default-pool-cpu-machine-type=CPU_TYPE ... ``` ## Permission Issues: `requires one of ["permission_name"] permission(s)`. diff --git a/xpk.py b/xpk.py index 59e8a9fa..f16b65d5 100644 --- a/xpk.py +++ b/xpk.py @@ -91,6 +91,15 @@ spec: schedulerName: {args.scheduler} restartPolicy: Never + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: 'cloud.google.com/gke-nodepool' + operator: NotIn + value: + default-pool nodeSelector: {accelerator_label} {machine_label} @@ -246,7 +255,8 @@ AcceleratorType = { 'TPU': 1, - 'GPU': 2 + 'GPU': 2, + 'CPU': 3 } @dataclass @@ -263,6 +273,10 @@ class AcceleratorCharacteristics: # GPU AcceleratorType['GPU']: AcceleratorCharacteristics( 'nvidia.com/gpu', 'cloud.google.com/gke-accelerator', 'cloud.google.com/gce-machine-type' + ), + # CPU + AcceleratorType['CPU']: AcceleratorCharacteristics( + 'cpu', '', 'cloud.google.com/gke-nodepool' ) } @@ -642,6 +656,25 @@ class SystemCharacteristics: 'v4-4096': SystemCharacteristics( '8x16x16', 512,'tpu-v4-podslice', 'ct4p-hightpu-4t', 4, AcceleratorType['TPU'], 'v4-4096' ), + + # CPU system characteristics + # n2-standard-32-$VMs + 'n2-standard-32-1': SystemCharacteristics( + 'N/A', 1,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32' + ), + 'n2-standard-32-2': SystemCharacteristics( + 'N/A', 2,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-2' + ), + 'n2-standard-32-4': SystemCharacteristics( + 'N/A', 4,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-4' + ), + 'n2-standard-32-8': SystemCharacteristics( + 'N/A', 8,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-8' + ), + + 'n2-standard-32-256': SystemCharacteristics( + 'N/A', 256,'N/A', 'n2-standard-32', 1, AcceleratorType['CPU'], 'n2-standard-32-256' + ), } """ If you modify UserFacingNameToSystemCharacteristics you should also modify the corresponding Map in MaxText/accelerator_to_spec_map.py """ @@ -1047,7 +1080,7 @@ def run_gke_cluster_create_command(args) -> int: f' --node-locations={args.zone}' f' --project={args.project} --region={zone_to_region(args.zone)}' f' --cluster-version={args.gke_version} --location-policy=BALANCED' - f' --machine-type={args.cluster_cpu_machine_type}' + f' --machine-type={args.default_pool_cpu_machine_type}' ' --scopes=storage-full,gke-default' f' {args.custom_cluster_arguments}' ) @@ -1315,10 +1348,10 @@ def run_gke_node_pool_create_command(args, system) -> int: f' {capacity_args}' ' --scopes=storage-full,gke-default' ' --enable-gvnic --max-pods-per-node 15' - f' {args.custom_tpu_nodepool_arguments}' ) if system.accelerator_type == AcceleratorType['TPU']: command += (f' --tpu-topology={system.topology}') + command += (f' {args.custom_tpu_nodepool_arguments}') elif system.accelerator_type == AcceleratorType['GPU']: command += f' --accelerator type={system.gke_accelerator},count={str(system.chips_per_vm)}' task = f'NodepoolCreate-{node_pool_name}' @@ -2181,6 +2214,8 @@ def create_accelerator_label(accelerator_type, system) -> str: Returns: The accelerator label. """ + if accelerator_type == AcceleratorType['CPU']: + return "" return f"{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}: {system.gke_accelerator}" def create_machine_label(accelerator_type, system) -> str: @@ -2572,7 +2607,7 @@ def directory_path_type(value): '--device-type', type=str, default=None, - help='The device type to use (can be tpu or gpu), v5litepod-16, h100-80gb-8, etc.' + help='The device type to use (can be tpu or gpu or cpu), v5litepod-16, h100-80gb-8, n2-standard-32-4 etc.' ) @@ -2628,14 +2663,24 @@ def directory_path_type(value): required=True, ) cluster_create_optional_arguments.add_argument( - '--cluster-cpu-machine-type', + '--default-pool-cpu-machine-type', type=str, default='e2-standard-16', help=( - 'Set the machine tpu within the default cpu node pool. For' + 'Set the machine type within the default cpu node pool. For' ' regional clusters, all zones must support the machine type.' ) ) +cluster_create_optional_arguments.add_argument( + '--cluster-cpu-machine-type', + type=str, + default='e2-standard-16', + help=( + 'Deprecated! Please use --default-pool-cpu-machine-type instead,' + ' to denote the machine type of the default cpu node pool. Set' + ' the machine type of other cpu nodepools using --device-type.' + ) +) cluster_create_optional_arguments.add_argument( '--custom-cluster-arguments', type=str, @@ -2883,7 +2928,7 @@ def directory_path_type(value): '--device-type', type=str, default=None, - help='The device type to use (can be tpu or gpu), v5litepod-16, h100-80gb-8, etc.' + help='The device type to use (can be tpu or gpu or cpu), v5litepod-16, h100-80gb-8, n2-standard-32-4 etc.' ) ### Workload Optional Arguments