diff --git a/docs/2-Deployment/2-UPI_Install.mdx b/docs/2-Deployment/2-UPI_Install.mdx index 409c9f67..bf9b49d0 100644 --- a/docs/2-Deployment/2-UPI_Install.mdx +++ b/docs/2-Deployment/2-UPI_Install.mdx @@ -604,4 +604,97 @@ Add the following under 'spec:' namespace: openshift-authentication servingCertKeyPairSecret: name: custom-cert -``` \ No newline at end of file +``` + +## Increase Primary Disk size on worker nodes: + +1) Run the following bash one-liner to increase the primary disk on all worker nodes to 500GB: + +``` +aws ec2 describe-instances --query 'Reservations[*].Instances[*].[InstanceId,Tags[?Key==`Name`].Value|[0],BlockDeviceMappings[0].Ebs.VolumeId]' --output text | grep worker | awk '{print $3}' | while read volume_id; do aws ec2 modify-volume --volume-id $volume_id --size 500; done +``` + +2) Log into the node with following command: + +``` +oc debug node/ +``` + +3) Once in the node, run the following: + +``` +chroot /host +``` + +then: + +``` +sudo lsblk +``` + +The output should look like this: + +``` +# sudo lsblk +NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS +nvme1n1 259:0 0 1T 0 disk +nvme0n1 259:1 0 500G 0 disk +|-nvme0n1p1 259:2 0 1M 0 part +|-nvme0n1p2 259:3 0 127M 0 part +|-nvme0n1p3 259:4 0 384M 0 part /boot +`-nvme0n1p4 259:5 0 239.5G 0 part /var/lib/kubelet/pods/555d6f90-41fd-49d2-8aad-fa7293a924e4/volume-subpaths/app-config-override/wd-discovery-cnm-api/2 + /var/lib/kubelet/pods/57d28f73-d355-4092-8e52-6b6aeec28bd5/volume-subpaths/clouseau-config/search/4 + /var/lib/kubelet/pods/fcb4b5ec-ea1a-42c7-908c-a027cf885ca1/volume-subpaths/db2wh-cm/zen-database-core/1 + /var/lib/kubelet/pods/fcb4b5ec-ea1a-42c7-908c-a027cf885ca1/volume-subpaths/db2oltp-cm/zen-database-core/0 + /var/lib/kubelet/pods/5e35fe3f-7e4d-4729-b3dd-b9553ffd73f6/volume-subpaths/nginx-conf/monitoring-plugin/1 + /var + /sysroot/ostree/deploy/rhcos/var + /sysroot + /usr + /etc + / + +``` + +4) Find the part on the disk that you wish to increase, in my case it was 'nvme0n1p4'. + +Now we extend the partition, by targeting the disk (Example: /dev/nvme0n1) and the partition (Example: 4) + +``` +sudo growpart /dev/nvme0n1 4 +``` + +5) Check the disk sizes again: + +``` +sudo lsblk +``` + +This is what my output looks like now: + +``` +NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS +nvme1n1 259:0 0 1T 0 disk +nvme0n1 259:1 0 500G 0 disk +|-nvme0n1p1 259:2 0 1M 0 part +|-nvme0n1p2 259:3 0 127M 0 part +|-nvme0n1p3 259:4 0 384M 0 part /boot +`-nvme0n1p4 259:5 0 499.5G 0 part /var/lib/kubelet/pods/555d6f90-41fd-49d2-8aad-fa7293a924e4/volume-subpaths/app-config-override/wd-discovery-cnm-api/2 + /var/lib/kubelet/pods/57d28f73-d355-4092-8e52-6b6aeec28bd5/volume-subpaths/clouseau-config/search/4 + /var/lib/kubelet/pods/fcb4b5ec-ea1a-42c7-908c-a027cf885ca1/volume-subpaths/db2wh-cm/zen-database-core/1 + /var/lib/kubelet/pods/fcb4b5ec-ea1a-42c7-908c-a027cf885ca1/volume-subpaths/db2oltp-cm/zen-database-core/0 + /var/lib/kubelet/pods/5e35fe3f-7e4d-4729-b3dd-b9553ffd73f6/volume-subpaths/nginx-conf/monitoring-plugin/1 + /var + /sysroot/ostree/deploy/rhcos/var + /sysroot + /usr + /etc + / +``` + +6) Last step is to extend the file system: + +``` +sudo xfs_growfs -d / +``` + diff --git a/docs/3-Installation/1-CP4D.mdx b/docs/3-Installation/1-CP4D.mdx index b637959d..fade0329 100644 --- a/docs/3-Installation/1-CP4D.mdx +++ b/docs/3-Installation/1-CP4D.mdx @@ -552,6 +552,56 @@ cpd-cli service-instance list \ ``` + +### Generate a cpd-cli Profile + +Log into the CP4D webui with the info retrieved from the `get-cpd-instance-details` and go to your Profile and settings page in the Cloud Pak for Data client and clicking Generate API key. + +In the upper right hand corner, click `API key` -> `Generate new key` + +Copy the generated key. + +Collect the web client URL and export it with the following command + +```tsx +export CPD_PROFILE_URL=$(oc get route cpd --namespace=${PROJECT_CPD_INST_OPERANDS} | tail -1 | awk '{print "https://"$2}') +``` + +We'll set our `profile-name` to the cluster name. + +Set the following vars: +```tsx +export API_KEY= +export CPD_ADMIN_USER=cpadmin +export LOCAL_USER= +export CPD_PROFILE_NAME=wxai +``` + +Create a local user configuration to store your username and API key by using the config users set command. + +```tsx +cpd-cli config users set ${LOCAL_USER} \ +--username ${CPD_ADMIN_USER} \ +--apikey ${API_KEY} +``` + +Create a profile to store the Cloud Pak for Data URL and to associate the profile with your local user configuration by using the config profiles set command. + +```tsx +cpd-cli config profiles set ${CPD_PROFILE_NAME} \ +--user ${LOCAL_USER} \ +--url ${CPD_PROFILE_URL} +``` + +You can now run cpd-cli commands with this profile as shown in the following example. + +```tsx +cpd-cli service-instance list \ +--profile=${CPD_PROFILE_NAME} +``` + + + ## Installing our Cartridges Source the env file @@ -601,6 +651,7 @@ The default Production size in this case is more the suited for our purposes. ::: #### Apply the olm + ```tsx cpd-cli manage apply-olm \ --release=${VERSION} \ diff --git a/static/scripts/Install-OCP-UPI/add_node.sh b/static/scripts/Install-OCP-UPI/add_node.sh index d65cdfcd..7525d4b6 100644 --- a/static/scripts/Install-OCP-UPI/add_node.sh +++ b/static/scripts/Install-OCP-UPI/add_node.sh @@ -60,8 +60,8 @@ do worker_ignition_url="https://api-int.${cluster_name}.${base_domain}:22623/config/worker" certificate_authorities=$(jq '.ignition.security.tls.certificateAuthorities[].source' ${ocp_data_dir}/worker.ign | sed -e 's/\"//g') - echo "**** Running: " ${bin_dir}/create_worker_param.sh "${cloudformation_dir}" "${infra_id}" "${aws_rhcos_ami_id}" "${gpu_subnet}" "${gpu_securitygroup}" "${worker_ignition_url}" "${certificate_authorities}" "${worker_instance_type}" - ${bin_dir}/create_worker_param.sh "${cloudformation_dir}" "${infra_id}" "${aws_rhcos_ami_id}" "${gpu_subnet}" "${gpu_securitygroup}" "${worker_ignition_url}" "${certificate_authorities}" "${worker_instance_type}" + echo "**** Running: " ${bin_dir}/create_worker_param.sh "${cloudformation_dir}" "${infra_id}" "${aws_rhcos_ami_id}" "${gpu_subnet}" "${gpu_securitygroup}" "${worker_ignition_url}" "${certificate_authorities}" "${gpu_instance_type}" + ${bin_dir}/create_worker_param.sh "${cloudformation_dir}" "${infra_id}" "${aws_rhcos_ami_id}" "${gpu_subnet}" "${gpu_securitygroup}" "${worker_ignition_url}" "${certificate_authorities}" "${gpu_instance_type}" if [[ $? -ne 0 ]] then echo "ERROR: create_worker_param.sh did not complete successfully" diff --git a/static/scripts/Install-OCP-UPI/cloudformation/worker-template.yaml b/static/scripts/Install-OCP-UPI/cloudformation/worker-template.yaml index b24f090b..8770a472 100644 --- a/static/scripts/Install-OCP-UPI/cloudformation/worker-template.yaml +++ b/static/scripts/Install-OCP-UPI/cloudformation/worker-template.yaml @@ -30,6 +30,7 @@ Parameters: Default: m5.large Type: String AllowedValues: + - "p4d.24xlarge" - "m4.large" - "m4.xlarge" - "m4.2xlarge" @@ -144,7 +145,7 @@ Resources: BlockDeviceMappings: - DeviceName: /dev/xvda Ebs: - VolumeSize: "300" + VolumeSize: 5300" VolumeType: "gp3" Encrypted: 'true' InstanceType: !Ref WorkerInstanceType diff --git a/static/scripts/Install-OCP-UPI/config.sh b/static/scripts/Install-OCP-UPI/config.sh index cdcb6282..05eceb50 100644 --- a/static/scripts/Install-OCP-UPI/config.sh +++ b/static/scripts/Install-OCP-UPI/config.sh @@ -42,4 +42,12 @@ master_1_subnet=$aws_private_subnets master_2_subnet=$aws_private_subnets worker_subnet_list=$aws_private_subnets #Don't Change worker_count=6 -worker_instance_type="m6i.8xlarge" \ No newline at end of file +worker_instance_type="m6i.8xlarge" + +### +# GPU Node +### +gpu_count=1 +gpu_subnet="" #US-EAST-2 +gpu_instance_type="p4d.24xlarge" +gpu_securitygroup="" \ No newline at end of file