You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
/home/mediaadm# kubectl logs nvidia-vgpu-manager-daemonset-5t77m -n gpu-operator
+ DRIVER_VERSION=535.54.06
+ DRIVER_ARCH=x86_64
+ DRIVER_RESET_RETRIES=10
++ uname -r
+ KERNEL_VERSION=5.4.0-153-generic
+ RUN_DIR=/run/nvidia
+ export DEBIAN_FRONTEND=noninteractive
+ DEBIAN_FRONTEND=noninteractive
+ '[' 1 -eq 0 ']'
+ command=init
+ shift
+ case"${command}"in
++ getopt -l accept-license -o a --
+ options=' --'
+ '[' 0 -ne 0 ']'
+ eval set -- ' --'
++ set -- --
+ ACCEPT_LICENSE=
++ uname -r
+ KERNEL_VERSION=5.4.0-153-generic
+ PRIVATE_KEY=
+ PACKAGE_TAG=
+ for opt in ${options}
+ case "$opt" in
+ shift
+ break
+ '[' 0 -ne 0 ']'
+ init
+ trap 'echo '\''Caught signal'\''; exit 1' HUP INT QUIT PIPE TERM
+ trap _shutdown EXIT
+ _unload_driver
+ rmmod_args=()
+ local rmmod_args
+ local nvidia_deps=0
+ local nvidia_refs=0
+ local nvidia_vgpu_vfio_refs=0
+ echo'Stopping NVIDIA vGPU Manager...'
+ '[' -f /var/run/nvidia-vgpu-mgr/nvidia-vgpu-mgr.pid ']'
+ echo'Unloading NVIDIA driver kernel modules...'
+ '[' -f /sys/module/nvidia_vgpu_vfio/refcnt ']'
+ '[' -f /sys/module/nvidia/refcnt ']'
+ '[' 0 -gt 0 ']'
+ return 0
+ _unmount_rootfs
+ echo'Unmounting NVIDIA driver rootfs...'
Stopping NVIDIA vGPU Manager...
Unloading NVIDIA driver kernel modules...
Unmounting NVIDIA driver rootfs...
+ findmnt -r -o TARGET
+ grep /run/nvidia/driver
+ _update_package_cache
+ '[''''!='builtin']'
+ echo'Updating the package cache...'
+ apt-get -qq update
Updating the package cache...
+ _resolve_kernel_version
++ apt-cache show linux-headers-5.4.0-153-generic
++ sed -nE 's/^Version:\s+(([0-9]+\.){2}[0-9]+)[-.]([0-9]+).*/\1-\3/p'
++ head -1
+ local version=5.4.0-153
++ echo 5.4.0-153-generic
++ sed 's/[^a-z]*//'
++ grep -Ev '^generic|virtual'
+ local flavor=
+ echo'Resolving Linux kernel version...'
+ '[' -z 5.4.0-153 ']'
+ KERNEL_VERSION=5.4.0-153-generic
+ echo'Proceeding with Linux kernel version 5.4.0-153-generic'
+ return 0
+ _install_prerequisites
Resolving Linux kernel version...
Proceeding with Linux kernel version 5.4.0-153-generic
++ mktemp -d
/tmp/tmp.SD4RANswGW /driver
+ local tmp_dir=/tmp/tmp.SD4RANswGW
+ trap'popd; rm -rf /tmp/tmp.SD4RANswGW' RETURN EXIT
+ pushd /tmp/tmp.SD4RANswGW
+ rm -rf /lib/modules/5.4.0-153-generic
+ mkdir -p /lib/modules/5.4.0-153-generic/proc
+ echo'Installing Linux kernel headers...'
+ apt-get -qq install --no-install-recommends linux-headers-5.4.0-153-generic
Installing Linux kernel headers...
+ echo'Installing Linux kernel module files...'
+ apt-get -qq download linux-image-5.4.0-153-generic
Installing Linux kernel module files...
+ dpkg -x linux-image-5.4.0-153-generic_5.4.0-153.170_amd64.deb .
+ mv lib/modules/5.4.0-153-generic/modules.builtin lib/modules/5.4.0-153-generic/modules.builtin.modinfo lib/modules/5.4.0-153-generic/modules.order /lib/modules/5.4.0-153-generic
+ mv lib/modules/5.4.0-153-generic/kernel /lib/modules/5.4.0-153-generic
+ depmod 5.4.0-153-generic
+ echo'Generating Linux kernel version string...'
Generating Linux kernel version string...
+ file boot/vmlinuz-5.4.0-153-generic
+ awk 'BEGIN { RS="," } $1=="version" { print $2 }' -
+ '[' -z 5.4.0-153-generic ']'
+ mv version /lib/modules/5.4.0-153-generic/proc
++ popd
++ rm -rf /tmp/tmp.SD4RANswGW
/driver
Creating '/dev/char' directory
+ _create_dev_char_directory
+ '[''!' -d /dev/char ']'
+ echo'Creating '\''/dev/char'\'' directory'
+ mkdir -p /dev/char
+ _install_driver
++ mktemp -d
+ local tmp_dir=/tmp/tmp.NHH0IVw9KN
+ sh NVIDIA-Linux-x86_64-535.54.06-vgpu-kvm.run --ui=none --no-questions --tmpdir /tmp/tmp.NHH0IVw9KN --no-systemd
Verifying archive integrity... OK
Uncompressing NVIDIA Accelerated Graphics Driver for Linux-x86_64 535.54.06.......................................................................................................................................................................................................................................................
Welcome to the NVIDIA Software Installer for Unix/Linux
Detected 160 CPUs online; setting concurrency level to 32.
Installing NVIDIA driver version 535.54.06.
Performing CC sanity check with CC="/usr/bin/cc".
Performing CC check.
Kernel source path: '/lib/modules/5.4.0-153-generic/build'
Kernel output path: '/lib/modules/5.4.0-153-generic/build'
Performing Compiler check.
Performing Dom0 check.
Performing Xen check.
Performing PREEMPT_RT check.
Performing vgpu_kvm check.
Cleaning kernel module build directory.
Building kernel modules
: [##############################] 100%
Kernel module compilation complete.
Unable to determine if Secure Boot is enabled: No such file or directory
Kernel messages:
[ 3642.071317] libceph: osd39 up
[ 3642.071458] libceph: osd15 up
[ 3642.071691] libceph: osd23 up
[ 3642.071693] libceph: osd43 up
[ 3646.457736] IPVS: rr: TCP 10.233.15.42:3300 - no destination available
[ 3652.310917] libceph: osd12 up
[ 8477.022034] device 620e8ef2b77d_h entered promiscuous mode
[ 8477.195217] eth0: renamed from 620e8ef2b77d_c
[ 8477.222976] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[ 8477.223138] IPv6: ADDRCONF(NETDEV_CHANGE): 620e8ef2b77d_h: link becomes ready
[ 8501.520850] device 4306c9246b7e_h entered promiscuous mode
[ 8501.670950] eth0: renamed from 4306c9246b7e_c
[ 8501.710406] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[ 8501.710522] IPv6: ADDRCONF(NETDEV_CHANGE): 4306c9246b7e_h: link becomes ready
[ 8506.705552] device 08e0daa14e14_h entered promiscuous mode
[ 8506.927346] eth0: renamed from 08e0daa14e14_c
[ 8506.963956] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[ 8506.964170] IPv6: ADDRCONF(NETDEV_CHANGE): 08e0daa14e14_h: link becomes ready
[ 8542.876131] nvidia: loading out-of-tree module taints kernel.
[ 8542.876143] nvidia: module license 'NVIDIA' taints kernel.
[ 8542.876144] Disabling lock debugging due to kernel taint
[ 8542.903020] nvidia: module verification failed: signature and/or required key missing - tainting kernel
[ 8542.918925] nvidia-nvlink: Nvlink Core is being initialized, major device number 234
[ 8543.305768] NVRM: loading NVIDIA UNIX x86_64 Kernel Module 535.54.06 Wed Jun 14 21:19:12 UTC 2023
[ 8543.361064] nvidia-nvlink: Unregistered Nvlink Core, major device number 234
Searching for conflicting files:
Searching: [##############################] 100%
Installing 'NVIDIA Accelerated Graphics Driver for Linux-x86_64' (535.54.06):
Installing: [##############################] 100%
Driver file installation is complete.
Running distribution scripts
Executing /usr/lib/nvidia/post-install: [##############################] 100%
Running post-install sanity check:
Checking: [##############################] 100%
Post-install sanity check passed.
Installation of the NVIDIA Accelerated Graphics Driver for Linux-x86_64 (version: 535.54.06) is now complete.
+ _load_driver
+ '[''!'-f /sys/module/nvidia_vgpu_vfio/refcnt ']'
+ /usr/bin/nvidia-vgpud
+ '[''!'-f /sys/module/nvidia/refcnt ']'
+ return 0
+ _mount_rootfs
+ echo'Mounting NVIDIA driver rootfs...'
+ /usr/bin/nvidia-vgpu-mgr
+ mount -o remount,rw /sys
Mounting NVIDIA driver rootfs...
+ mount --make-runbindable /sys
+ mount --make-private /sys
+ mkdir -p /run/nvidia/driver
+ mount --rbind / /run/nvidia/driver
+ _enable_vfs
+ local retry
+ (( retry =0))
+ (( retry <=10))
+ /usr/lib/nvidia/sriov-manage -e ALL
Enabling VFs on 0000:1b:00.0
Cannot obtain unbindLock for 0000:1b:00.0
+ '[' 0 == 10 ']'
+ (( retry++))
+ (( retry <=10))
+ /usr/lib/nvidia/sriov-manage -e ALL
Enabling VFs on 0000:1b:00.0
Cannot obtain unbindLock for 0000:1b:00.0
+ '[' 1 == 10 ']'
+ (( retry++))
+ (( retry <=10))
+ /usr/lib/nvidia/sriov-manage -e ALL
Enabling VFs on 0000:1b:00.0
Enabling VFs on 0000:1c:00.0
Enabling VFs on 0000:1d:00.0
Enabling VFs on 0000:1e:00.0
Enabling VFs on 0000:ce:00.0
Enabling VFs on 0000:cf:00.0
Enabling VFs on 0000:d0:00.0
Enabling VFs on 0000:d1:00.0
+ return 0
+ pgrep nvidia-vgpu-mgr
+ nvidia-vgpud
+ set +x
Done, now waiting for signal
logs of Vgpu device Manager pod
kubectl logs nvidia-vgpu-device-manager-hg4k7 -n gpu-operator -f vgpu-manager-validation
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
waiting for NVIDIA vGPU Manager to be setup
The text was updated successfully, but these errors were encountered:
@aravindgpd what is the status of the nvidia-sandbox-validator pod? The vgpu-device-manager will block until a status file is created by nvidia-sandbox-validator indicating that the vGPU Manager is finished installing.
Vgpu Manager pod logs
logs of Vgpu device Manager pod
The text was updated successfully, but these errors were encountered: