feat: package GPU libraries inside backend containers for unified base image (#7891)

* Initial plan * Add GPU library packaging for isolated backend environments - Create scripts/build/package-gpu-libs.sh for packaging CUDA, ROCm, SYCL, and Vulkan libraries - Update llama-cpp, whisper, stablediffusion-ggml package.sh to include GPU libraries - Update Dockerfile.python to package GPU libraries into Python backends - Update libbackend.sh to set LD_LIBRARY_PATH for GPU library loading Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> * Address code review feedback: fix variable consistency and quoting Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> * Fix code review issues: improve glob handling and remove redundant variable Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> * Simplify main Dockerfile and workflow to use unified base image - Remove GPU-specific driver installation from Dockerfile (CUDA, ROCm, Vulkan, Intel) - Simplify image.yml workflow to build single unified base image for linux/amd64 and linux/arm64 - GPU libraries are now packaged in individual backend containers Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-04-21 13:27:21 +00:00 · 2026-01-07 15:48:51 +01:00 · 2026-01-07 15:48:51 +01:00 · fd53978a7b
commit fd53978a7b
parent 7abc0242bb
8 changed files with 331 additions and 262 deletions
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@ -13,42 +13,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  hipblas-jobs:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
-      aio: ${{ matrix.aio }}
-      makeflags: ${{ matrix.makeflags }}
-      ubuntu-version: ${{ matrix.ubuntu-version }}
-      ubuntu-codename: ${{ matrix.ubuntu-codename }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      matrix:
-        include:
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-hipblas'
-            base-image: "rocm/dev-ubuntu-24.04:6.4.4"
-            grpc-base-image: "ubuntu:24.04"
-            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=3 --output-sync=target"
-            aio: "-aio-gpu-hipblas"
-            ubuntu-version: '2404'
-            ubuntu-codename: 'noble'
-
+  # Unified base image build - GPU drivers are now packaged in individual backends
  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
@ -72,9 +37,10 @@ jobs:
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
-      #max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
+          # Unified base image for all platforms
+          # GPU-specific backends will be pulled at runtime and contain their own GPU libraries
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
@ -86,101 +52,3 @@ jobs:
            skip-drivers: 'false'
            ubuntu-version: '2404'
            ubuntu-codename: 'noble'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "9"
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-nvidia-cuda-12'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:24.04"
-            skip-drivers: 'false'
-            makeflags: "--jobs=4 --output-sync=target"
-            aio: "-aio-gpu-nvidia-cuda-12"
-            ubuntu-version: '2404'
-            ubuntu-codename: 'noble'
-          - build-type: 'cublas'
-            cuda-major-version: "13"
-            cuda-minor-version: "0"
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-nvidia-cuda-13'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
-            skip-drivers: 'false'
-            makeflags: "--jobs=4 --output-sync=target"
-            aio: "-aio-gpu-nvidia-cuda-13"
-            ubuntu-version: '2404'
-            ubuntu-codename: 'noble'
-          - build-type: 'vulkan'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-gpu-vulkan'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:24.04"
-            skip-drivers: 'false'
-            makeflags: "--jobs=4 --output-sync=target"
-            aio: "-aio-gpu-vulkan"
-            ubuntu-version: '2404'
-            ubuntu-codename: 'noble'
-          - build-type: 'intel'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-            grpc-base-image: "ubuntu:24.04"
-            tag-suffix: '-gpu-intel'
-            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=3 --output-sync=target"
-            aio: "-aio-gpu-intel"
-            ubuntu-version: '2404'
-            ubuntu-codename: 'noble'
-
-  gh-runner:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-      aio: ${{ matrix.aio }}
-      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
-      makeflags: ${{ matrix.makeflags }}
-      skip-drivers: ${{ matrix.skip-drivers }}
-      ubuntu-version: ${{ matrix.ubuntu-version }}
-      ubuntu-codename: ${{ matrix.ubuntu-codename }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      matrix:
-        include:
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "9"
-            platforms: 'linux/arm64'
-            tag-latest: 'auto'
-            tag-suffix: '-nvidia-l4t-arm64'
-            base-image: "ubuntu:24.04"
-            runs-on: 'ubuntu-24.04-arm'
-            makeflags: "--jobs=4 --output-sync=target"
-            skip-drivers: 'true'
-            ubuntu-version: "2404"
-            ubuntu-codename: 'noble'
-          - build-type: 'cublas'
-            cuda-major-version: "13"
-            cuda-minor-version: "0"
-            platforms: 'linux/arm64'
-            tag-latest: 'auto'
-            tag-suffix: '-nvidia-l4t-arm64-cuda-13'
-            base-image: "ubuntu:24.04"
-            runs-on: 'ubuntu-24.04-arm'
-            makeflags: "--jobs=4 --output-sync=target"
-            skip-drivers: 'false'
-            ubuntu-version: '2404'
-            ubuntu-codename: 'noble'
--- a/130
+++ b/130
@ -14,7 +14,9 @@ RUN apt-get update && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-# The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
+# GPU drivers are no longer installed in the main image.
+# Each backend now packages its own GPU libraries (CUDA, ROCm, SYCL, Vulkan)
+# This allows for a unified base image that works with any backend.
 FROM requirements AS requirements-drivers

 ARG BUILD_TYPE
@ -29,132 +31,6 @@ ARG UBUNTU_VERSION=2404
 RUN mkdir -p /run/localai
 RUN echo "default" > /run/localai/capability

-# Vulkan requirements
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils wget gpg-agent && \
-        apt-get install -y libglm-dev cmake libxcb-dri3-0 libxcb-present0 libpciaccess0 \
-            libpng-dev libxcb-keysyms1-dev libxcb-dri3-dev libx11-dev g++ gcc \
-            libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
-            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
-            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
-            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils mesa-vulkan-drivers && \
-        wget "https://sdk.lunarg.com/sdk/download/1.4.328.1/linux/vulkansdk-linux-x86_64-1.4.328.1.tar.xz" && \
-        tar -xf vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
-        rm vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
-        mkdir -p /opt/vulkan-sdk && \
-        mv 1.4.328.1 /opt/vulkan-sdk/ && \
-        cd /opt/vulkan-sdk/1.4.328.1 && \
-        ./vulkansdk --no-deps --maxjobs \
-            vulkan-loader \
-            vulkan-validationlayers \
-            vulkan-extensionlayer \
-            vulkan-tools \
-            shaderc && \
-        cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/bin/* /usr/bin/ && \
-        cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/lib/* /usr/lib/x86_64-linux-gnu/ && \
-        cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/include/* /usr/include/ && \
-        cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/share/* /usr/share/ && \
-        rm -rf /opt/vulkan-sdk && \
-        ldconfig && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
-        echo "vulkan" > /run/localai/capability
-    fi
-EOT
-
-# CuBLAS requirements
-RUN <<EOT bash
-    if ( [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "l4t" ] ) && [ "${SKIP_DRIVERS}" = "false" ]; then
-        apt-get update && \
-        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils
-        if [ "amd64" = "$TARGETARCH" ]; then
-            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
-        fi
-        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
-        fi
-        dpkg -i cuda-keyring_1.1-1_all.deb && \
-        rm -f cuda-keyring_1.1-1_all.deb && \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "arm64" = "$TARGETARCH" ]; then
-            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        fi
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
-        echo "nvidia-cuda-${CUDA_MAJOR_VERSION}" > /run/localai/capability
-    fi
-EOT
-
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
-        echo "nvidia-l4t-cuda-${CUDA_MAJOR_VERSION}" > /run/localai/capability
-    fi
-EOT
-
-# https://github.com/NVIDIA/Isaac-GR00T/issues/343
-RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
-        wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        dpkg -i cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
-        cp /var/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get -y install cudss cudss-cuda-${CUDA_MAJOR_VERSION} && \
-        wget https://developer.download.nvidia.com/compute/nvpl/25.5/local_installers/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        dpkg -i nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
-        cp /var/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5/nvpl-*-keyring.gpg /usr/share/keyrings/ && \
-        apt-get update && apt-get install -y nvpl
-    fi
-EOT
-
-# If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            libclblast-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* \
-    ; fi
-
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            hipblas-dev \
-            rocblas-dev && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
-        echo "amd" > /run/localai/capability && \
-        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
-        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
-        ldconfig \
-    ; fi
-
-RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
-    ln -s /opt/rocm-**/lib/llvm/lib/libomp.so /usr/lib/libomp.so \
-    ; fi
-
-RUN expr "${BUILD_TYPE}" = intel && echo "intel" > /run/localai/capability || echo "not intel"
-
-# Cuda
-ENV PATH=/usr/local/cuda/bin:${PATH}
-
-# HipBLAS requirements
-ENV PATH=/opt/rocm/bin:${PATH}
-
 ###################################
 ###################################

--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@ -174,9 +174,15 @@ EOT
 COPY backend/python/${BACKEND} /${BACKEND}
 COPY backend/backend.proto /${BACKEND}/backend.proto
 COPY backend/python/common/ /${BACKEND}/common
+COPY scripts/build/package-gpu-libs.sh /package-gpu-libs.sh

 RUN cd /${BACKEND} && PORTABLE_PYTHON=true make

+# Package GPU libraries into the backend's lib directory
+RUN mkdir -p /${BACKEND}/lib && \
+    TARGET_LIB_DIR="/${BACKEND}/lib" BUILD_TYPE="${BUILD_TYPE}" CUDA_MAJOR_VERSION="${CUDA_MAJOR_VERSION}" \
+    bash /package-gpu-libs.sh "/${BACKEND}/lib"
+
 FROM scratch
 ARG BACKEND=rerankers
 COPY --from=builder /${BACKEND}/ /
--- a/backend/cpp/llama-cpp/package.sh
+++ b/backend/cpp/llama-cpp/package.sh
@ -6,6 +6,7 @@
 set -e

 CURDIR=$(dirname "$(realpath $0)")
+REPO_ROOT="${CURDIR}/../../.."

 # Create lib directory
 mkdir -p $CURDIR/package/lib
@ -37,6 +38,15 @@ else
    exit 1
 fi

+# Package GPU libraries based on BUILD_TYPE
+# The GPU library packaging script will detect BUILD_TYPE and copy appropriate GPU libraries
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
 echo "Packaging completed successfully" 
 ls -liah $CURDIR/package/
 ls -liah $CURDIR/package/lib/
--- a/backend/go/stablediffusion-ggml/package.sh
+++ b/backend/go/stablediffusion-ggml/package.sh
@ -6,6 +6,7 @@
 set -e

 CURDIR=$(dirname "$(realpath $0)")
+REPO_ROOT="${CURDIR}/../../.."

 # Create lib directory
 mkdir -p $CURDIR/package/lib
@ -50,6 +51,15 @@ else
    exit 1
 fi

+# Package GPU libraries based on BUILD_TYPE
+# The GPU library packaging script will detect BUILD_TYPE and copy appropriate GPU libraries
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
 echo "Packaging completed successfully"
 ls -liah $CURDIR/package/
 ls -liah $CURDIR/package/lib/
--- a/backend/go/whisper/package.sh
+++ b/backend/go/whisper/package.sh
@ -6,6 +6,7 @@
 set -e

 CURDIR=$(dirname "$(realpath $0)")
+REPO_ROOT="${CURDIR}/../../.."

 # Create lib directory
 mkdir -p $CURDIR/package/lib
@ -50,6 +51,15 @@ else
    exit 1
 fi

+# Package GPU libraries based on BUILD_TYPE
+# The GPU library packaging script will detect BUILD_TYPE and copy appropriate GPU libraries
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
 echo "Packaging completed successfully"
 ls -liah $CURDIR/package/
 ls -liah $CURDIR/package/lib/
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@ -465,6 +465,14 @@ function startBackend() {
    if [ "x${PORTABLE_PYTHON}" == "xtrue" ] || [ -x "$(_portable_python)" ]; then
        _makeVenvPortable --update-pyvenv-cfg
    fi
+
+    # Set up GPU library paths if a lib directory exists
+    # This allows backends to include their own GPU libraries (CUDA, ROCm, etc.)
+    if [ -d "${EDIR}/lib" ]; then
+        export LD_LIBRARY_PATH="${EDIR}/lib:${LD_LIBRARY_PATH:-}"
+        echo "Added ${EDIR}/lib to LD_LIBRARY_PATH for GPU libraries"
+    fi
+
    if [ ! -z "${BACKEND_FILE:-}" ]; then
        exec "${EDIR}/venv/bin/python" "${BACKEND_FILE}" "$@"
    elif [ -e "${MY_DIR}/server.py" ]; then
--- a/scripts/build/package-gpu-libs.sh
+++ b/scripts/build/package-gpu-libs.sh
@ -0,0 +1,281 @@
+#!/bin/bash
+# Script to package GPU libraries based on BUILD_TYPE
+# This script copies GPU-specific runtime libraries to a target lib directory
+# so backends can run in isolation with their own GPU libraries.
+#
+# Usage: source package-gpu-libs.sh TARGET_LIB_DIR
+#        package_gpu_libs
+#
+# Environment variables:
+#   BUILD_TYPE - The GPU build type (cublas, l4t, hipblas, sycl_f16, sycl_f32, intel, vulkan)
+#   CUDA_MAJOR_VERSION - CUDA major version (for cublas/l4t builds)
+#
+# This enables backends to be fully self-contained and run on a unified base image
+# without requiring GPU drivers to be pre-installed in the host image.
+
+set -e
+
+TARGET_LIB_DIR="${1:-./lib}"
+
+# Create target directory if it doesn't exist
+mkdir -p "$TARGET_LIB_DIR"
+
+# Helper function to copy library and follow symlinks
+copy_lib() {
+    local src="$1"
+    if [ -e "$src" ]; then
+        cp -arfLv "$src" "$TARGET_LIB_DIR/" 2>/dev/null || true
+    fi
+}
+
+# Helper function to copy all matching libraries from a glob pattern
+copy_libs_glob() {
+    local pattern="$1"
+    # Use nullglob option to handle non-matching patterns gracefully
+    local old_nullglob=$(shopt -p nullglob)
+    shopt -s nullglob
+    local matched=($pattern)
+    eval "$old_nullglob"
+    for lib in "${matched[@]}"; do
+        if [ -e "$lib" ]; then
+            copy_lib "$lib"
+        fi
+    done
+}
+
+# Package NVIDIA CUDA libraries
+package_cuda_libs() {
+    echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..."
+
+    local cuda_lib_paths=(
+        "/usr/local/cuda/lib64"
+        "/usr/local/cuda-${CUDA_MAJOR_VERSION:-}/lib64"
+        "/usr/lib/x86_64-linux-gnu"
+        "/usr/lib/aarch64-linux-gnu"
+    )
+
+    # Core CUDA runtime libraries
+    local cuda_libs=(
+        "libcudart.so*"
+        "libcublas.so*"
+        "libcublasLt.so*"
+        "libcufft.so*"
+        "libcurand.so*"
+        "libcusparse.so*"
+        "libcusolver.so*"
+        "libnvrtc.so*"
+        "libnvrtc-builtins.so*"
+        "libcudnn.so*"
+        "libcudnn_ops.so*"
+        "libcudnn_cnn.so*"
+        "libnvJitLink.so*"
+        "libnvinfer.so*"
+        "libnvonnxparser.so*"
+    )
+
+    for lib_path in "${cuda_lib_paths[@]}"; do
+        if [ -d "$lib_path" ]; then
+            for lib_pattern in "${cuda_libs[@]}"; do
+                copy_libs_glob "${lib_path}/${lib_pattern}"
+            done
+        fi
+    done
+
+    # Copy CUDA target directory for runtime compilation support
+    if [ -d "/usr/local/cuda/targets" ]; then
+        mkdir -p "$TARGET_LIB_DIR/../cuda"
+        cp -arfL /usr/local/cuda/targets "$TARGET_LIB_DIR/../cuda/" 2>/dev/null || true
+    fi
+
+    echo "CUDA libraries packaged successfully"
+}
+
+# Package AMD ROCm/HIPBlas libraries
+package_rocm_libs() {
+    echo "Packaging ROCm/HIPBlas libraries for BUILD_TYPE=${BUILD_TYPE}..."
+
+    local rocm_lib_paths=(
+        "/opt/rocm/lib"
+        "/opt/rocm/lib64"
+        "/opt/rocm/hip/lib"
+    )
+
+    # Find the actual ROCm versioned directory
+    for rocm_dir in /opt/rocm-*; do
+        if [ -d "$rocm_dir/lib" ]; then
+            rocm_lib_paths+=("$rocm_dir/lib")
+        fi
+    done
+
+    # Core ROCm/HIP runtime libraries
+    local rocm_libs=(
+        "libamdhip64.so*"
+        "libhipblas.so*"
+        "librocblas.so*"
+        "librocrand.so*"
+        "librocsparse.so*"
+        "librocsolver.so*"
+        "librocfft.so*"
+        "libMIOpen.so*"
+        "libroctx64.so*"
+        "libhsa-runtime64.so*"
+        "libamd_comgr.so*"
+        "libhip_hcc.so*"
+        "libhiprtc.so*"
+    )
+
+    for lib_path in "${rocm_lib_paths[@]}"; do
+        if [ -d "$lib_path" ]; then
+            for lib_pattern in "${rocm_libs[@]}"; do
+                copy_libs_glob "${lib_path}/${lib_pattern}"
+            done
+        fi
+    done
+
+    # Copy rocblas library data (tuning files, etc.)
+    local old_nullglob=$(shopt -p nullglob)
+    shopt -s nullglob
+    local rocm_dirs=(/opt/rocm /opt/rocm-*)
+    eval "$old_nullglob"
+    for rocm_base in "${rocm_dirs[@]}"; do
+        if [ -d "$rocm_base/lib/rocblas" ]; then
+            mkdir -p "$TARGET_LIB_DIR/rocblas"
+            cp -arfL "$rocm_base/lib/rocblas/"* "$TARGET_LIB_DIR/rocblas/" 2>/dev/null || true
+        fi
+    done
+
+    # Copy libomp from LLVM (required for ROCm)
+    shopt -s nullglob
+    local omp_libs=(/opt/rocm*/lib/llvm/lib/libomp.so*)
+    eval "$old_nullglob"
+    for omp_path in "${omp_libs[@]}"; do
+        if [ -e "$omp_path" ]; then
+            copy_lib "$omp_path"
+        fi
+    done
+
+    echo "ROCm libraries packaged successfully"
+}
+
+# Package Intel oneAPI/SYCL libraries
+package_intel_libs() {
+    echo "Packaging Intel oneAPI/SYCL libraries for BUILD_TYPE=${BUILD_TYPE}..."
+
+    local intel_lib_paths=(
+        "/opt/intel/oneapi/compiler/latest/lib"
+        "/opt/intel/oneapi/mkl/latest/lib/intel64"
+        "/opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8"
+    )
+
+    # Core Intel oneAPI runtime libraries
+    local intel_libs=(
+        "libsycl.so*"
+        "libOpenCL.so*"
+        "libmkl_core.so*"
+        "libmkl_intel_lp64.so*"
+        "libmkl_intel_thread.so*"
+        "libmkl_sequential.so*"
+        "libmkl_sycl.so*"
+        "libiomp5.so*"
+        "libsvml.so*"
+        "libirng.so*"
+        "libimf.so*"
+        "libintlc.so*"
+        "libtbb.so*"
+        "libtbbmalloc.so*"
+        "libpi_level_zero.so*"
+        "libpi_opencl.so*"
+        "libze_loader.so*"
+    )
+
+    for lib_path in "${intel_lib_paths[@]}"; do
+        if [ -d "$lib_path" ]; then
+            for lib_pattern in "${intel_libs[@]}"; do
+                copy_libs_glob "${lib_path}/${lib_pattern}"
+            done
+        fi
+    done
+
+    echo "Intel oneAPI libraries packaged successfully"
+}
+
+# Package Vulkan libraries
+package_vulkan_libs() {
+    echo "Packaging Vulkan libraries for BUILD_TYPE=${BUILD_TYPE}..."
+
+    local vulkan_lib_paths=(
+        "/usr/lib/x86_64-linux-gnu"
+        "/usr/lib/aarch64-linux-gnu"
+        "/usr/local/lib"
+    )
+
+    # Core Vulkan runtime libraries
+    local vulkan_libs=(
+        "libvulkan.so*"
+        "libshaderc_shared.so*"
+        "libSPIRV.so*"
+        "libSPIRV-Tools.so*"
+        "libglslang.so*"
+    )
+
+    for lib_path in "${vulkan_lib_paths[@]}"; do
+        if [ -d "$lib_path" ]; then
+            for lib_pattern in "${vulkan_libs[@]}"; do
+                copy_libs_glob "${lib_path}/${lib_pattern}"
+            done
+        fi
+    done
+
+    # Copy Vulkan ICD files
+    if [ -d "/usr/share/vulkan/icd.d" ]; then
+        mkdir -p "$TARGET_LIB_DIR/../vulkan/icd.d"
+        cp -arfL /usr/share/vulkan/icd.d/* "$TARGET_LIB_DIR/../vulkan/icd.d/" 2>/dev/null || true
+    fi
+
+    echo "Vulkan libraries packaged successfully"
+}
+
+# Main function to package GPU libraries based on BUILD_TYPE
+package_gpu_libs() {
+    local build_type="${BUILD_TYPE:-}"
+
+    echo "Packaging GPU libraries for BUILD_TYPE=${build_type}..."
+
+    case "$build_type" in
+        cublas|l4t)
+            package_cuda_libs
+            ;;
+        hipblas)
+            package_rocm_libs
+            ;;
+        sycl_f16|sycl_f32|intel)
+            package_intel_libs
+            ;;
+        vulkan)
+            package_vulkan_libs
+            ;;
+        ""|cpu)
+            echo "No GPU libraries to package for BUILD_TYPE=${build_type}"
+            ;;
+        *)
+            echo "Unknown BUILD_TYPE: ${build_type}, skipping GPU library packaging"
+            ;;
+    esac
+
+    echo "GPU library packaging complete. Contents of ${TARGET_LIB_DIR}:"
+    ls -la "$TARGET_LIB_DIR/" 2>/dev/null || echo "  (empty or not created)"
+}
+
+# Export the function so it can be sourced and called
+export -f package_gpu_libs
+export -f copy_lib
+export -f copy_libs_glob
+export -f package_cuda_libs
+export -f package_rocm_libs
+export -f package_intel_libs
+export -f package_vulkan_libs
+
+# If script is run directly (not sourced), execute the packaging
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    package_gpu_libs
+fi