Deploy SR-IOV Network with RDMA

Note

You can automate the configuration of this use case with NVIDIA Kubernetes Launch Kit. For more details, see Configuration Assistance with Kubernetes Launch Kit.

Step 1: Create NicClusterPolicy

apiVersion: mellanox.com/v1alpha1
kind: NicClusterPolicy
metadata:
  name: nic-cluster-policy
spec:
  nvIpam:
    image: nvidia-k8s-ipam
    repository: nvcr.io/nvstaging/mellanox
    version: network-operator-v26.7.0-beta.2
    enableWebhook: false
  secondaryNetwork:
    cniPlugins:
      image: plugins
      repository: nvcr.io/nvstaging/mellanox
      version: network-operator-v26.7.0-beta.2
    multus:
      image: multus-cni
      repository: nvcr.io/nvstaging/mellanox
      version: network-operator-v26.7.0-beta.2

kubectl apply -f nicclusterpolicy.yaml

Step 2: Create IPPool for nv-ipam

apiVersion: nv-ipam.nvidia.com/v1alpha1
kind: IPPool
metadata:
  name: sriov-pool
  namespace: nvidia-network-operator
spec:
  subnet: 192.168.2.0/24
  perNodeBlockSize: 50
  gateway: 192.168.2.1

kubectl apply -f ippool.yaml

Step 3: Configure SR-IOV

apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetworkNodePolicy
metadata:
  name: ethernet-sriov
  namespace: nvidia-network-operator
spec:
  deviceType: netdevice
  mtu: 1500
  nodeSelector:
    feature.node.kubernetes.io/pci-15b3.present: "true"
  nicSelector:
    vendor: "15b3"
  isRdma: true
  numVfs: 8
  priority: 90
  resourceName: sriov_resource

kubectl apply -f sriovnetworknodepolicy.yaml

Step 4: Create SR-IOV Network

apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetwork
metadata:
  name: sriov-rdma-network
  namespace: nvidia-network-operator
spec:
  ipam: |
    {
      "type": "nv-ipam",
      "poolName": "sriov-pool"
    }
  networkNamespace: default
  resourceName: sriov_resource

kubectl apply -f sriovnetwork.yaml

Step 5: Deploy test workload

---
apiVersion: v1
kind: Pod
metadata:
  name: sriov-rdma-server
  namespace: default
  labels:
    app: sriov-rdma
    role: server
  annotations:
    k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
  tolerations:
  - key: "node-role.kubernetes.io/control-plane"
    operator: "Exists"
    effect: "NoSchedule"
  - key: "node-role.kubernetes.io/master"
    operator: "Exists"
    effect: "NoSchedule"
  restartPolicy: Never
  containers:
  - name: rdma-test
    image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
    command: ["/bin/bash", "-c", "sleep infinity"]
    securityContext:
      capabilities:
        add: ["IPC_LOCK"]
      privileged: true
    resources:
      requests:
        nvidia.com/sriov_resource: "1"
      limits:
        nvidia.com/sriov_resource: "1"
---
apiVersion: v1
kind: Pod
metadata:
  name: sriov-rdma-client
  namespace: default
  labels:
    app: sriov-rdma
    role: client
  annotations:
    k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
  affinity:
    podAntiAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
      - labelSelector:
          matchExpressions:
          - key: role
            operator: In
            values:
            - server
        topologyKey: kubernetes.io/hostname
  restartPolicy: Never
  containers:
  - name: rdma-test
    image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
    command: ["/bin/bash", "-c", "sleep infinity"]
    securityContext:
      capabilities:
        add: ["IPC_LOCK"]
      privileged: true
    resources:
      requests:
        nvidia.com/sriov_resource: "1"
      limits:
        nvidia.com/sriov_resource: "1"

kubectl apply -f pod.yaml

Step 6: Verify the deployment

Check that the pods are running on different nodes:

kubectl get pods -n default -o wide

Verify RDMA devices are available in the pods:

kubectl -n default exec sriov-rdma-server -- ibv_devices
kubectl -n default exec sriov-rdma-client -- ibv_devices

Capture the server IP and RDMA device names in environment variables:

export SERVER_IP=$(kubectl get pod sriov-rdma-server -n default -o jsonpath='{.metadata.annotations.k8s\.v1\.cni\.cncf\.io/network-status}' | jq -r '.[] | select(.name=="default/sriov-rdma-network") | .ips[0]')
export SERVER_RDMA_DEV=$(kubectl -n default exec sriov-rdma-server -- ibv_devices | awk 'NR==3 {print $1}')
export CLIENT_RDMA_DEV=$(kubectl -n default exec sriov-rdma-client -- ibv_devices | awk 'NR==3 {print $1}')

echo "Server IP: $SERVER_IP"
echo "Server RDMA Device: $SERVER_RDMA_DEV"
echo "Client RDMA Device: $CLIENT_RDMA_DEV"

Step 7: Test RDMA connectivity

Start the RDMA bandwidth test server:

kubectl -n default exec -it sriov-rdma-server -- bash -lc "ib_write_bw -d $SERVER_RDMA_DEV -R -a --report_gbits"

In a separate terminal, run the RDMA bandwidth test client:

kubectl -n default exec -it sriov-rdma-client -- bash -lc "ib_write_bw -d $CLIENT_RDMA_DEV -R -a --report_gbits $SERVER_IP"

Note

The commands above automatically use the first available RDMA device from each pod. If you need to use a different device, manually set the environment variables or replace them in the command.

Complete Configuration

apiVersion: mellanox.com/v1alpha1
kind: NicClusterPolicy
metadata:
  name: nic-cluster-policy
spec:
  nvIpam:
    image: nvidia-k8s-ipam
    repository: nvcr.io/nvstaging/mellanox
    version: network-operator-v26.7.0-beta.2
    enableWebhook: false
  secondaryNetwork:
    cniPlugins:
      image: plugins
      repository: nvcr.io/nvstaging/mellanox
      version: network-operator-v26.7.0-beta.2
    multus:
      image: multus-cni
      repository: nvcr.io/nvstaging/mellanox
      version: network-operator-v26.7.0-beta.2

---
apiVersion: nv-ipam.nvidia.com/v1alpha1
kind: IPPool
metadata:
  name: sriov-pool
  namespace: nvidia-network-operator
spec:
  subnet: 192.168.2.0/24
  perNodeBlockSize: 50
  gateway: 192.168.2.1
---
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetworkNodePolicy
metadata:
  name: ethernet-sriov
  namespace: nvidia-network-operator
spec:
  deviceType: netdevice
  mtu: 1500
  nodeSelector:
    feature.node.kubernetes.io/pci-15b3.present: "true"
  nicSelector:
    vendor: "15b3"
  isRdma: true
  numVfs: 8
  priority: 90
  resourceName: sriov_resource
---
apiVersion: sriovnetwork.openshift.io/v1
kind: SriovNetwork
metadata:
  name: sriov-rdma-network
  namespace: nvidia-network-operator
spec:
  ipam: |
    {
      "type": "nv-ipam",
      "poolName": "sriov-pool"
    }
  networkNamespace: default
  resourceName: sriov_resource
---
---
apiVersion: v1
kind: Pod
metadata:
  name: sriov-rdma-server
  namespace: default
  labels:
    app: sriov-rdma
    role: server
  annotations:
    k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
  tolerations:
  - key: "node-role.kubernetes.io/control-plane"
    operator: "Exists"
    effect: "NoSchedule"
  - key: "node-role.kubernetes.io/master"
    operator: "Exists"
    effect: "NoSchedule"
  restartPolicy: Never
  containers:
  - name: rdma-test
    image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
    command: ["/bin/bash", "-c", "sleep infinity"]
    securityContext:
      capabilities:
        add: ["IPC_LOCK"]
      privileged: true
    resources:
      requests:
        nvidia.com/sriov_resource: "1"
      limits:
        nvidia.com/sriov_resource: "1"
---
apiVersion: v1
kind: Pod
metadata:
  name: sriov-rdma-client
  namespace: default
  labels:
    app: sriov-rdma
    role: client
  annotations:
    k8s.v1.cni.cncf.io/networks: sriov-rdma-network
spec:
  affinity:
    podAntiAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
      - labelSelector:
          matchExpressions:
          - key: role
            operator: In
            values:
            - server
        topologyKey: kubernetes.io/hostname
  restartPolicy: Never
  containers:
  - name: rdma-test
    image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
    command: ["/bin/bash", "-c", "sleep infinity"]
    securityContext:
      capabilities:
        add: ["IPC_LOCK"]
      privileged: true
    resources:
      requests:
        nvidia.com/sriov_resource: "1"
      limits:
        nvidia.com/sriov_resource: "1"