[GEDI] [PATCH-for-9.1] rdma: Remove RDMA subsystem and pvrdma device
Paolo Bonzini
pbonzini at redhat.com
Thu Mar 28 07:51:34 UTC 2024
Il mer 27 mar 2024, 11:56 Philippe Mathieu-Daudé <philmd at linaro.org> ha
scritto:
> The whole RDMA subsystem was deprecated in commit e9a54265f5
> ("hw/rdma: Deprecate the pvrdma device and the rdma subsystem")
> released in v8.2. Time to remove it.
>
> Keep the RAM_SAVE_FLAG_HOOK definition since it might appears
> in old migration streams.
>
> Remove the dependencies on libibumad and libibverbs.
>
> Remove the generated vmw_pvrdma/ directory from linux-headers.
>
> Remove RDMA handling from migration.
>
> Remove RDMA handling in GlusterFS block driver.
>
I don't think these two were deprecated? They are unrelated to pvrdma.
Paolo
> Remove rdmacm-mux tool from contrib/.
>
> Remove PVRDMA device.
>
> Cc: Peter Xu <peterx at redhat.com>
> Cc: Li Zhijian <lizhijian at fujitsu.com>
> Cc: Yuval Shaia <yuval.shaia.ml at gmail.com>
> Cc: Marcel Apfelbaum <marcel.apfelbaum at gmail.com>
> Signed-off-by: Philippe Mathieu-Daudé <philmd at linaro.org>
> ---
> MAINTAINERS | 17 -
> docs/about/deprecated.rst | 9 -
> docs/about/removed-features.rst | 4 +
> docs/devel/migration/main.rst | 6 -
> docs/pvrdma.txt | 345 --
> docs/rdma.txt | 420 --
> docs/system/device-url-syntax.rst.inc | 4 +-
> docs/system/loongarch/virt.rst | 2 +-
> docs/system/qemu-block-drivers.rst.inc | 1 -
> meson.build | 59 -
> qapi/machine.json | 17 -
> qapi/migration.json | 31 +-
> qapi/qapi-schema.json | 1 -
> qapi/rdma.json | 38 -
> contrib/rdmacm-mux/rdmacm-mux.h | 61 -
> hw/rdma/rdma_backend.h | 129 -
> hw/rdma/rdma_backend_defs.h | 76 -
> hw/rdma/rdma_rm.h | 97 -
> hw/rdma/rdma_rm_defs.h | 146 -
> hw/rdma/rdma_utils.h | 63 -
> hw/rdma/trace.h | 1 -
> hw/rdma/vmw/pvrdma.h | 144 -
> hw/rdma/vmw/pvrdma_dev_ring.h | 46 -
> hw/rdma/vmw/pvrdma_qp_ops.h | 28 -
> hw/rdma/vmw/trace.h | 1 -
> include/hw/rdma/rdma.h | 37 -
> include/monitor/hmp.h | 1 -
> .../infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h | 685 ---
> .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 348 --
> .../standard-headers/rdma/vmw_pvrdma-abi.h | 310 --
> migration/migration-stats.h | 6 +-
> migration/migration.h | 9 -
> migration/options.h | 2 -
> migration/rdma.h | 69 -
> block/gluster.c | 39 -
> contrib/rdmacm-mux/main.c | 831 ----
> hw/core/machine-qmp-cmds.c | 32 -
> hw/rdma/rdma.c | 30 -
> hw/rdma/rdma_backend.c | 1401 ------
> hw/rdma/rdma_rm.c | 812 ----
> hw/rdma/rdma_utils.c | 126 -
> hw/rdma/vmw/pvrdma_cmd.c | 815 ----
> hw/rdma/vmw/pvrdma_dev_ring.c | 141 -
> hw/rdma/vmw/pvrdma_main.c | 735 ---
> hw/rdma/vmw/pvrdma_qp_ops.c | 298 --
> migration/migration-stats.c | 5 +-
> migration/migration.c | 31 -
> migration/options.c | 16 -
> migration/qemu-file.c | 1 -
> migration/ram.c | 86 +-
> migration/rdma.c | 4184 -----------------
> migration/savevm.c | 2 +-
> monitor/qmp-cmds.c | 1 -
> Kconfig.host | 3 -
> contrib/rdmacm-mux/meson.build | 7 -
> hmp-commands-info.hx | 13 -
> hw/Kconfig | 1 -
> hw/meson.build | 1 -
> hw/rdma/Kconfig | 3 -
> hw/rdma/meson.build | 12 -
> hw/rdma/trace-events | 31 -
> hw/rdma/vmw/trace-events | 17 -
> meson_options.txt | 4 -
> migration/meson.build | 1 -
> migration/trace-events | 68 +-
> qapi/meson.build | 1 -
> qemu-options.hx | 6 -
> .../ci/org.centos/stream/8/x86_64/configure | 1 -
> scripts/ci/setup/build-environment.yml | 2 -
> scripts/coverity-scan/run-coverity-scan | 2 +-
> scripts/meson-buildoptions.sh | 6 -
> scripts/update-linux-headers.sh | 27 -
> tests/lcitool/projects/qemu.yml | 2 -
> tests/migration/guestperf/engine.py | 4 +-
> 74 files changed, 20 insertions(+), 12991 deletions(-)
> delete mode 100644 docs/pvrdma.txt
> delete mode 100644 docs/rdma.txt
> delete mode 100644 qapi/rdma.json
> delete mode 100644 contrib/rdmacm-mux/rdmacm-mux.h
> delete mode 100644 hw/rdma/rdma_backend.h
> delete mode 100644 hw/rdma/rdma_backend_defs.h
> delete mode 100644 hw/rdma/rdma_rm.h
> delete mode 100644 hw/rdma/rdma_rm_defs.h
> delete mode 100644 hw/rdma/rdma_utils.h
> delete mode 100644 hw/rdma/trace.h
> delete mode 100644 hw/rdma/vmw/pvrdma.h
> delete mode 100644 hw/rdma/vmw/pvrdma_dev_ring.h
> delete mode 100644 hw/rdma/vmw/pvrdma_qp_ops.h
> delete mode 100644 hw/rdma/vmw/trace.h
> delete mode 100644 include/hw/rdma/rdma.h
> delete mode 100644
> include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
> delete mode 100644
> include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
> delete mode 100644 include/standard-headers/rdma/vmw_pvrdma-abi.h
> delete mode 100644 migration/rdma.h
> delete mode 100644 contrib/rdmacm-mux/main.c
> delete mode 100644 hw/rdma/rdma.c
> delete mode 100644 hw/rdma/rdma_backend.c
> delete mode 100644 hw/rdma/rdma_rm.c
> delete mode 100644 hw/rdma/rdma_utils.c
> delete mode 100644 hw/rdma/vmw/pvrdma_cmd.c
> delete mode 100644 hw/rdma/vmw/pvrdma_dev_ring.c
> delete mode 100644 hw/rdma/vmw/pvrdma_main.c
> delete mode 100644 hw/rdma/vmw/pvrdma_qp_ops.c
> delete mode 100644 migration/rdma.c
> delete mode 100644 contrib/rdmacm-mux/meson.build
> delete mode 100644 hw/rdma/Kconfig
> delete mode 100644 hw/rdma/meson.build
> delete mode 100644 hw/rdma/trace-events
> delete mode 100644 hw/rdma/vmw/trace-events
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index a07af6b9d4..05226cea0a 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3426,13 +3426,6 @@ F: docs/devel/migration.rst
> F: qapi/migration.json
> F: tests/migration/
> F: util/userfaultfd.c
> -X: migration/rdma*
> -
> -RDMA Migration
> -R: Li Zhijian <lizhijian at fujitsu.com>
> -R: Peter Xu <peterx at redhat.com>
> -S: Odd Fixes
> -F: migration/rdma*
>
> Migration dirty limit and dirty page rate
> M: Hyman Huang <yong.huang at smartx.com>
> @@ -4060,16 +4053,6 @@ F: block/replication.c
> F: tests/unit/test-replication.c
> F: docs/block-replication.txt
>
> -PVRDMA
> -M: Yuval Shaia <yuval.shaia.ml at gmail.com>
> -M: Marcel Apfelbaum <marcel.apfelbaum at gmail.com>
> -S: Odd Fixes
> -F: hw/rdma/*
> -F: hw/rdma/vmw/*
> -F: docs/pvrdma.txt
> -F: contrib/rdmacm-mux/*
> -F: qapi/rdma.json
> -
> Semihosting
> M: Alex Bennée <alex.bennee at linaro.org>
> S: Maintained
> diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
> index 7b548519b5..29eae69e50 100644
> --- a/docs/about/deprecated.rst
> +++ b/docs/about/deprecated.rst
> @@ -376,15 +376,6 @@ recommending to switch to their stable counterparts:
> - "Zve64f" should be replaced with "zve64f"
> - "Zve64d" should be replaced with "zve64d"
>
> -``-device pvrdma`` and the rdma subsystem (since 8.2)
> -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> -
> -The pvrdma device and the whole rdma subsystem are in a bad shape and
> -without active maintenance. The QEMU project intends to remove this
> -device and subsystem from the code base in a future release without
> -replacement unless somebody steps up and improves the situation.
> -
> -
> Block device options
> ''''''''''''''''''''
>
> diff --git a/docs/about/removed-features.rst
> b/docs/about/removed-features.rst
> index f9cf874f7b..4d5bdc43b4 100644
> --- a/docs/about/removed-features.rst
> +++ b/docs/about/removed-features.rst
> @@ -909,6 +909,10 @@ contains native support for this feature and thus use
> of the option
> ROM approach was obsolete. The native SeaBIOS support can be activated
> by using ``-machine graphics=off``.
>
> +``pvrdma`` and the RDMA subsystem (removed in 9.1)
> +''''''''''''''''''''''''''''''''''''''''''''''''''
> +
> +The 'pvrdma' device and the whole RDMA subsystem have been removed.
>
> Related binaries
> ----------------
> diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst
> index 54385a23e5..70278ce1e3 100644
> --- a/docs/devel/migration/main.rst
> +++ b/docs/devel/migration/main.rst
> @@ -47,12 +47,6 @@ over any transport.
> QEMU interference. Note that QEMU does not flush cached file
> data/metadata at the end of migration.
>
> -In addition, support is included for migration using RDMA, which
> -transports the page data using ``RDMA``, where the hardware takes care of
> -transporting the pages, and the load on the CPU is much lower. While the
> -internals of RDMA migration are a bit different, this isn't really visible
> -outside the RAM migration code.
> -
> All these migration protocols use the same infrastructure to
> save/restore state devices. This infrastructure is shared with the
> savevm/loadvm functionality.
> diff --git a/docs/pvrdma.txt b/docs/pvrdma.txt
> deleted file mode 100644
> index 5c122fe818..0000000000
> --- a/docs/pvrdma.txt
> +++ /dev/null
> @@ -1,345 +0,0 @@
> -Paravirtualized RDMA Device (PVRDMA)
> -====================================
> -
> -
> -1. Description
> -===============
> -PVRDMA is the QEMU implementation of VMware's paravirtualized RDMA device.
> -It works with its Linux Kernel driver AS IS, no need for any special guest
> -modifications.
> -
> -While it complies with the VMware device, it can also communicate with
> bare
> -metal RDMA-enabled machines as peers.
> -
> -It does not require an RDMA HCA in the host, it can work with Soft-RoCE
> (rxe).
> -
> -It does not require the whole guest RAM to be pinned allowing memory
> -over-commit and, even if not implemented yet, migration support will be
> -possible with some HW assistance.
> -
> -A project presentation accompany this document:
> --
> https://blog.linuxplumbersconf.org/2017/ocw/system/presentations/4730/original/lpc-2017-pvrdma-marcel-apfelbaum-yuval-shaia.pdf
> -
> -
> -
> -2. Setup
> -========
> -
> -
> -2.1 Guest setup
> -===============
> -Fedora 27+ kernels work out of the box, older distributions
> -require updating the kernel to 4.14 to include the pvrdma driver.
> -
> -However the libpvrdma library needed by User Level Software is still
> -not available as part of the distributions, so the rdma-core library
> -needs to be compiled and optionally installed.
> -
> -Please follow the instructions at:
> - https://github.com/linux-rdma/rdma-core.git
> -
> -
> -2.2 Host Setup
> -==============
> -The pvrdma backend is an ibdevice interface that can be exposed
> -either by a Soft-RoCE(rxe) device on machines with no RDMA device,
> -or an HCA SRIOV function(VF/PF).
> -Note that ibdevice interfaces can't be shared between pvrdma devices,
> -each one requiring a separate instance (rxe or SRIOV VF).
> -
> -
> -2.2.1 Soft-RoCE backend(rxe)
> -===========================
> -A stable version of rxe is required, Fedora 27+ or a Linux
> -Kernel 4.14+ is preferred.
> -
> -The rdma_rxe module is part of the Linux Kernel but not loaded by default.
> -Install the User Level library (librxe) following the instructions from:
> -https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home
> -
> -Associate an ETH interface with rxe by running:
> - rxe_cfg add eth0
> -An rxe0 ibdevice interface will be created and can be used as pvrdma
> backend.
> -
> -
> -2.2.2 RDMA device Virtual Function backend
> -==========================================
> -Nothing special is required, the pvrdma device can work not only with
> -Ethernet Links, but also Infinibands Links.
> -All is needed is an ibdevice with an active port, for Mellanox cards
> -will be something like mlx5_6 which can be the backend.
> -
> -
> -2.2.3 QEMU setup
> -================
> -Configure QEMU with --enable-rdma flag, installing
> -the required RDMA libraries.
> -
> -
> -
> -3. Usage
> -========
> -
> -
> -3.1 VM Memory settings
> -======================
> -Currently the device is working only with memory backed RAM
> -and it must be mark as "shared":
> - -m 1G \
> - -object memory-backend-ram,id=mb1,size=1G,share \
> - -numa node,memdev=mb1 \
> -
> -
> -3.2 MAD Multiplexer
> -===================
> -MAD Multiplexer is a service that exposes MAD-like interface for VMs in
> -order to overcome the limitation where only single entity can register
> with
> -MAD layer to send and receive RDMA-CM MAD packets.
> -
> -To build rdmacm-mux run
> -# make rdmacm-mux
> -
> -Before running the rdmacm-mux make sure that both ib_cm and rdma_cm kernel
> -modules aren't loaded, otherwise the rdmacm-mux service will fail to
> start.
> -
> -The application accepts 3 command line arguments and exposes a UNIX socket
> -to pass control and data to it.
> --d rdma-device-name Name of RDMA device to register with
> --s unix-socket-path Path to unix socket to listen (default
> /var/run/rdmacm-mux)
> --p rdma-device-port Port number of RDMA device to register with (default
> 1)
> -The final UNIX socket file name is a concatenation of the 3 arguments so
> -for example for device mlx5_0 on port 2 this /var/run/rdmacm-mux-mlx5_0-2
> -will be created.
> -
> -pvrdma requires this service.
> -
> -Please refer to contrib/rdmacm-mux for more details.
> -
> -
> -3.3 Service exposed by libvirt daemon
> -=====================================
> -The control over the RDMA device's GID table is done by updating the
> -device's Ethernet function addresses.
> -Usually the first GID entry is determined by the MAC address, the second
> by
> -the first IPv6 address and the third by the IPv4 address. Other entries
> can
> -be added by adding more IP addresses. The opposite is the same, i.e.
> -whenever an address is removed, the corresponding GID entry is removed.
> -The process is done by the network and RDMA stacks. Whenever an address is
> -added the ib_core driver is notified and calls the device driver add_gid
> -function which in turn update the device.
> -To support this in pvrdma device the device hooks into the create_bind and
> -destroy_bind HW commands triggered by pvrdma driver in guest.
> -
> -Whenever changed is made to the pvrdma port's GID table a special QMP
> -messages is sent to be processed by libvirt to update the address of the
> -backend Ethernet device.
> -
> -pvrdma requires that libvirt service will be up.
> -
> -
> -3.4 PCI devices settings
> -========================
> -RoCE device exposes two functions - an Ethernet and RDMA.
> -To support it, pvrdma device is composed of two PCI functions, an Ethernet
> -device of type vmxnet3 on PCI slot 0 and a PVRDMA device on PCI slot 1.
> The
> -Ethernet function can be used for other Ethernet purposes such as IP.
> -
> -
> -3.5 Device parameters
> -=====================
> -- netdev: Specifies the Ethernet device function name on the host for
> - example enp175s0f0. For Soft-RoCE device (rxe) this would be the
> Ethernet
> - device used to create it.
> -- ibdev: The IB device name on host for example rxe0, mlx5_0 etc.
> -- mad-chardev: The name of the MAD multiplexer char device.
> -- ibport: In case of multi-port device (such as Mellanox's HCA) this
> - specify the port to use. If not set 1 will be used.
> -- dev-caps-max-mr-size: The maximum size of MR.
> -- dev-caps-max-qp: Maximum number of QPs.
> -- dev-caps-max-cq: Maximum number of CQs.
> -- dev-caps-max-mr: Maximum number of MRs.
> -- dev-caps-max-pd: Maximum number of PDs.
> -- dev-caps-max-ah: Maximum number of AHs.
> -
> -Notes:
> -- The first 3 parameters are mandatory settings, the rest have their
> - defaults.
> -- The last 8 parameters (the ones that prefixed by dev-caps) defines the
> top
> - limits but the final values is adjusted by the backend device
> limitations.
> -- netdev can be extracted from ibdev's sysfs
> - (/sys/class/infiniband/<ibdev>/device/net/)
> -
> -
> -3.6 Example
> -===========
> -Define bridge device with vmxnet3 network backend:
> -<interface type='bridge'>
> - <mac address='56:b4:44:e9:62:dc'/>
> - <source bridge='bridge1'/>
> - <model type='vmxnet3'/>
> - <address type='pci' domain='0x0000' bus='0x00' slot='0x10'
> function='0x0' multifunction='on'/>
> -</interface>
> -
> -Define pvrdma device:
> -<qemu:commandline>
> - <qemu:arg value='-object'/>
> - <qemu:arg value='memory-backend-ram,id=mb1,size=1G,share'/>
> - <qemu:arg value='-numa'/>
> - <qemu:arg value='node,memdev=mb1'/>
> - <qemu:arg value='-chardev'/>
> - <qemu:arg value='socket,path=/var/run/rdmacm-mux-rxe0-1,id=mads'/>
> - <qemu:arg value='-device'/>
> - <qemu:arg
> value='pvrdma,addr=10.1,ibdev=rxe0,netdev=bridge0,mad-chardev=mads'/>
> -</qemu:commandline>
> -
> -
> -
> -4. Implementation details
> -=========================
> -
> -
> -4.1 Overview
> -============
> -The device acts like a proxy between the Guest Driver and the host
> -ibdevice interface.
> -On configuration path:
> - - For every hardware resource request (PD/QP/CQ/...) the pvrdma will
> request
> - a resource from the backend interface, maintaining a 1-1 mapping
> - between the guest and host.
> -On data path:
> - - Every post_send/receive received from the guest will be converted into
> - a post_send/receive for the backend. The buffers data will not be
> touched
> - or copied resulting in near bare-metal performance for large enough
> buffers.
> - - Completions from the backend interface will result in completions for
> - the pvrdma device.
> -
> -
> -4.2 PCI BARs
> -============
> -PCI Bars:
> - BAR 0 - MSI-X
> - MSI-X vectors:
> - (0) Command - used when execution of a command is
> completed.
> - (1) Async - not in use.
> - (2) Completion - used when a completion event is placed in
> - device's CQ ring.
> - BAR 1 - Registers
> - --------------------------------------------------------
> - | VERSION | DSR | CTL | REQ | ERR | ICR | IMR | MAC |
> - --------------------------------------------------------
> - DSR - Address of driver/device shared memory used
> - for the command channel, used for passing:
> - - General info such as driver version
> - - Address of 'command' and 'response'
> - - Address of async ring
> - - Address of device's CQ ring
> - - Device capabilities
> - CTL - Device control operations (activate, reset etc)
> - IMG - Set interrupt mask
> - REQ - Command execution register
> - ERR - Operation status
> -
> - BAR 2 - UAR
> - ---------------------------------------------------------
> - | QP_NUM | SEND/RECV Flag || CQ_NUM | ARM/POLL Flag |
> - ---------------------------------------------------------
> - - Offset 0 used for QP operations (send and recv)
> - - Offset 4 used for CQ operations (arm and poll)
> -
> -
> -4.3 Major flows
> -===============
> -
> -4.3.1 Create CQ
> -===============
> - - Guest driver
> - - Allocates pages for CQ ring
> - - Creates page directory (pdir) to hold CQ ring's pages
> - - Initializes CQ ring
> - - Initializes 'Create CQ' command object (cqe, pdir etc)
> - - Copies the command to 'command' address
> - - Writes 0 into REQ register
> - - Device
> - - Reads the request object from the 'command' address
> - - Allocates CQ object and initialize CQ ring based on pdir
> - - Creates the backend CQ
> - - Writes operation status to ERR register
> - - Posts command-interrupt to guest
> - - Guest driver
> - - Reads the HW response code from ERR register
> -
> -4.3.2 Create QP
> -===============
> - - Guest driver
> - - Allocates pages for send and receive rings
> - - Creates page directory(pdir) to hold the ring's pages
> - - Initializes 'Create QP' command object (max_send_wr,
> - send_cq_handle, recv_cq_handle, pdir etc)
> - - Copies the object to 'command' address
> - - Write 0 into REQ register
> - - Device
> - - Reads the request object from 'command' address
> - - Allocates the QP object and initialize
> - - Send and recv rings based on pdir
> - - Send and recv ring state
> - - Creates the backend QP
> - - Writes the operation status to ERR register
> - - Posts command-interrupt to guest
> - - Guest driver
> - - Reads the HW response code from ERR register
> -
> -4.3.3 Post receive
> -==================
> - - Guest driver
> - - Initializes a wqe and place it on recv ring
> - - Write to qpn|qp_recv_bit (31) to QP offset in UAR
> - - Device
> - - Extracts qpn from UAR
> - - Walks through the ring and does the following for each wqe
> - - Prepares the backend CQE context to be used when
> - receiving completion from backend (wr_id, op_code,
> emu_cq_num)
> - - For each sge prepares backend sge
> - - Calls backend's post_recv
> -
> -4.3.4 Process backend events
> -============================
> - - Done by a dedicated thread used to process backend events;
> - at initialization is attached to the device and creates
> - the communication channel.
> - - Thread main loop:
> - - Polls for completions
> - - Extracts QEMU _cq_num, wr_id and op_code from context
> - - Writes CQE to CQ ring
> - - Writes CQ number to device CQ
> - - Sends completion-interrupt to guest
> - - Deallocates context
> - - Acks the event to backend
> -
> -
> -
> -5. Limitations
> -==============
> -- The device obviously is limited by the Guest Linux Driver features
> implementation
> - of the VMware device API.
> -- Memory registration mechanism requires mremap for every page in the
> buffer in order
> - to map it to a contiguous virtual address range. Since this is not the
> data path
> - it should not matter much. If the default max mr size is increased, be
> aware that
> - memory registration can take up to 0.5 seconds for 1GB of memory.
> -- The device requires target page size to be the same as the host page
> size,
> - otherwise it will fail to init.
> -- QEMU cannot map guest RAM from a file descriptor if a pvrdma device is
> attached,
> - so it can't work with huge pages. The limitation will be addressed in
> the future,
> - however QEMU allocates Guest RAM with MADV_HUGEPAGE so if there are
> enough huge
> - pages available, QEMU will use them. QEMU will fail to init if the
> requirements
> - are not met.
> -
> -
> -
> -6. Performance
> -==============
> -By design the pvrdma device exits on each post-send/receive, so for small
> buffers
> -the performance is affected; however for medium buffers it will became
> close to
> -bare metal and from 1MB buffers and up it reaches bare metal performance.
> -(tested with 2 VMs, the pvrdma devices connected to 2 VFs of the same
> device)
> -
> -All the above assumes no memory registration is done on data path.
> diff --git a/docs/rdma.txt b/docs/rdma.txt
> deleted file mode 100644
> index bd8dd799a9..0000000000
> --- a/docs/rdma.txt
> +++ /dev/null
> @@ -1,420 +0,0 @@
> -(RDMA: Remote Direct Memory Access)
> -RDMA Live Migration Specification, Version # 1
> -==============================================
> -Wiki: https://wiki.qemu.org/Features/RDMALiveMigration
> -Github: git at github.com:hinesmr/qemu.git, 'rdma' branch
> -
> -Copyright (C) 2013 Michael R. Hines <mrhines at us.ibm.com>
> -
> -An *exhaustive* paper (2010) shows additional performance details
> -linked on the QEMU wiki above.
> -
> -Contents:
> -=========
> -* Introduction
> -* Before running
> -* Running
> -* Performance
> -* RDMA Migration Protocol Description
> -* Versioning and Capabilities
> -* QEMUFileRDMA Interface
> -* Migration of VM's ram
> -* Error handling
> -* TODO
> -
> -Introduction:
> -=============
> -
> -RDMA helps make your migration more deterministic under heavy load because
> -of the significantly lower latency and higher throughput over TCP/IP.
> This is
> -because the RDMA I/O architecture reduces the number of interrupts and
> -data copies by bypassing the host networking stack. In particular, a
> TCP-based
> -migration, under certain types of memory-bound workloads, may take a more
> -unpredictable amount of time to complete the migration if the amount of
> -memory tracked during each live migration iteration round cannot keep pace
> -with the rate of dirty memory produced by the workload.
> -
> -RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA
> -over Converged Ethernet) as well as Infiniband-based. This implementation
> of
> -migration using RDMA is capable of using both technologies because of
> -the use of the OpenFabrics OFED software stack that abstracts out the
> -programming model irrespective of the underlying hardware.
> -
> -Refer to openfabrics.org or your respective RDMA hardware vendor for
> -an understanding on how to verify that you have the OFED software stack
> -installed in your environment. You should be able to successfully link
> -against the "librdmacm" and "libibverbs" libraries and development headers
> -for a working build of QEMU to run successfully using RDMA Migration.
> -
> -BEFORE RUNNING:
> -===============
> -
> -Use of RDMA during migration requires pinning and registering memory
> -with the hardware. This means that memory must be physically resident
> -before the hardware can transmit that memory to another machine.
> -If this is not acceptable for your application or product, then the use
> -of RDMA migration may in fact be harmful to co-located VMs or other
> -software on the machine if there is not sufficient memory available to
> -relocate the entire footprint of the virtual machine. If so, then the
> -use of RDMA is discouraged and it is recommended to use standard TCP
> migration.
> -
> -Experimental: Next, decide if you want dynamic page registration.
> -For example, if you have an 8GB RAM virtual machine, but only 1GB
> -is in active use, then enabling this feature will cause all 8GB to
> -be pinned and resident in memory. This feature mostly affects the
> -bulk-phase round of the migration and can be enabled for extremely
> -high-performance RDMA hardware using the following command:
> -
> -QEMU Monitor Command:
> -$ migrate_set_capability rdma-pin-all on # disabled by default
> -
> -Performing this action will cause all 8GB to be pinned, so if that's
> -not what you want, then please ignore this step altogether.
> -
> -On the other hand, this will also significantly speed up the bulk round
> -of the migration, which can greatly reduce the "total" time of your
> migration.
> -Example performance of this using an idle VM in the previous example
> -can be found in the "Performance" section.
> -
> -Note: for very large virtual machines (hundreds of GBs), pinning all
> -*all* of the memory of your virtual machine in the kernel is very
> expensive
> -may extend the initial bulk iteration time by many seconds,
> -and thus extending the total migration time. However, this will not
> -affect the determinism or predictability of your migration you will
> -still gain from the benefits of advanced pinning with RDMA.
> -
> -RUNNING:
> -========
> -
> -First, set the migration speed to match your hardware's capabilities:
> -
> -QEMU Monitor Command:
> -$ migrate_set_parameter max-bandwidth 40g # or whatever is the MAX of
> your RDMA device
> -
> -Next, on the destination machine, add the following to the QEMU command
> line:
> -
> -qemu ..... -incoming rdma:host:port
> -
> -Finally, perform the actual migration on the source machine:
> -
> -QEMU Monitor Command:
> -$ migrate -d rdma:host:port
> -
> -PERFORMANCE
> -===========
> -
> -Here is a brief summary of total migration time and downtime using RDMA:
> -Using a 40gbps infiniband link performing a worst-case stress test,
> -using an 8GB RAM virtual machine:
> -
> -Using the following command:
> -$ apt-get install stress
> -$ stress --vm-bytes 7500M --vm 1 --vm-keep
> -
> -1. Migration throughput: 26 gigabits/second.
> -2. Downtime (stop time) varies between 15 and 100 milliseconds.
> -
> -EFFECTS of memory registration on bulk phase round:
> -
> -For example, in the same 8GB RAM example with all 8GB of memory in
> -active use and the VM itself is completely idle using the same 40 gbps
> -infiniband link:
> -
> -1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps
> -2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps
> -
> -These numbers would of course scale up to whatever size virtual machine
> -you have to migrate using RDMA.
> -
> -Enabling this feature does *not* have any measurable affect on
> -migration *downtime*. This is because, without this feature, all of the
> -memory will have already been registered already in advance during
> -the bulk round and does not need to be re-registered during the successive
> -iteration rounds.
> -
> -RDMA Protocol Description:
> -==========================
> -
> -Migration with RDMA is separated into two parts:
> -
> -1. The transmission of the pages using RDMA
> -2. Everything else (a control channel is introduced)
> -
> -"Everything else" is transmitted using a formal
> -protocol now, consisting of infiniband SEND messages.
> -
> -An infiniband SEND message is the standard ibverbs
> -message used by applications of infiniband hardware.
> -The only difference between a SEND message and an RDMA
> -message is that SEND messages cause notifications
> -to be posted to the completion queue (CQ) on the
> -infiniband receiver side, whereas RDMA messages (used
> -for VM's ram) do not (to behave like an actual DMA).
> -
> -Messages in infiniband require two things:
> -
> -1. registration of the memory that will be transmitted
> -2. (SEND only) work requests to be posted on both
> - sides of the network before the actual transmission
> - can occur.
> -
> -RDMA messages are much easier to deal with. Once the memory
> -on the receiver side is registered and pinned, we're
> -basically done. All that is required is for the sender
> -side to start dumping bytes onto the link.
> -
> -(Memory is not released from pinning until the migration
> -completes, given that RDMA migrations are very fast.)
> -
> -SEND messages require more coordination because the
> -receiver must have reserved space (using a receive
> -work request) on the receive queue (RQ) before QEMUFileRDMA
> -can start using them to carry all the bytes as
> -a control transport for migration of device state.
> -
> -To begin the migration, the initial connection setup is
> -as follows (migration-rdma.c):
> -
> -1. Receiver and Sender are started (command line or libvirt):
> -2. Both sides post two RQ work requests
> -3. Receiver does listen()
> -4. Sender does connect()
> -5. Receiver accept()
> -6. Check versioning and capabilities (described later)
> -
> -At this point, we define a control channel on top of SEND messages
> -which is described by a formal protocol. Each SEND message has a
> -header portion and a data portion (but together are transmitted
> -as a single SEND message).
> -
> -Header:
> - * Length (of the data portion, uint32, network byte
> order)
> - * Type (what command to perform, uint32, network byte
> order)
> - * Repeat (Number of commands in data portion, same type
> only)
> -
> -The 'Repeat' field is here to support future multiple page registrations
> -in a single message without any need to change the protocol itself
> -so that the protocol is compatible against multiple versions of QEMU.
> -Version #1 requires that all server implementations of the protocol must
> -check this field and register all requests found in the array of commands
> located
> -in the data portion and return an equal number of results in the response.
> -The maximum number of repeats is hard-coded to 4096. This is a
> conservative
> -limit based on the maximum size of a SEND message along with empirical
> -observations on the maximum future benefit of simultaneous page
> registrations.
> -
> -The 'type' field has 12 different command values:
> - 1. Unused
> - 2. Error (sent to the source during bad things)
> - 3. Ready (control-channel is available)
> - 4. QEMU File (for sending non-live device state)
> - 5. RAM Blocks request (used right after connection setup)
> - 6. RAM Blocks result (used right after connection setup)
> - 7. Compress page (zap zero page and skip registration)
> - 8. Register request (dynamic chunk registration)
> - 9. Register result ('rkey' to be used by sender)
> - 10. Register finished (registration for current iteration
> finished)
> - 11. Unregister request (unpin previously registered memory)
> - 12. Unregister finished (confirmation that unpin completed)
> -
> -A single control message, as hinted above, can contain within the data
> -portion an array of many commands of the same type. If there is more than
> -one command, then the 'repeat' field will be greater than 1.
> -
> -After connection setup, message 5 & 6 are used to exchange ram block
> -information and optionally pin all the memory if requested by the user.
> -
> -After ram block exchange is completed, we have two protocol-level
> -functions, responsible for communicating control-channel commands
> -using the above list of values:
> -
> -Logically:
> -
> -qemu_rdma_exchange_recv(header, expected command type)
> -
> -1. We transmit a READY command to let the sender know that
> - we are *ready* to receive some data bytes on the control channel.
> -2. Before attempting to receive the expected command, we post another
> - RQ work request to replace the one we just used up.
> -3. Block on a CQ event channel and wait for the SEND to arrive.
> -4. When the send arrives, librdmacm will unblock us.
> -5. Verify that the command-type and version received matches the one we
> expected.
> -
> -qemu_rdma_exchange_send(header, data, optional response header & data):
> -
> -1. Block on the CQ event channel waiting for a READY command
> - from the receiver to tell us that the receiver
> - is *ready* for us to transmit some new bytes.
> -2. Optionally: if we are expecting a response from the command
> - (that we have not yet transmitted), let's post an RQ
> - work request to receive that data a few moments later.
> -3. When the READY arrives, librdmacm will
> - unblock us and we immediately post a RQ work request
> - to replace the one we just used up.
> -4. Now, we can actually post the work request to SEND
> - the requested command type of the header we were asked for.
> -5. Optionally, if we are expecting a response (as before),
> - we block again and wait for that response using the additional
> - work request we previously posted. (This is used to carry
> - 'Register result' commands #6 back to the sender which
> - hold the rkey need to perform RDMA. Note that the virtual address
> - corresponding to this rkey was already exchanged at the beginning
> - of the connection (described below).
> -
> -All of the remaining command types (not including 'ready')
> -described above all use the aforementioned two functions to do the hard
> work:
> -
> -1. After connection setup, RAMBlock information is exchanged using
> - this protocol before the actual migration begins. This information
> includes
> - a description of each RAMBlock on the server side as well as the
> virtual addresses
> - and lengths of each RAMBlock. This is used by the client to determine
> the
> - start and stop locations of chunks and how to register them dynamically
> - before performing the RDMA operations.
> -2. During runtime, once a 'chunk' becomes full of pages ready to
> - be sent with RDMA, the registration commands are used to ask the
> - other side to register the memory for this chunk and respond
> - with the result (rkey) of the registration.
> -3. Also, the QEMUFile interfaces also call these functions (described
> below)
> - when transmitting non-live state, such as devices or to send
> - its own protocol information during the migration process.
> -4. Finally, zero pages are only checked if a page has not yet been
> registered
> - using chunk registration (or not checked at all and unconditionally
> - written if chunk registration is disabled. This is accomplished using
> - the "Compress" command listed above. If the page *has* been registered
> - then we check the entire chunk for zero. Only if the entire chunk is
> - zero, then we send a compress command to zap the page on the other
> side.
> -
> -Versioning and Capabilities
> -===========================
> -Current version of the protocol is version #1.
> -
> -The same version applies to both for protocol traffic and capabilities
> -negotiation. (i.e. There is only one version number that is referred to
> -by all communication).
> -
> -librdmacm provides the user with a 'private data' area to be exchanged
> -at connection-setup time before any infiniband traffic is generated.
> -
> -Header:
> - * Version (protocol version validated before send/recv occurs),
> - uint32, network byte order
> - * Flags (bitwise OR of each capability),
> - uint32, network byte order
> -
> -There is no data portion of this header right now, so there is
> -no length field. The maximum size of the 'private data' section
> -is only 192 bytes per the Infiniband specification, so it's not
> -very useful for data anyway. This structure needs to remain small.
> -
> -This private data area is a convenient place to check for protocol
> -versioning because the user does not need to register memory to
> -transmit a few bytes of version information.
> -
> -This is also a convenient place to negotiate capabilities
> -(like dynamic page registration).
> -
> -If the version is invalid, we throw an error.
> -
> -If the version is new, we only negotiate the capabilities that the
> -requested version is able to perform and ignore the rest.
> -
> -Currently there is only one capability in Version #1: dynamic page
> registration
> -
> -Finally: Negotiation happens with the Flags field: If the primary-VM
> -sets a flag, but the destination does not support this capability, it
> -will return a zero-bit for that flag and the primary-VM will understand
> -that as not being an available capability and will thus disable that
> -capability on the primary-VM side.
> -
> -QEMUFileRDMA Interface:
> -=======================
> -
> -QEMUFileRDMA introduces a couple of new functions:
> -
> -1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops)
> -2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops)
> -
> -These two functions are very short and simply use the protocol
> -describe above to deliver bytes without changing the upper-level
> -users of QEMUFile that depend on a bytestream abstraction.
> -
> -Finally, how do we handoff the actual bytes to get_buffer()?
> -
> -Again, because we're trying to "fake" a bytestream abstraction
> -using an analogy not unlike individual UDP frames, we have
> -to hold on to the bytes received from control-channel's SEND
> -messages in memory.
> -
> -Each time we receive a complete "QEMU File" control-channel
> -message, the bytes from SEND are copied into a small local holding area.
> -
> -Then, we return the number of bytes requested by get_buffer()
> -and leave the remaining bytes in the holding area until get_buffer()
> -comes around for another pass.
> -
> -If the buffer is empty, then we follow the same steps
> -listed above and issue another "QEMU File" protocol command,
> -asking for a new SEND message to re-fill the buffer.
> -
> -Migration of VM's ram:
> -====================
> -
> -At the beginning of the migration, (migration-rdma.c),
> -the sender and the receiver populate the list of RAMBlocks
> -to be registered with each other into a structure.
> -Then, using the aforementioned protocol, they exchange a
> -description of these blocks with each other, to be used later
> -during the iteration of main memory. This description includes
> -a list of all the RAMBlocks, their offsets and lengths, virtual
> -addresses and possibly includes pre-registered RDMA keys in case dynamic
> -page registration was disabled on the server-side, otherwise not.
> -
> -Main memory is not migrated with the aforementioned protocol,
> -but is instead migrated with normal RDMA Write operations.
> -
> -Pages are migrated in "chunks" (hard-coded to 1 Megabyte right now).
> -Chunk size is not dynamic, but it could be in a future implementation.
> -There's nothing to indicate that this is useful right now.
> -
> -When a chunk is full (or a flush() occurs), the memory backed by
> -the chunk is registered with librdmacm is pinned in memory on
> -both sides using the aforementioned protocol.
> -After pinning, an RDMA Write is generated and transmitted
> -for the entire chunk.
> -
> -Chunks are also transmitted in batches: This means that we
> -do not request that the hardware signal the completion queue
> -for the completion of *every* chunk. The current batch size
> -is about 64 chunks (corresponding to 64 MB of memory).
> -Only the last chunk in a batch must be signaled.
> -This helps keep everything as asynchronous as possible
> -and helps keep the hardware busy performing RDMA operations.
> -
> -Error-handling:
> -===============
> -
> -Infiniband has what is called a "Reliable, Connected"
> -link (one of 4 choices). This is the mode in which
> -we use for RDMA migration.
> -
> -If a *single* message fails,
> -the decision is to abort the migration entirely and
> -cleanup all the RDMA descriptors and unregister all
> -the memory.
> -
> -After cleanup, the Virtual Machine is returned to normal
> -operation the same way that would happen if the TCP
> -socket is broken during a non-RDMA based migration.
> -
> -TODO:
> -=====
> -1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits
> - are not compatible with infiniband memory pinning and will result in
> - an aborted migration (but with the source VM left unaffected).
> -2. Use of the recent /proc/<pid>/pagemap would likely speed up
> - the use of KSM and ballooning while using RDMA.
> -3. Also, some form of balloon-device usage tracking would also
> - help alleviate some issues.
> -4. Use LRU to provide more fine-grained direction of UNREGISTER
> - requests for unpinning memory in an overcommitted environment.
> -5. Expose UNREGISTER support to the user by way of workload-specific
> - hints about application behavior.
> diff --git a/docs/system/device-url-syntax.rst.inc
> b/docs/system/device-url-syntax.rst.inc
> index 7dbc525fa8..43b5c2596b 100644
> --- a/docs/system/device-url-syntax.rst.inc
> +++ b/docs/system/device-url-syntax.rst.inc
> @@ -87,8 +87,8 @@ These are specified using a special URL syntax.
>
> ``GlusterFS``
> GlusterFS is a user space distributed file system. QEMU supports the
> - use of GlusterFS volumes for hosting VM disk images using TCP, Unix
> - Domain Sockets and RDMA transport protocols.
> + use of GlusterFS volumes for hosting VM disk images using TCP and Unix
> + Domain Sockets transport protocols.
>
> Syntax for specifying a VM disk image on GlusterFS volume is
>
> diff --git a/docs/system/loongarch/virt.rst
> b/docs/system/loongarch/virt.rst
> index c37268b404..0a8e0766e4 100644
> --- a/docs/system/loongarch/virt.rst
> +++ b/docs/system/loongarch/virt.rst
> @@ -39,7 +39,7 @@ can be accessed by following steps.
>
> .. code-block:: bash
>
> - ./configure --disable-rdma --disable-pvrdma --prefix=/usr \
> + ./configure --prefix=/usr \
> --target-list="loongarch64-softmmu" \
> --disable-libiscsi --disable-libnfs --disable-libpmem \
> --disable-glusterfs --enable-libusb --enable-usb-redir \
> diff --git a/docs/system/qemu-block-drivers.rst.inc
> b/docs/system/qemu-block-drivers.rst.inc
> index 105cb9679c..384e95ba76 100644
> --- a/docs/system/qemu-block-drivers.rst.inc
> +++ b/docs/system/qemu-block-drivers.rst.inc
> @@ -737,7 +737,6 @@ Examples
> |qemu_system| -drive
> file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
> |qemu_system| -drive file=gluster+tcp://
> server.domain.com:24007/testvol/dir/a.img
> |qemu_system| -drive
> file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
> - |qemu_system| -drive file=gluster+rdma://1.2.3.4:24007/testvol/a.img
> |qemu_system| -drive file=gluster://
> 1.2.3.4/testvol/a.img,file.debug=9,file.logfile=/var/log/qemu-gluster.log
> |qemu_system| 'json:{"driver":"qcow2",
> "file":{"driver":"gluster",
> diff --git a/meson.build b/meson.build
> index c9c3217ba4..bd65abad13 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -1854,21 +1854,6 @@ if numa.found() and not cc.links('''
> endif
> endif
>
> -rdma = not_found
> -if not get_option('rdma').auto() or have_system
> - libumad = cc.find_library('ibumad', required: get_option('rdma'))
> - rdma_libs = [cc.find_library('rdmacm', has_headers: ['rdma/rdma_cma.h'],
> - required: get_option('rdma')),
> - cc.find_library('ibverbs', required: get_option('rdma')),
> - libumad]
> - rdma = declare_dependency(dependencies: rdma_libs)
> - foreach lib: rdma_libs
> - if not lib.found()
> - rdma = not_found
> - endif
> - endforeach
> -endif
> -
> cacard = not_found
> if not get_option('smartcard').auto() or have_system
> cacard = dependency('libcacard', required: get_option('smartcard'),
> @@ -2246,7 +2231,6 @@ endif
> config_host_data.set('CONFIG_OPENGL', opengl.found())
> config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
> config_host_data.set('CONFIG_RBD', rbd.found())
> -config_host_data.set('CONFIG_RDMA', rdma.found())
> config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
> config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
> config_host_data.set('CONFIG_SDL', sdl.found())
> @@ -2399,12 +2383,6 @@ if rbd.found()
> dependencies: rbd,
> prefix: '#include <rbd/librbd.h>'))
> endif
> -if rdma.found()
> - config_host_data.set('HAVE_IBV_ADVISE_MR',
> - cc.has_function('ibv_advise_mr',
> - dependencies: rdma,
> - prefix: '#include
> <infiniband/verbs.h>'))
> -endif
>
> have_asan_fiber = false
> if get_option('sanitizers') and \
> @@ -2829,37 +2807,6 @@ config_host_data.set('CONFIG_ARM_AES_BUILTIN',
> cc.compiles('''
> void foo(uint8x16_t *p) { *p = vaesmcq_u8(*p); }
> '''))
>
> -have_pvrdma = get_option('pvrdma') \
> - .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics
> libraries') \
> - .require(cc.compiles(gnu_source_prefix + '''
> - #include <sys/mman.h>
> - int main(void)
> - {
> - char buf = 0;
> - void *addr = &buf;
> - addr = mremap(addr, 0, 1, MREMAP_MAYMOVE | MREMAP_FIXED);
> -
> - return 0;
> - }'''), error_message: 'PVRDMA requires mremap').allowed()
> -
> -if have_pvrdma
> - config_host_data.set('LEGACY_RDMA_REG_MR', not cc.links('''
> - #include <infiniband/verbs.h>
> - int main(void)
> - {
> - struct ibv_mr *mr;
> - struct ibv_pd *pd = NULL;
> - size_t length = 10;
> - uint64_t iova = 0;
> - int access = 0;
> - void *addr = NULL;
> -
> - mr = ibv_reg_mr_iova(pd, addr, length, iova, access);
> - ibv_dereg_mr(mr);
> - return 0;
> - }'''))
> -endif
> -
> if get_option('membarrier').disabled()
> have_membarrier = false
> elif host_os == 'windows'
> @@ -2993,7 +2940,6 @@ host_kconfig = \
> (have_vhost_kernel ? ['CONFIG_VHOST_KERNEL=y'] : []) + \
> (have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \
> (host_os == 'linux' ? ['CONFIG_LINUX=y'] : []) + \
> - (have_pvrdma ? ['CONFIG_PVRDMA=y'] : []) + \
> (multiprocess_allowed ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : []) + \
> (vfio_user_server_allowed ? ['CONFIG_VFIO_USER_SERVER_ALLOWED=y'] : [])
> + \
> (hv_balloon ? ['CONFIG_HV_BALLOON_POSSIBLE=y'] : [])
> @@ -3357,8 +3303,6 @@ if have_system
> 'hw/pci',
> 'hw/pci-host',
> 'hw/ppc',
> - 'hw/rdma',
> - 'hw/rdma/vmw',
> 'hw/rtc',
> 'hw/s390x',
> 'hw/scsi',
> @@ -4028,7 +3972,6 @@ if have_tools
> }]
> endforeach
>
> - subdir('contrib/rdmacm-mux')
> subdir('contrib/elf2dmp')
>
> executable('qemu-edid', files('qemu-edid.c',
> 'hw/display/edid-generate.c'),
> @@ -4433,8 +4376,6 @@ summary_info += {'Multipath support': mpathpersist}
> summary_info += {'Linux AIO support': libaio}
> summary_info += {'Linux io_uring support': linux_io_uring}
> summary_info += {'ATTR/XATTR support': libattr}
> -summary_info += {'RDMA support': rdma}
> -summary_info += {'PVRDMA support': have_pvrdma}
> summary_info += {'fdt support': fdt_opt == 'disabled' ? false :
> fdt_opt}
> summary_info += {'libcap-ng support': libcap_ng}
> summary_info += {'bpf support': libbpf}
> diff --git a/qapi/machine.json b/qapi/machine.json
> index e8b60641f2..e9f0f0c49a 100644
> --- a/qapi/machine.json
> +++ b/qapi/machine.json
> @@ -1737,23 +1737,6 @@
> 'returns': 'HumanReadableText',
> 'features': [ 'unstable' ] }
>
> -##
> -# @x-query-rdma:
> -#
> -# Query RDMA state
> -#
> -# Features:
> -#
> -# @unstable: This command is meant for debugging.
> -#
> -# Returns: RDMA state
> -#
> -# Since: 6.2
> -##
> -{ 'command': 'x-query-rdma',
> - 'returns': 'HumanReadableText',
> - 'features': [ 'unstable' ] }
> -
> ##
> # @x-query-roms:
> #
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 8c65b90328..9a56d403be 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -221,8 +221,8 @@
> #
> # @setup-time: amount of setup time in milliseconds *before* the
> # iterations begin but *after* the QMP command is issued. This is
> -# designed to provide an accounting of any activities (such as
> -# RDMA pinning) which may be expensive, but do not actually occur
> +# designed to provide an accounting of any activities which may be
> +# expensive, but do not actually occur
> # during the iterative migration rounds themselves. (since 1.6)
> #
> # @cpu-throttle-percentage: percentage of time guest cpus are being
> @@ -430,10 +430,6 @@
> # for certain work loads, by sending compressed difference of the
> # pages
> #
> -# @rdma-pin-all: Controls whether or not the entire VM memory
> -# footprint is mlock()'d on demand or all at once. Refer to
> -# docs/rdma.txt for usage. Disabled by default. (since 2.0)
> -#
> # @zero-blocks: During storage migration encode blocks of zeroes
> # efficiently. This essentially saves 1MB of zeroes per block on
> # the wire. Enabling requires source and target VM to support
> @@ -547,7 +543,7 @@
> # Since: 1.2
> ##
> { 'enum': 'MigrationCapability',
> - 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
> + 'data': ['xbzrle', 'auto-converge', 'zero-blocks',
> { 'name': 'compress', 'features': [ 'deprecated' ] },
> 'events', 'postcopy-ram',
> { 'name': 'x-colo', 'features': [ 'unstable' ] },
> @@ -606,7 +602,6 @@
> # -> { "execute": "query-migrate-capabilities" }
> # <- { "return": [
> # {"state": false, "capability": "xbzrle"},
> -# {"state": false, "capability": "rdma-pin-all"},
> # {"state": false, "capability": "auto-converge"},
> # {"state": false, "capability": "zero-blocks"},
> # {"state": false, "capability": "compress"},
> @@ -1654,14 +1649,12 @@
> #
> # @exec: Direct the migration stream to another process.
> #
> -# @rdma: Migrate via RDMA.
> -#
> # @file: Direct the migration stream to a file.
> #
> # Since: 8.2
> ##
> { 'enum': 'MigrationAddressType',
> - 'data': [ 'socket', 'exec', 'rdma', 'file' ] }
> + 'data': [ 'socket', 'exec', 'file' ] }
>
> ##
> # @FileMigrationArgs:
> @@ -1701,7 +1694,6 @@
> 'data': {
> 'socket': 'SocketAddress',
> 'exec': 'MigrationExecCommand',
> - 'rdma': 'InetSocketAddress',
> 'file': 'FileMigrationArgs' } }
>
> ##
> @@ -1804,14 +1796,6 @@
> # -> { "execute": "migrate",
> # "arguments": {
> # "channels": [ { "channel-type": "main",
> -# "addr": { "transport": "rdma",
> -# "host": "10.12.34.9",
> -# "port": "1050" } } ] } }
> -# <- { "return": {} }
> -#
> -# -> { "execute": "migrate",
> -# "arguments": {
> -# "channels": [ { "channel-type": "main",
> # "addr": { "transport": "file",
> # "filename": "/tmp/migfile",
> # "offset": "0x1000" } } ] } }
> @@ -1879,13 +1863,6 @@
> # "/some/sock" ] } } ] }
> }
> # <- { "return": {} }
> #
> -# -> { "execute": "migrate-incoming",
> -# "arguments": {
> -# "channels": [ { "channel-type": "main",
> -# "addr": { "transport": "rdma",
> -# "host": "10.12.34.9",
> -# "port": "1050" } } ] } }
> -# <- { "return": {} }
> ##
> { 'command': 'migrate-incoming',
> 'data': {'*uri': 'str',
> diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json
> index 8304d45625..5e33da7228 100644
> --- a/qapi/qapi-schema.json
> +++ b/qapi/qapi-schema.json
> @@ -54,7 +54,6 @@
> { 'include': 'dump.json' }
> { 'include': 'net.json' }
> { 'include': 'ebpf.json' }
> -{ 'include': 'rdma.json' }
> { 'include': 'rocker.json' }
> { 'include': 'tpm.json' }
> { 'include': 'ui.json' }
> diff --git a/qapi/rdma.json b/qapi/rdma.json
> deleted file mode 100644
> index 195c001850..0000000000
> --- a/qapi/rdma.json
> +++ /dev/null
> @@ -1,38 +0,0 @@
> -# -*- Mode: Python -*-
> -# vim: filetype=python
> -#
> -
> -##
> -# = RDMA device
> -##
> -
> -##
> -# @RDMA_GID_STATUS_CHANGED:
> -#
> -# Emitted when guest driver adds/deletes GID to/from device
> -#
> -# @netdev: RoCE Network Device name
> -#
> -# @gid-status: Add or delete indication
> -#
> -# @subnet-prefix: Subnet Prefix
> -#
> -# @interface-id: Interface ID
> -#
> -# Since: 4.0
> -#
> -# Example:
> -#
> -# <- {"timestamp": {"seconds": 1541579657, "microseconds": 986760},
> -# "event": "RDMA_GID_STATUS_CHANGED",
> -# "data":
> -# {"netdev": "bridge0",
> -# "interface-id": 15880512517475447892,
> -# "gid-status": true,
> -# "subnet-prefix": 33022}}
> -##
> -{ 'event': 'RDMA_GID_STATUS_CHANGED',
> - 'data': { 'netdev' : 'str',
> - 'gid-status' : 'bool',
> - 'subnet-prefix' : 'uint64',
> - 'interface-id' : 'uint64' } }
> diff --git a/contrib/rdmacm-mux/rdmacm-mux.h
> b/contrib/rdmacm-mux/rdmacm-mux.h
> deleted file mode 100644
> index 07a4722913..0000000000
> --- a/contrib/rdmacm-mux/rdmacm-mux.h
> +++ /dev/null
> @@ -1,61 +0,0 @@
> -/*
> - * QEMU paravirtual RDMA - rdmacm-mux declarations
> - *
> - * Copyright (C) 2018 Oracle
> - * Copyright (C) 2018 Red Hat Inc
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - * Marcel Apfelbaum <marcel at redhat.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef RDMACM_MUX_H
> -#define RDMACM_MUX_H
> -
> -#include "linux/if.h"
> -#include <infiniband/verbs.h>
> -#include <infiniband/umad.h>
> -#include <rdma/rdma_user_cm.h>
> -
> -typedef enum RdmaCmMuxMsgType {
> - RDMACM_MUX_MSG_TYPE_REQ = 0,
> - RDMACM_MUX_MSG_TYPE_RESP = 1,
> -} RdmaCmMuxMsgType;
> -
> -typedef enum RdmaCmMuxOpCode {
> - RDMACM_MUX_OP_CODE_REG = 0,
> - RDMACM_MUX_OP_CODE_UNREG = 1,
> - RDMACM_MUX_OP_CODE_MAD = 2,
> -} RdmaCmMuxOpCode;
> -
> -typedef enum RdmaCmMuxErrCode {
> - RDMACM_MUX_ERR_CODE_OK = 0,
> - RDMACM_MUX_ERR_CODE_EINVAL = 1,
> - RDMACM_MUX_ERR_CODE_EEXIST = 2,
> - RDMACM_MUX_ERR_CODE_EACCES = 3,
> - RDMACM_MUX_ERR_CODE_ENOTFOUND = 4,
> -} RdmaCmMuxErrCode;
> -
> -typedef struct RdmaCmMuxHdr {
> - RdmaCmMuxMsgType msg_type;
> - RdmaCmMuxOpCode op_code;
> - union ibv_gid sgid;
> - RdmaCmMuxErrCode err_code;
> -} RdmaCmUHdr;
> -
> -typedef struct RdmaCmUMad {
> - struct ib_user_mad hdr;
> - char mad[RDMA_MAX_PRIVATE_DATA];
> -} RdmaCmUMad;
> -
> -typedef struct RdmaCmMuxMsg {
> - RdmaCmUHdr hdr;
> - int umad_len;
> - RdmaCmUMad umad;
> -} RdmaCmMuxMsg;
> -
> -#endif
> diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
> deleted file mode 100644
> index 225af481e0..0000000000
> --- a/hw/rdma/rdma_backend.h
> +++ /dev/null
> @@ -1,129 +0,0 @@
> -/*
> - * RDMA device: Definitions of Backend Device functions
> - *
> - * Copyright (C) 2018 Oracle
> - * Copyright (C) 2018 Red Hat Inc
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - * Marcel Apfelbaum <marcel at redhat.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef RDMA_BACKEND_H
> -#define RDMA_BACKEND_H
> -
> -#include "qapi/error.h"
> -#include "chardev/char-fe.h"
> -
> -#include "rdma_rm_defs.h"
> -#include "rdma_backend_defs.h"
> -
> -/* Vendor Errors */
> -#define VENDOR_ERR_FAIL_BACKEND 0x201
> -#define VENDOR_ERR_TOO_MANY_SGES 0x202
> -#define VENDOR_ERR_NOMEM 0x203
> -#define VENDOR_ERR_QP0 0x204
> -#define VENDOR_ERR_INV_NUM_SGE 0x205
> -#define VENDOR_ERR_MAD_SEND 0x206
> -#define VENDOR_ERR_INVLKEY 0x207
> -#define VENDOR_ERR_MR_SMALL 0x208
> -#define VENDOR_ERR_INV_MAD_BUFF 0x209
> -#define VENDOR_ERR_INV_GID_IDX 0x210
> -
> -/* Add definition for QP0 and QP1 as there is no userspace enums for them
> */
> -enum ibv_special_qp_type {
> - IBV_QPT_SMI = 0,
> - IBV_QPT_GSI = 1,
> -};
> -
> -static inline uint32_t rdma_backend_qpn(const RdmaBackendQP *qp)
> -{
> - return qp->ibqp ? qp->ibqp->qp_num : 1;
> -}
> -
> -static inline uint32_t rdma_backend_mr_lkey(const RdmaBackendMR *mr)
> -{
> - return mr->ibmr ? mr->ibmr->lkey : 0;
> -}
> -
> -static inline uint32_t rdma_backend_mr_rkey(const RdmaBackendMR *mr)
> -{
> - return mr->ibmr ? mr->ibmr->rkey : 0;
> -}
> -
> -int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
> - RdmaDeviceResources *rdma_dev_res,
> - const char *backend_device_name, uint8_t port_num,
> - struct ibv_device_attr *dev_attr,
> - CharBackend *mad_chr_be);
> -void rdma_backend_fini(RdmaBackendDev *backend_dev);
> -int rdma_backend_add_gid(RdmaBackendDev *backend_dev, const char *ifname,
> - union ibv_gid *gid);
> -int rdma_backend_del_gid(RdmaBackendDev *backend_dev, const char *ifname,
> - union ibv_gid *gid);
> -int rdma_backend_get_gid_index(RdmaBackendDev *backend_dev,
> - union ibv_gid *gid);
> -void rdma_backend_start(RdmaBackendDev *backend_dev);
> -void rdma_backend_stop(RdmaBackendDev *backend_dev);
> -void rdma_backend_register_comp_handler(void (*handler)(void *ctx,
> - struct ibv_wc
> *wc));
> -void rdma_backend_unregister_comp_handler(void);
> -
> -int rdma_backend_query_port(RdmaBackendDev *backend_dev,
> - struct ibv_port_attr *port_attr);
> -int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD
> *pd);
> -void rdma_backend_destroy_pd(RdmaBackendPD *pd);
> -
> -int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, void
> *addr,
> - size_t length, uint64_t guest_start, int
> access);
> -void rdma_backend_destroy_mr(RdmaBackendMR *mr);
> -
> -int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq,
> - int cqe);
> -void rdma_backend_destroy_cq(RdmaBackendCQ *cq);
> -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res,
> RdmaBackendCQ *cq);
> -
> -int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
> - RdmaBackendPD *pd, RdmaBackendCQ *scq,
> - RdmaBackendCQ *rcq, RdmaBackendSRQ *srq,
> - uint32_t max_send_wr, uint32_t max_recv_wr,
> - uint32_t max_send_sge, uint32_t max_recv_sge);
> -int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP
> *qp,
> - uint8_t qp_type, uint32_t qkey);
> -int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP
> *qp,
> - uint8_t qp_type, uint8_t sgid_idx,
> - union ibv_gid *dgid, uint32_t dqpn,
> - uint32_t rq_psn, uint32_t qkey, bool
> use_qkey);
> -int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type,
> - uint32_t sq_psn, uint32_t qkey, bool
> use_qkey);
> -int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr,
> - int attr_mask, struct ibv_qp_init_attr
> *init_attr);
> -void rdma_backend_destroy_qp(RdmaBackendQP *qp, RdmaDeviceResources
> *dev_res);
> -
> -void rdma_backend_post_send(RdmaBackendDev *backend_dev,
> - RdmaBackendQP *qp, uint8_t qp_type,
> - struct ibv_sge *sge, uint32_t num_sge,
> - uint8_t sgid_idx, union ibv_gid *sgid,
> - union ibv_gid *dgid, uint32_t dqpn, uint32_t
> dqkey,
> - void *ctx);
> -void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
> - RdmaBackendQP *qp, uint8_t qp_type,
> - struct ibv_sge *sge, uint32_t num_sge, void
> *ctx);
> -
> -int rdma_backend_create_srq(RdmaBackendSRQ *srq, RdmaBackendPD *pd,
> - uint32_t max_wr, uint32_t max_sge,
> - uint32_t srq_limit);
> -int rdma_backend_query_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr
> *srq_attr);
> -int rdma_backend_modify_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr
> *srq_attr,
> - int srq_attr_mask);
> -void rdma_backend_destroy_srq(RdmaBackendSRQ *srq,
> - RdmaDeviceResources *dev_res);
> -void rdma_backend_post_srq_recv(RdmaBackendDev *backend_dev,
> - RdmaBackendSRQ *srq, struct ibv_sge *sge,
> - uint32_t num_sge, void *ctx);
> -
> -#endif
> diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h
> deleted file mode 100644
> index 4e6c0ad695..0000000000
> --- a/hw/rdma/rdma_backend_defs.h
> +++ /dev/null
> @@ -1,76 +0,0 @@
> -/*
> - * RDMA device: Definitions of Backend Device structures
> - *
> - * Copyright (C) 2018 Oracle
> - * Copyright (C) 2018 Red Hat Inc
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - * Marcel Apfelbaum <marcel at redhat.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef RDMA_BACKEND_DEFS_H
> -#define RDMA_BACKEND_DEFS_H
> -
> -#include "qemu/thread.h"
> -#include "chardev/char-fe.h"
> -#include <infiniband/verbs.h>
> -#include "contrib/rdmacm-mux/rdmacm-mux.h"
> -#include "rdma_utils.h"
> -
> -typedef struct RdmaDeviceResources RdmaDeviceResources;
> -
> -typedef struct RdmaBackendThread {
> - QemuThread thread;
> - bool run; /* Set by thread manager to let thread know it should exit
> */
> - bool is_running; /* Set by the thread to report its status */
> -} RdmaBackendThread;
> -
> -typedef struct RdmaCmMux {
> - CharBackend *chr_be;
> - int can_receive;
> -} RdmaCmMux;
> -
> -typedef struct RdmaBackendDev {
> - RdmaBackendThread comp_thread;
> - PCIDevice *dev;
> - RdmaDeviceResources *rdma_dev_res;
> - struct ibv_device *ib_dev;
> - struct ibv_context *context;
> - struct ibv_comp_channel *channel;
> - uint8_t port_num;
> - RdmaProtectedGQueue recv_mads_list;
> - RdmaCmMux rdmacm_mux;
> -} RdmaBackendDev;
> -
> -typedef struct RdmaBackendPD {
> - struct ibv_pd *ibpd;
> -} RdmaBackendPD;
> -
> -typedef struct RdmaBackendMR {
> - struct ibv_pd *ibpd;
> - struct ibv_mr *ibmr;
> -} RdmaBackendMR;
> -
> -typedef struct RdmaBackendCQ {
> - RdmaBackendDev *backend_dev;
> - struct ibv_cq *ibcq;
> -} RdmaBackendCQ;
> -
> -typedef struct RdmaBackendQP {
> - struct ibv_pd *ibpd;
> - struct ibv_qp *ibqp;
> - uint8_t sgid_idx;
> - RdmaProtectedGSList cqe_ctx_list;
> -} RdmaBackendQP;
> -
> -typedef struct RdmaBackendSRQ {
> - struct ibv_srq *ibsrq;
> - RdmaProtectedGSList cqe_ctx_list;
> -} RdmaBackendSRQ;
> -
> -#endif
> diff --git a/hw/rdma/rdma_rm.h b/hw/rdma/rdma_rm.h
> deleted file mode 100644
> index d69a917795..0000000000
> --- a/hw/rdma/rdma_rm.h
> +++ /dev/null
> @@ -1,97 +0,0 @@
> -/*
> - * RDMA device: Definitions of Resource Manager functions
> - *
> - * Copyright (C) 2018 Oracle
> - * Copyright (C) 2018 Red Hat Inc
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - * Marcel Apfelbaum <marcel at redhat.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef RDMA_RM_H
> -#define RDMA_RM_H
> -
> -#include "qapi/error.h"
> -#include "rdma_backend_defs.h"
> -#include "rdma_rm_defs.h"
> -
> -int rdma_rm_init(RdmaDeviceResources *dev_res,
> - struct ibv_device_attr *dev_attr);
> -void rdma_rm_fini(RdmaDeviceResources *dev_res, RdmaBackendDev
> *backend_dev,
> - const char *ifname);
> -
> -int rdma_rm_alloc_pd(RdmaDeviceResources *dev_res, RdmaBackendDev
> *backend_dev,
> - uint32_t *pd_handle, uint32_t ctx_handle);
> -RdmaRmPD *rdma_rm_get_pd(RdmaDeviceResources *dev_res, uint32_t
> pd_handle);
> -void rdma_rm_dealloc_pd(RdmaDeviceResources *dev_res, uint32_t pd_handle);
> -
> -int rdma_rm_alloc_mr(RdmaDeviceResources *dev_res, uint32_t pd_handle,
> - uint64_t guest_start, uint64_t guest_length,
> - void *host_virt, int access_flags, uint32_t
> *mr_handle,
> - uint32_t *lkey, uint32_t *rkey);
> -RdmaRmMR *rdma_rm_get_mr(RdmaDeviceResources *dev_res, uint32_t
> mr_handle);
> -void rdma_rm_dealloc_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle);
> -
> -int rdma_rm_alloc_uc(RdmaDeviceResources *dev_res, uint32_t pfn,
> - uint32_t *uc_handle);
> -RdmaRmUC *rdma_rm_get_uc(RdmaDeviceResources *dev_res, uint32_t
> uc_handle);
> -void rdma_rm_dealloc_uc(RdmaDeviceResources *dev_res, uint32_t uc_handle);
> -
> -int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev
> *backend_dev,
> - uint32_t cqe, uint32_t *cq_handle, void *opaque);
> -RdmaRmCQ *rdma_rm_get_cq(RdmaDeviceResources *dev_res, uint32_t
> cq_handle);
> -void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, uint32_t
> cq_handle,
> - bool notify);
> -void rdma_rm_dealloc_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle);
> -
> -int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, uint32_t pd_handle,
> - uint8_t qp_type, uint32_t max_send_wr,
> - uint32_t max_send_sge, uint32_t send_cq_handle,
> - uint32_t max_recv_wr, uint32_t max_recv_sge,
> - uint32_t recv_cq_handle, void *opaque, uint32_t *qpn,
> - uint8_t is_srq, uint32_t srq_handle);
> -RdmaRmQP *rdma_rm_get_qp(RdmaDeviceResources *dev_res, uint32_t qpn);
> -int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev
> *backend_dev,
> - uint32_t qp_handle, uint32_t attr_mask, uint8_t
> sgid_idx,
> - union ibv_gid *dgid, uint32_t dqpn,
> - enum ibv_qp_state qp_state, uint32_t qkey,
> - uint32_t rq_psn, uint32_t sq_psn);
> -int rdma_rm_query_qp(RdmaDeviceResources *dev_res, RdmaBackendDev
> *backend_dev,
> - uint32_t qp_handle, struct ibv_qp_attr *attr,
> - int attr_mask, struct ibv_qp_init_attr *init_attr);
> -void rdma_rm_dealloc_qp(RdmaDeviceResources *dev_res, uint32_t qp_handle);
> -
> -RdmaRmSRQ *rdma_rm_get_srq(RdmaDeviceResources *dev_res, uint32_t
> srq_handle);
> -int rdma_rm_alloc_srq(RdmaDeviceResources *dev_res, uint32_t pd_handle,
> - uint32_t max_wr, uint32_t max_sge, uint32_t
> srq_limit,
> - uint32_t *srq_handle, void *opaque);
> -int rdma_rm_query_srq(RdmaDeviceResources *dev_res, uint32_t srq_handle,
> - struct ibv_srq_attr *srq_attr);
> -int rdma_rm_modify_srq(RdmaDeviceResources *dev_res, uint32_t srq_handle,
> - struct ibv_srq_attr *srq_attr, int srq_attr_mask);
> -void rdma_rm_dealloc_srq(RdmaDeviceResources *dev_res, uint32_t
> srq_handle);
> -
> -int rdma_rm_alloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t
> *cqe_ctx_id,
> - void *ctx);
> -void *rdma_rm_get_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t
> cqe_ctx_id);
> -void rdma_rm_dealloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t
> cqe_ctx_id);
> -
> -int rdma_rm_add_gid(RdmaDeviceResources *dev_res, RdmaBackendDev
> *backend_dev,
> - const char *ifname, union ibv_gid *gid, int gid_idx);
> -int rdma_rm_del_gid(RdmaDeviceResources *dev_res, RdmaBackendDev
> *backend_dev,
> - const char *ifname, int gid_idx);
> -int rdma_rm_get_backend_gid_index(RdmaDeviceResources *dev_res,
> - RdmaBackendDev *backend_dev, int
> sgid_idx);
> -static inline union ibv_gid *rdma_rm_get_gid(RdmaDeviceResources *dev_res,
> - int sgid_idx)
> -{
> - return &dev_res->port.gid_tbl[sgid_idx].gid;
> -}
> -void rdma_format_device_counters(RdmaDeviceResources *dev_res, GString
> *buf);
> -
> -#endif
> diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h
> deleted file mode 100644
> index 534f2f74d3..0000000000
> --- a/hw/rdma/rdma_rm_defs.h
> +++ /dev/null
> @@ -1,146 +0,0 @@
> -/*
> - * RDMA device: Definitions of Resource Manager structures
> - *
> - * Copyright (C) 2018 Oracle
> - * Copyright (C) 2018 Red Hat Inc
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - * Marcel Apfelbaum <marcel at redhat.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef RDMA_RM_DEFS_H
> -#define RDMA_RM_DEFS_H
> -
> -#include "rdma_backend_defs.h"
> -
> -#define MAX_PORTS 1 /* Do not change - we support only one
> port */
> -#define MAX_PORT_GIDS 255
> -#define MAX_GIDS MAX_PORT_GIDS
> -#define MAX_PORT_PKEYS 1
> -#define MAX_PKEYS MAX_PORT_PKEYS
> -#define MAX_UCS 512
> -#define MAX_MR_SIZE (1UL << 27)
> -#define MAX_QP 1024
> -#define MAX_SGE 4
> -#define MAX_CQ 2048
> -#define MAX_MR 1024
> -#define MAX_PD 1024
> -#define MAX_QP_RD_ATOM 16
> -#define MAX_QP_INIT_RD_ATOM 16
> -#define MAX_AH 64
> -#define MAX_SRQ 512
> -
> -#define MAX_RM_TBL_NAME 16
> -#define MAX_CONSEQ_EMPTY_POLL_CQ 4096 /* considered as error above
> this */
> -
> -typedef struct RdmaRmResTbl {
> - char name[MAX_RM_TBL_NAME];
> - QemuMutex lock;
> - unsigned long *bitmap;
> - size_t tbl_sz;
> - size_t res_sz;
> - void *tbl;
> - uint32_t used; /* number of used entries in the table */
> -} RdmaRmResTbl;
> -
> -typedef struct RdmaRmPD {
> - RdmaBackendPD backend_pd;
> - uint32_t ctx_handle;
> -} RdmaRmPD;
> -
> -typedef enum CQNotificationType {
> - CNT_CLEAR,
> - CNT_ARM,
> - CNT_SET,
> -} CQNotificationType;
> -
> -typedef struct RdmaRmCQ {
> - RdmaBackendCQ backend_cq;
> - void *opaque;
> - CQNotificationType notify;
> -} RdmaRmCQ;
> -
> -/* MR (DMA region) */
> -typedef struct RdmaRmMR {
> - RdmaBackendMR backend_mr;
> - void *virt;
> - uint64_t start;
> - size_t length;
> - uint32_t pd_handle;
> - uint32_t lkey;
> - uint32_t rkey;
> -} RdmaRmMR;
> -
> -typedef struct RdmaRmUC {
> - uint64_t uc_handle;
> -} RdmaRmUC;
> -
> -typedef struct RdmaRmQP {
> - RdmaBackendQP backend_qp;
> - void *opaque;
> - uint32_t qp_type;
> - uint32_t qpn;
> - uint32_t send_cq_handle;
> - uint32_t recv_cq_handle;
> - enum ibv_qp_state qp_state;
> - uint8_t is_srq;
> -} RdmaRmQP;
> -
> -typedef struct RdmaRmSRQ {
> - RdmaBackendSRQ backend_srq;
> - uint32_t recv_cq_handle;
> - void *opaque;
> -} RdmaRmSRQ;
> -
> -typedef struct RdmaRmGid {
> - union ibv_gid gid;
> - int backend_gid_index;
> -} RdmaRmGid;
> -
> -typedef struct RdmaRmPort {
> - RdmaRmGid gid_tbl[MAX_PORT_GIDS];
> - enum ibv_port_state state;
> -} RdmaRmPort;
> -
> -typedef struct RdmaRmStats {
> - uint64_t tx;
> - uint64_t tx_len;
> - uint64_t tx_err;
> - uint64_t rx_bufs;
> - uint64_t rx_bufs_len;
> - uint64_t rx_bufs_err;
> - uint64_t rx_srq;
> - uint64_t completions;
> - uint64_t mad_tx;
> - uint64_t mad_tx_err;
> - uint64_t mad_rx;
> - uint64_t mad_rx_err;
> - uint64_t mad_rx_bufs;
> - uint64_t mad_rx_bufs_err;
> - uint64_t poll_cq_from_bk;
> - uint64_t poll_cq_from_guest;
> - uint64_t poll_cq_from_guest_empty;
> - uint64_t poll_cq_ppoll_to;
> - uint32_t missing_cqe;
> -} RdmaRmStats;
> -
> -struct RdmaDeviceResources {
> - RdmaRmPort port;
> - RdmaRmResTbl pd_tbl;
> - RdmaRmResTbl mr_tbl;
> - RdmaRmResTbl uc_tbl;
> - RdmaRmResTbl qp_tbl;
> - RdmaRmResTbl cq_tbl;
> - RdmaRmResTbl cqe_ctx_tbl;
> - RdmaRmResTbl srq_tbl;
> - GHashTable *qp_hash; /* Keeps mapping between real and emulated */
> - QemuMutex lock;
> - RdmaRmStats stats;
> -};
> -
> -#endif
> diff --git a/hw/rdma/rdma_utils.h b/hw/rdma/rdma_utils.h
> deleted file mode 100644
> index 54e4f56edd..0000000000
> --- a/hw/rdma/rdma_utils.h
> +++ /dev/null
> @@ -1,63 +0,0 @@
> -/*
> - * RDMA device: Debug utilities
> - *
> - * Copyright (C) 2018 Oracle
> - * Copyright (C) 2018 Red Hat Inc
> - *
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - * Marcel Apfelbaum <marcel at redhat.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef RDMA_UTILS_H
> -#define RDMA_UTILS_H
> -
> -#include "qemu/error-report.h"
> -#include "sysemu/dma.h"
> -
> -#define rdma_error_report(fmt, ...) \
> - error_report("%s: " fmt, "rdma", ## __VA_ARGS__)
> -#define rdma_warn_report(fmt, ...) \
> - warn_report("%s: " fmt, "rdma", ## __VA_ARGS__)
> -#define rdma_info_report(fmt, ...) \
> - info_report("%s: " fmt, "rdma", ## __VA_ARGS__)
> -
> -typedef struct RdmaProtectedGQueue {
> - QemuMutex lock;
> - GQueue *list;
> -} RdmaProtectedGQueue;
> -
> -typedef struct RdmaProtectedGSList {
> - QemuMutex lock;
> - GSList *list;
> -} RdmaProtectedGSList;
> -
> -void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t len);
> -void rdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len);
> -void rdma_protected_gqueue_init(RdmaProtectedGQueue *list);
> -void rdma_protected_gqueue_destroy(RdmaProtectedGQueue *list);
> -void rdma_protected_gqueue_append_int64(RdmaProtectedGQueue *list,
> - int64_t value);
> -int64_t rdma_protected_gqueue_pop_int64(RdmaProtectedGQueue *list);
> -void rdma_protected_gslist_init(RdmaProtectedGSList *list);
> -void rdma_protected_gslist_destroy(RdmaProtectedGSList *list);
> -void rdma_protected_gslist_append_int32(RdmaProtectedGSList *list,
> - int32_t value);
> -void rdma_protected_gslist_remove_int32(RdmaProtectedGSList *list,
> - int32_t value);
> -
> -static inline void addrconf_addr_eui48(uint8_t *eui, const char *addr)
> -{
> - memcpy(eui, addr, 3);
> - eui[3] = 0xFF;
> - eui[4] = 0xFE;
> - memcpy(eui + 5, addr + 3, 3);
> - eui[0] ^= 2;
> -}
> -
> -#endif
> diff --git a/hw/rdma/trace.h b/hw/rdma/trace.h
> deleted file mode 100644
> index b3fa8ebc51..0000000000
> --- a/hw/rdma/trace.h
> +++ /dev/null
> @@ -1 +0,0 @@
> -#include "trace/trace-hw_rdma.h"
> diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
> deleted file mode 100644
> index 4cbc10c980..0000000000
> --- a/hw/rdma/vmw/pvrdma.h
> +++ /dev/null
> @@ -1,144 +0,0 @@
> -/*
> - * QEMU VMWARE paravirtual RDMA device definitions
> - *
> - * Copyright (C) 2018 Oracle
> - * Copyright (C) 2018 Red Hat Inc
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - * Marcel Apfelbaum <marcel at redhat.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef PVRDMA_PVRDMA_H
> -#define PVRDMA_PVRDMA_H
> -
> -#include "qemu/units.h"
> -#include "qemu/notify.h"
> -#include "hw/pci/msix.h"
> -#include "hw/pci/pci_device.h"
> -#include "chardev/char-fe.h"
> -#include "hw/net/vmxnet3_defs.h"
> -
> -#include "../rdma_backend_defs.h"
> -#include "../rdma_rm_defs.h"
> -
> -#include
> "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h"
> -#include "pvrdma_dev_ring.h"
> -#include "qom/object.h"
> -
> -/* BARs */
> -#define RDMA_MSIX_BAR_IDX 0
> -#define RDMA_REG_BAR_IDX 1
> -#define RDMA_UAR_BAR_IDX 2
> -#define RDMA_BAR0_MSIX_SIZE (16 * KiB)
> -#define RDMA_BAR1_REGS_SIZE 64
> -#define RDMA_BAR2_UAR_SIZE (0x1000 * MAX_UCS) /* each uc gets page */
> -
> -/* MSIX */
> -#define RDMA_MAX_INTRS 3
> -#define RDMA_MSIX_TABLE 0x0000
> -#define RDMA_MSIX_PBA 0x2000
> -
> -/* Interrupts Vectors */
> -#define INTR_VEC_CMD_RING 0
> -#define INTR_VEC_CMD_ASYNC_EVENTS 1
> -#define INTR_VEC_CMD_COMPLETION_Q 2
> -
> -/* HW attributes */
> -#define PVRDMA_HW_NAME "pvrdma"
> -#define PVRDMA_HW_VERSION 17
> -#define PVRDMA_FW_VERSION 14
> -
> -/* Some defaults */
> -#define PVRDMA_PKEY 0xFFFF
> -
> -typedef struct DSRInfo {
> - dma_addr_t dma;
> - struct pvrdma_device_shared_region *dsr;
> -
> - union pvrdma_cmd_req *req;
> - union pvrdma_cmd_resp *rsp;
> -
> - PvrdmaRingState *async_ring_state;
> - PvrdmaRing async;
> -
> - PvrdmaRingState *cq_ring_state;
> - PvrdmaRing cq;
> -} DSRInfo;
> -
> -typedef struct PVRDMADevStats {
> - uint64_t commands;
> - uint64_t regs_reads;
> - uint64_t regs_writes;
> - uint64_t uar_writes;
> - uint64_t interrupts;
> -} PVRDMADevStats;
> -
> -struct PVRDMADev {
> - PCIDevice parent_obj;
> - MemoryRegion msix;
> - MemoryRegion regs;
> - uint32_t regs_data[RDMA_BAR1_REGS_SIZE];
> - MemoryRegion uar;
> - uint32_t uar_data[RDMA_BAR2_UAR_SIZE];
> - DSRInfo dsr_info;
> - int interrupt_mask;
> - struct ibv_device_attr dev_attr;
> - uint64_t node_guid;
> - char *backend_eth_device_name;
> - char *backend_device_name;
> - uint8_t backend_port_num;
> - RdmaBackendDev backend_dev;
> - RdmaDeviceResources rdma_dev_res;
> - CharBackend mad_chr;
> - VMXNET3State *func0;
> - Notifier shutdown_notifier;
> - PVRDMADevStats stats;
> -};
> -typedef struct PVRDMADev PVRDMADev;
> -DECLARE_INSTANCE_CHECKER(PVRDMADev, PVRDMA_DEV,
> - PVRDMA_HW_NAME)
> -
> -static inline int get_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t *val)
> -{
> - int idx = addr >> 2;
> -
> - if (idx >= RDMA_BAR1_REGS_SIZE) {
> - return -EINVAL;
> - }
> -
> - *val = dev->regs_data[idx];
> -
> - return 0;
> -}
> -
> -static inline int set_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t val)
> -{
> - int idx = addr >> 2;
> -
> - if (idx >= RDMA_BAR1_REGS_SIZE) {
> - return -EINVAL;
> - }
> -
> - dev->regs_data[idx] = val;
> -
> - return 0;
> -}
> -
> -static inline void post_interrupt(PVRDMADev *dev, unsigned vector)
> -{
> - PCIDevice *pci_dev = PCI_DEVICE(dev);
> -
> - if (likely(!dev->interrupt_mask)) {
> - dev->stats.interrupts++;
> - msix_notify(pci_dev, vector);
> - }
> -}
> -
> -int pvrdma_exec_cmd(PVRDMADev *dev);
> -
> -#endif
> diff --git a/hw/rdma/vmw/pvrdma_dev_ring.h b/hw/rdma/vmw/pvrdma_dev_ring.h
> deleted file mode 100644
> index d231588ce0..0000000000
> --- a/hw/rdma/vmw/pvrdma_dev_ring.h
> +++ /dev/null
> @@ -1,46 +0,0 @@
> -/*
> - * QEMU VMWARE paravirtual RDMA ring utilities
> - *
> - * Copyright (C) 2018 Oracle
> - * Copyright (C) 2018 Red Hat Inc
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - * Marcel Apfelbaum <marcel at redhat.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef PVRDMA_DEV_RING_H
> -#define PVRDMA_DEV_RING_H
> -
> -
> -#define MAX_RING_NAME_SZ 32
> -
> -typedef struct PvrdmaRingState {
> - int prod_tail; /* producer tail */
> - int cons_head; /* consumer head */
> -} PvrdmaRingState;
> -
> -typedef struct PvrdmaRing {
> - char name[MAX_RING_NAME_SZ];
> - PCIDevice *dev;
> - uint32_t max_elems;
> - size_t elem_sz;
> - PvrdmaRingState *ring_state; /* used only for unmap */
> - int npages;
> - void **pages;
> -} PvrdmaRing;
> -
> -int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev,
> - PvrdmaRingState *ring_state, uint32_t max_elems,
> - size_t elem_sz, dma_addr_t *tbl, uint32_t npages);
> -void *pvrdma_ring_next_elem_read(PvrdmaRing *ring);
> -void pvrdma_ring_read_inc(PvrdmaRing *ring);
> -void *pvrdma_ring_next_elem_write(PvrdmaRing *ring);
> -void pvrdma_ring_write_inc(PvrdmaRing *ring);
> -void pvrdma_ring_free(PvrdmaRing *ring);
> -
> -#endif
> diff --git a/hw/rdma/vmw/pvrdma_qp_ops.h b/hw/rdma/vmw/pvrdma_qp_ops.h
> deleted file mode 100644
> index bf2b15c5ce..0000000000
> --- a/hw/rdma/vmw/pvrdma_qp_ops.h
> +++ /dev/null
> @@ -1,28 +0,0 @@
> -/*
> - * QEMU VMWARE paravirtual RDMA QP Operations
> - *
> - * Copyright (C) 2018 Oracle
> - * Copyright (C) 2018 Red Hat Inc
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - * Marcel Apfelbaum <marcel at redhat.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef PVRDMA_QP_OPS_H
> -#define PVRDMA_QP_OPS_H
> -
> -#include "pvrdma.h"
> -
> -int pvrdma_qp_ops_init(void);
> -void pvrdma_qp_ops_fini(void);
> -void pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle);
> -void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle);
> -void pvrdma_srq_recv(PVRDMADev *dev, uint32_t srq_handle);
> -void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle);
> -
> -#endif
> diff --git a/hw/rdma/vmw/trace.h b/hw/rdma/vmw/trace.h
> deleted file mode 100644
> index 3ebc9fb7ad..0000000000
> --- a/hw/rdma/vmw/trace.h
> +++ /dev/null
> @@ -1 +0,0 @@
> -#include "trace/trace-hw_rdma_vmw.h"
> diff --git a/include/hw/rdma/rdma.h b/include/hw/rdma/rdma.h
> deleted file mode 100644
> index 80b2e531c4..0000000000
> --- a/include/hw/rdma/rdma.h
> +++ /dev/null
> @@ -1,37 +0,0 @@
> -/*
> - * RDMA device interface
> - *
> - * Copyright (C) 2019 Oracle
> - * Copyright (C) 2019 Red Hat Inc
> - *
> - * Authors:
> - * Yuval Shaia <yuval.shaia at oracle.com>
> - *
> - * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> - * See the COPYING file in the top-level directory.
> - *
> - */
> -
> -#ifndef RDMA_H
> -#define RDMA_H
> -
> -#include "qom/object.h"
> -
> -#define INTERFACE_RDMA_PROVIDER "rdma"
> -
> -typedef struct RdmaProviderClass RdmaProviderClass;
> -DECLARE_CLASS_CHECKERS(RdmaProviderClass, RDMA_PROVIDER,
> - INTERFACE_RDMA_PROVIDER)
> -#define RDMA_PROVIDER(obj) \
> - INTERFACE_CHECK(RdmaProvider, (obj), \
> - INTERFACE_RDMA_PROVIDER)
> -
> -typedef struct RdmaProvider RdmaProvider;
> -
> -struct RdmaProviderClass {
> - InterfaceClass parent;
> -
> - void (*format_statistics)(RdmaProvider *obj, GString *buf);
> -};
> -
> -#endif
> diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
> index 13f9a2dedb..f4cf8f6717 100644
> --- a/include/monitor/hmp.h
> +++ b/include/monitor/hmp.h
> @@ -37,7 +37,6 @@ void hmp_info_spice(Monitor *mon, const QDict *qdict);
> void hmp_info_balloon(Monitor *mon, const QDict *qdict);
> void hmp_info_irq(Monitor *mon, const QDict *qdict);
> void hmp_info_pic(Monitor *mon, const QDict *qdict);
> -void hmp_info_rdma(Monitor *mon, const QDict *qdict);
> void hmp_info_pci(Monitor *mon, const QDict *qdict);
> void hmp_info_tpm(Monitor *mon, const QDict *qdict);
> void hmp_info_iothreads(Monitor *mon, const QDict *qdict);
> diff --git
> a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
> b/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
> deleted file mode 100644
> index a5a1c8234e..0000000000
> ---
> a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
> +++ /dev/null
> @@ -1,685 +0,0 @@
> -/*
> - * Copyright (c) 2012-2016 VMware, Inc. All rights reserved.
> - *
> - * This program is free software; you can redistribute it and/or
> - * modify it under the terms of EITHER the GNU General Public License
> - * version 2 as published by the Free Software Foundation or the BSD
> - * 2-Clause License. This program is distributed in the hope that it
> - * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
> - * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
> - * See the GNU General Public License version 2 for more details at
> - * <http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.gluster.org/pipermail/integration/attachments/20240328/a369cb61/attachment-0001.html>
More information about the integration
mailing list