[v7,12/12] RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter

Message ID 1666034441-15424-13-git-send-email-longli@linuxonhyperv.com
State New
Headers
Series Introduce Microsoft Azure Network Adapter (MANA) RDMA driver |

Commit Message

longli@linuxonhyperv.com Oct. 17, 2022, 7:20 p.m. UTC
  From: Long Li <longli@microsoft.com>

Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA).

Co-developed-by: Ajay Sharma <sharmaajay@microsoft.com>
Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Changed coding sytles/formats
Checked undersize for udata length
Changed all logging to use ibdev_xxx()
Avoided page array copy when doing MR
Sorted driver ops
Fixed warnings reported by kernel test robot <lkp@intel.com>

v3:
More coding sytle/format changes

v4:
Process error on hardware vport configuration

v5:
Change licenses to GPL-2.0-only
Fix error handling in mana_ib_gd_create_dma_region()

v6:
rebased to rdma-next
removed redundant initialization to return value in mana_ib_probe()
added missing tabs at the end of mana_ib_gd_create_dma_region()

v7:
move mana_gd_destroy_doorbell_page() and mana_gd_allocate_doorbell_page() from GDMA to this driver
use ib_umem_find_best_pgsz() for finding page size for registering dma regions with hardware
fix a bug that may double free mana_ind_table in mana_ib_create_qp_rss()
add Ajay Sharma <sharmaajay@microsoft.com> to maintainer list
add details to description in drivers/infiniband/hw/mana/Kconfig
change multiple lines comments to use RDMA style from NETDEV style
change mana_ib_dev_ops to static
use module_auxiliary_driver() in place of module_init and module_exit
move all user-triggerable error messages to debug messages
check for ind_tbl_size overflow in mana_ib_create_qp_rss()

 MAINTAINERS                             |   9 +
 drivers/infiniband/Kconfig              |   1 +
 drivers/infiniband/hw/Makefile          |   1 +
 drivers/infiniband/hw/mana/Kconfig      |  10 +
 drivers/infiniband/hw/mana/Makefile     |   4 +
 drivers/infiniband/hw/mana/cq.c         |  79 ++++
 drivers/infiniband/hw/mana/device.c     | 117 ++++++
 drivers/infiniband/hw/mana/main.c       | 508 ++++++++++++++++++++++++
 drivers/infiniband/hw/mana/mana_ib.h    | 156 ++++++++
 drivers/infiniband/hw/mana/mr.c         | 200 ++++++++++
 drivers/infiniband/hw/mana/qp.c         | 505 +++++++++++++++++++++++
 drivers/infiniband/hw/mana/wq.c         | 115 ++++++
 include/net/mana/mana.h                 |   3 +
 include/uapi/rdma/ib_user_ioctl_verbs.h |   1 +
 include/uapi/rdma/mana-abi.h            |  66 +++
 15 files changed, 1775 insertions(+)
 create mode 100644 drivers/infiniband/hw/mana/Kconfig
 create mode 100644 drivers/infiniband/hw/mana/Makefile
 create mode 100644 drivers/infiniband/hw/mana/cq.c
 create mode 100644 drivers/infiniband/hw/mana/device.c
 create mode 100644 drivers/infiniband/hw/mana/main.c
 create mode 100644 drivers/infiniband/hw/mana/mana_ib.h
 create mode 100644 drivers/infiniband/hw/mana/mr.c
 create mode 100644 drivers/infiniband/hw/mana/qp.c
 create mode 100644 drivers/infiniband/hw/mana/wq.c
 create mode 100644 include/uapi/rdma/mana-abi.h
  

Comments

Paolo Abeni Oct. 18, 2022, 10:11 a.m. UTC | #1
On Mon, 2022-10-17 at 12:20 -0700, longli@linuxonhyperv.com wrote:
> diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
> new file mode 100644
> index 000000000000..57e5f9dca454
> --- /dev/null
> +++ b/drivers/infiniband/hw/mana/main.c

[...]

> +static int mana_gd_destroy_doorbell_page(struct gdma_context *gc,
> +					 int doorbell_page)
> +{
> +	struct gdma_destroy_resource_range_req req = {};
> +	struct gdma_resp_hdr resp = {};
> +	int err;
> +
> +	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_RESOURCE_RANGE,
> +			     sizeof(req), sizeof(resp));
> +
> +	req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
> +	req.num_resources = 1;
> +	req.allocated_resources = doorbell_page;
> +
> +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> +	if (err || resp.status) {
> +		dev_err(gc->dev,
> +			"Failed to destroy doorbell page: ret %d, 0x%x\n",
> +			err, resp.status);
> +		return err ? err : -EPROTO;

Minor nit: the preferred style is:
		return err ?: -EPROTO;

a few other occurences below.

> +	}
> +
> +	return 0;
> +}

[...]

> +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
> +				 mana_handle_t *gdma_region)
> +{
> +	struct gdma_dma_region_add_pages_req *add_req = NULL;
> +	struct gdma_create_dma_region_resp create_resp = {};
> +	struct gdma_create_dma_region_req *create_req;
> +	size_t num_pages_cur, num_pages_to_handle;
> +	unsigned int create_req_msg_size;
> +	struct hw_channel_context *hwc;
> +	struct ib_block_iter biter;
> +	size_t max_pgs_create_cmd;
> +	struct gdma_context *gc;
> +	size_t num_pages_total;
> +	struct gdma_dev *mdev;
> +	unsigned long page_sz;
> +	void *request_buf;
> +	unsigned int i;
> +	int err;
> +
> +	mdev = dev->gdma_dev;
> +	gc = mdev->gdma_context;
> +	hwc = gc->hwc.driver_data;
> +
> +	/* Hardware requires dma region to align to chosen page size */
> +	page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
> +	if (!page_sz) {
> +		ibdev_dbg(&dev->ib_dev, "failed to find page size.\n");
> +		return -ENOMEM;
> +	}
> +	num_pages_total = ib_umem_num_dma_blocks(umem, page_sz);
> +
> +	max_pgs_create_cmd =
> +		(hwc->max_req_msg_size - sizeof(*create_req)) / sizeof(u64);
> +	num_pages_to_handle =
> +		min_t(size_t, num_pages_total, max_pgs_create_cmd);
> +	create_req_msg_size =
> +		struct_size(create_req, page_addr_list, num_pages_to_handle);
> +
> +	request_buf = kzalloc(hwc->max_req_msg_size, GFP_KERNEL);
> +	if (!request_buf)
> +		return -ENOMEM;
> +
> +	create_req = request_buf;
> +	mana_gd_init_req_hdr(&create_req->hdr, GDMA_CREATE_DMA_REGION,
> +			     create_req_msg_size, sizeof(create_resp));
> +
> +	create_req->length = umem->length;
> +	create_req->offset_in_page = umem->address & (page_sz - 1);
> +	create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT;
> +	create_req->page_count = num_pages_total;
> +	create_req->page_addr_list_len = num_pages_to_handle;
> +
> +	ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n",
> +		  umem->length, num_pages_total);
> +
> +	ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n",
> +		  page_sz, create_req->offset_in_page);
> +
> +	ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u",
> +		  num_pages_to_handle, create_req->gdma_page_type);
> +
> +	__rdma_umem_block_iter_start(&biter, umem, page_sz);
> +
> +	for (i = 0; i < num_pages_to_handle; ++i) {
> +		dma_addr_t cur_addr;
> +
> +		__rdma_block_iter_next(&biter);
> +		cur_addr = rdma_block_iter_dma_address(&biter);
> +
> +		create_req->page_addr_list[i] = cur_addr;
> +	}
> +
> +	err = mana_gd_send_request(gc, create_req_msg_size, create_req,
> +				   sizeof(create_resp), &create_resp);
> +	if (err || create_resp.hdr.status) {
> +		ibdev_dbg(&dev->ib_dev,
> +			  "Failed to create DMA region: %d, 0x%x\n", err,
> +			  create_resp.hdr.status);
> +		if (!err)
> +			err = -EPROTO;
> +
> +		kfree(request_buf);
> +		return err;

Minor nit: you can avoid a little code doplication replacing the above
2 lines with:
		goto out;

and ...

> +	}
> +
> +	*gdma_region = create_resp.dma_region_handle;
> +	ibdev_dbg(&dev->ib_dev, "Created DMA region with handle 0x%llx\n",
> +		  *gdma_region);
> +
> +	num_pages_cur = num_pages_to_handle;
> +
> +	if (num_pages_cur < num_pages_total) {
> +		unsigned int add_req_msg_size;
> +		size_t max_pgs_add_cmd =
> +			(hwc->max_req_msg_size - sizeof(*add_req)) /
> +			sizeof(u64);
> +
> +		num_pages_to_handle =
> +			min_t(size_t, num_pages_total - num_pages_cur,
> +			      max_pgs_add_cmd);
> +
> +		/* Calculate the max num of pages that will be handled */
> +		add_req_msg_size = struct_size(add_req, page_addr_list,
> +					       num_pages_to_handle);
> +		add_req = request_buf;
> +
> +		while (num_pages_cur < num_pages_total) {
> +			struct gdma_general_resp add_resp = {};
> +			u32 expected_status = 0;
> +
> +			if (num_pages_cur + num_pages_to_handle <
> +			    num_pages_total) {
> +				/* Status indicating more pages are needed */
> +				expected_status = GDMA_STATUS_MORE_ENTRIES;
> +			}
> +
> +			memset(add_req, 0, add_req_msg_size);
> +
> +			mana_gd_init_req_hdr(&add_req->hdr,
> +					     GDMA_DMA_REGION_ADD_PAGES,
> +					     add_req_msg_size,
> +					     sizeof(add_resp));
> +			add_req->dma_region_handle = *gdma_region;
> +			add_req->page_addr_list_len = num_pages_to_handle;
> +
> +			for (i = 0; i < num_pages_to_handle; ++i) {
> +				dma_addr_t cur_addr =
> +					rdma_block_iter_dma_address(&biter);
> +				add_req->page_addr_list[i] = cur_addr;
> +				__rdma_block_iter_next(&biter);
> +
> +				ibdev_dbg(&dev->ib_dev,
> +					  "page_addr_list %lu addr 0x%llx\n",
> +					  num_pages_cur + i, cur_addr);
> +			}
> +
> +			err = mana_gd_send_request(gc, add_req_msg_size,
> +						   add_req, sizeof(add_resp),
> +						   &add_resp);
> +			if (err || add_resp.hdr.status != expected_status) {
> +				ibdev_dbg(&dev->ib_dev,
> +					  "Failed put DMA pages %u: %d,0x%x\n",
> +					  i, err, add_resp.hdr.status);
> +				err = -EPROTO;
> +				break;
> +			}
> +
> +			num_pages_cur += num_pages_to_handle;
> +			num_pages_to_handle =
> +				min_t(size_t, num_pages_total - num_pages_cur,
> +				      max_pgs_add_cmd);
> +			add_req_msg_size = sizeof(*add_req) +
> +					   num_pages_to_handle * sizeof(u64);
> +		}
> +	}
> +
> +	kfree(request_buf);
> +
> +	if (err)
> +		mana_ib_gd_destroy_dma_region(dev, create_resp.dma_region_handle);

... here:

	if (err)
		mana_ib_gd_destroy_dma_region(dev, create_resp.dma_region_handle);

out:
	kfree(request_buf);

> +
> +	return err;
> +}

[...]

> diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
> new file mode 100644
> index 000000000000..fec7d4a06ace
> --- /dev/null
> +++ b/drivers/infiniband/hw/mana/qp.c
> @@ -0,0 +1,505 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
> + */
> +
> +#include "mana_ib.h"
> +
> +static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev,
> +				      struct net_device *ndev,
> +				      mana_handle_t default_rxobj,
> +				      mana_handle_t ind_table[],
> +				      u32 log_ind_tbl_size, u32 rx_hash_key_len,
> +				      u8 *rx_hash_key)
> +{
> +	struct mana_port_context *mpc = netdev_priv(ndev);
> +	struct mana_cfg_rx_steer_req *req = NULL;
> +	struct mana_cfg_rx_steer_resp resp = {};
> +	mana_handle_t *req_indir_tab;
> +	struct gdma_context *gc;
> +	struct gdma_dev *mdev;
> +	u32 req_buf_size;
> +	int i, err;
> +
> +	mdev = dev->gdma_dev;
> +	gc = mdev->gdma_context;
> +
> +	req_buf_size =
> +		sizeof(*req) + sizeof(mana_handle_t) * MANA_INDIRECT_TABLE_SIZE;
> +	req = kzalloc(req_buf_size, GFP_KERNEL);
> +	if (!req)
> +		return -ENOMEM;
> +
> +	mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size,
> +			     sizeof(resp));
> +
> +	req->vport = mpc->port_handle;
> +	req->rx_enable = 1;
> +	req->update_default_rxobj = 1;
> +	req->default_rxobj = default_rxobj;
> +	req->hdr.dev_id = mdev->dev_id;
> +
> +	/* If there are more than 1 entries in indirection table, enable RSS */
> +	if (log_ind_tbl_size)
> +		req->rss_enable = true;
> +
> +	req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE;
> +	req->indir_tab_offset = sizeof(*req);
> +	req->update_indir_tab = true;
> +
> +	req_indir_tab = (mana_handle_t *)(req + 1);
> +	/* The ind table passed to the hardware must have
> +	 * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb
> +	 * ind_table to MANA_INDIRECT_TABLE_SIZE if required
> +	 */
> +	ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size);
> +	for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) {
> +		req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)];
> +		ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i,
> +			  req_indir_tab[i]);
> +	}
> +
> +	req->update_hashkey = true;
> +	if (rx_hash_key_len)
> +		memcpy(req->hashkey, rx_hash_key, rx_hash_key_len);
> +	else
> +		netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE);
> +
> +	ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n",
> +		  req->vport, default_rxobj);
> +
> +	err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp), &resp);
> +	if (err) {
> +		netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
> +		goto out;
> +	}
> +
> +	if (resp.hdr.status) {
> +		netdev_err(ndev, "vPort RX configuration failed: 0x%x\n",
> +			   resp.hdr.status);
> +		err = -EPROTO;

This is confusing: if this error condition is reached, both error and
succesful configuration will be logged. I guess an additional:

		goto out;

is needed.

> +	}
> +
> +	netdev_info(ndev, "Configured steering vPort %llu log_entries %u\n",
> +		    mpc->port_handle, log_ind_tbl_size);
> +
> +out:
> +	kfree(req);
> +	return err;
> +}
> +
> +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
> +				 struct ib_qp_init_attr *attr,
> +				 struct ib_udata *udata)
> +{
> +	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
> +	struct mana_ib_dev *mdev =
> +		container_of(pd->device, struct mana_ib_dev, ib_dev);
> +	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
> +	struct mana_ib_create_qp_rss_resp resp = {};
> +	struct mana_ib_create_qp_rss ucmd = {};
> +	struct gdma_dev *gd = mdev->gdma_dev;
> +	mana_handle_t *mana_ind_table;
> +	struct mana_port_context *mpc;
> +	struct mana_context *mc;
> +	struct net_device *ndev;
> +	struct mana_ib_cq *cq;
> +	struct mana_ib_wq *wq;
> +	unsigned int ind_tbl_size;
> +	struct ib_cq *ibcq;
> +	struct ib_wq *ibwq;
> +	u32 port;
> +	int ret;
> +	int i;

This causes a build warning with clang:

../drivers/infiniband/hw/mana/qp.c:172:6: warning: variable 'i' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
        if (!mana_ind_table) {
            ^~~~~~~~~~~~~~~
../drivers/infiniband/hw/mana/qp.c:241:9: note: uninitialized use occurs here
        while (i-- > 0) {
               ^
../drivers/infiniband/hw/mana/qp.c:172:2: note: remove the 'if' if its condition is always false
        if (!mana_ind_table) {
        ^~~~~~~~~~~~~~~~~~~~~~
../drivers/infiniband/hw/mana/qp.c:113:7: note: initialize the variable 'i' to silence this warning
        int i;



> +
> +	mc = gd->driver_data;
> +
> +	if (!udata || udata->inlen < sizeof(ucmd))
> +		return -EINVAL;
> +
> +	ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
> +	if (ret) {
> +		ibdev_dbg(&mdev->ib_dev,
> +			  "Failed copy from udata for create rss-qp, err %d\n",
> +			  ret);
> +		return -EFAULT;
> +	}
> +
> +	if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
> +		ibdev_dbg(&mdev->ib_dev,
> +			  "Requested max_recv_wr %d exceeding limit\n",
> +			  attr->cap.max_recv_wr);
> +		return -EINVAL;
> +	}
> +
> +	if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) {
> +		ibdev_dbg(&mdev->ib_dev,
> +			  "Requested max_recv_sge %d exceeding limit\n",
> +			  attr->cap.max_recv_sge);
> +		return -EINVAL;
> +	}
> +
> +	ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size;
> +	if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) {
> +		ibdev_dbg(&mdev->ib_dev,
> +			  "Indirect table size %d exceeding limit\n",
> +			  ind_tbl_size);
> +		return -EINVAL;
> +	}
> +
> +	if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ) {
> +		ibdev_dbg(&mdev->ib_dev,
> +			  "RX Hash function is not supported, %d\n",
> +			  ucmd.rx_hash_function);
> +		return -EINVAL;
> +	}
> +
> +	/* IB ports start with 1, MANA start with 0 */
> +	port = ucmd.port;
> +	if (port < 1 || port > mc->num_ports) {
> +		ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n",
> +			  port);
> +		return -EINVAL;
> +	}
> +	ndev = mc->ports[port - 1];
> +	mpc = netdev_priv(ndev);
> +
> +	ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n",
> +		  ucmd.rx_hash_function, port);
> +
> +	mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t),
> +				 GFP_KERNEL);
> +	if (!mana_ind_table) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	qp->port = port;
> +
> +	for (i = 0; i < ind_tbl_size; i++) {
> +		struct mana_obj_spec wq_spec = {};
> +		struct mana_obj_spec cq_spec = {};
> +
> +		ibwq = ind_tbl->ind_tbl[i];
> +		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
> +
> +		ibcq = ibwq->cq;
> +		cq = container_of(ibcq, struct mana_ib_cq, ibcq);
> +
> +		wq_spec.gdma_region = wq->gdma_region;
> +		wq_spec.queue_size = wq->wq_buf_size;
> +
> +		cq_spec.gdma_region = cq->gdma_region;
> +		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
> +		cq_spec.modr_ctx_id = 0;
> +		cq_spec.attached_eq = GDMA_CQ_NO_EQ;
> +
> +		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
> +					 &wq_spec, &cq_spec, &wq->rx_object);
> +		if (ret)
> +			goto fail;
> +
> +		/* The GDMA regions are now owned by the WQ object */
> +		wq->gdma_region = GDMA_INVALID_DMA_REGION;
> +		cq->gdma_region = GDMA_INVALID_DMA_REGION;
> +
> +		wq->id = wq_spec.queue_index;
> +		cq->id = cq_spec.queue_index;
> +
> +		ibdev_dbg(&mdev->ib_dev,
> +			  "ret %d rx_object 0x%llx wq id %llu cq id %llu\n",
> +			  ret, wq->rx_object, wq->id, cq->id);
> +
> +		resp.entries[i].cqid = cq->id;
> +		resp.entries[i].wqid = wq->id;
> +
> +		mana_ind_table[i] = wq->rx_object;
> +	}
> +	resp.num_entries = i;
> +
> +	ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object,
> +					 mana_ind_table,
> +					 ind_tbl->log_ind_tbl_size,
> +					 ucmd.rx_hash_key_len,
> +					 ucmd.rx_hash_key);
> +	if (ret)
> +		goto fail;
> +
> +	ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
> +	if (ret) {
> +		ibdev_dbg(&mdev->ib_dev,
> +			  "Failed to copy to udata create rss-qp, %d\n",
> +			  ret);
> +		goto fail;
> +	}
> +
> +	kfree(mana_ind_table);
> +
> +	return 0;
> +
> +fail:
> +	while (i-- > 0) {
> +		ibwq = ind_tbl->ind_tbl[i];
> +		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
> +		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
> +	}
> +
> +	kfree(mana_ind_table);
> +
> +	return ret;
> +}


Cheers,

Paolo
  
Yunsheng Lin Oct. 19, 2022, 1:17 a.m. UTC | #2
On 2022/10/18 3:20, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
> 
> Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA).
> 
> Co-developed-by: Ajay Sharma <sharmaajay@microsoft.com>
> Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
> Reviewed-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
> Change log:
> v2:
> Changed coding sytles/formats
> Checked undersize for udata length
> Changed all logging to use ibdev_xxx()
> Avoided page array copy when doing MR
> Sorted driver ops
> Fixed warnings reported by kernel test robot <lkp@intel.com>
> 
> v3:
> More coding sytle/format changes
> 
> v4:
> Process error on hardware vport configuration
> 
> v5:
> Change licenses to GPL-2.0-only
> Fix error handling in mana_ib_gd_create_dma_region()
> 
> v6:
> rebased to rdma-next
> removed redundant initialization to return value in mana_ib_probe()
> added missing tabs at the end of mana_ib_gd_create_dma_region()
> 
> v7:
> move mana_gd_destroy_doorbell_page() and mana_gd_allocate_doorbell_page() from GDMA to this driver
> use ib_umem_find_best_pgsz() for finding page size for registering dma regions with hardware
> fix a bug that may double free mana_ind_table in mana_ib_create_qp_rss()
> add Ajay Sharma <sharmaajay@microsoft.com> to maintainer list
> add details to description in drivers/infiniband/hw/mana/Kconfig
> change multiple lines comments to use RDMA style from NETDEV style
> change mana_ib_dev_ops to static
> use module_auxiliary_driver() in place of module_init and module_exit
> move all user-triggerable error messages to debug messages
> check for ind_tbl_size overflow in mana_ib_create_qp_rss()
> 
>  MAINTAINERS                             |   9 +
>  drivers/infiniband/Kconfig              |   1 +
>  drivers/infiniband/hw/Makefile          |   1 +
>  drivers/infiniband/hw/mana/Kconfig      |  10 +
>  drivers/infiniband/hw/mana/Makefile     |   4 +
>  drivers/infiniband/hw/mana/cq.c         |  79 ++++
>  drivers/infiniband/hw/mana/device.c     | 117 ++++++
>  drivers/infiniband/hw/mana/main.c       | 508 ++++++++++++++++++++++++
>  drivers/infiniband/hw/mana/mana_ib.h    | 156 ++++++++
>  drivers/infiniband/hw/mana/mr.c         | 200 ++++++++++
>  drivers/infiniband/hw/mana/qp.c         | 505 +++++++++++++++++++++++
>  drivers/infiniband/hw/mana/wq.c         | 115 ++++++
>  include/net/mana/mana.h                 |   3 +
>  include/uapi/rdma/ib_user_ioctl_verbs.h |   1 +
>  include/uapi/rdma/mana-abi.h            |  66 +++
>  15 files changed, 1775 insertions(+)
>  create mode 100644 drivers/infiniband/hw/mana/Kconfig
>  create mode 100644 drivers/infiniband/hw/mana/Makefile
>  create mode 100644 drivers/infiniband/hw/mana/cq.c
>  create mode 100644 drivers/infiniband/hw/mana/device.c
>  create mode 100644 drivers/infiniband/hw/mana/main.c
>  create mode 100644 drivers/infiniband/hw/mana/mana_ib.h
>  create mode 100644 drivers/infiniband/hw/mana/mr.c
>  create mode 100644 drivers/infiniband/hw/mana/qp.c
>  create mode 100644 drivers/infiniband/hw/mana/wq.c
>  create mode 100644 include/uapi/rdma/mana-abi.h
> 

[...]

> +
> +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
> +		      u32 doorbell_id)
> +{
> +	struct gdma_dev *mdev = dev->gdma_dev;
> +	struct mana_port_context *mpc;
> +	struct mana_context *mc;
> +	struct net_device *ndev;
> +	int err;
> +
> +	mc = mdev->driver_data;
> +	ndev = mc->ports[port];
> +	mpc = netdev_priv(ndev);
> +
> +	mutex_lock(&pd->vport_mutex);
> +
> +	pd->vport_use_count++;
> +	if (pd->vport_use_count > 1) {
> +		ibdev_dbg(&dev->ib_dev,
> +			  "Skip as this PD is already configured vport\n");
> +		mutex_unlock(&pd->vport_mutex);
> +		return 0;
> +	}
> +	mutex_unlock(&pd->vport_mutex);
> +
> +	err = mana_cfg_vport(mpc, pd->pdn, doorbell_id);
> +	if (err) {
> +		mutex_lock(&pd->vport_mutex);
> +		pd->vport_use_count--;
> +		mutex_unlock(&pd->vport_mutex);

It seems there might be a race between the "pd->vport_use_count > 1"
checking above and the error handling here, it may cause other user using a
unconfigured vport if other user is checking the "pd->vport_use_count > 1)"
while mana_cfg_vport() fails before doing "pd->vport_use_count--".

> +
> +		ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n", err);
> +		return err;
> +	}
> +
> +	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
> +	pd->tx_vp_offset = mpc->tx_vp_offset;
> +
> +	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
> +		  mpc->port_handle, pd->pdn, doorbell_id);
> +
> +	return 0;
> +}
> +

[...]

> +
> +static int mana_gd_allocate_doorbell_page(struct gdma_context *gc,
> +					  int *doorbell_page)
> +{
> +	struct gdma_allocate_resource_range_req req = {};
> +	struct gdma_allocate_resource_range_resp resp = {};
> +	int err;
> +
> +	mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOCATE_RESOURCE_RANGE,
> +			     sizeof(req), sizeof(resp));
> +
> +	req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
> +	req.num_resources = 1;
> +	req.alignment = 1;
> +
> +	/* Have GDMA start searching from 0 */
> +	req.allocated_resources = 0;
> +
> +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> +	if (err || resp.hdr.status) {
> +		dev_err(gc->dev,
> +			"Failed to allocate doorbell page: ret %d, 0x%x\n",
> +			err, resp.hdr.status);
> +		return err ? err : -EPROTO;
> +	}
> +
> +	*doorbell_page = resp.allocated_resources;
> +
> +	return 0;
> +}
> +
> +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
> +			   struct ib_udata *udata)
> +{
> +	struct mana_ib_ucontext *ucontext =
> +		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
> +	struct ib_device *ibdev = ibcontext->device;
> +	struct mana_ib_dev *mdev;
> +	struct gdma_context *gc;
> +	struct gdma_dev *dev;
> +	int doorbell_page;
> +	int ret;
> +
> +	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> +	dev = mdev->gdma_dev;
> +	gc = dev->gdma_context;
> +
> +	/* Allocate a doorbell page index */
> +	ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
> +	if (ret) {
> +		ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret);
> +		return -ENOMEM;

It does not make much sense to do "err ? err : -EPROTO" in
mana_gd_allocate_doorbell_page() if -ENOMEM is returned unconditionally
here.

> +	}
> +
> +	ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page);
> +
> +	ucontext->doorbell = doorbell_page;
> +
> +	return 0;
> +}
> +

[...]

> diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
> new file mode 100644
> index 000000000000..2225a6d6f8e1
> --- /dev/null
> +++ b/drivers/infiniband/hw/mana/mana_ib.h
> @@ -0,0 +1,156 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
> + */
> +
> +#ifndef _MANA_IB_H_
> +#define _MANA_IB_H_
> +
> +#include <rdma/ib_verbs.h>
> +#include <rdma/ib_mad.h>
> +#include <rdma/ib_umem.h>
> +#include <rdma/mana-abi.h>
> +#include <rdma/uverbs_ioctl.h>
> +
> +#include <net/mana/mana.h>
> +
> +#define PAGE_SZ_BM                                                             \
> +	(SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K |        \
> +	 SZ_512K | SZ_1M | SZ_2M)
> +
> +/* MANA doesn't have any limit for MR size */
> +#define MANA_IB_MAX_MR_SIZE ((u64)(~(0ULL)))

Use U64_MAX?

> +
> +struct mana_ib_dev {
> +	struct ib_device ib_dev;
> +	struct gdma_dev *gdma_dev;
> +};
> +

[...]

> diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
> new file mode 100644
> index 000000000000..09124dd1792d
> --- /dev/null
> +++ b/drivers/infiniband/hw/mana/mr.c
> @@ -0,0 +1,200 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
> + */
> +
> +#include "mana_ib.h"
> +
> +#define VALID_MR_FLAGS                                                         \
> +	(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ)
> +
> +static enum gdma_mr_access_flags
> +mana_ib_verbs_to_gdma_access_flags(int access_flags)
> +{
> +	enum gdma_mr_access_flags flags = GDMA_ACCESS_FLAG_LOCAL_READ;
> +
> +	if (access_flags & IB_ACCESS_LOCAL_WRITE)
> +		flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE;
> +
> +	if (access_flags & IB_ACCESS_REMOTE_WRITE)
> +		flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE;
> +
> +	if (access_flags & IB_ACCESS_REMOTE_READ)
> +		flags |= GDMA_ACCESS_FLAG_REMOTE_READ;
> +
> +	return flags;
> +}
> +
> +static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
> +				struct gdma_create_mr_params *mr_params)
> +{
> +	struct gdma_create_mr_response resp = {};
> +	struct gdma_create_mr_request req = {};
> +	struct gdma_dev *mdev = dev->gdma_dev;
> +	struct gdma_context *gc;
> +	int err;
> +
> +	gc = mdev->gdma_context;
> +
> +	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req),
> +			     sizeof(resp));
> +	req.pd_handle = mr_params->pd_handle;
> +	req.mr_type = mr_params->mr_type;
> +
> +	switch (mr_params->mr_type) {
> +	case GDMA_MR_TYPE_GVA:
> +		req.gva.dma_region_handle = mr_params->gva.dma_region_handle;
> +		req.gva.virtual_address = mr_params->gva.virtual_address;
> +		req.gva.access_flags = mr_params->gva.access_flags;
> +		break;
> +
> +	default:
> +		ibdev_dbg(&dev->ib_dev,
> +			  "invalid param (GDMA_MR_TYPE) passed, type %d\n",
> +			  req.mr_type);
> +		err = -EINVAL;
> +		goto error;
> +	}
> +
> +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> +
> +	if (err || resp.hdr.status) {
> +		ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err,
> +			  resp.hdr.status);
> +		if (!err)
> +			err = -EPROTO;
> +
> +		goto error;
> +	}
> +
> +	mr->ibmr.lkey = resp.lkey;
> +	mr->ibmr.rkey = resp.rkey;
> +	mr->mr_handle = resp.mr_handle;
> +
> +	return 0;
> +error:
> +	return err;

There is no error handling here, maybe just return error directly instead of
a goto.

> +}
> +
> +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, gdma_obj_handle_t mr_handle)
> +{
> +	struct gdma_destroy_mr_response resp = {};
> +	struct gdma_destroy_mr_request req = {};
> +	struct gdma_dev *mdev = dev->gdma_dev;
> +	struct gdma_context *gc;
> +	int err;
> +
> +	gc = mdev->gdma_context;
> +
> +	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req),
> +			     sizeof(resp));
> +
> +	req.mr_handle = mr_handle;
> +
> +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> +	if (err || resp.hdr.status) {
> +		dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err,
> +			resp.hdr.status);
> +		if (!err)
> +			err = -EPROTO;
> +		return err;
> +	}
> +
> +	return 0;
> +}
> +

[...]

> +
> +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
> +				 struct ib_qp_init_attr *attr,
> +				 struct ib_udata *udata)
> +{
> +	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
> +	struct mana_ib_dev *mdev =
> +		container_of(pd->device, struct mana_ib_dev, ib_dev);
> +	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
> +	struct mana_ib_create_qp_rss_resp resp = {};
> +	struct mana_ib_create_qp_rss ucmd = {};
> +	struct gdma_dev *gd = mdev->gdma_dev;
> +	mana_handle_t *mana_ind_table;
> +	struct mana_port_context *mpc;
> +	struct mana_context *mc;
> +	struct net_device *ndev;
> +	struct mana_ib_cq *cq;
> +	struct mana_ib_wq *wq;
> +	unsigned int ind_tbl_size;
> +	struct ib_cq *ibcq;
> +	struct ib_wq *ibwq;
> +	u32 port;
> +	int ret;
> +	int i;
> +
> +	mc = gd->driver_data;
> +
> +	if (!udata || udata->inlen < sizeof(ucmd))
> +		return -EINVAL;
> +
> +	ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
> +	if (ret) {
> +		ibdev_dbg(&mdev->ib_dev,
> +			  "Failed copy from udata for create rss-qp, err %d\n",
> +			  ret);
> +		return -EFAULT;

Why not just return 'ret' directly?
  
Long Li Oct. 19, 2022, 6:35 p.m. UTC | #3
> Subject: Re: [Patch v7 12/12] RDMA/mana_ib: Add a driver for Microsoft
> Azure Network Adapter
> 
> On Mon, 2022-10-17 at 12:20 -0700, longli@linuxonhyperv.com wrote:
> > diff --git a/drivers/infiniband/hw/mana/main.c
> > b/drivers/infiniband/hw/mana/main.c
> > new file mode 100644
> > index 000000000000..57e5f9dca454
> > --- /dev/null
> > +++ b/drivers/infiniband/hw/mana/main.c
> 
> [...]
> 
> > +static int mana_gd_destroy_doorbell_page(struct gdma_context *gc,
> > +					 int doorbell_page)
> > +{
> > +	struct gdma_destroy_resource_range_req req = {};
> > +	struct gdma_resp_hdr resp = {};
> > +	int err;
> > +
> > +	mana_gd_init_req_hdr(&req.hdr,
> GDMA_DESTROY_RESOURCE_RANGE,
> > +			     sizeof(req), sizeof(resp));
> > +
> > +	req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
> > +	req.num_resources = 1;
> > +	req.allocated_resources = doorbell_page;
> > +
> > +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> &resp);
> > +	if (err || resp.status) {
> > +		dev_err(gc->dev,
> > +			"Failed to destroy doorbell page: ret %d, 0x%x\n",
> > +			err, resp.status);
> > +		return err ? err : -EPROTO;
> 
> Minor nit: the preferred style is:
> 		return err ?: -EPROTO;
> 
> a few other occurences below.

Will change this.

> 
> > +	}
> > +
> > +	return 0;
> > +}
> 
> [...]
> 
> > +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct
> ib_umem *umem,
> > +				 mana_handle_t *gdma_region)
> > +{
> > +	struct gdma_dma_region_add_pages_req *add_req = NULL;
> > +	struct gdma_create_dma_region_resp create_resp = {};
> > +	struct gdma_create_dma_region_req *create_req;
> > +	size_t num_pages_cur, num_pages_to_handle;
> > +	unsigned int create_req_msg_size;
> > +	struct hw_channel_context *hwc;
> > +	struct ib_block_iter biter;
> > +	size_t max_pgs_create_cmd;
> > +	struct gdma_context *gc;
> > +	size_t num_pages_total;
> > +	struct gdma_dev *mdev;
> > +	unsigned long page_sz;
> > +	void *request_buf;
> > +	unsigned int i;
> > +	int err;
> > +
> > +	mdev = dev->gdma_dev;
> > +	gc = mdev->gdma_context;
> > +	hwc = gc->hwc.driver_data;
> > +
> > +	/* Hardware requires dma region to align to chosen page size */
> > +	page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
> > +	if (!page_sz) {
> > +		ibdev_dbg(&dev->ib_dev, "failed to find page size.\n");
> > +		return -ENOMEM;
> > +	}
> > +	num_pages_total = ib_umem_num_dma_blocks(umem, page_sz);
> > +
> > +	max_pgs_create_cmd =
> > +		(hwc->max_req_msg_size - sizeof(*create_req)) /
> sizeof(u64);
> > +	num_pages_to_handle =
> > +		min_t(size_t, num_pages_total, max_pgs_create_cmd);
> > +	create_req_msg_size =
> > +		struct_size(create_req, page_addr_list,
> num_pages_to_handle);
> > +
> > +	request_buf = kzalloc(hwc->max_req_msg_size, GFP_KERNEL);
> > +	if (!request_buf)
> > +		return -ENOMEM;
> > +
> > +	create_req = request_buf;
> > +	mana_gd_init_req_hdr(&create_req->hdr,
> GDMA_CREATE_DMA_REGION,
> > +			     create_req_msg_size, sizeof(create_resp));
> > +
> > +	create_req->length = umem->length;
> > +	create_req->offset_in_page = umem->address & (page_sz - 1);
> > +	create_req->gdma_page_type = order_base_2(page_sz) -
> PAGE_SHIFT;
> > +	create_req->page_count = num_pages_total;
> > +	create_req->page_addr_list_len = num_pages_to_handle;
> > +
> > +	ibdev_dbg(&dev->ib_dev, "size_dma_region %lu
> num_pages_total %lu\n",
> > +		  umem->length, num_pages_total);
> > +
> > +	ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n",
> > +		  page_sz, create_req->offset_in_page);
> > +
> > +	ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu,
> gdma_page_type %u",
> > +		  num_pages_to_handle, create_req->gdma_page_type);
> > +
> > +	__rdma_umem_block_iter_start(&biter, umem, page_sz);
> > +
> > +	for (i = 0; i < num_pages_to_handle; ++i) {
> > +		dma_addr_t cur_addr;
> > +
> > +		__rdma_block_iter_next(&biter);
> > +		cur_addr = rdma_block_iter_dma_address(&biter);
> > +
> > +		create_req->page_addr_list[i] = cur_addr;
> > +	}
> > +
> > +	err = mana_gd_send_request(gc, create_req_msg_size, create_req,
> > +				   sizeof(create_resp), &create_resp);
> > +	if (err || create_resp.hdr.status) {
> > +		ibdev_dbg(&dev->ib_dev,
> > +			  "Failed to create DMA region: %d, 0x%x\n", err,
> > +			  create_resp.hdr.status);
> > +		if (!err)
> > +			err = -EPROTO;
> > +
> > +		kfree(request_buf);
> > +		return err;
> 
> Minor nit: you can avoid a little code doplication replacing the above
> 2 lines with:
> 		goto out;

Good suggestion, will make the change.

> 
> and ...
> 
> > +	}
> > +
> > +	*gdma_region = create_resp.dma_region_handle;
> > +	ibdev_dbg(&dev->ib_dev, "Created DMA region with handle
> 0x%llx\n",
> > +		  *gdma_region);
> > +
> > +	num_pages_cur = num_pages_to_handle;
> > +
> > +	if (num_pages_cur < num_pages_total) {
> > +		unsigned int add_req_msg_size;
> > +		size_t max_pgs_add_cmd =
> > +			(hwc->max_req_msg_size - sizeof(*add_req)) /
> > +			sizeof(u64);
> > +
> > +		num_pages_to_handle =
> > +			min_t(size_t, num_pages_total - num_pages_cur,
> > +			      max_pgs_add_cmd);
> > +
> > +		/* Calculate the max num of pages that will be handled */
> > +		add_req_msg_size = struct_size(add_req, page_addr_list,
> > +					       num_pages_to_handle);
> > +		add_req = request_buf;
> > +
> > +		while (num_pages_cur < num_pages_total) {
> > +			struct gdma_general_resp add_resp = {};
> > +			u32 expected_status = 0;
> > +
> > +			if (num_pages_cur + num_pages_to_handle <
> > +			    num_pages_total) {
> > +				/* Status indicating more pages are needed
> */
> > +				expected_status =
> GDMA_STATUS_MORE_ENTRIES;
> > +			}
> > +
> > +			memset(add_req, 0, add_req_msg_size);
> > +
> > +			mana_gd_init_req_hdr(&add_req->hdr,
> > +
> GDMA_DMA_REGION_ADD_PAGES,
> > +					     add_req_msg_size,
> > +					     sizeof(add_resp));
> > +			add_req->dma_region_handle = *gdma_region;
> > +			add_req->page_addr_list_len =
> num_pages_to_handle;
> > +
> > +			for (i = 0; i < num_pages_to_handle; ++i) {
> > +				dma_addr_t cur_addr =
> > +
> 	rdma_block_iter_dma_address(&biter);
> > +				add_req->page_addr_list[i] = cur_addr;
> > +				__rdma_block_iter_next(&biter);
> > +
> > +				ibdev_dbg(&dev->ib_dev,
> > +					  "page_addr_list %lu addr 0x%llx\n",
> > +					  num_pages_cur + i, cur_addr);
> > +			}
> > +
> > +			err = mana_gd_send_request(gc, add_req_msg_size,
> > +						   add_req, sizeof(add_resp),
> > +						   &add_resp);
> > +			if (err || add_resp.hdr.status != expected_status) {
> > +				ibdev_dbg(&dev->ib_dev,
> > +					  "Failed put DMA
> pages %u: %d,0x%x\n",
> > +					  i, err, add_resp.hdr.status);
> > +				err = -EPROTO;
> > +				break;
> > +			}
> > +
> > +			num_pages_cur += num_pages_to_handle;
> > +			num_pages_to_handle =
> > +				min_t(size_t, num_pages_total -
> num_pages_cur,
> > +				      max_pgs_add_cmd);
> > +			add_req_msg_size = sizeof(*add_req) +
> > +					   num_pages_to_handle *
> sizeof(u64);
> > +		}
> > +	}
> > +
> > +	kfree(request_buf);
> > +
> > +	if (err)
> > +		mana_ib_gd_destroy_dma_region(dev,
> create_resp.dma_region_handle);
> 
> ... here:
> 
> 	if (err)
> 		mana_ib_gd_destroy_dma_region(dev,
> create_resp.dma_region_handle);
> 
> out:
> 	kfree(request_buf);
> 
> > +
> > +	return err;
> > +}
> 
> [...]
> 
> > diff --git a/drivers/infiniband/hw/mana/qp.c
> > b/drivers/infiniband/hw/mana/qp.c new file mode 100644 index
> > 000000000000..fec7d4a06ace
> > --- /dev/null
> > +++ b/drivers/infiniband/hw/mana/qp.c
> > @@ -0,0 +1,505 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
> > + */
> > +
> > +#include "mana_ib.h"
> > +
> > +static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev,
> > +				      struct net_device *ndev,
> > +				      mana_handle_t default_rxobj,
> > +				      mana_handle_t ind_table[],
> > +				      u32 log_ind_tbl_size, u32
> rx_hash_key_len,
> > +				      u8 *rx_hash_key)
> > +{
> > +	struct mana_port_context *mpc = netdev_priv(ndev);
> > +	struct mana_cfg_rx_steer_req *req = NULL;
> > +	struct mana_cfg_rx_steer_resp resp = {};
> > +	mana_handle_t *req_indir_tab;
> > +	struct gdma_context *gc;
> > +	struct gdma_dev *mdev;
> > +	u32 req_buf_size;
> > +	int i, err;
> > +
> > +	mdev = dev->gdma_dev;
> > +	gc = mdev->gdma_context;
> > +
> > +	req_buf_size =
> > +		sizeof(*req) + sizeof(mana_handle_t) *
> MANA_INDIRECT_TABLE_SIZE;
> > +	req = kzalloc(req_buf_size, GFP_KERNEL);
> > +	if (!req)
> > +		return -ENOMEM;
> > +
> > +	mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX,
> req_buf_size,
> > +			     sizeof(resp));
> > +
> > +	req->vport = mpc->port_handle;
> > +	req->rx_enable = 1;
> > +	req->update_default_rxobj = 1;
> > +	req->default_rxobj = default_rxobj;
> > +	req->hdr.dev_id = mdev->dev_id;
> > +
> > +	/* If there are more than 1 entries in indirection table, enable RSS */
> > +	if (log_ind_tbl_size)
> > +		req->rss_enable = true;
> > +
> > +	req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE;
> > +	req->indir_tab_offset = sizeof(*req);
> > +	req->update_indir_tab = true;
> > +
> > +	req_indir_tab = (mana_handle_t *)(req + 1);
> > +	/* The ind table passed to the hardware must have
> > +	 * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb
> > +	 * ind_table to MANA_INDIRECT_TABLE_SIZE if required
> > +	 */
> > +	ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 <<
> log_ind_tbl_size);
> > +	for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) {
> > +		req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)];
> > +		ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i,
> > +			  req_indir_tab[i]);
> > +	}
> > +
> > +	req->update_hashkey = true;
> > +	if (rx_hash_key_len)
> > +		memcpy(req->hashkey, rx_hash_key, rx_hash_key_len);
> > +	else
> > +		netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE);
> > +
> > +	ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj
> 0x%llx\n",
> > +		  req->vport, default_rxobj);
> > +
> > +	err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp),
> &resp);
> > +	if (err) {
> > +		netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
> > +		goto out;
> > +	}
> > +
> > +	if (resp.hdr.status) {
> > +		netdev_err(ndev, "vPort RX configuration failed: 0x%x\n",
> > +			   resp.hdr.status);
> > +		err = -EPROTO;
> 
> This is confusing: if this error condition is reached, both error and succesful
> configuration will be logged. I guess an additional:
> 
> 		goto out;
> 
> is needed.

Yes, it's confusing. Will change this.

> 
> > +	}
> > +
> > +	netdev_info(ndev, "Configured steering vPort %llu
> log_entries %u\n",
> > +		    mpc->port_handle, log_ind_tbl_size);
> > +
> > +out:
> > +	kfree(req);
> > +	return err;
> > +}
> > +
> > +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
> > +				 struct ib_qp_init_attr *attr,
> > +				 struct ib_udata *udata)
> > +{
> > +	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp,
> ibqp);
> > +	struct mana_ib_dev *mdev =
> > +		container_of(pd->device, struct mana_ib_dev, ib_dev);
> > +	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
> > +	struct mana_ib_create_qp_rss_resp resp = {};
> > +	struct mana_ib_create_qp_rss ucmd = {};
> > +	struct gdma_dev *gd = mdev->gdma_dev;
> > +	mana_handle_t *mana_ind_table;
> > +	struct mana_port_context *mpc;
> > +	struct mana_context *mc;
> > +	struct net_device *ndev;
> > +	struct mana_ib_cq *cq;
> > +	struct mana_ib_wq *wq;
> > +	unsigned int ind_tbl_size;
> > +	struct ib_cq *ibcq;
> > +	struct ib_wq *ibwq;
> > +	u32 port;
> > +	int ret;
> > +	int i;
> 
> This causes a build warning with clang:
> 
> ../drivers/infiniband/hw/mana/qp.c:172:6: warning: variable 'i' is used
> uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
>         if (!mana_ind_table) {
>             ^~~~~~~~~~~~~~~
> ../drivers/infiniband/hw/mana/qp.c:241:9: note: uninitialized use occurs
> here
>         while (i-- > 0) {
>                ^
> ../drivers/infiniband/hw/mana/qp.c:172:2: note: remove the 'if' if its
> condition is always false
>         if (!mana_ind_table) {
>         ^~~~~~~~~~~~~~~~~~~~~~
> ../drivers/infiniband/hw/mana/qp.c:113:7: note: initialize the variable 'i' to
> silence this warning
>         int i;

Thank you. Will fix it.

> 
> 
> 
> > +
> > +	mc = gd->driver_data;
> > +
> > +	if (!udata || udata->inlen < sizeof(ucmd))
> > +		return -EINVAL;
> > +
> > +	ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> >inlen));
> > +	if (ret) {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Failed copy from udata for create rss-qp, err %d\n",
> > +			  ret);
> > +		return -EFAULT;
> > +	}
> > +
> > +	if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Requested max_recv_wr %d exceeding limit\n",
> > +			  attr->cap.max_recv_wr);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Requested max_recv_sge %d exceeding limit\n",
> > +			  attr->cap.max_recv_sge);
> > +		return -EINVAL;
> > +	}
> > +
> > +	ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size;
> > +	if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Indirect table size %d exceeding limit\n",
> > +			  ind_tbl_size);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ)
> {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "RX Hash function is not supported, %d\n",
> > +			  ucmd.rx_hash_function);
> > +		return -EINVAL;
> > +	}
> > +
> > +	/* IB ports start with 1, MANA start with 0 */
> > +	port = ucmd.port;
> > +	if (port < 1 || port > mc->num_ports) {
> > +		ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating
> qp\n",
> > +			  port);
> > +		return -EINVAL;
> > +	}
> > +	ndev = mc->ports[port - 1];
> > +	mpc = netdev_priv(ndev);
> > +
> > +	ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n",
> > +		  ucmd.rx_hash_function, port);
> > +
> > +	mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t),
> > +				 GFP_KERNEL);
> > +	if (!mana_ind_table) {
> > +		ret = -ENOMEM;
> > +		goto fail;
> > +	}
> > +
> > +	qp->port = port;
> > +
> > +	for (i = 0; i < ind_tbl_size; i++) {
> > +		struct mana_obj_spec wq_spec = {};
> > +		struct mana_obj_spec cq_spec = {};
> > +
> > +		ibwq = ind_tbl->ind_tbl[i];
> > +		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
> > +
> > +		ibcq = ibwq->cq;
> > +		cq = container_of(ibcq, struct mana_ib_cq, ibcq);
> > +
> > +		wq_spec.gdma_region = wq->gdma_region;
> > +		wq_spec.queue_size = wq->wq_buf_size;
> > +
> > +		cq_spec.gdma_region = cq->gdma_region;
> > +		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
> > +		cq_spec.modr_ctx_id = 0;
> > +		cq_spec.attached_eq = GDMA_CQ_NO_EQ;
> > +
> > +		ret = mana_create_wq_obj(mpc, mpc->port_handle,
> GDMA_RQ,
> > +					 &wq_spec, &cq_spec, &wq-
> >rx_object);
> > +		if (ret)
> > +			goto fail;
> > +
> > +		/* The GDMA regions are now owned by the WQ object */
> > +		wq->gdma_region = GDMA_INVALID_DMA_REGION;
> > +		cq->gdma_region = GDMA_INVALID_DMA_REGION;
> > +
> > +		wq->id = wq_spec.queue_index;
> > +		cq->id = cq_spec.queue_index;
> > +
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "ret %d rx_object 0x%llx wq id %llu cq id %llu\n",
> > +			  ret, wq->rx_object, wq->id, cq->id);
> > +
> > +		resp.entries[i].cqid = cq->id;
> > +		resp.entries[i].wqid = wq->id;
> > +
> > +		mana_ind_table[i] = wq->rx_object;
> > +	}
> > +	resp.num_entries = i;
> > +
> > +	ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object,
> > +					 mana_ind_table,
> > +					 ind_tbl->log_ind_tbl_size,
> > +					 ucmd.rx_hash_key_len,
> > +					 ucmd.rx_hash_key);
> > +	if (ret)
> > +		goto fail;
> > +
> > +	ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
> > +	if (ret) {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Failed to copy to udata create rss-qp, %d\n",
> > +			  ret);
> > +		goto fail;
> > +	}
> > +
> > +	kfree(mana_ind_table);
> > +
> > +	return 0;
> > +
> > +fail:
> > +	while (i-- > 0) {
> > +		ibwq = ind_tbl->ind_tbl[i];
> > +		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
> > +		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
> > +	}
> > +
> > +	kfree(mana_ind_table);
> > +
> > +	return ret;
> > +}
> 
> 
> Cheers,
> 
> Paolo
  
Long Li Oct. 19, 2022, 7:36 p.m. UTC | #4
> Subject: Re: [Patch v7 12/12] RDMA/mana_ib: Add a driver for Microsoft
> Azure Network Adapter
> 
> On 2022/10/18 3:20, longli@linuxonhyperv.com wrote:
> > From: Long Li <longli@microsoft.com>
> >
> > Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA).
> >
> > Co-developed-by: Ajay Sharma <sharmaajay@microsoft.com>
> > Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
> > Reviewed-by: Dexuan Cui <decui@microsoft.com>
> > Signed-off-by: Long Li <longli@microsoft.com>
> > ---
> > Change log:
> > v2:
> > Changed coding sytles/formats
> > Checked undersize for udata length
> > Changed all logging to use ibdev_xxx() Avoided page array copy when
> > doing MR Sorted driver ops Fixed warnings reported by kernel test
> > robot <lkp@intel.com>
> >
> > v3:
> > More coding sytle/format changes
> >
> > v4:
> > Process error on hardware vport configuration
> >
> > v5:
> > Change licenses to GPL-2.0-only
> > Fix error handling in mana_ib_gd_create_dma_region()
> >
> > v6:
> > rebased to rdma-next
> > removed redundant initialization to return value in mana_ib_probe()
> > added missing tabs at the end of mana_ib_gd_create_dma_region()
> >
> > v7:
> > move mana_gd_destroy_doorbell_page() and
> > mana_gd_allocate_doorbell_page() from GDMA to this driver use
> > ib_umem_find_best_pgsz() for finding page size for registering dma
> > regions with hardware fix a bug that may double free mana_ind_table in
> > mana_ib_create_qp_rss() add Ajay Sharma <sharmaajay@microsoft.com>
> to
> > maintainer list add details to description in
> > drivers/infiniband/hw/mana/Kconfig
> > change multiple lines comments to use RDMA style from NETDEV style
> > change mana_ib_dev_ops to static use module_auxiliary_driver() in
> > place of module_init and module_exit move all user-triggerable error
> > messages to debug messages check for ind_tbl_size overflow in
> > mana_ib_create_qp_rss()
> >
> >  MAINTAINERS                             |   9 +
> >  drivers/infiniband/Kconfig              |   1 +
> >  drivers/infiniband/hw/Makefile          |   1 +
> >  drivers/infiniband/hw/mana/Kconfig      |  10 +
> >  drivers/infiniband/hw/mana/Makefile     |   4 +
> >  drivers/infiniband/hw/mana/cq.c         |  79 ++++
> >  drivers/infiniband/hw/mana/device.c     | 117 ++++++
> >  drivers/infiniband/hw/mana/main.c       | 508
> ++++++++++++++++++++++++
> >  drivers/infiniband/hw/mana/mana_ib.h    | 156 ++++++++
> >  drivers/infiniband/hw/mana/mr.c         | 200 ++++++++++
> >  drivers/infiniband/hw/mana/qp.c         | 505 +++++++++++++++++++++++
> >  drivers/infiniband/hw/mana/wq.c         | 115 ++++++
> >  include/net/mana/mana.h                 |   3 +
> >  include/uapi/rdma/ib_user_ioctl_verbs.h |   1 +
> >  include/uapi/rdma/mana-abi.h            |  66 +++
> >  15 files changed, 1775 insertions(+)
> >  create mode 100644 drivers/infiniband/hw/mana/Kconfig
> >  create mode 100644 drivers/infiniband/hw/mana/Makefile
> >  create mode 100644 drivers/infiniband/hw/mana/cq.c  create mode
> > 100644 drivers/infiniband/hw/mana/device.c
> >  create mode 100644 drivers/infiniband/hw/mana/main.c  create mode
> > 100644 drivers/infiniband/hw/mana/mana_ib.h
> >  create mode 100644 drivers/infiniband/hw/mana/mr.c  create mode
> > 100644 drivers/infiniband/hw/mana/qp.c  create mode 100644
> > drivers/infiniband/hw/mana/wq.c  create mode 100644
> > include/uapi/rdma/mana-abi.h
> >
> 
> [...]
> 
> > +
> > +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct
> mana_ib_pd *pd,
> > +		      u32 doorbell_id)
> > +{
> > +	struct gdma_dev *mdev = dev->gdma_dev;
> > +	struct mana_port_context *mpc;
> > +	struct mana_context *mc;
> > +	struct net_device *ndev;
> > +	int err;
> > +
> > +	mc = mdev->driver_data;
> > +	ndev = mc->ports[port];
> > +	mpc = netdev_priv(ndev);
> > +
> > +	mutex_lock(&pd->vport_mutex);
> > +
> > +	pd->vport_use_count++;
> > +	if (pd->vport_use_count > 1) {
> > +		ibdev_dbg(&dev->ib_dev,
> > +			  "Skip as this PD is already configured vport\n");
> > +		mutex_unlock(&pd->vport_mutex);
> > +		return 0;
> > +	}
> > +	mutex_unlock(&pd->vport_mutex);
> > +
> > +	err = mana_cfg_vport(mpc, pd->pdn, doorbell_id);
> > +	if (err) {
> > +		mutex_lock(&pd->vport_mutex);
> > +		pd->vport_use_count--;
> > +		mutex_unlock(&pd->vport_mutex);
> 
> It seems there might be a race between the "pd->vport_use_count > 1"
> checking above and the error handling here, it may cause other user using a
> unconfigured vport if other user is checking the "pd->vport_use_count > 1)"
> while mana_cfg_vport() fails before doing "pd->vport_use_count--".

Thank you. You are correct about the possible race. I'm fixing it.

> 
> > +
> > +		ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n",
> err);
> > +		return err;
> > +	}
> > +
> > +	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
> > +	pd->tx_vp_offset = mpc->tx_vp_offset;
> > +
> > +	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x
> doorbell_id %x\n",
> > +		  mpc->port_handle, pd->pdn, doorbell_id);
> > +
> > +	return 0;
> > +}
> > +
> 
> [...]
> 
> > +
> > +static int mana_gd_allocate_doorbell_page(struct gdma_context *gc,
> > +					  int *doorbell_page)
> > +{
> > +	struct gdma_allocate_resource_range_req req = {};
> > +	struct gdma_allocate_resource_range_resp resp = {};
> > +	int err;
> > +
> > +	mana_gd_init_req_hdr(&req.hdr,
> GDMA_ALLOCATE_RESOURCE_RANGE,
> > +			     sizeof(req), sizeof(resp));
> > +
> > +	req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
> > +	req.num_resources = 1;
> > +	req.alignment = 1;
> > +
> > +	/* Have GDMA start searching from 0 */
> > +	req.allocated_resources = 0;
> > +
> > +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> &resp);
> > +	if (err || resp.hdr.status) {
> > +		dev_err(gc->dev,
> > +			"Failed to allocate doorbell page: ret %d, 0x%x\n",
> > +			err, resp.hdr.status);
> > +		return err ? err : -EPROTO;
> > +	}
> > +
> > +	*doorbell_page = resp.allocated_resources;
> > +
> > +	return 0;
> > +}
> > +
> > +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
> > +			   struct ib_udata *udata)
> > +{
> > +	struct mana_ib_ucontext *ucontext =
> > +		container_of(ibcontext, struct mana_ib_ucontext,
> ibucontext);
> > +	struct ib_device *ibdev = ibcontext->device;
> > +	struct mana_ib_dev *mdev;
> > +	struct gdma_context *gc;
> > +	struct gdma_dev *dev;
> > +	int doorbell_page;
> > +	int ret;
> > +
> > +	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> > +	dev = mdev->gdma_dev;
> > +	gc = dev->gdma_context;
> > +
> > +	/* Allocate a doorbell page index */
> > +	ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
> > +	if (ret) {
> > +		ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n",
> ret);
> > +		return -ENOMEM;
> 
> It does not make much sense to do "err ? err : -EPROTO" in
> mana_gd_allocate_doorbell_page() if -ENOMEM is returned unconditionally
> here.

I will change it to return "ret", to be consistent with other usage on mana_gd_xxx functions.

> 
> > +	}
> > +
> > +	ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page);
> > +
> > +	ucontext->doorbell = doorbell_page;
> > +
> > +	return 0;
> > +}
> > +
> 
> [...]
> 
> > diff --git a/drivers/infiniband/hw/mana/mana_ib.h
> > b/drivers/infiniband/hw/mana/mana_ib.h
> > new file mode 100644
> > index 000000000000..2225a6d6f8e1
> > --- /dev/null
> > +++ b/drivers/infiniband/hw/mana/mana_ib.h
> > @@ -0,0 +1,156 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
> > + */
> > +
> > +#ifndef _MANA_IB_H_
> > +#define _MANA_IB_H_
> > +
> > +#include <rdma/ib_verbs.h>
> > +#include <rdma/ib_mad.h>
> > +#include <rdma/ib_umem.h>
> > +#include <rdma/mana-abi.h>
> > +#include <rdma/uverbs_ioctl.h>
> > +
> > +#include <net/mana/mana.h>
> > +
> > +#define PAGE_SZ_BM                                                             \
> > +	(SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K |
> \
> > +	 SZ_512K | SZ_1M | SZ_2M)
> > +
> > +/* MANA doesn't have any limit for MR size */ #define
> > +MANA_IB_MAX_MR_SIZE ((u64)(~(0ULL)))
> 
> Use U64_MAX?

Good idea. Will change it.

> 
> > +
> > +struct mana_ib_dev {
> > +	struct ib_device ib_dev;
> > +	struct gdma_dev *gdma_dev;
> > +};
> > +
> 
> [...]
> 
> > diff --git a/drivers/infiniband/hw/mana/mr.c
> > b/drivers/infiniband/hw/mana/mr.c new file mode 100644 index
> > 000000000000..09124dd1792d
> > --- /dev/null
> > +++ b/drivers/infiniband/hw/mana/mr.c
> > @@ -0,0 +1,200 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
> > + */
> > +
> > +#include "mana_ib.h"
> > +
> > +#define VALID_MR_FLAGS                                                         \
> > +	(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
> > +IB_ACCESS_REMOTE_READ)
> > +
> > +static enum gdma_mr_access_flags
> > +mana_ib_verbs_to_gdma_access_flags(int access_flags) {
> > +	enum gdma_mr_access_flags flags =
> GDMA_ACCESS_FLAG_LOCAL_READ;
> > +
> > +	if (access_flags & IB_ACCESS_LOCAL_WRITE)
> > +		flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE;
> > +
> > +	if (access_flags & IB_ACCESS_REMOTE_WRITE)
> > +		flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE;
> > +
> > +	if (access_flags & IB_ACCESS_REMOTE_READ)
> > +		flags |= GDMA_ACCESS_FLAG_REMOTE_READ;
> > +
> > +	return flags;
> > +}
> > +
> > +static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct
> mana_ib_mr *mr,
> > +				struct gdma_create_mr_params *mr_params)
> {
> > +	struct gdma_create_mr_response resp = {};
> > +	struct gdma_create_mr_request req = {};
> > +	struct gdma_dev *mdev = dev->gdma_dev;
> > +	struct gdma_context *gc;
> > +	int err;
> > +
> > +	gc = mdev->gdma_context;
> > +
> > +	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req),
> > +			     sizeof(resp));
> > +	req.pd_handle = mr_params->pd_handle;
> > +	req.mr_type = mr_params->mr_type;
> > +
> > +	switch (mr_params->mr_type) {
> > +	case GDMA_MR_TYPE_GVA:
> > +		req.gva.dma_region_handle = mr_params-
> >gva.dma_region_handle;
> > +		req.gva.virtual_address = mr_params->gva.virtual_address;
> > +		req.gva.access_flags = mr_params->gva.access_flags;
> > +		break;
> > +
> > +	default:
> > +		ibdev_dbg(&dev->ib_dev,
> > +			  "invalid param (GDMA_MR_TYPE) passed,
> type %d\n",
> > +			  req.mr_type);
> > +		err = -EINVAL;
> > +		goto error;
> > +	}
> > +
> > +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> > +&resp);
> > +
> > +	if (err || resp.hdr.status) {
> > +		ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err,
> > +			  resp.hdr.status);
> > +		if (!err)
> > +			err = -EPROTO;
> > +
> > +		goto error;
> > +	}
> > +
> > +	mr->ibmr.lkey = resp.lkey;
> > +	mr->ibmr.rkey = resp.rkey;
> > +	mr->mr_handle = resp.mr_handle;
> > +
> > +	return 0;
> > +error:
> > +	return err;
> 
> There is no error handling here, maybe just return error directly instead of a
> goto.

Yes, will remove the goto.

> 
> > +}
> > +
> > +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev,
> > +gdma_obj_handle_t mr_handle) {
> > +	struct gdma_destroy_mr_response resp = {};
> > +	struct gdma_destroy_mr_request req = {};
> > +	struct gdma_dev *mdev = dev->gdma_dev;
> > +	struct gdma_context *gc;
> > +	int err;
> > +
> > +	gc = mdev->gdma_context;
> > +
> > +	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req),
> > +			     sizeof(resp));
> > +
> > +	req.mr_handle = mr_handle;
> > +
> > +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> &resp);
> > +	if (err || resp.hdr.status) {
> > +		dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err,
> > +			resp.hdr.status);
> > +		if (!err)
> > +			err = -EPROTO;
> > +		return err;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> 
> [...]
> 
> > +
> > +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
> > +				 struct ib_qp_init_attr *attr,
> > +				 struct ib_udata *udata)
> > +{
> > +	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp,
> ibqp);
> > +	struct mana_ib_dev *mdev =
> > +		container_of(pd->device, struct mana_ib_dev, ib_dev);
> > +	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
> > +	struct mana_ib_create_qp_rss_resp resp = {};
> > +	struct mana_ib_create_qp_rss ucmd = {};
> > +	struct gdma_dev *gd = mdev->gdma_dev;
> > +	mana_handle_t *mana_ind_table;
> > +	struct mana_port_context *mpc;
> > +	struct mana_context *mc;
> > +	struct net_device *ndev;
> > +	struct mana_ib_cq *cq;
> > +	struct mana_ib_wq *wq;
> > +	unsigned int ind_tbl_size;
> > +	struct ib_cq *ibcq;
> > +	struct ib_wq *ibwq;
> > +	u32 port;
> > +	int ret;
> > +	int i;
> > +
> > +	mc = gd->driver_data;
> > +
> > +	if (!udata || udata->inlen < sizeof(ucmd))
> > +		return -EINVAL;
> > +
> > +	ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> >inlen));
> > +	if (ret) {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Failed copy from udata for create rss-qp, err %d\n",
> > +			  ret);
> > +		return -EFAULT;
> 
> Why not just return 'ret' directly?

Yes, it's better to just return "ret". Will fix this.

Thank you.
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 8b9a50756c7e..81ee58f44956 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13506,6 +13506,15 @@  F:	drivers/scsi/smartpqi/smartpqi*.[ch]
 F:	include/linux/cciss*.h
 F:	include/uapi/linux/cciss*.h
 
+MICROSOFT MANA RDMA DRIVER
+M:	Long Li <longli@microsoft.com>
+M:	Ajay Sharma <sharmaajay@microsoft.com>
+L:	linux-rdma@vger.kernel.org
+S:	Supported
+F:	drivers/infiniband/hw/mana/
+F:	include/net/mana
+F:	include/uapi/rdma/mana-abi.h
+
 MICROSOFT SURFACE AGGREGATOR TABLET-MODE SWITCH
 M:	Maximilian Luz <luzmaximilian@gmail.com>
 L:	platform-driver-x86@vger.kernel.org
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index aa36ac618e72..ccc874478f0b 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -85,6 +85,7 @@  source "drivers/infiniband/hw/erdma/Kconfig"
 source "drivers/infiniband/hw/hfi1/Kconfig"
 source "drivers/infiniband/hw/hns/Kconfig"
 source "drivers/infiniband/hw/irdma/Kconfig"
+source "drivers/infiniband/hw/mana/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/mlx5/Kconfig"
 source "drivers/infiniband/hw/mthca/Kconfig"
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index 6b3a88046125..1211f4317a9f 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -4,6 +4,7 @@  obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
 obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/
 obj-$(CONFIG_INFINIBAND_EFA)		+= efa/
 obj-$(CONFIG_INFINIBAND_IRDMA)		+= irdma/
+obj-$(CONFIG_MANA_INFINIBAND)		+= mana/
 obj-$(CONFIG_MLX4_INFINIBAND)		+= mlx4/
 obj-$(CONFIG_MLX5_INFINIBAND)		+= mlx5/
 obj-$(CONFIG_INFINIBAND_OCRDMA)		+= ocrdma/
diff --git a/drivers/infiniband/hw/mana/Kconfig b/drivers/infiniband/hw/mana/Kconfig
new file mode 100644
index 000000000000..546640657bac
--- /dev/null
+++ b/drivers/infiniband/hw/mana/Kconfig
@@ -0,0 +1,10 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+config MANA_INFINIBAND
+	tristate "Microsoft Azure Network Adapter support"
+	depends on NETDEVICES && ETHERNET && PCI && MICROSOFT_MANA
+	help
+	  This driver provides low-level RDMA support for Microsoft Azure
+	  Network Adapter (MANA). MANA supports RDMA features that can be used
+	  for workloads (e.g. DPDK, MPI etc) that uses RDMA verbs to directly
+	  access hardware from user-mode processes in Microsoft Azure cloud
+	  environment.
diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile
new file mode 100644
index 000000000000..88655fe5e398
--- /dev/null
+++ b/drivers/infiniband/hw/mana/Makefile
@@ -0,0 +1,4 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o
+
+mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o
diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
new file mode 100644
index 000000000000..71251fff36c3
--- /dev/null
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -0,0 +1,79 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata)
+{
+	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
+	struct ib_device *ibdev = ibcq->device;
+	struct mana_ib_create_cq ucmd = {};
+	struct mana_ib_dev *mdev;
+	int err;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+
+	if (udata->inlen < sizeof(ucmd))
+		return -EINVAL;
+
+	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(ibdev,
+			  "Failed to copy from udata for create cq, %d\n", err);
+		return -EFAULT;
+	}
+
+	if (attr->cqe > MAX_SEND_BUFFERS_PER_QUEUE) {
+		ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
+		return -EINVAL;
+	}
+
+	cq->cqe = attr->cqe;
+	cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE,
+			       IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(cq->umem)) {
+		err = PTR_ERR(cq->umem);
+		ibdev_dbg(ibdev, "Failed to get umem for create cq, err %d\n",
+			  err);
+		return err;
+	}
+
+	err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region);
+	if (err) {
+		ibdev_dbg(ibdev,
+			  "Failed to create dma region for create cq, %d\n",
+			  err);
+		goto err_release_umem;
+	}
+
+	ibdev_dbg(ibdev,
+		  "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n",
+		  err, cq->gdma_region);
+
+	/*
+	 * The CQ ID is not known at this time. The ID is generated at create_qp
+	 */
+
+	return 0;
+
+err_release_umem:
+	ib_umem_release(cq->umem);
+	return err;
+}
+
+int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+{
+	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
+	struct ib_device *ibdev = ibcq->device;
+	struct mana_ib_dev *mdev;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+
+	mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region);
+	ib_umem_release(cq->umem);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
new file mode 100644
index 000000000000..d4541b8707e4
--- /dev/null
+++ b/drivers/infiniband/hw/mana/device.c
@@ -0,0 +1,117 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+#include <net/mana/mana_auxiliary.h>
+
+MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver");
+MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS(NET_MANA);
+
+static const struct ib_device_ops mana_ib_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_MANA,
+	.uverbs_abi_ver = MANA_IB_UVERBS_ABI_VERSION,
+
+	.alloc_pd = mana_ib_alloc_pd,
+	.alloc_ucontext = mana_ib_alloc_ucontext,
+	.create_cq = mana_ib_create_cq,
+	.create_qp = mana_ib_create_qp,
+	.create_rwq_ind_table = mana_ib_create_rwq_ind_table,
+	.create_wq = mana_ib_create_wq,
+	.dealloc_pd = mana_ib_dealloc_pd,
+	.dealloc_ucontext = mana_ib_dealloc_ucontext,
+	.dereg_mr = mana_ib_dereg_mr,
+	.destroy_cq = mana_ib_destroy_cq,
+	.destroy_qp = mana_ib_destroy_qp,
+	.destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table,
+	.destroy_wq = mana_ib_destroy_wq,
+	.disassociate_ucontext = mana_ib_disassociate_ucontext,
+	.get_port_immutable = mana_ib_get_port_immutable,
+	.mmap = mana_ib_mmap,
+	.modify_qp = mana_ib_modify_qp,
+	.modify_wq = mana_ib_modify_wq,
+	.query_device = mana_ib_query_device,
+	.query_gid = mana_ib_query_gid,
+	.query_port = mana_ib_query_port,
+	.reg_user_mr = mana_ib_reg_user_mr,
+
+	INIT_RDMA_OBJ_SIZE(ib_cq, mana_ib_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, mana_ib_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_qp, mana_ib_qp, ibqp),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, mana_ib_ucontext, ibucontext),
+	INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mana_ib_rwq_ind_table,
+			   ib_ind_table),
+};
+
+static int mana_ib_probe(struct auxiliary_device *adev,
+			 const struct auxiliary_device_id *id)
+{
+	struct mana_adev *madev = container_of(adev, struct mana_adev, adev);
+	struct gdma_dev *mdev = madev->mdev;
+	struct mana_context *mc;
+	struct mana_ib_dev *dev;
+	int ret;
+
+	mc = mdev->driver_data;
+
+	dev = ib_alloc_device(mana_ib_dev, ib_dev);
+	if (!dev)
+		return -ENOMEM;
+
+	ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops);
+
+	dev->ib_dev.phys_port_cnt = mc->num_ports;
+
+	ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev,
+		  mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt);
+
+	dev->gdma_dev = mdev;
+	dev->ib_dev.node_type = RDMA_NODE_IB_CA;
+
+	/*
+	 * num_comp_vectors needs to set to the max MSIX index
+	 * when interrupts and event queues are implemented
+	 */
+	dev->ib_dev.num_comp_vectors = 1;
+	dev->ib_dev.dev.parent = mdev->gdma_context->dev;
+
+	ret = ib_register_device(&dev->ib_dev, "mana_%d",
+				 mdev->gdma_context->dev);
+	if (ret) {
+		ib_dealloc_device(&dev->ib_dev);
+		return ret;
+	}
+
+	dev_set_drvdata(&adev->dev, dev);
+
+	return 0;
+}
+
+static void mana_ib_remove(struct auxiliary_device *adev)
+{
+	struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev);
+
+	ib_unregister_device(&dev->ib_dev);
+	ib_dealloc_device(&dev->ib_dev);
+}
+
+static const struct auxiliary_device_id mana_id_table[] = {
+	{
+		.name = "mana.rdma",
+	},
+	{},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, mana_id_table);
+
+static struct auxiliary_driver mana_driver = {
+	.name = "rdma",
+	.probe = mana_ib_probe,
+	.remove = mana_ib_remove,
+	.id_table = mana_id_table,
+};
+
+module_auxiliary_driver(mana_driver);
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
new file mode 100644
index 000000000000..57e5f9dca454
--- /dev/null
+++ b/drivers/infiniband/hw/mana/main.c
@@ -0,0 +1,508 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
+			 u32 port)
+{
+	struct gdma_dev *gd = dev->gdma_dev;
+	struct mana_port_context *mpc;
+	struct net_device *ndev;
+	struct mana_context *mc;
+
+	mc = gd->driver_data;
+	ndev = mc->ports[port];
+	mpc = netdev_priv(ndev);
+
+	mutex_lock(&pd->vport_mutex);
+
+	pd->vport_use_count--;
+	WARN_ON(pd->vport_use_count < 0);
+
+	if (!pd->vport_use_count)
+		mana_uncfg_vport(mpc);
+
+	mutex_unlock(&pd->vport_mutex);
+}
+
+int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
+		      u32 doorbell_id)
+{
+	struct gdma_dev *mdev = dev->gdma_dev;
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	int err;
+
+	mc = mdev->driver_data;
+	ndev = mc->ports[port];
+	mpc = netdev_priv(ndev);
+
+	mutex_lock(&pd->vport_mutex);
+
+	pd->vport_use_count++;
+	if (pd->vport_use_count > 1) {
+		ibdev_dbg(&dev->ib_dev,
+			  "Skip as this PD is already configured vport\n");
+		mutex_unlock(&pd->vport_mutex);
+		return 0;
+	}
+	mutex_unlock(&pd->vport_mutex);
+
+	err = mana_cfg_vport(mpc, pd->pdn, doorbell_id);
+	if (err) {
+		mutex_lock(&pd->vport_mutex);
+		pd->vport_use_count--;
+		mutex_unlock(&pd->vport_mutex);
+
+		ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n", err);
+		return err;
+	}
+
+	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
+	pd->tx_vp_offset = mpc->tx_vp_offset;
+
+	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
+		  mpc->port_handle, pd->pdn, doorbell_id);
+
+	return 0;
+}
+
+int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+	struct ib_device *ibdev = ibpd->device;
+	struct gdma_create_pd_resp resp = {};
+	struct gdma_create_pd_req req = {};
+	enum gdma_pd_flags flags = 0;
+	struct mana_ib_dev *dev;
+	struct gdma_dev *mdev;
+	int err;
+
+	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mdev = dev->gdma_dev;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
+			     sizeof(resp));
+
+	req.flags = flags;
+	err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req,
+				   sizeof(resp), &resp);
+
+	if (err || resp.hdr.status) {
+		ibdev_dbg(&dev->ib_dev,
+			  "Failed to get pd_id err %d status %u\n", err,
+			  resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+
+		return err;
+	}
+
+	pd->pd_handle = resp.pd_handle;
+	pd->pdn = resp.pd_id;
+	ibdev_dbg(&dev->ib_dev, "pd_handle 0x%llx pd_id %d\n",
+		  pd->pd_handle, pd->pdn);
+
+	mutex_init(&pd->vport_mutex);
+	pd->vport_use_count = 0;
+	return 0;
+}
+
+int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+	struct ib_device *ibdev = ibpd->device;
+	struct gdma_destory_pd_resp resp = {};
+	struct gdma_destroy_pd_req req = {};
+	struct mana_ib_dev *dev;
+	struct gdma_dev *mdev;
+	int err;
+
+	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mdev = dev->gdma_dev;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_PD, sizeof(req),
+			     sizeof(resp));
+
+	req.pd_handle = pd->pd_handle;
+	err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req,
+				   sizeof(resp), &resp);
+
+	if (err || resp.hdr.status) {
+		ibdev_dbg(&dev->ib_dev,
+			  "Failed to destroy pd_handle 0x%llx err %d status %u",
+			  pd->pd_handle, err, resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+	}
+
+	return err;
+}
+
+static int mana_gd_destroy_doorbell_page(struct gdma_context *gc,
+					 int doorbell_page)
+{
+	struct gdma_destroy_resource_range_req req = {};
+	struct gdma_resp_hdr resp = {};
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_RESOURCE_RANGE,
+			     sizeof(req), sizeof(resp));
+
+	req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
+	req.num_resources = 1;
+	req.allocated_resources = doorbell_page;
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+	if (err || resp.status) {
+		dev_err(gc->dev,
+			"Failed to destroy doorbell page: ret %d, 0x%x\n",
+			err, resp.status);
+		return err ? err : -EPROTO;
+	}
+
+	return 0;
+}
+
+static int mana_gd_allocate_doorbell_page(struct gdma_context *gc,
+					  int *doorbell_page)
+{
+	struct gdma_allocate_resource_range_req req = {};
+	struct gdma_allocate_resource_range_resp resp = {};
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOCATE_RESOURCE_RANGE,
+			     sizeof(req), sizeof(resp));
+
+	req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
+	req.num_resources = 1;
+	req.alignment = 1;
+
+	/* Have GDMA start searching from 0 */
+	req.allocated_resources = 0;
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+	if (err || resp.hdr.status) {
+		dev_err(gc->dev,
+			"Failed to allocate doorbell page: ret %d, 0x%x\n",
+			err, resp.hdr.status);
+		return err ? err : -EPROTO;
+	}
+
+	*doorbell_page = resp.allocated_resources;
+
+	return 0;
+}
+
+int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
+			   struct ib_udata *udata)
+{
+	struct mana_ib_ucontext *ucontext =
+		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
+	struct ib_device *ibdev = ibcontext->device;
+	struct mana_ib_dev *mdev;
+	struct gdma_context *gc;
+	struct gdma_dev *dev;
+	int doorbell_page;
+	int ret;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	dev = mdev->gdma_dev;
+	gc = dev->gdma_context;
+
+	/* Allocate a doorbell page index */
+	ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
+	if (ret) {
+		ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret);
+		return -ENOMEM;
+	}
+
+	ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page);
+
+	ucontext->doorbell = doorbell_page;
+
+	return 0;
+}
+
+void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mana_ib_ucontext *mana_ucontext =
+		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
+	struct ib_device *ibdev = ibcontext->device;
+	struct mana_ib_dev *mdev;
+	struct gdma_context *gc;
+	int ret;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	gc = mdev->gdma_dev->gdma_context;
+
+	ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext->doorbell);
+	if (ret)
+		ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret);
+}
+
+int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
+				 mana_handle_t *gdma_region)
+{
+	struct gdma_dma_region_add_pages_req *add_req = NULL;
+	struct gdma_create_dma_region_resp create_resp = {};
+	struct gdma_create_dma_region_req *create_req;
+	size_t num_pages_cur, num_pages_to_handle;
+	unsigned int create_req_msg_size;
+	struct hw_channel_context *hwc;
+	struct ib_block_iter biter;
+	size_t max_pgs_create_cmd;
+	struct gdma_context *gc;
+	size_t num_pages_total;
+	struct gdma_dev *mdev;
+	unsigned long page_sz;
+	void *request_buf;
+	unsigned int i;
+	int err;
+
+	mdev = dev->gdma_dev;
+	gc = mdev->gdma_context;
+	hwc = gc->hwc.driver_data;
+
+	/* Hardware requires dma region to align to chosen page size */
+	page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
+	if (!page_sz) {
+		ibdev_dbg(&dev->ib_dev, "failed to find page size.\n");
+		return -ENOMEM;
+	}
+	num_pages_total = ib_umem_num_dma_blocks(umem, page_sz);
+
+	max_pgs_create_cmd =
+		(hwc->max_req_msg_size - sizeof(*create_req)) / sizeof(u64);
+	num_pages_to_handle =
+		min_t(size_t, num_pages_total, max_pgs_create_cmd);
+	create_req_msg_size =
+		struct_size(create_req, page_addr_list, num_pages_to_handle);
+
+	request_buf = kzalloc(hwc->max_req_msg_size, GFP_KERNEL);
+	if (!request_buf)
+		return -ENOMEM;
+
+	create_req = request_buf;
+	mana_gd_init_req_hdr(&create_req->hdr, GDMA_CREATE_DMA_REGION,
+			     create_req_msg_size, sizeof(create_resp));
+
+	create_req->length = umem->length;
+	create_req->offset_in_page = umem->address & (page_sz - 1);
+	create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT;
+	create_req->page_count = num_pages_total;
+	create_req->page_addr_list_len = num_pages_to_handle;
+
+	ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n",
+		  umem->length, num_pages_total);
+
+	ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n",
+		  page_sz, create_req->offset_in_page);
+
+	ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u",
+		  num_pages_to_handle, create_req->gdma_page_type);
+
+	__rdma_umem_block_iter_start(&biter, umem, page_sz);
+
+	for (i = 0; i < num_pages_to_handle; ++i) {
+		dma_addr_t cur_addr;
+
+		__rdma_block_iter_next(&biter);
+		cur_addr = rdma_block_iter_dma_address(&biter);
+
+		create_req->page_addr_list[i] = cur_addr;
+	}
+
+	err = mana_gd_send_request(gc, create_req_msg_size, create_req,
+				   sizeof(create_resp), &create_resp);
+	if (err || create_resp.hdr.status) {
+		ibdev_dbg(&dev->ib_dev,
+			  "Failed to create DMA region: %d, 0x%x\n", err,
+			  create_resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+
+		kfree(request_buf);
+		return err;
+	}
+
+	*gdma_region = create_resp.dma_region_handle;
+	ibdev_dbg(&dev->ib_dev, "Created DMA region with handle 0x%llx\n",
+		  *gdma_region);
+
+	num_pages_cur = num_pages_to_handle;
+
+	if (num_pages_cur < num_pages_total) {
+		unsigned int add_req_msg_size;
+		size_t max_pgs_add_cmd =
+			(hwc->max_req_msg_size - sizeof(*add_req)) /
+			sizeof(u64);
+
+		num_pages_to_handle =
+			min_t(size_t, num_pages_total - num_pages_cur,
+			      max_pgs_add_cmd);
+
+		/* Calculate the max num of pages that will be handled */
+		add_req_msg_size = struct_size(add_req, page_addr_list,
+					       num_pages_to_handle);
+		add_req = request_buf;
+
+		while (num_pages_cur < num_pages_total) {
+			struct gdma_general_resp add_resp = {};
+			u32 expected_status = 0;
+
+			if (num_pages_cur + num_pages_to_handle <
+			    num_pages_total) {
+				/* Status indicating more pages are needed */
+				expected_status = GDMA_STATUS_MORE_ENTRIES;
+			}
+
+			memset(add_req, 0, add_req_msg_size);
+
+			mana_gd_init_req_hdr(&add_req->hdr,
+					     GDMA_DMA_REGION_ADD_PAGES,
+					     add_req_msg_size,
+					     sizeof(add_resp));
+			add_req->dma_region_handle = *gdma_region;
+			add_req->page_addr_list_len = num_pages_to_handle;
+
+			for (i = 0; i < num_pages_to_handle; ++i) {
+				dma_addr_t cur_addr =
+					rdma_block_iter_dma_address(&biter);
+				add_req->page_addr_list[i] = cur_addr;
+				__rdma_block_iter_next(&biter);
+
+				ibdev_dbg(&dev->ib_dev,
+					  "page_addr_list %lu addr 0x%llx\n",
+					  num_pages_cur + i, cur_addr);
+			}
+
+			err = mana_gd_send_request(gc, add_req_msg_size,
+						   add_req, sizeof(add_resp),
+						   &add_resp);
+			if (err || add_resp.hdr.status != expected_status) {
+				ibdev_dbg(&dev->ib_dev,
+					  "Failed put DMA pages %u: %d,0x%x\n",
+					  i, err, add_resp.hdr.status);
+				err = -EPROTO;
+				break;
+			}
+
+			num_pages_cur += num_pages_to_handle;
+			num_pages_to_handle =
+				min_t(size_t, num_pages_total - num_pages_cur,
+				      max_pgs_add_cmd);
+			add_req_msg_size = sizeof(*add_req) +
+					   num_pages_to_handle * sizeof(u64);
+		}
+	}
+
+	kfree(request_buf);
+
+	if (err)
+		mana_ib_gd_destroy_dma_region(dev, create_resp.dma_region_handle);
+
+	return err;
+}
+
+int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, u64 gdma_region)
+{
+	struct gdma_dev *mdev = dev->gdma_dev;
+	struct gdma_context *gc;
+
+	gc = mdev->gdma_context;
+	ibdev_dbg(&dev->ib_dev, "destroy dma region 0x%llx\n", gdma_region);
+
+	return mana_gd_destroy_dma_region(gc, gdma_region);
+}
+
+int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
+{
+	struct mana_ib_ucontext *mana_ucontext =
+		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
+	struct ib_device *ibdev = ibcontext->device;
+	struct mana_ib_dev *mdev;
+	struct gdma_context *gc;
+	phys_addr_t pfn;
+	pgprot_t prot;
+	int ret;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	gc = mdev->gdma_dev->gdma_context;
+
+	if (vma->vm_pgoff != 0) {
+		ibdev_dbg(ibdev, "Unexpected vm_pgoff %lu\n", vma->vm_pgoff);
+		return -EINVAL;
+	}
+
+	/* Map to the page indexed by ucontext->doorbell */
+	pfn = (gc->phys_db_page_base +
+	       gc->db_page_size * mana_ucontext->doorbell) >>
+	      PAGE_SHIFT;
+	prot = pgprot_writecombine(vma->vm_page_prot);
+
+	ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot,
+				NULL);
+	if (ret)
+		ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret);
+	else
+		ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n",
+			  pfn, gc->db_page_size, ret);
+
+	return ret;
+}
+
+int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	/*
+	 * This version only support RAW_PACKET
+	 * other values need to be filled for other types
+	 */
+	immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
+
+	return 0;
+}
+
+int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			 struct ib_udata *uhw)
+{
+	props->max_qp = MANA_MAX_NUM_QUEUES;
+	props->max_qp_wr = MAX_SEND_BUFFERS_PER_QUEUE;
+
+	/*
+	 * max_cqe could be potentially much bigger.
+	 * As this version of driver only support RAW QP, set it to the same
+	 * value as max_qp_wr
+	 */
+	props->max_cqe = MAX_SEND_BUFFERS_PER_QUEUE;
+
+	props->max_mr_size = MANA_IB_MAX_MR_SIZE;
+	props->max_mr = INT_MAX;
+	props->max_send_sge = MAX_TX_WQE_SGL_ENTRIES;
+	props->max_recv_sge = MAX_RX_WQE_SGL_ENTRIES;
+
+	return 0;
+}
+
+int mana_ib_query_port(struct ib_device *ibdev, u32 port,
+		       struct ib_port_attr *props)
+{
+	/* This version doesn't return port properties */
+	return 0;
+}
+
+int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
+		      union ib_gid *gid)
+{
+	/* This version doesn't return GID properties */
+	return 0;
+}
+
+void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+}
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
new file mode 100644
index 000000000000..2225a6d6f8e1
--- /dev/null
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -0,0 +1,156 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ */
+
+#ifndef _MANA_IB_H_
+#define _MANA_IB_H_
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_umem.h>
+#include <rdma/mana-abi.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include <net/mana/mana.h>
+
+#define PAGE_SZ_BM                                                             \
+	(SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K |        \
+	 SZ_512K | SZ_1M | SZ_2M)
+
+/* MANA doesn't have any limit for MR size */
+#define MANA_IB_MAX_MR_SIZE ((u64)(~(0ULL)))
+
+struct mana_ib_dev {
+	struct ib_device ib_dev;
+	struct gdma_dev *gdma_dev;
+};
+
+struct mana_ib_wq {
+	struct ib_wq ibwq;
+	struct ib_umem *umem;
+	int wqe;
+	u32 wq_buf_size;
+	u64 gdma_region;
+	u64 id;
+	mana_handle_t rx_object;
+};
+
+struct mana_ib_pd {
+	struct ib_pd ibpd;
+	u32 pdn;
+	mana_handle_t pd_handle;
+
+	/* Mutex for sharing access to vport_use_count */
+	struct mutex vport_mutex;
+	int vport_use_count;
+
+	bool tx_shortform_allowed;
+	u32 tx_vp_offset;
+};
+
+struct mana_ib_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	mana_handle_t mr_handle;
+};
+
+struct mana_ib_cq {
+	struct ib_cq ibcq;
+	struct ib_umem *umem;
+	int cqe;
+	u64 gdma_region;
+	u64 id;
+};
+
+struct mana_ib_qp {
+	struct ib_qp ibqp;
+
+	/* Work queue info */
+	struct ib_umem *sq_umem;
+	int sqe;
+	u64 sq_gdma_region;
+	u64 sq_id;
+	mana_handle_t tx_object;
+
+	/* The port on the IB device, starting with 1 */
+	u32 port;
+};
+
+struct mana_ib_ucontext {
+	struct ib_ucontext ibucontext;
+	u32 doorbell;
+};
+
+struct mana_ib_rwq_ind_table {
+	struct ib_rwq_ind_table ib_ind_table;
+};
+
+int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
+				 mana_handle_t *gdma_region);
+
+int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev,
+				  mana_handle_t gdma_region);
+
+struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata);
+
+int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata);
+
+int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata);
+
+int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table,
+				 struct ib_rwq_ind_table_init_attr *init_attr,
+				 struct ib_udata *udata);
+
+int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl);
+
+struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags);
+
+struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 iova, int access_flags,
+				  struct ib_udata *udata);
+
+int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+
+int mana_ib_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr,
+		      struct ib_udata *udata);
+
+int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+
+int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+
+int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port_id,
+		      struct mana_ib_pd *pd, u32 doorbell_id);
+void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
+			 u32 port);
+
+int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata);
+
+int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+
+int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+
+int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
+			   struct ib_udata *udata);
+void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
+
+int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma);
+
+int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
+			       struct ib_port_immutable *immutable);
+int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			 struct ib_udata *uhw);
+int mana_ib_query_port(struct ib_device *ibdev, u32 port,
+		       struct ib_port_attr *props);
+int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
+		      union ib_gid *gid);
+
+void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext);
+
+#endif
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
new file mode 100644
index 000000000000..09124dd1792d
--- /dev/null
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -0,0 +1,200 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+#define VALID_MR_FLAGS                                                         \
+	(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ)
+
+static enum gdma_mr_access_flags
+mana_ib_verbs_to_gdma_access_flags(int access_flags)
+{
+	enum gdma_mr_access_flags flags = GDMA_ACCESS_FLAG_LOCAL_READ;
+
+	if (access_flags & IB_ACCESS_LOCAL_WRITE)
+		flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		flags |= GDMA_ACCESS_FLAG_REMOTE_READ;
+
+	return flags;
+}
+
+static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
+				struct gdma_create_mr_params *mr_params)
+{
+	struct gdma_create_mr_response resp = {};
+	struct gdma_create_mr_request req = {};
+	struct gdma_dev *mdev = dev->gdma_dev;
+	struct gdma_context *gc;
+	int err;
+
+	gc = mdev->gdma_context;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req),
+			     sizeof(resp));
+	req.pd_handle = mr_params->pd_handle;
+	req.mr_type = mr_params->mr_type;
+
+	switch (mr_params->mr_type) {
+	case GDMA_MR_TYPE_GVA:
+		req.gva.dma_region_handle = mr_params->gva.dma_region_handle;
+		req.gva.virtual_address = mr_params->gva.virtual_address;
+		req.gva.access_flags = mr_params->gva.access_flags;
+		break;
+
+	default:
+		ibdev_dbg(&dev->ib_dev,
+			  "invalid param (GDMA_MR_TYPE) passed, type %d\n",
+			  req.mr_type);
+		err = -EINVAL;
+		goto error;
+	}
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+
+	if (err || resp.hdr.status) {
+		ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err,
+			  resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+
+		goto error;
+	}
+
+	mr->ibmr.lkey = resp.lkey;
+	mr->ibmr.rkey = resp.rkey;
+	mr->mr_handle = resp.mr_handle;
+
+	return 0;
+error:
+	return err;
+}
+
+static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, gdma_obj_handle_t mr_handle)
+{
+	struct gdma_destroy_mr_response resp = {};
+	struct gdma_destroy_mr_request req = {};
+	struct gdma_dev *mdev = dev->gdma_dev;
+	struct gdma_context *gc;
+	int err;
+
+	gc = mdev->gdma_context;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req),
+			     sizeof(resp));
+
+	req.mr_handle = mr_handle;
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+	if (err || resp.hdr.status) {
+		dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err,
+			resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+		return err;
+	}
+
+	return 0;
+}
+
+struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
+				  u64 iova, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+	struct gdma_create_mr_params mr_params = {};
+	struct ib_device *ibdev = ibpd->device;
+	gdma_obj_handle_t dma_region_handle;
+	struct mana_ib_dev *dev;
+	struct mana_ib_mr *mr;
+	int err;
+
+	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+
+	ibdev_dbg(ibdev,
+		  "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x",
+		  start, iova, length, access_flags);
+
+	if (access_flags & ~VALID_MR_FLAGS)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = ib_umem_get(ibdev, start, length, access_flags);
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		ibdev_dbg(ibdev,
+			  "Failed to get umem for register user-mr, %d\n", err);
+		goto err_free;
+	}
+
+	err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle);
+	if (err) {
+		ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n",
+			  err);
+		goto err_umem;
+	}
+
+	ibdev_dbg(ibdev,
+		  "mana_ib_gd_create_dma_region ret %d gdma_region %llx\n", err,
+		  dma_region_handle);
+
+	mr_params.pd_handle = pd->pd_handle;
+	mr_params.mr_type = GDMA_MR_TYPE_GVA;
+	mr_params.gva.dma_region_handle = dma_region_handle;
+	mr_params.gva.virtual_address = iova;
+	mr_params.gva.access_flags =
+		mana_ib_verbs_to_gdma_access_flags(access_flags);
+
+	err = mana_ib_gd_create_mr(dev, mr, &mr_params);
+	if (err)
+		goto err_dma_region;
+
+	/*
+	 * There is no need to keep track of dma_region_handle after MR is
+	 * successfully created. The dma_region_handle is tracked in the PF
+	 * as part of the lifecycle of this MR.
+	 */
+
+	return &mr->ibmr;
+
+err_dma_region:
+	mana_gd_destroy_dma_region(dev->gdma_dev->gdma_context,
+				   dma_region_handle);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+	struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr, ibmr);
+	struct ib_device *ibdev = ibmr->device;
+	struct mana_ib_dev *dev;
+	int err;
+
+	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+
+	err = mana_ib_gd_destroy_mr(dev, mr->mr_handle);
+	if (err)
+		return err;
+
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	kfree(mr);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
new file mode 100644
index 000000000000..fec7d4a06ace
--- /dev/null
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -0,0 +1,505 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev,
+				      struct net_device *ndev,
+				      mana_handle_t default_rxobj,
+				      mana_handle_t ind_table[],
+				      u32 log_ind_tbl_size, u32 rx_hash_key_len,
+				      u8 *rx_hash_key)
+{
+	struct mana_port_context *mpc = netdev_priv(ndev);
+	struct mana_cfg_rx_steer_req *req = NULL;
+	struct mana_cfg_rx_steer_resp resp = {};
+	mana_handle_t *req_indir_tab;
+	struct gdma_context *gc;
+	struct gdma_dev *mdev;
+	u32 req_buf_size;
+	int i, err;
+
+	mdev = dev->gdma_dev;
+	gc = mdev->gdma_context;
+
+	req_buf_size =
+		sizeof(*req) + sizeof(mana_handle_t) * MANA_INDIRECT_TABLE_SIZE;
+	req = kzalloc(req_buf_size, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size,
+			     sizeof(resp));
+
+	req->vport = mpc->port_handle;
+	req->rx_enable = 1;
+	req->update_default_rxobj = 1;
+	req->default_rxobj = default_rxobj;
+	req->hdr.dev_id = mdev->dev_id;
+
+	/* If there are more than 1 entries in indirection table, enable RSS */
+	if (log_ind_tbl_size)
+		req->rss_enable = true;
+
+	req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE;
+	req->indir_tab_offset = sizeof(*req);
+	req->update_indir_tab = true;
+
+	req_indir_tab = (mana_handle_t *)(req + 1);
+	/* The ind table passed to the hardware must have
+	 * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb
+	 * ind_table to MANA_INDIRECT_TABLE_SIZE if required
+	 */
+	ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size);
+	for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) {
+		req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)];
+		ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i,
+			  req_indir_tab[i]);
+	}
+
+	req->update_hashkey = true;
+	if (rx_hash_key_len)
+		memcpy(req->hashkey, rx_hash_key, rx_hash_key_len);
+	else
+		netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE);
+
+	ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n",
+		  req->vport, default_rxobj);
+
+	err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp), &resp);
+	if (err) {
+		netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
+		goto out;
+	}
+
+	if (resp.hdr.status) {
+		netdev_err(ndev, "vPort RX configuration failed: 0x%x\n",
+			   resp.hdr.status);
+		err = -EPROTO;
+	}
+
+	netdev_info(ndev, "Configured steering vPort %llu log_entries %u\n",
+		    mpc->port_handle, log_ind_tbl_size);
+
+out:
+	kfree(req);
+	return err;
+}
+
+static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
+				 struct ib_qp_init_attr *attr,
+				 struct ib_udata *udata)
+{
+	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+	struct mana_ib_dev *mdev =
+		container_of(pd->device, struct mana_ib_dev, ib_dev);
+	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
+	struct mana_ib_create_qp_rss_resp resp = {};
+	struct mana_ib_create_qp_rss ucmd = {};
+	struct gdma_dev *gd = mdev->gdma_dev;
+	mana_handle_t *mana_ind_table;
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	struct mana_ib_cq *cq;
+	struct mana_ib_wq *wq;
+	unsigned int ind_tbl_size;
+	struct ib_cq *ibcq;
+	struct ib_wq *ibwq;
+	u32 port;
+	int ret;
+	int i;
+
+	mc = gd->driver_data;
+
+	if (!udata || udata->inlen < sizeof(ucmd))
+		return -EINVAL;
+
+	ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
+	if (ret) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed copy from udata for create rss-qp, err %d\n",
+			  ret);
+		return -EFAULT;
+	}
+
+	if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Requested max_recv_wr %d exceeding limit\n",
+			  attr->cap.max_recv_wr);
+		return -EINVAL;
+	}
+
+	if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Requested max_recv_sge %d exceeding limit\n",
+			  attr->cap.max_recv_sge);
+		return -EINVAL;
+	}
+
+	ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size;
+	if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Indirect table size %d exceeding limit\n",
+			  ind_tbl_size);
+		return -EINVAL;
+	}
+
+	if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "RX Hash function is not supported, %d\n",
+			  ucmd.rx_hash_function);
+		return -EINVAL;
+	}
+
+	/* IB ports start with 1, MANA start with 0 */
+	port = ucmd.port;
+	if (port < 1 || port > mc->num_ports) {
+		ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n",
+			  port);
+		return -EINVAL;
+	}
+	ndev = mc->ports[port - 1];
+	mpc = netdev_priv(ndev);
+
+	ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n",
+		  ucmd.rx_hash_function, port);
+
+	mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t),
+				 GFP_KERNEL);
+	if (!mana_ind_table) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	qp->port = port;
+
+	for (i = 0; i < ind_tbl_size; i++) {
+		struct mana_obj_spec wq_spec = {};
+		struct mana_obj_spec cq_spec = {};
+
+		ibwq = ind_tbl->ind_tbl[i];
+		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+
+		ibcq = ibwq->cq;
+		cq = container_of(ibcq, struct mana_ib_cq, ibcq);
+
+		wq_spec.gdma_region = wq->gdma_region;
+		wq_spec.queue_size = wq->wq_buf_size;
+
+		cq_spec.gdma_region = cq->gdma_region;
+		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
+		cq_spec.modr_ctx_id = 0;
+		cq_spec.attached_eq = GDMA_CQ_NO_EQ;
+
+		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
+					 &wq_spec, &cq_spec, &wq->rx_object);
+		if (ret)
+			goto fail;
+
+		/* The GDMA regions are now owned by the WQ object */
+		wq->gdma_region = GDMA_INVALID_DMA_REGION;
+		cq->gdma_region = GDMA_INVALID_DMA_REGION;
+
+		wq->id = wq_spec.queue_index;
+		cq->id = cq_spec.queue_index;
+
+		ibdev_dbg(&mdev->ib_dev,
+			  "ret %d rx_object 0x%llx wq id %llu cq id %llu\n",
+			  ret, wq->rx_object, wq->id, cq->id);
+
+		resp.entries[i].cqid = cq->id;
+		resp.entries[i].wqid = wq->id;
+
+		mana_ind_table[i] = wq->rx_object;
+	}
+	resp.num_entries = i;
+
+	ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object,
+					 mana_ind_table,
+					 ind_tbl->log_ind_tbl_size,
+					 ucmd.rx_hash_key_len,
+					 ucmd.rx_hash_key);
+	if (ret)
+		goto fail;
+
+	ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (ret) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to copy to udata create rss-qp, %d\n",
+			  ret);
+		goto fail;
+	}
+
+	kfree(mana_ind_table);
+
+	return 0;
+
+fail:
+	while (i-- > 0) {
+		ibwq = ind_tbl->ind_tbl[i];
+		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
+	}
+
+	kfree(mana_ind_table);
+
+	return ret;
+}
+
+static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
+				 struct ib_qp_init_attr *attr,
+				 struct ib_udata *udata)
+{
+	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+	struct mana_ib_dev *mdev =
+		container_of(ibpd->device, struct mana_ib_dev, ib_dev);
+	struct mana_ib_cq *send_cq =
+		container_of(attr->send_cq, struct mana_ib_cq, ibcq);
+	struct mana_ib_ucontext *mana_ucontext =
+		rdma_udata_to_drv_context(udata, struct mana_ib_ucontext,
+					  ibucontext);
+	struct mana_ib_create_qp_resp resp = {};
+	struct gdma_dev *gd = mdev->gdma_dev;
+	struct mana_ib_create_qp ucmd = {};
+	struct mana_obj_spec wq_spec = {};
+	struct mana_obj_spec cq_spec = {};
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	struct ib_umem *umem;
+	int err;
+	u32 port;
+
+	mc = gd->driver_data;
+
+	if (!mana_ucontext || udata->inlen < sizeof(ucmd))
+		return -EINVAL;
+
+	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to copy from udata create qp-raw, %d\n", err);
+		return -EFAULT;
+	}
+
+	/* IB ports start with 1, MANA Ethernet ports start with 0 */
+	port = ucmd.port;
+	if (ucmd.port > mc->num_ports)
+		return -EINVAL;
+
+	if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Requested max_send_wr %d exceeding limit\n",
+			  attr->cap.max_send_wr);
+		return -EINVAL;
+	}
+
+	if (attr->cap.max_send_sge > MAX_TX_WQE_SGL_ENTRIES) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Requested max_send_sge %d exceeding limit\n",
+			  attr->cap.max_send_sge);
+		return -EINVAL;
+	}
+
+	ndev = mc->ports[port - 1];
+	mpc = netdev_priv(ndev);
+	ibdev_dbg(&mdev->ib_dev, "port %u ndev %p mpc %p\n", port, ndev, mpc);
+
+	err = mana_ib_cfg_vport(mdev, port - 1, pd, mana_ucontext->doorbell);
+	if (err)
+		return -ENODEV;
+
+	qp->port = port;
+
+	ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n",
+		  ucmd.sq_buf_addr, ucmd.port);
+
+	umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr, ucmd.sq_buf_size,
+			   IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(umem)) {
+		err = PTR_ERR(umem);
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to get umem for create qp-raw, err %d\n",
+			  err);
+		goto err_free_vport;
+	}
+	qp->sq_umem = umem;
+
+	err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem,
+					   &qp->sq_gdma_region);
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to create dma region for create qp-raw, %d\n",
+			  err);
+		goto err_release_umem;
+	}
+
+	ibdev_dbg(&mdev->ib_dev,
+		  "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n",
+		  err, qp->sq_gdma_region);
+
+	/* Create a WQ on the same port handle used by the Ethernet */
+	wq_spec.gdma_region = qp->sq_gdma_region;
+	wq_spec.queue_size = ucmd.sq_buf_size;
+
+	cq_spec.gdma_region = send_cq->gdma_region;
+	cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE;
+	cq_spec.modr_ctx_id = 0;
+	cq_spec.attached_eq = GDMA_CQ_NO_EQ;
+
+	err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec,
+				 &cq_spec, &qp->tx_object);
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to create wq for create raw-qp, err %d\n",
+			  err);
+		goto err_destroy_dma_region;
+	}
+
+	/* The GDMA regions are now owned by the WQ object */
+	qp->sq_gdma_region = GDMA_INVALID_DMA_REGION;
+	send_cq->gdma_region = GDMA_INVALID_DMA_REGION;
+
+	qp->sq_id = wq_spec.queue_index;
+	send_cq->id = cq_spec.queue_index;
+
+	ibdev_dbg(&mdev->ib_dev,
+		  "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err,
+		  qp->tx_object, qp->sq_id, send_cq->id);
+
+	resp.sqid = qp->sq_id;
+	resp.cqid = send_cq->id;
+	resp.tx_vp_offset = pd->tx_vp_offset;
+
+	err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed copy udata for create qp-raw, %d\n",
+			  err);
+		goto err_destroy_wq_obj;
+	}
+
+	return 0;
+
+err_destroy_wq_obj:
+	mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object);
+
+err_destroy_dma_region:
+	mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region);
+
+err_release_umem:
+	ib_umem_release(umem);
+
+err_free_vport:
+	mana_ib_uncfg_vport(mdev, pd, port - 1);
+
+	return err;
+}
+
+int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
+		      struct ib_udata *udata)
+{
+	switch (attr->qp_type) {
+	case IB_QPT_RAW_PACKET:
+		/* When rwq_ind_tbl is used, it's for creating WQs for RSS */
+		if (attr->rwq_ind_tbl)
+			return mana_ib_create_qp_rss(ibqp, ibqp->pd, attr,
+						     udata);
+
+		return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata);
+	default:
+		/* Creating QP other than IB_QPT_RAW_PACKET is not supported */
+		ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n",
+			  attr->qp_type);
+	}
+
+	return -EINVAL;
+}
+
+int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	/* modify_qp is not supported by this version of the driver */
+	return -EOPNOTSUPP;
+}
+
+static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
+				  struct ib_rwq_ind_table *ind_tbl,
+				  struct ib_udata *udata)
+{
+	struct mana_ib_dev *mdev =
+		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
+	struct gdma_dev *gd = mdev->gdma_dev;
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	struct mana_ib_wq *wq;
+	struct ib_wq *ibwq;
+	int i;
+
+	mc = gd->driver_data;
+	ndev = mc->ports[qp->port - 1];
+	mpc = netdev_priv(ndev);
+
+	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+		ibwq = ind_tbl->ind_tbl[i];
+		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+		ibdev_dbg(&mdev->ib_dev, "destroying wq->rx_object %llu\n",
+			  wq->rx_object);
+		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
+	}
+
+	return 0;
+}
+
+static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata)
+{
+	struct mana_ib_dev *mdev =
+		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
+	struct gdma_dev *gd = mdev->gdma_dev;
+	struct ib_pd *ibpd = qp->ibqp.pd;
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	struct mana_ib_pd *pd;
+
+	mc = gd->driver_data;
+	ndev = mc->ports[qp->port - 1];
+	mpc = netdev_priv(ndev);
+	pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+
+	mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object);
+
+	if (qp->sq_umem) {
+		mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region);
+		ib_umem_release(qp->sq_umem);
+	}
+
+	mana_ib_uncfg_vport(mdev, pd, qp->port - 1);
+
+	return 0;
+}
+
+int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+{
+	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+
+	switch (ibqp->qp_type) {
+	case IB_QPT_RAW_PACKET:
+		if (ibqp->rwq_ind_tbl)
+			return mana_ib_destroy_qp_rss(qp, ibqp->rwq_ind_tbl,
+						      udata);
+
+		return mana_ib_destroy_qp_raw(qp, udata);
+
+	default:
+		ibdev_dbg(ibqp->device, "Unexpected QP type %u\n",
+			  ibqp->qp_type);
+	}
+
+	return -ENOENT;
+}
diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c
new file mode 100644
index 000000000000..5472e9472f94
--- /dev/null
+++ b/drivers/infiniband/hw/mana/wq.c
@@ -0,0 +1,115 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mana_ib_dev *mdev =
+		container_of(pd->device, struct mana_ib_dev, ib_dev);
+	struct mana_ib_create_wq ucmd = {};
+	struct mana_ib_wq *wq;
+	struct ib_umem *umem;
+	int err;
+
+	if (udata->inlen < sizeof(ucmd))
+		return ERR_PTR(-EINVAL);
+
+	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to copy from udata for create wq, %d\n", err);
+		return ERR_PTR(-EFAULT);
+	}
+
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	if (!wq)
+		return ERR_PTR(-ENOMEM);
+
+	ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr);
+
+	umem = ib_umem_get(pd->device, ucmd.wq_buf_addr, ucmd.wq_buf_size,
+			   IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(umem)) {
+		err = PTR_ERR(umem);
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to get umem for create wq, err %d\n", err);
+		goto err_free_wq;
+	}
+
+	wq->umem = umem;
+	wq->wqe = init_attr->max_wr;
+	wq->wq_buf_size = ucmd.wq_buf_size;
+	wq->rx_object = INVALID_MANA_HANDLE;
+
+	err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region);
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to create dma region for create wq, %d\n",
+			  err);
+		goto err_release_umem;
+	}
+
+	ibdev_dbg(&mdev->ib_dev,
+		  "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n",
+		  err, wq->gdma_region);
+
+	/* WQ ID is returned at wq_create time, doesn't know the value yet */
+
+	return &wq->ibwq;
+
+err_release_umem:
+	ib_umem_release(umem);
+
+err_free_wq:
+	kfree(wq);
+
+	return ERR_PTR(err);
+}
+
+int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata)
+{
+	/* modify_wq is not supported by this version of the driver */
+	return -EOPNOTSUPP;
+}
+
+int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
+{
+	struct mana_ib_wq *wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+	struct ib_device *ib_dev = ibwq->device;
+	struct mana_ib_dev *mdev;
+
+	mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev);
+
+	mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region);
+	ib_umem_release(wq->umem);
+
+	kfree(wq);
+
+	return 0;
+}
+
+int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table,
+				 struct ib_rwq_ind_table_init_attr *init_attr,
+				 struct ib_udata *udata)
+{
+	/*
+	 * There is no additional data in ind_table to be maintained by this
+	 * driver, do nothing
+	 */
+	return 0;
+}
+
+int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+	/*
+	 * There is no additional data in ind_table to be maintained by this
+	 * driver, do nothing
+	 */
+	return 0;
+}
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 713a8f8cca9a..20212ffeefb9 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -412,6 +412,9 @@  int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
 
 extern const struct ethtool_ops mana_ethtool_ops;
 
+/* A CQ can be created not associated with any EQ */
+#define GDMA_CQ_NO_EQ  0xffff
+
 struct mana_obj_spec {
 	u32 queue_index;
 	u64 gdma_region;
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
index 7dd56210226f..e0c25537fd2e 100644
--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -251,6 +251,7 @@  enum rdma_driver_id {
 	RDMA_DRIVER_EFA,
 	RDMA_DRIVER_SIW,
 	RDMA_DRIVER_ERDMA,
+	RDMA_DRIVER_MANA,
 };
 
 enum ib_uverbs_gid_type {
diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h
new file mode 100644
index 000000000000..5fcb31b37fb9
--- /dev/null
+++ b/include/uapi/rdma/mana-abi.h
@@ -0,0 +1,66 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#ifndef MANA_ABI_USER_H
+#define MANA_ABI_USER_H
+
+#include <linux/types.h>
+#include <rdma/ib_user_ioctl_verbs.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+
+#define MANA_IB_UVERBS_ABI_VERSION 1
+
+struct mana_ib_create_cq {
+	__aligned_u64 buf_addr;
+};
+
+struct mana_ib_create_qp {
+	__aligned_u64 sq_buf_addr;
+	__u32 sq_buf_size;
+	__u32 port;
+};
+
+struct mana_ib_create_qp_resp {
+	__u32 sqid;
+	__u32 cqid;
+	__u32 tx_vp_offset;
+	__u32 reserved;
+};
+
+struct mana_ib_create_wq {
+	__aligned_u64 wq_buf_addr;
+	__u32 wq_buf_size;
+	__u32 reserved;
+};
+
+/* RX Hash function flags */
+enum mana_ib_rx_hash_function_flags {
+	MANA_IB_RX_HASH_FUNC_TOEPLITZ = 1 << 0,
+};
+
+struct mana_ib_create_qp_rss {
+	__aligned_u64 rx_hash_fields_mask;
+	__u8 rx_hash_function;
+	__u8 reserved[7];
+	__u32 rx_hash_key_len;
+	__u8 rx_hash_key[40];
+	__u32 port;
+};
+
+struct rss_resp_entry {
+	__u32 cqid;
+	__u32 wqid;
+};
+
+struct mana_ib_create_qp_rss_resp {
+	__aligned_u64 num_entries;
+	struct rss_resp_entry entries[64];
+};
+
+#endif