[for-next,v5,6/7] RDMA/rxe: Add support for Send/Recv/Write/Read with ODP

Message ID 25d903e0136ea1e65c612d8f6b8c18c1f010add7.1684397037.git.matsuda-daisuke@fujitsu.com
State New
Headers
Series On-Demand Paging on SoftRoCE |

Commit Message

Daisuke Matsuda (Fujitsu) May 18, 2023, 8:21 a.m. UTC
  rxe_mr_copy() is used widely to copy data to/from a user MR. requester uses
it to load payloads of requesting packets; responder uses it to process
Send, Write, and Read operaetions; completer uses it to copy data from
response packets of Read and Atomic operations to a user MR.

Allow these operations to be used with ODP by adding a subordinate function
rxe_odp_mr_copy(). It is comprised of the following steps:
 1. Check the driver page table(umem_odp->dma_list) to see if pages being
    accessed are present with appropriate permission.
 2. If necessary, trigger page fault to map the pages.
 3. Update the MR xarray using PFNs in umem_odp->pfn_list.
 4. Execute data copy to/from the pages.

umem_mutex is used to ensure that dma_list (an array of addresses of an MR)
is not changed while it is being checked and that mapped pages are not
invalidated before data copy completes.

Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
---
 drivers/infiniband/sw/rxe/rxe.c     |  10 +++
 drivers/infiniband/sw/rxe/rxe_loc.h |   8 ++
 drivers/infiniband/sw/rxe/rxe_mr.c  |   2 +-
 drivers/infiniband/sw/rxe/rxe_odp.c | 109 ++++++++++++++++++++++++++++
 4 files changed, 128 insertions(+), 1 deletion(-)
  

Comments

Bob Pearson May 19, 2023, 5:10 p.m. UTC | #1
On 5/18/23 03:21, Daisuke Matsuda wrote:
> rxe_mr_copy() is used widely to copy data to/from a user MR. requester uses
> it to load payloads of requesting packets; responder uses it to process
> Send, Write, and Read operaetions; completer uses it to copy data from
> response packets of Read and Atomic operations to a user MR.
> 
> Allow these operations to be used with ODP by adding a subordinate function
> rxe_odp_mr_copy(). It is comprised of the following steps:
>  1. Check the driver page table(umem_odp->dma_list) to see if pages being
>     accessed are present with appropriate permission.
>  2. If necessary, trigger page fault to map the pages.
>  3. Update the MR xarray using PFNs in umem_odp->pfn_list.
>  4. Execute data copy to/from the pages.
> 
> umem_mutex is used to ensure that dma_list (an array of addresses of an MR)
> is not changed while it is being checked and that mapped pages are not
> invalidated before data copy completes.
> 
> Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
> ---
>  drivers/infiniband/sw/rxe/rxe.c     |  10 +++
>  drivers/infiniband/sw/rxe/rxe_loc.h |   8 ++
>  drivers/infiniband/sw/rxe/rxe_mr.c  |   2 +-
>  drivers/infiniband/sw/rxe/rxe_odp.c | 109 ++++++++++++++++++++++++++++
>  4 files changed, 128 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index f2284d27229b..207a022156f0 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -79,6 +79,16 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
>  
>  		/* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
>  		rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
> +
> +		rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
> +		rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV;
> +		rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
> +
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
>  	}
>  }
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index 93247d123642..4b95c8c46bdc 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -206,6 +206,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
>  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
>  int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
>  			 u64 iova, int access_flags, struct rxe_mr *mr);
> +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
> +		    enum rxe_mr_copy_dir dir);
>  #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>  static inline int
>  rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
> @@ -213,6 +215,12 @@ rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
>  {
>  	return -EOPNOTSUPP;
>  }
> +static inline int
> +rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
> +		int length, enum rxe_mr_copy_dir dir)
> +{
> +	return -EOPNOTSUPP;
> +}
>  
>  #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
> index cd368cd096c8..0e3cda59d702 100644
> --- a/drivers/infiniband/sw/rxe/rxe_mr.c
> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
> @@ -319,7 +319,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
>  	}
>  
>  	if (mr->odp_enabled)
> -		return -EOPNOTSUPP;
> +		return rxe_odp_mr_copy(mr, iova, addr, length, dir);
>  	else
>  		return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
>  }
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index e5497d09c399..cbe5d0c3fcc4 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -174,3 +174,112 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
>  
>  	return err;
>  }
> +
> +static inline bool rxe_is_pagefault_neccesary(struct ib_umem_odp *umem_odp,
> +					      u64 iova, int length, u32 perm)
> +{
> +	int idx;
> +	u64 addr;
> +	bool need_fault = false;
> +
> +	addr = iova & (~(BIT(umem_odp->page_shift) - 1));
> +
> +	/* Skim through all pages that are to be accessed. */
> +	while (addr < iova + length) {
> +		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
> +
> +		if (!(umem_odp->dma_list[idx] & perm)) {
> +			need_fault = true;
> +			break;
> +		}
> +
> +		addr += BIT(umem_odp->page_shift);
> +	}
> +	return need_fault;
> +}
> +
> +/* umem mutex must be locked before entering this function. */
> +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags)
> +{
> +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> +	const int max_tries = 3;
> +	int cnt = 0;
> +
> +	int err;
> +	u64 perm;
> +	bool need_fault;
> +
> +	if (unlikely(length < 1)) {
> +		mutex_unlock(&umem_odp->umem_mutex);
> +		return -EINVAL;
> +	}
> +
> +	perm = ODP_READ_ALLOWED_BIT;
> +	if (!(flags & RXE_PAGEFAULT_RDONLY))
> +		perm |= ODP_WRITE_ALLOWED_BIT;
> +
> +	/*
> +	 * A successful return from rxe_odp_do_pagefault() does not guarantee
> +	 * that all pages in the range became present. Recheck the DMA address
> +	 * array, allowing max 3 tries for pagefault.
> +	 */
> +	while ((need_fault = rxe_is_pagefault_neccesary(umem_odp,
> +							iova, length, perm))) {
> +		if (cnt >= max_tries)
> +			break;
> +
> +		mutex_unlock(&umem_odp->umem_mutex);
> +
> +		/* umem_mutex is locked on success. */
> +		err = rxe_odp_do_pagefault(mr, iova, length, flags);
> +		if (err < 0)
> +			return err;
> +
> +		cnt++;
> +	}
> +
> +	if (need_fault)
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
> +		    enum rxe_mr_copy_dir dir)
> +{
> +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> +	u32 flags = 0;
> +	int err;
> +
> +	if (unlikely(!mr->odp_enabled))
> +		return -EOPNOTSUPP;
> +
> +	switch (dir) {
> +	case RXE_TO_MR_OBJ:
> +		break;
> +
> +	case RXE_FROM_MR_OBJ:
> +		flags = RXE_PAGEFAULT_RDONLY;
> +		break;
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	/* If pagefault is not required, umem mutex will be held until data
> +	 * copy to the MR completes. Otherwise, it is released and locked
> +	 * again in rxe_odp_map_range() to let invalidation handler do its
> +	 * work meanwhile.
> +	 */
> +	mutex_lock(&umem_odp->umem_mutex);
> +
> +	err = rxe_odp_map_range(mr, iova, length, flags);
> +	if (err)
> +		return err;
> +
> +	err =  rxe_mr_copy_xarray(mr, iova, addr, length, dir);
> +
> +	mutex_unlock(&umem_odp->umem_mutex);
> +
> +	return err;
> +}

Reviewed-by: Bob Pearson <rpearsonhpe@gmail.com>
  
Bob Pearson May 19, 2023, 5:10 p.m. UTC | #2
On 5/18/23 03:21, Daisuke Matsuda wrote:
> rxe_mr_copy() is used widely to copy data to/from a user MR. requester uses
> it to load payloads of requesting packets; responder uses it to process
> Send, Write, and Read operaetions; completer uses it to copy data from
> response packets of Read and Atomic operations to a user MR.
> 
> Allow these operations to be used with ODP by adding a subordinate function
> rxe_odp_mr_copy(). It is comprised of the following steps:
>  1. Check the driver page table(umem_odp->dma_list) to see if pages being
>     accessed are present with appropriate permission.
>  2. If necessary, trigger page fault to map the pages.
>  3. Update the MR xarray using PFNs in umem_odp->pfn_list.
>  4. Execute data copy to/from the pages.
> 
> umem_mutex is used to ensure that dma_list (an array of addresses of an MR)
> is not changed while it is being checked and that mapped pages are not
> invalidated before data copy completes.
> 
> Signed-off-by: Daisuke Matsuda <matsuda-daisuke@fujitsu.com>
> ---
>  drivers/infiniband/sw/rxe/rxe.c     |  10 +++
>  drivers/infiniband/sw/rxe/rxe_loc.h |   8 ++
>  drivers/infiniband/sw/rxe/rxe_mr.c  |   2 +-
>  drivers/infiniband/sw/rxe/rxe_odp.c | 109 ++++++++++++++++++++++++++++
>  4 files changed, 128 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index f2284d27229b..207a022156f0 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -79,6 +79,16 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
>  
>  		/* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
>  		rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
> +
> +		rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
> +		rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV;
> +		rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
> +
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
> +		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
>  	}
>  }
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index 93247d123642..4b95c8c46bdc 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -206,6 +206,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
>  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
>  int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
>  			 u64 iova, int access_flags, struct rxe_mr *mr);
> +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
> +		    enum rxe_mr_copy_dir dir);
>  #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>  static inline int
>  rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
> @@ -213,6 +215,12 @@ rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
>  {
>  	return -EOPNOTSUPP;
>  }
> +static inline int
> +rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
> +		int length, enum rxe_mr_copy_dir dir)
> +{
> +	return -EOPNOTSUPP;
> +}
>  
>  #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
> index cd368cd096c8..0e3cda59d702 100644
> --- a/drivers/infiniband/sw/rxe/rxe_mr.c
> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
> @@ -319,7 +319,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
>  	}
>  
>  	if (mr->odp_enabled)
> -		return -EOPNOTSUPP;
> +		return rxe_odp_mr_copy(mr, iova, addr, length, dir);
>  	else
>  		return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
>  }
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index e5497d09c399..cbe5d0c3fcc4 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -174,3 +174,112 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
>  
>  	return err;
>  }
> +
> +static inline bool rxe_is_pagefault_neccesary(struct ib_umem_odp *umem_odp,
> +					      u64 iova, int length, u32 perm)
> +{
> +	int idx;
> +	u64 addr;
> +	bool need_fault = false;
> +
> +	addr = iova & (~(BIT(umem_odp->page_shift) - 1));
> +
> +	/* Skim through all pages that are to be accessed. */
> +	while (addr < iova + length) {
> +		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
> +
> +		if (!(umem_odp->dma_list[idx] & perm)) {
> +			need_fault = true;
> +			break;
> +		}
> +
> +		addr += BIT(umem_odp->page_shift);
> +	}
> +	return need_fault;
> +}
> +
> +/* umem mutex must be locked before entering this function. */
> +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags)
> +{
> +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> +	const int max_tries = 3;
> +	int cnt = 0;
> +
> +	int err;
> +	u64 perm;
> +	bool need_fault;
> +
> +	if (unlikely(length < 1)) {
> +		mutex_unlock(&umem_odp->umem_mutex);
> +		return -EINVAL;
> +	}
> +
> +	perm = ODP_READ_ALLOWED_BIT;
> +	if (!(flags & RXE_PAGEFAULT_RDONLY))
> +		perm |= ODP_WRITE_ALLOWED_BIT;
> +
> +	/*
> +	 * A successful return from rxe_odp_do_pagefault() does not guarantee
> +	 * that all pages in the range became present. Recheck the DMA address
> +	 * array, allowing max 3 tries for pagefault.
> +	 */
> +	while ((need_fault = rxe_is_pagefault_neccesary(umem_odp,
> +							iova, length, perm))) {
> +		if (cnt >= max_tries)
> +			break;
> +
> +		mutex_unlock(&umem_odp->umem_mutex);
> +
> +		/* umem_mutex is locked on success. */
> +		err = rxe_odp_do_pagefault(mr, iova, length, flags);
> +		if (err < 0)
> +			return err;
> +
> +		cnt++;
> +	}
> +
> +	if (need_fault)
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
> +		    enum rxe_mr_copy_dir dir)
> +{
> +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> +	u32 flags = 0;
> +	int err;
> +
> +	if (unlikely(!mr->odp_enabled))
> +		return -EOPNOTSUPP;
> +
> +	switch (dir) {
> +	case RXE_TO_MR_OBJ:
> +		break;
> +
> +	case RXE_FROM_MR_OBJ:
> +		flags = RXE_PAGEFAULT_RDONLY;
> +		break;
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	/* If pagefault is not required, umem mutex will be held until data
> +	 * copy to the MR completes. Otherwise, it is released and locked
> +	 * again in rxe_odp_map_range() to let invalidation handler do its
> +	 * work meanwhile.
> +	 */
> +	mutex_lock(&umem_odp->umem_mutex);
> +
> +	err = rxe_odp_map_range(mr, iova, length, flags);
> +	if (err)
> +		return err;
> +
> +	err =  rxe_mr_copy_xarray(mr, iova, addr, length, dir);
> +
> +	mutex_unlock(&umem_odp->umem_mutex);
> +
> +	return err;
> +}

Reviewed-by: Bob Pearson <rpearsonhpe@gmail.com>
  
Jason Gunthorpe June 12, 2023, 4:22 p.m. UTC | #3
On Thu, May 18, 2023 at 05:21:51PM +0900, Daisuke Matsuda wrote:

> +/* umem mutex must be locked before entering this function. */
> +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags)
> +{
> +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> +	const int max_tries = 3;
> +	int cnt = 0;
> +
> +	int err;
> +	u64 perm;
> +	bool need_fault;
> +
> +	if (unlikely(length < 1)) {
> +		mutex_unlock(&umem_odp->umem_mutex);
> +		return -EINVAL;
> +	}
> +
> +	perm = ODP_READ_ALLOWED_BIT;
> +	if (!(flags & RXE_PAGEFAULT_RDONLY))
> +		perm |= ODP_WRITE_ALLOWED_BIT;
> +
> +	/*
> +	 * A successful return from rxe_odp_do_pagefault() does not guarantee
> +	 * that all pages in the range became present. Recheck the DMA address
> +	 * array, allowing max 3 tries for pagefault.
> +	 */
> +	while ((need_fault = rxe_is_pagefault_neccesary(umem_odp,
> +							iova, length, perm))) {
> +		if (cnt >= max_tries)
> +			break;

I don't think this makes sense..

You need to make this work more like mlx5 does, you take the spinlock
on the xarray, you search it for your index and whatever is there
tells what to do. Hold the spinlock while doing the copy. This is
enough locking for the fast path.

If there is no index present, or it is not writable and you need
write, then you unlock the spinlock and prefetch the missing entry and
try again, this time also holding the mutex so there isn't a race.

You shouldn't probe into parts of the umem to discover information
already stored in the xarray then do the same lookup into the xarray.

IIRC this also needs to keep track in the xarray on a per page basis
if the page is writable.

Jason
  
Daisuke Matsuda (Fujitsu) July 19, 2023, 6:01 a.m. UTC | #4
On Tue, June 13, 2023 1:23 AM Jason Gunthorpe wrote:
> 
> On Thu, May 18, 2023 at 05:21:51PM +0900, Daisuke Matsuda wrote:
> 
> > +/* umem mutex must be locked before entering this function. */
> > +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags)
> > +{
> > +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> > +	const int max_tries = 3;
> > +	int cnt = 0;
> > +
> > +	int err;
> > +	u64 perm;
> > +	bool need_fault;
> > +
> > +	if (unlikely(length < 1)) {
> > +		mutex_unlock(&umem_odp->umem_mutex);
> > +		return -EINVAL;
> > +	}
> > +
> > +	perm = ODP_READ_ALLOWED_BIT;
> > +	if (!(flags & RXE_PAGEFAULT_RDONLY))
> > +		perm |= ODP_WRITE_ALLOWED_BIT;
> > +
> > +	/*
> > +	 * A successful return from rxe_odp_do_pagefault() does not guarantee
> > +	 * that all pages in the range became present. Recheck the DMA address
> > +	 * array, allowing max 3 tries for pagefault.
> > +	 */
> > +	while ((need_fault = rxe_is_pagefault_neccesary(umem_odp,
> > +							iova, length, perm))) {
> > +		if (cnt >= max_tries)
> > +			break;
> 
> I don't think this makes sense..
> 
> You need to make this work more like mlx5 does, you take the spinlock
> on the xarray, you search it for your index and whatever is there
> tells what to do. Hold the spinlock while doing the copy. This is
> enough locking for the fast path.
> 
> If there is no index present, or it is not writable and you need
> write, then you unlock the spinlock and prefetch the missing entry and
> try again, this time also holding the mutex so there isn't a race.
> 
> You shouldn't probe into parts of the umem to discover information
> already stored in the xarray then do the same lookup into the xarray.
> 
> IIRC this also needs to keep track in the xarray on a per page basis
> if the page is writable.

Thank you for the detailed explanation.
I agree the current implementation is inefficient.

I think I can fix the patches according to your comment.
I'll submit a new series once it is complete.

Thanks,
Daisuke

> 
> Jason
  
Daisuke Matsuda (Fujitsu) Sept. 8, 2023, 6:35 a.m. UTC | #5
On Tue, June 13, 2023 1:23 AM Jason Gunthorpe wrote:
> On Thu, May 18, 2023 at 05:21:51PM +0900, Daisuke Matsuda wrote:
> 
> > +/* umem mutex must be locked before entering this function. */
> > +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags)
> > +{
> > +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> > +	const int max_tries = 3;
> > +	int cnt = 0;
> > +
> > +	int err;
> > +	u64 perm;
> > +	bool need_fault;
> > +
> > +	if (unlikely(length < 1)) {
> > +		mutex_unlock(&umem_odp->umem_mutex);
> > +		return -EINVAL;
> > +	}
> > +
> > +	perm = ODP_READ_ALLOWED_BIT;
> > +	if (!(flags & RXE_PAGEFAULT_RDONLY))
> > +		perm |= ODP_WRITE_ALLOWED_BIT;
> > +
> > +	/*
> > +	 * A successful return from rxe_odp_do_pagefault() does not guarantee
> > +	 * that all pages in the range became present. Recheck the DMA address
> > +	 * array, allowing max 3 tries for pagefault.
> > +	 */
> > +	while ((need_fault = rxe_is_pagefault_neccesary(umem_odp,
> > +							iova, length, perm))) {
> > +		if (cnt >= max_tries)
> > +			break;
> 
> I don't think this makes sense..

I have posted the new implementation, but it is somewhat different from
your suggestion. The reason is as below.

> 
> You need to make this work more like mlx5 does, you take the spinlock
> on the xarray, you search it for your index and whatever is there
> tells what to do. Hold the spinlock while doing the copy. This is
> enough locking for the fast path.

This lock should be umem mutex. Actual page-out is executed in the
kernel after rxe_ib_invalidate_range() is done. It is possible the spinlock
does not block this flow if it is locked while the invalidation is running.
The umem mutex can serialize these accesses.

> 
> If there is no index present, or it is not writable and you need
> write, then you unlock the spinlock and prefetch the missing entry and
> try again, this time also holding the mutex so there isn't a race.
> 
> You shouldn't probe into parts of the umem to discover information
> already stored in the xarray then do the same lookup into the xarray.
> 
> IIRC this also needs to keep track in the xarray on a per page basis
> if the page is writable.

An xarray entry can hold a pointer or a value from 0 to LONG_MAX.
That is not enough to store page address and its permission.

If we try to do everything with xarray, we need to allocate a new struct
for each page that holds a pointer to a page and a value to store r/w permission.
That is inefficient in terms of memory usage and implementation.

I think the xarray can be used to check presence of pages just like we have
been doing in the non-ODP case. On the other hand, the permission
should be fetched from umem_odp->pfn_list, which is updated everytime
page fault is executed.

Thanks,
Daisuke

> 
> Jason
  

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index f2284d27229b..207a022156f0 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -79,6 +79,16 @@  static void rxe_init_device_param(struct rxe_dev *rxe)
 
 		/* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
 		rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
+
+		rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
+		rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV;
+		rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
+		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
+		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
+		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
+		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
+		rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
 	}
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index 93247d123642..4b95c8c46bdc 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -206,6 +206,8 @@  static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
 			 u64 iova, int access_flags, struct rxe_mr *mr);
+int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
+		    enum rxe_mr_copy_dir dir);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 static inline int
 rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
@@ -213,6 +215,12 @@  rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
 {
 	return -EOPNOTSUPP;
 }
+static inline int
+rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
+		int length, enum rxe_mr_copy_dir dir)
+{
+	return -EOPNOTSUPP;
+}
 
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index cd368cd096c8..0e3cda59d702 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -319,7 +319,7 @@  int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
 	}
 
 	if (mr->odp_enabled)
-		return -EOPNOTSUPP;
+		return rxe_odp_mr_copy(mr, iova, addr, length, dir);
 	else
 		return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
index e5497d09c399..cbe5d0c3fcc4 100644
--- a/drivers/infiniband/sw/rxe/rxe_odp.c
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -174,3 +174,112 @@  int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
 
 	return err;
 }
+
+static inline bool rxe_is_pagefault_neccesary(struct ib_umem_odp *umem_odp,
+					      u64 iova, int length, u32 perm)
+{
+	int idx;
+	u64 addr;
+	bool need_fault = false;
+
+	addr = iova & (~(BIT(umem_odp->page_shift) - 1));
+
+	/* Skim through all pages that are to be accessed. */
+	while (addr < iova + length) {
+		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
+
+		if (!(umem_odp->dma_list[idx] & perm)) {
+			need_fault = true;
+			break;
+		}
+
+		addr += BIT(umem_odp->page_shift);
+	}
+	return need_fault;
+}
+
+/* umem mutex must be locked before entering this function. */
+static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags)
+{
+	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+	const int max_tries = 3;
+	int cnt = 0;
+
+	int err;
+	u64 perm;
+	bool need_fault;
+
+	if (unlikely(length < 1)) {
+		mutex_unlock(&umem_odp->umem_mutex);
+		return -EINVAL;
+	}
+
+	perm = ODP_READ_ALLOWED_BIT;
+	if (!(flags & RXE_PAGEFAULT_RDONLY))
+		perm |= ODP_WRITE_ALLOWED_BIT;
+
+	/*
+	 * A successful return from rxe_odp_do_pagefault() does not guarantee
+	 * that all pages in the range became present. Recheck the DMA address
+	 * array, allowing max 3 tries for pagefault.
+	 */
+	while ((need_fault = rxe_is_pagefault_neccesary(umem_odp,
+							iova, length, perm))) {
+		if (cnt >= max_tries)
+			break;
+
+		mutex_unlock(&umem_odp->umem_mutex);
+
+		/* umem_mutex is locked on success. */
+		err = rxe_odp_do_pagefault(mr, iova, length, flags);
+		if (err < 0)
+			return err;
+
+		cnt++;
+	}
+
+	if (need_fault)
+		return -EFAULT;
+
+	return 0;
+}
+
+int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
+		    enum rxe_mr_copy_dir dir)
+{
+	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+	u32 flags = 0;
+	int err;
+
+	if (unlikely(!mr->odp_enabled))
+		return -EOPNOTSUPP;
+
+	switch (dir) {
+	case RXE_TO_MR_OBJ:
+		break;
+
+	case RXE_FROM_MR_OBJ:
+		flags = RXE_PAGEFAULT_RDONLY;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	/* If pagefault is not required, umem mutex will be held until data
+	 * copy to the MR completes. Otherwise, it is released and locked
+	 * again in rxe_odp_map_range() to let invalidation handler do its
+	 * work meanwhile.
+	 */
+	mutex_lock(&umem_odp->umem_mutex);
+
+	err = rxe_odp_map_range(mr, iova, length, flags);
+	if (err)
+		return err;
+
+	err =  rxe_mr_copy_xarray(mr, iova, addr, length, dir);
+
+	mutex_unlock(&umem_odp->umem_mutex);
+
+	return err;
+}