[RFC,V2,6/6] RDMA/rxe: Support PAGE_SIZE aligned MR

Message ID 20231103095549.490744-7-lizhijian@fujitsu.com
State New
Headers
Series rxe_map_mr_sg() fix cleanup and refactor |

Commit Message

Zhijian Li (Fujitsu) Nov. 3, 2023, 9:55 a.m. UTC
  In order to support PAGE_SIZE aligned MR, rxe_map_mr_sg() should be able
to split a large buffer to N * page entry into the xarray page_list.

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
 drivers/infiniband/sw/rxe/rxe_mr.c | 39 +++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 6 deletions(-)
  

Comments

Bart Van Assche Nov. 3, 2023, 3:04 p.m. UTC | #1
On 11/3/23 02:55, Li Zhijian wrote:
> -	return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page);
> +	for_each_sg(sgl, sg, sg_nents, i) {
> +		u64 dma_addr = sg_dma_address(sg) + sg_offset;
> +		unsigned int dma_len = sg_dma_len(sg) - sg_offset;
> +		u64 end_dma_addr = dma_addr + dma_len;
> +		u64 page_addr = dma_addr & PAGE_MASK;
> +
> +		if (sg_dma_len(sg) == 0) {
> +			rxe_dbg_mr(mr, "empty SGE\n");
> +			return -EINVAL;
> +		}
> +		do {
> +			int ret = rxe_store_page(mr, page_addr);
> +			if (ret)
> +				return ret;
> +
> +			page_addr += PAGE_SIZE;
> +		} while (page_addr < end_dma_addr);
> +		sg_offset = 0;
> +	}
> +
> +	return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset_p, rxe_set_page);
>   }

Is this change necessary? There is already a loop in ib_sg_to_pages()
that splits SG entries that are larger than mr->page_size into entries
with size mr->page_size.

Bart.
  
Zhijian Li (Fujitsu) Nov. 6, 2023, 3:07 a.m. UTC | #2
On 03/11/2023 23:04, Bart Van Assche wrote:
> 
> On 11/3/23 02:55, Li Zhijian wrote:
>> -    return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page);
>> +    for_each_sg(sgl, sg, sg_nents, i) {
>> +        u64 dma_addr = sg_dma_address(sg) + sg_offset;
>> +        unsigned int dma_len = sg_dma_len(sg) - sg_offset;
>> +        u64 end_dma_addr = dma_addr + dma_len;
>> +        u64 page_addr = dma_addr & PAGE_MASK;
>> +
>> +        if (sg_dma_len(sg) == 0) {
>> +            rxe_dbg_mr(mr, "empty SGE\n");
>> +            return -EINVAL;
>> +        }
>> +        do {
>> +            int ret = rxe_store_page(mr, page_addr);
>> +            if (ret)
>> +                return ret;
>> +
>> +            page_addr += PAGE_SIZE;
>> +        } while (page_addr < end_dma_addr);
>> +        sg_offset = 0;
>> +    }
>> +
>> +    return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset_p, rxe_set_page);
>>   }
> 
> Is this change necessary? 

There is already a loop in ib_sg_to_pages()
> that splits SG entries that are larger than mr->page_size into entries
> with size mr->page_size.

I see.

My thought was that we are only able to safely access PAGE_SIZE memory scope [page_va, page_va + PAGE_SIZE)
from the return of kmap_local_page(page).
However when mr->page_size is larger than PAGE_SIZE, we may access the next pages without mapping it.

Thanks
Zhijian
  

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index a038133e1322..3761740af986 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -193,9 +193,8 @@  int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr)
 	return err;
 }
 
-static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr)
+static int rxe_store_page(struct rxe_mr *mr, u64 dma_addr)
 {
-	struct rxe_mr *mr = to_rmr(ibmr);
 	struct page *page = ib_virt_dma_to_page(dma_addr);
 	bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT);
 	int err;
@@ -216,20 +215,48 @@  static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr)
 	return 0;
 }
 
+static int rxe_set_page(struct ib_mr *base_mr, u64 buf_addr)
+{
+	return 0;
+}
+
 int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl,
-		  int sg_nents, unsigned int *sg_offset)
+		  int sg_nents, unsigned int *sg_offset_p)
 {
 	struct rxe_mr *mr = to_rmr(ibmr);
+	struct scatterlist *sg;
+	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+	int i;
 
-	if (ibmr->page_size != PAGE_SIZE) {
-		rxe_err_mr(mr, "Unsupport mr page size %x, expect PAGE_SIZE(%lx)\n",
+	if (!IS_ALIGNED(ibmr->page_size, PAGE_SIZE)) {
+		rxe_err_mr(mr, "Misaligned page size %x, expect PAGE_SIZE(%lx) aligned\n",
 			   ibmr->page_size, PAGE_SIZE);
 		return -EINVAL;
 	}
 
 	mr->nbuf = 0;
 
-	return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page);
+	for_each_sg(sgl, sg, sg_nents, i) {
+		u64 dma_addr = sg_dma_address(sg) + sg_offset;
+		unsigned int dma_len = sg_dma_len(sg) - sg_offset;
+		u64 end_dma_addr = dma_addr + dma_len;
+		u64 page_addr = dma_addr & PAGE_MASK;
+
+		if (sg_dma_len(sg) == 0) {
+			rxe_dbg_mr(mr, "empty SGE\n");
+			return -EINVAL;
+		}
+		do {
+			int ret = rxe_store_page(mr, page_addr);
+			if (ret)
+				return ret;
+
+			page_addr += PAGE_SIZE;
+		} while (page_addr < end_dma_addr);
+		sg_offset = 0;
+	}
+
+	return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset_p, rxe_set_page);
 }
 
 static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr,