[drm-next,v2,05/16] drm: manager to keep track of GPUs VA mappings

Message ID 20230217134422.14116-6-dakr@redhat.com
State New
Headers
Series DRM GPUVA Manager & Nouveau VM_BIND UAPI |

Commit Message

Danilo Krummrich Feb. 17, 2023, 1:44 p.m. UTC
  Add infrastructure to keep track of GPU virtual address (VA) mappings
with a decicated VA space manager implementation.

New UAPIs, motivated by Vulkan sparse memory bindings graphics drivers
start implementing, allow userspace applications to request multiple and
arbitrary GPU VA mappings of buffer objects. The DRM GPU VA manager is
intended to serve the following purposes in this context.

1) Provide infrastructure to track GPU VA allocations and mappings,
   making use of the maple_tree.

2) Generically connect GPU VA mappings to their backing buffers, in
   particular DRM GEM objects.

3) Provide a common implementation to perform more complex mapping
   operations on the GPU VA space. In particular splitting and merging
   of GPU VA mappings, e.g. for intersecting mapping requests or partial
   unmap requests.

Suggested-by: Dave Airlie <airlied@redhat.com>
Signed-off-by: Danilo Krummrich <dakr@redhat.com>
---
 Documentation/gpu/drm-mm.rst    |   31 +
 drivers/gpu/drm/Makefile        |    1 +
 drivers/gpu/drm/drm_gem.c       |    3 +
 drivers/gpu/drm/drm_gpuva_mgr.c | 1704 +++++++++++++++++++++++++++++++
 include/drm/drm_drv.h           |    6 +
 include/drm/drm_gem.h           |   75 ++
 include/drm/drm_gpuva_mgr.h     |  714 +++++++++++++
 7 files changed, 2534 insertions(+)
 create mode 100644 drivers/gpu/drm/drm_gpuva_mgr.c
 create mode 100644 include/drm/drm_gpuva_mgr.h
  

Comments

kernel test robot Feb. 18, 2023, 1:05 a.m. UTC | #1
Hi Danilo,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on 48075a66fca613477ac1969b576a93ef5db0164f]

url:    https://github.com/intel-lab-lkp/linux/commits/Danilo-Krummrich/drm-execution-context-for-GEM-buffers/20230217-215101
base:   48075a66fca613477ac1969b576a93ef5db0164f
patch link:    https://lore.kernel.org/r/20230217134422.14116-6-dakr%40redhat.com
patch subject: [PATCH drm-next v2 05/16] drm: manager to keep track of GPUs VA mappings
config: mips-allyesconfig (https://download.01.org/0day-ci/archive/20230218/202302180805.b0ab40V5-lkp@intel.com/config)
compiler: mips-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/00132cc92b6745cfd51c0d5df4c246a848f2ceaa
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Danilo-Krummrich/drm-execution-context-for-GEM-buffers/20230217-215101
        git checkout 00132cc92b6745cfd51c0d5df4c246a848f2ceaa
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash drivers/gpu/drm/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202302180805.b0ab40V5-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/drm_gpuva_mgr.c:1383:5: warning: no previous prototype for 'drm_gpuva_sm_step' [-Wmissing-prototypes]
    1383 | int drm_gpuva_sm_step(struct drm_gpuva_op *__op, void *priv)
         |     ^~~~~~~~~~~~~~~~~
--
>> drivers/gpu/drm/drm_gpuva_mgr.c:529: warning: expecting prototype for drm_gpuva_remove_iter(). Prototype was for drm_gpuva_iter_remove() instead
   drivers/gpu/drm/drm_gpuva_mgr.c:549: warning: Excess function parameter 'addr' description in 'drm_gpuva_insert'
   drivers/gpu/drm/drm_gpuva_mgr.c:549: warning: Excess function parameter 'range' description in 'drm_gpuva_insert'
   drivers/gpu/drm/drm_gpuva_mgr.c:765: warning: Excess function parameter 'addr' description in 'drm_gpuva_region_insert'
   drivers/gpu/drm/drm_gpuva_mgr.c:765: warning: Excess function parameter 'range' description in 'drm_gpuva_region_insert'
   drivers/gpu/drm/drm_gpuva_mgr.c:1345: warning: Excess function parameter 'ops' description in 'drm_gpuva_sm_unmap'
   drivers/gpu/drm/drm_gpuva_mgr.c:1589: warning: Function parameter or member 'addr' not described in 'drm_gpuva_prefetch_ops_create'
   drivers/gpu/drm/drm_gpuva_mgr.c:1589: warning: Function parameter or member 'range' not described in 'drm_gpuva_prefetch_ops_create'
   drivers/gpu/drm/drm_gpuva_mgr.c:1589: warning: Excess function parameter 'req_addr' description in 'drm_gpuva_prefetch_ops_create'
   drivers/gpu/drm/drm_gpuva_mgr.c:1589: warning: Excess function parameter 'req_range' description in 'drm_gpuva_prefetch_ops_create'


vim +/drm_gpuva_sm_step +1383 drivers/gpu/drm/drm_gpuva_mgr.c

  1382	
> 1383	int drm_gpuva_sm_step(struct drm_gpuva_op *__op, void *priv)
  1384	{
  1385		struct {
  1386			struct drm_gpuva_manager *mgr;
  1387			struct drm_gpuva_ops *ops;
  1388		} *args = priv;
  1389		struct drm_gpuva_manager *mgr = args->mgr;
  1390		struct drm_gpuva_ops *ops = args->ops;
  1391		struct drm_gpuva_op *op;
  1392	
  1393		op = gpuva_op_alloc(mgr);
  1394		if (unlikely(!op))
  1395			goto err;
  1396	
  1397		memcpy(op, __op, sizeof(*op));
  1398	
  1399		if (op->op == DRM_GPUVA_OP_REMAP) {
  1400			struct drm_gpuva_op_remap *__r = &__op->remap;
  1401			struct drm_gpuva_op_remap *r = &op->remap;
  1402	
  1403			r->unmap = kmemdup(__r->unmap, sizeof(*r->unmap),
  1404					   GFP_KERNEL);
  1405			if (unlikely(!r->unmap))
  1406				goto err_free_op;
  1407	
  1408			if (__r->prev) {
  1409				r->prev = kmemdup(__r->prev, sizeof(*r->prev),
  1410						  GFP_KERNEL);
  1411				if (unlikely(!r->prev))
  1412					goto err_free_unmap;
  1413			}
  1414	
  1415			if (__r->next) {
  1416				r->next = kmemdup(__r->next, sizeof(*r->next),
  1417						  GFP_KERNEL);
  1418				if (unlikely(!r->next))
  1419					goto err_free_prev;
  1420			}
  1421		}
  1422	
  1423		list_add_tail(&op->entry, &ops->list);
  1424	
  1425		return 0;
  1426	
  1427	err_free_unmap:
  1428		kfree(op->remap.unmap);
  1429	err_free_prev:
  1430		kfree(op->remap.prev);
  1431	err_free_op:
  1432		gpuva_op_free(mgr, op);
  1433	err:
  1434		return -ENOMEM;
  1435	}
  1436
  
Liam R. Howlett Feb. 21, 2023, 6:20 p.m. UTC | #2
* Danilo Krummrich <dakr@redhat.com> [230217 08:45]:
> Add infrastructure to keep track of GPU virtual address (VA) mappings
> with a decicated VA space manager implementation.
> 
> New UAPIs, motivated by Vulkan sparse memory bindings graphics drivers
> start implementing, allow userspace applications to request multiple and
> arbitrary GPU VA mappings of buffer objects. The DRM GPU VA manager is
> intended to serve the following purposes in this context.
> 
> 1) Provide infrastructure to track GPU VA allocations and mappings,
>    making use of the maple_tree.
> 
> 2) Generically connect GPU VA mappings to their backing buffers, in
>    particular DRM GEM objects.
> 
> 3) Provide a common implementation to perform more complex mapping
>    operations on the GPU VA space. In particular splitting and merging
>    of GPU VA mappings, e.g. for intersecting mapping requests or partial
>    unmap requests.
> 
> Suggested-by: Dave Airlie <airlied@redhat.com>
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> ---
>  Documentation/gpu/drm-mm.rst    |   31 +
>  drivers/gpu/drm/Makefile        |    1 +
>  drivers/gpu/drm/drm_gem.c       |    3 +
>  drivers/gpu/drm/drm_gpuva_mgr.c | 1704 +++++++++++++++++++++++++++++++
>  include/drm/drm_drv.h           |    6 +
>  include/drm/drm_gem.h           |   75 ++
>  include/drm/drm_gpuva_mgr.h     |  714 +++++++++++++
>  7 files changed, 2534 insertions(+)
>  create mode 100644 drivers/gpu/drm/drm_gpuva_mgr.c
>  create mode 100644 include/drm/drm_gpuva_mgr.h
> 
> diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
> index a52e6f4117d6..c9f120cfe730 100644
> --- a/Documentation/gpu/drm-mm.rst
> +++ b/Documentation/gpu/drm-mm.rst
> @@ -466,6 +466,37 @@ DRM MM Range Allocator Function References
>  .. kernel-doc:: drivers/gpu/drm/drm_mm.c
>     :export:
>  
...

> +
> +/**
> + * drm_gpuva_remove_iter - removes the iterators current element
> + * @it: the &drm_gpuva_iterator
> + *
> + * This removes the element the iterator currently points to.
> + */
> +void
> +drm_gpuva_iter_remove(struct drm_gpuva_iterator *it)
> +{
> +	mas_erase(&it->mas);
> +}
> +EXPORT_SYMBOL(drm_gpuva_iter_remove);
> +
> +/**
> + * drm_gpuva_insert - insert a &drm_gpuva
> + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
> + * @va: the &drm_gpuva to insert
> + * @addr: the start address of the GPU VA
> + * @range: the range of the GPU VA
> + *
> + * Insert a &drm_gpuva with a given address and range into a
> + * &drm_gpuva_manager.
> + *
> + * Returns: 0 on success, negative error code on failure.
> + */
> +int
> +drm_gpuva_insert(struct drm_gpuva_manager *mgr,
> +		 struct drm_gpuva *va)
> +{
> +	u64 addr = va->va.addr;
> +	u64 range = va->va.range;
> +	MA_STATE(mas, &mgr->va_mt, addr, addr + range - 1);
> +	struct drm_gpuva_region *reg = NULL;
> +	int ret;
> +
> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
> +		return -EINVAL;
> +
> +	if (unlikely(drm_gpuva_in_kernel_region(mgr, addr, range)))
> +		return -EINVAL;
> +
> +	if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS) {
> +		reg = drm_gpuva_in_region(mgr, addr, range);
> +		if (unlikely(!reg))
> +			return -EINVAL;
> +	}
> +

-----

> +	if (unlikely(drm_gpuva_find_first(mgr, addr, range)))
> +		return -EEXIST;
> +
> +	ret = mas_store_gfp(&mas, va, GFP_KERNEL);

mas_walk() will set the internal maple state to the limits to what it
finds.  So, instead of an iterator, you can use the walk function and
ensure there is a large enough area in the existing NULL:

/*
 * Nothing at addr, mas now points to the location where the store would
 * happen
 */
if (mas_walk(&mas))
	return -EEXIST;

/* The NULL entry ends at mas.last, make sure there is room */
if (mas.last < (addr + range - 1))
	return -EEXIST;

/* Limit the store size to the correct end address, and store */
 mas.last = addr + range - 1;
 ret = mas_store_gfp(&mas, va, GFP_KERNEL);

> +	if (unlikely(ret))
> +		return ret;
> +
> +	va->mgr = mgr;
> +	va->region = reg;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(drm_gpuva_insert);
> +
> +/**
> + * drm_gpuva_remove - remove a &drm_gpuva
> + * @va: the &drm_gpuva to remove
> + *
> + * This removes the given &va from the underlaying tree.
> + */
> +void
> +drm_gpuva_remove(struct drm_gpuva *va)
> +{
> +	MA_STATE(mas, &va->mgr->va_mt, va->va.addr, 0);
> +
> +	mas_erase(&mas);
> +}
> +EXPORT_SYMBOL(drm_gpuva_remove);
> +
...

> +/**
> + * drm_gpuva_find_first - find the first &drm_gpuva in the given range
> + * @mgr: the &drm_gpuva_manager to search in
> + * @addr: the &drm_gpuvas address
> + * @range: the &drm_gpuvas range
> + *
> + * Returns: the first &drm_gpuva within the given range
> + */
> +struct drm_gpuva *
> +drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
> +		     u64 addr, u64 range)
> +{
> +	MA_STATE(mas, &mgr->va_mt, addr, 0);
> +
> +	return mas_find(&mas, addr + range - 1);
> +}
> +EXPORT_SYMBOL(drm_gpuva_find_first);
> +
> +/**
> + * drm_gpuva_find - find a &drm_gpuva
> + * @mgr: the &drm_gpuva_manager to search in
> + * @addr: the &drm_gpuvas address
> + * @range: the &drm_gpuvas range
> + *
> + * Returns: the &drm_gpuva at a given &addr and with a given &range

Note that mas_find() will continue upwards in the address space if there
isn't anything at @addr.  This means that &drm_gpuva may not be at
&addr.  If you want to check just at &addr, use mas_walk().

> + */
> +struct drm_gpuva *
> +drm_gpuva_find(struct drm_gpuva_manager *mgr,
> +	       u64 addr, u64 range)
> +{
> +	struct drm_gpuva *va;
> +
> +	va = drm_gpuva_find_first(mgr, addr, range);
> +	if (!va)
> +		goto out;
> +
> +	if (va->va.range != range)
> +		goto out;
> +
> +	return va;
> +
> +out:
> +	return NULL;
> +}
> +EXPORT_SYMBOL(drm_gpuva_find);
> +
> +/**
> + * drm_gpuva_find_prev - find the &drm_gpuva before the given address
> + * @mgr: the &drm_gpuva_manager to search in
> + * @start: the given GPU VA's start address
> + *
> + * Find the adjacent &drm_gpuva before the GPU VA with given &start address.
> + *
> + * Note that if there is any free space between the GPU VA mappings no mapping
> + * is returned.
> + *
> + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
> + */
> +struct drm_gpuva *
> +drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start)

find_prev() usually continues beyond 1 less than the address.  I found
this name confusing.  You may as well use mas_walk(), it would be
faster.

> +{
> +	MA_STATE(mas, &mgr->va_mt, start, 0);
> +
> +	if (start <= mgr->mm_start ||
> +	    start > (mgr->mm_start + mgr->mm_range))
> +		return NULL;
> +
> +	return mas_prev(&mas, start - 1);
> +}
> +EXPORT_SYMBOL(drm_gpuva_find_prev);
> +
> +/**
> + * drm_gpuva_find_next - find the &drm_gpuva after the given address
> + * @mgr: the &drm_gpuva_manager to search in
> + * @end: the given GPU VA's end address
> + *
> + * Find the adjacent &drm_gpuva after the GPU VA with given &end address.
> + *
> + * Note that if there is any free space between the GPU VA mappings no mapping
> + * is returned.
> + *
> + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
> + */
> +struct drm_gpuva *
> +drm_gpuva_find_next(struct drm_gpuva_manager *mgr, u64 end)

This name is also a bit confusing for the same reason.  Again, it seems
worth just walking to end here.

> +{
> +	MA_STATE(mas, &mgr->va_mt, end - 1, 0);
> +
> +	if (end < mgr->mm_start ||
> +	    end >= (mgr->mm_start + mgr->mm_range))
> +		return NULL;
> +
> +	return mas_next(&mas, end);
> +}
> +EXPORT_SYMBOL(drm_gpuva_find_next);
> +
> +static int
> +__drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
> +			  struct drm_gpuva_region *reg)
> +{
> +	u64 addr = reg->va.addr;
> +	u64 range = reg->va.range;
> +	MA_STATE(mas, &mgr->region_mt, addr, addr + range - 1);
> +	int ret;
> +
> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
> +		return -EINVAL;
> +
> +	ret = mas_store_gfp(&mas, reg, GFP_KERNEL);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	reg->mgr = mgr;
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpuva_region_insert - insert a &drm_gpuva_region
> + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
> + * @reg: the &drm_gpuva_region to insert
> + * @addr: the start address of the GPU VA
> + * @range: the range of the GPU VA
> + *
> + * Insert a &drm_gpuva_region with a given address and range into a
> + * &drm_gpuva_manager.
> + *
> + * Returns: 0 on success, negative error code on failure.
> + */
> +int
> +drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
> +			struct drm_gpuva_region *reg)
> +{
> +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
> +		return -EINVAL;
> +
> +	return __drm_gpuva_region_insert(mgr, reg);
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_insert);
> +
> +static void
> +__drm_gpuva_region_remove(struct drm_gpuva_region *reg)
> +{
> +	struct drm_gpuva_manager *mgr = reg->mgr;
> +	MA_STATE(mas, &mgr->region_mt, reg->va.addr, 0);
> +
> +	mas_erase(&mas);
> +}
> +
> +/**
> + * drm_gpuva_region_remove - remove a &drm_gpuva_region
> + * @reg: the &drm_gpuva to remove
> + *
> + * This removes the given &reg from the underlaying tree.
> + */
> +void
> +drm_gpuva_region_remove(struct drm_gpuva_region *reg)
> +{
> +	struct drm_gpuva_manager *mgr = reg->mgr;
> +
> +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
> +		return;
> +
> +	if (unlikely(reg == &mgr->kernel_alloc_region)) {
> +		WARN(1, "Can't destroy kernel reserved region.\n");
> +		return;
> +	}
> +
> +	if (unlikely(!drm_gpuva_region_empty(reg)))
> +		WARN(1, "GPU VA region should be empty on destroy.\n");
> +
> +	__drm_gpuva_region_remove(reg);
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_remove);
> +
> +/**
> + * drm_gpuva_region_empty - indicate whether a &drm_gpuva_region is empty
> + * @reg: the &drm_gpuva to destroy
> + *
> + * Returns: true if the &drm_gpuva_region is empty, false otherwise
> + */
> +bool
> +drm_gpuva_region_empty(struct drm_gpuva_region *reg)
> +{
> +	DRM_GPUVA_ITER(it, reg->mgr);
> +
> +	drm_gpuva_iter_for_each_range(it, reg->va.addr,
> +				      reg->va.addr +
> +				      reg->va.range)
> +		return false;
> +
> +	return true;
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_empty);
> +
> +/**
> + * drm_gpuva_region_find_first - find the first &drm_gpuva_region in the given
> + * range
> + * @mgr: the &drm_gpuva_manager to search in
> + * @addr: the &drm_gpuva_regions address
> + * @range: the &drm_gpuva_regions range
> + *
> + * Returns: the first &drm_gpuva_region within the given range
> + */
> +struct drm_gpuva_region *
> +drm_gpuva_region_find_first(struct drm_gpuva_manager *mgr,
> +			    u64 addr, u64 range)
> +{
> +	MA_STATE(mas, &mgr->region_mt, addr, 0);
> +
> +	return mas_find(&mas, addr + range - 1);
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_find_first);
> +
> +/**
> + * drm_gpuva_region_find - find a &drm_gpuva_region
> + * @mgr: the &drm_gpuva_manager to search in
> + * @addr: the &drm_gpuva_regions address
> + * @range: the &drm_gpuva_regions range
> + *
> + * Returns: the &drm_gpuva_region at a given &addr and with a given &range

again, I'm not sure you want to find first or walk here.. It sounds like
you want exactly addr to addr + range VMA?

> + */
> +struct drm_gpuva_region *
> +drm_gpuva_region_find(struct drm_gpuva_manager *mgr,
> +		      u64 addr, u64 range)
> +{
> +	struct drm_gpuva_region *reg;
> +
> +	reg = drm_gpuva_region_find_first(mgr, addr, range);
> +	if (!reg)
> +		goto out;
> +
> +	if (reg->va.range != range)
> +		goto out;
> +
> +	return reg;
> +
> +out:
> +	return NULL;
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_find);
> +

...
  
Christian König Feb. 22, 2023, 10:25 a.m. UTC | #3
Am 17.02.23 um 14:44 schrieb Danilo Krummrich:
> Add infrastructure to keep track of GPU virtual address (VA) mappings
> with a decicated VA space manager implementation.
>
> New UAPIs, motivated by Vulkan sparse memory bindings graphics drivers
> start implementing, allow userspace applications to request multiple and
> arbitrary GPU VA mappings of buffer objects. The DRM GPU VA manager is
> intended to serve the following purposes in this context.
>
> 1) Provide infrastructure to track GPU VA allocations and mappings,
>     making use of the maple_tree.
>
> 2) Generically connect GPU VA mappings to their backing buffers, in
>     particular DRM GEM objects.
>
> 3) Provide a common implementation to perform more complex mapping
>     operations on the GPU VA space. In particular splitting and merging
>     of GPU VA mappings, e.g. for intersecting mapping requests or partial
>     unmap requests.
>
> Suggested-by: Dave Airlie <airlied@redhat.com>
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> ---
>   Documentation/gpu/drm-mm.rst    |   31 +
>   drivers/gpu/drm/Makefile        |    1 +
>   drivers/gpu/drm/drm_gem.c       |    3 +
>   drivers/gpu/drm/drm_gpuva_mgr.c | 1704 +++++++++++++++++++++++++++++++
>   include/drm/drm_drv.h           |    6 +
>   include/drm/drm_gem.h           |   75 ++
>   include/drm/drm_gpuva_mgr.h     |  714 +++++++++++++
>   7 files changed, 2534 insertions(+)
>   create mode 100644 drivers/gpu/drm/drm_gpuva_mgr.c
>   create mode 100644 include/drm/drm_gpuva_mgr.h
>
> diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
> index a52e6f4117d6..c9f120cfe730 100644
> --- a/Documentation/gpu/drm-mm.rst
> +++ b/Documentation/gpu/drm-mm.rst
> @@ -466,6 +466,37 @@ DRM MM Range Allocator Function References
>   .. kernel-doc:: drivers/gpu/drm/drm_mm.c
>      :export:
>   
> +DRM GPU VA Manager
> +==================
> +
> +Overview
> +--------
> +
> +.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
> +   :doc: Overview
> +
> +Split and Merge
> +---------------
> +
> +.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
> +   :doc: Split and Merge
> +
> +Locking
> +-------
> +
> +.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
> +   :doc: Locking
> +
> +
> +DRM GPU VA Manager Function References
> +--------------------------------------
> +
> +.. kernel-doc:: include/drm/drm_gpuva_mgr.h
> +   :internal:
> +
> +.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
> +   :export:
> +
>   DRM Buddy Allocator
>   ===================
>   
> diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
> index d40defbb0347..4d098efffb98 100644
> --- a/drivers/gpu/drm/Makefile
> +++ b/drivers/gpu/drm/Makefile
> @@ -45,6 +45,7 @@ drm-y := \
>   	drm_vblank.o \
>   	drm_vblank_work.o \
>   	drm_vma_manager.o \
> +	drm_gpuva_mgr.o \
>   	drm_writeback.o
>   drm-$(CONFIG_DRM_LEGACY) += \
>   	drm_agpsupport.o \
> diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
> index 59a0bb5ebd85..65115fe88627 100644
> --- a/drivers/gpu/drm/drm_gem.c
> +++ b/drivers/gpu/drm/drm_gem.c
> @@ -164,6 +164,9 @@ void drm_gem_private_object_init(struct drm_device *dev,
>   	if (!obj->resv)
>   		obj->resv = &obj->_resv;
>   
> +	if (drm_core_check_feature(dev, DRIVER_GEM_GPUVA))
> +		drm_gem_gpuva_init(obj);
> +
>   	drm_vma_node_reset(&obj->vma_node);
>   	INIT_LIST_HEAD(&obj->lru_node);
>   }
> diff --git a/drivers/gpu/drm/drm_gpuva_mgr.c b/drivers/gpu/drm/drm_gpuva_mgr.c
> new file mode 100644
> index 000000000000..19f583704562
> --- /dev/null
> +++ b/drivers/gpu/drm/drm_gpuva_mgr.c
> @@ -0,0 +1,1704 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2022 Red Hat.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + * Authors:
> + *     Danilo Krummrich <dakr@redhat.com>
> + *
> + */
> +
> +#include <drm/drm_gem.h>
> +#include <drm/drm_gpuva_mgr.h>
> +
> +/**
> + * DOC: Overview
> + *
> + * The DRM GPU VA Manager, represented by struct drm_gpuva_manager keeps track
> + * of a GPU's virtual address (VA) space and manages the corresponding virtual
> + * mappings represented by &drm_gpuva objects. It also keeps track of the
> + * mapping's backing &drm_gem_object buffers.
> + *
> + * &drm_gem_object buffers maintain a list (and a corresponding list lock) of
> + * &drm_gpuva objects representing all existent GPU VA mappings using this
> + * &drm_gem_object as backing buffer.
> + *
> + * If the &DRM_GPUVA_MANAGER_REGIONS feature is enabled, a GPU VA mapping can
> + * only be created within a previously allocated &drm_gpuva_region, which
> + * represents a reserved portion of the GPU VA space. GPU VA mappings are not
> + * allowed to span over a &drm_gpuva_region's boundary.
> + *
> + * GPU VA regions can also be flagged as sparse, which allows drivers to create
> + * sparse mappings for a whole GPU VA region in order to support Vulkan
> + * 'Sparse Resources'.

Well since we have now found that there is absolutely no technical 
reason for having those regions could we please drop them?

I don't really see a need for them any more.

Regards,
Christian.

> + *
> + * The GPU VA manager internally uses &maple_tree structures to manage the
> + * &drm_gpuva mappings and the &drm_gpuva_regions within a GPU's virtual address
> + * space.
> + *
> + * Besides the GPU VA space regions (&drm_gpuva_region) allocated by a driver
> + * the &drm_gpuva_manager contains a special region representing the portion of
> + * VA space reserved by the kernel. This node is initialized together with the
> + * GPU VA manager instance and removed when the GPU VA manager is destroyed.
> + *
> + * In a typical application drivers would embed struct drm_gpuva_manager,
> + * struct drm_gpuva_region and struct drm_gpuva within their own driver
> + * specific structures, there won't be any memory allocations of it's own nor
> + * memory allocations of &drm_gpuva or &drm_gpuva_region entries.
> + */
> +
> +/**
> + * DOC: Split and Merge
> + *
> + * The DRM GPU VA manager also provides an algorithm implementing splitting and
> + * merging of existent GPU VA mappings with the ones that are requested to be
> + * mapped or unmapped. This feature is required by the Vulkan API to implement
> + * Vulkan 'Sparse Memory Bindings' - drivers UAPIs often refer to this as
> + * VM BIND.
> + *
> + * Drivers can call drm_gpuva_sm_map() to receive a sequence of callbacks
> + * containing map, unmap and remap operations for a given newly requested
> + * mapping. The sequence of callbacks represents the set of operations to
> + * execute in order to integrate the new mapping cleanly into the current state
> + * of the GPU VA space.
> + *
> + * Depending on how the new GPU VA mapping intersects with the existent mappings
> + * of the GPU VA space the &drm_gpuva_fn_ops callbacks contain an arbitrary
> + * amount of unmap operations, a maximum of two remap operations and a single
> + * map operation. The caller might receive no callback at all if no operation is
> + * required, e.g. if the requested mapping already exists in the exact same way.
> + *
> + * The single map operation, if existent, represents the original map operation
> + * requested by the caller. Please note that this operation might be altered
> + * comparing it with the original map operation, e.g. because it was merged with
> + * an already  existent mapping. Hence, drivers must execute this map operation
> + * instead of the original one passed to drm_gpuva_sm_map().
> + *
> + * &drm_gpuva_op_unmap contains a 'keep' field, which indicates whether the
> + * &drm_gpuva to unmap is physically contiguous with the original mapping
> + * request. Optionally, if 'keep' is set, drivers may keep the actual page table
> + * entries for this &drm_gpuva, adding the missing page table entries only and
> + * update the &drm_gpuva_manager's view of things accordingly.
> + *
> + * Drivers may do the same optimization, namely delta page table updates, also
> + * for remap operations. This is possible since &drm_gpuva_op_remap consists of
> + * one unmap operation and one or two map operations, such that drivers can
> + * derive the page table update delta accordingly.
> + *
> + * Note that there can't be more than two existent mappings to split up, one at
> + * the beginning and one at the end of the new mapping, hence there is a
> + * maximum of two remap operations.
> + *
> + * Generally, the DRM GPU VA manager never merges mappings across the
> + * boundaries of &drm_gpuva_regions. This is the case since merging between
> + * GPU VA regions would result into unmap and map operations to be issued for
> + * both regions involved although the original mapping request was referred to
> + * one specific GPU VA region only. Since the other GPU VA region, the one not
> + * explicitly requested to be altered, might be in use by the GPU, we are not
> + * allowed to issue any map/unmap operations for this region.
> + *
> + * To update the &drm_gpuva_manager's view of the GPU VA space
> + * drm_gpuva_insert() and drm_gpuva_remove() should be used.
> + *
> + * Analogous to drm_gpuva_sm_map() drm_gpuva_sm_unmap() uses &drm_gpuva_fn_ops
> + * to call back into the driver in order to unmap a range of GPU VA space. The
> + * logic behind this function is way simpler though: For all existent mappings
> + * enclosed by the given range unmap operations are created. For mappings which
> + * are only partically located within the given range, remap operations are
> + * created such that those mappings are split up and re-mapped partically.
> + *
> + * The following diagram depicts the basic relationships of existent GPU VA
> + * mappings, a newly requested mapping and the resulting mappings as implemented
> + * by drm_gpuva_sm_map() - it doesn't cover any arbitrary combinations of these.
> + *
> + * 1) Requested mapping is identical, hence noop.
> + *
> + *    ::
> + *
> + *	     0     a     1
> + *	old: |-----------| (bo_offset=n)
> + *
> + *	     0     a     1
> + *	req: |-----------| (bo_offset=n)
> + *
> + *	     0     a     1
> + *	new: |-----------| (bo_offset=n)
> + *
> + *
> + * 2) Requested mapping is identical, except for the BO offset, hence replace
> + *    the mapping.
> + *
> + *    ::
> + *
> + *	     0     a     1
> + *	old: |-----------| (bo_offset=n)
> + *
> + *	     0     a     1
> + *	req: |-----------| (bo_offset=m)
> + *
> + *	     0     a     1
> + *	new: |-----------| (bo_offset=m)
> + *
> + *
> + * 3) Requested mapping is identical, except for the backing BO, hence replace
> + *    the mapping.
> + *
> + *    ::
> + *
> + *	     0     a     1
> + *	old: |-----------| (bo_offset=n)
> + *
> + *	     0     b     1
> + *	req: |-----------| (bo_offset=n)
> + *
> + *	     0     b     1
> + *	new: |-----------| (bo_offset=n)
> + *
> + *
> + * 4) Existent mapping is a left aligned subset of the requested one, hence
> + *    replace the existent one.
> + *
> + *    ::
> + *
> + *	     0  a  1
> + *	old: |-----|       (bo_offset=n)
> + *
> + *	     0     a     2
> + *	req: |-----------| (bo_offset=n)
> + *
> + *	     0     a     2
> + *	new: |-----------| (bo_offset=n)
> + *
> + *    .. note::
> + *       We expect to see the same result for a request with a different BO
> + *       and/or non-contiguous BO offset.
> + *
> + *
> + * 5) Requested mapping's range is a left aligned subset of the existent one,
> + *    but backed by a different BO. Hence, map the requested mapping and split
> + *    the existent one adjusting it's BO offset.
> + *
> + *    ::
> + *
> + *	     0     a     2
> + *	old: |-----------| (bo_offset=n)
> + *
> + *	     0  b  1
> + *	req: |-----|       (bo_offset=n)
> + *
> + *	     0  b  1  a' 2
> + *	new: |-----|-----| (b.bo_offset=n, a.bo_offset=n+1)
> + *
> + *    .. note::
> + *       We expect to see the same result for a request with a different BO
> + *       and/or non-contiguous BO offset.
> + *
> + *
> + * 6) Existent mapping is a superset of the requested mapping, hence noop.
> + *
> + *    ::
> + *
> + *	     0     a     2
> + *	old: |-----------| (bo_offset=n)
> + *
> + *	     0  a  1
> + *	req: |-----|       (bo_offset=n)
> + *
> + *	     0     a     2
> + *	new: |-----------| (bo_offset=n)
> + *
> + *
> + * 7) Requested mapping's range is a right aligned subset of the existent one,
> + *    but backed by a different BO. Hence, map the requested mapping and split
> + *    the existent one, without adjusting the BO offset.
> + *
> + *    ::
> + *
> + *	     0     a     2
> + *	old: |-----------| (bo_offset=n)
> + *
> + *	           1  b  2
> + *	req:       |-----| (bo_offset=m)
> + *
> + *	     0  a  1  b  2
> + *	new: |-----|-----| (a.bo_offset=n,b.bo_offset=m)
> + *
> + *
> + * 8) Existent mapping is a superset of the requested mapping, hence noop.
> + *
> + *    ::
> + *
> + *	      0     a     2
> + *	old: |-----------| (bo_offset=n)
> + *
> + *	           1  a  2
> + *	req:       |-----| (bo_offset=n+1)
> + *
> + *	     0     a     2
> + *	new: |-----------| (bo_offset=n)
> + *
> + *
> + * 9) Existent mapping is overlapped at the end by the requested mapping backed
> + *    by a different BO. Hence, map the requested mapping and split up the
> + *    existent one, without adjusting the BO offset.
> + *
> + *    ::
> + *
> + *	     0     a     2
> + *	old: |-----------|       (bo_offset=n)
> + *
> + *	           1     b     3
> + *	req:       |-----------| (bo_offset=m)
> + *
> + *	     0  a  1     b     3
> + *	new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
> + *
> + *
> + * 10) Existent mapping is overlapped by the requested mapping, both having the
> + *     same backing BO with a contiguous offset. Hence, merge both mappings.
> + *
> + *     ::
> + *
> + *	      0     a     2
> + *	 old: |-----------|       (bo_offset=n)
> + *
> + *	            1     a     3
> + *	 req:       |-----------| (bo_offset=n+1)
> + *
> + *	      0        a        3
> + *	 new: |-----------------| (bo_offset=n)
> + *
> + *
> + * 11) Requested mapping's range is a centered subset of the existent one
> + *     having a different backing BO. Hence, map the requested mapping and split
> + *     up the existent one in two mappings, adjusting the BO offset of the right
> + *     one accordingly.
> + *
> + *     ::
> + *
> + *	      0        a        3
> + *	 old: |-----------------| (bo_offset=n)
> + *
> + *	            1  b  2
> + *	 req:       |-----|       (bo_offset=m)
> + *
> + *	      0  a  1  b  2  a' 3
> + *	 new: |-----|-----|-----| (a.bo_offset=n,b.bo_offset=m,a'.bo_offset=n+2)
> + *
> + *
> + * 12) Requested mapping is a contiguous subset of the existent one, hence noop.
> + *
> + *     ::
> + *
> + *	      0        a        3
> + *	 old: |-----------------| (bo_offset=n)
> + *
> + *	            1  a  2
> + *	 req:       |-----|       (bo_offset=n+1)
> + *
> + *	      0        a        3
> + *	 old: |-----------------| (bo_offset=n)
> + *
> + *
> + * 13) Existent mapping is a right aligned subset of the requested one, hence
> + *     replace the existent one.
> + *
> + *     ::
> + *
> + *	            1  a  2
> + *	 old:       |-----| (bo_offset=n+1)
> + *
> + *	      0     a     2
> + *	 req: |-----------| (bo_offset=n)
> + *
> + *	      0     a     2
> + *	 new: |-----------| (bo_offset=n)
> + *
> + *     .. note::
> + *        We expect to see the same result for a request with a different bo
> + *        and/or non-contiguous bo_offset.
> + *
> + *
> + * 14) Existent mapping is a centered subset of the requested one, hence
> + *     replace the existent one.
> + *
> + *     ::
> + *
> + *	            1  a  2
> + *	 old:       |-----| (bo_offset=n+1)
> + *
> + *	      0        a       3
> + *	 req: |----------------| (bo_offset=n)
> + *
> + *	      0        a       3
> + *	 new: |----------------| (bo_offset=n)
> + *
> + *     .. note::
> + *        We expect to see the same result for a request with a different bo
> + *        and/or non-contiguous bo_offset.
> + *
> + *
> + * 15) Existent mappings is overlapped at the beginning by the requested mapping
> + *     backed by a different BO. Hence, map the requested mapping and split up
> + *     the existent one, adjusting it's BO offset accordingly.
> + *
> + *     ::
> + *
> + *	            1     a     3
> + *	 old:       |-----------| (bo_offset=n)
> + *
> + *	      0     b     2
> + *	 req: |-----------|       (bo_offset=m)
> + *
> + *	      0     b     2  a' 3
> + *	 new: |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> + *
> + *
> + * 16) Requested mapping fills the gap between two existent mappings all having
> + *     the same backing BO, such that all three have a contiguous BO offset.
> + *     Hence, merge all mappings.
> + *
> + *     ::
> + *
> + *	      0     a     1
> + *	 old: |-----------|                        (bo_offset=n)
> + *
> + *	                             2     a     3
> + *	 old':                       |-----------| (bo_offset=n+2)
> + *
> + *	                 1     a     2
> + *	 req:            |-----------|             (bo_offset=n+1)
> + *
> + *	                       a
> + *	 new: |----------------------------------| (bo_offset=n)
> + */
> +
> +/**
> + * DOC: Locking
> + *
> + * Generally, the GPU VA manager does not take care of locking itself, it is
> + * the drivers responsibility to take care about locking. Drivers might want to
> + * protect the following operations: inserting, removing and iterating
> + * &drm_gpuva and &drm_gpuva_region objects as well as generating all kinds of
> + * operations, such as split / merge or prefetch.
> + *
> + * The GPU VA manager also does not take care of the locking of the backing
> + * &drm_gem_object buffers GPU VA lists by itself; drivers are responsible to
> + * enforce mutual exclusion.
> + */
> +
> +
> +static int __drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
> +				     struct drm_gpuva_region *reg);
> +static void __drm_gpuva_region_remove(struct drm_gpuva_region *reg);
> +
> +/**
> + * drm_gpuva_manager_init - initialize a &drm_gpuva_manager
> + * @mgr: pointer to the &drm_gpuva_manager to initialize
> + * @name: the name of the GPU VA space
> + * @start_offset: the start offset of the GPU VA space
> + * @range: the size of the GPU VA space
> + * @reserve_offset: the start of the kernel reserved GPU VA area
> + * @reserve_range: the size of the kernel reserved GPU VA area
> + * @ops: &drm_gpuva_fn_ops called on &drm_gpuva_sm_map / &drm_gpuva_sm_unmap
> + * @flags: the feature flags of the &drm_gpuva_manager
> + *
> + * The &drm_gpuva_manager must be initialized with this function before use.
> + *
> + * Note that @mgr must be cleared to 0 before calling this function. The given
> + * &name is expected to be managed by the surrounding driver structures.
> + */
> +void
> +drm_gpuva_manager_init(struct drm_gpuva_manager *mgr,
> +		       const char *name,
> +		       u64 start_offset, u64 range,
> +		       u64 reserve_offset, u64 reserve_range,
> +		       struct drm_gpuva_fn_ops *ops,
> +		       enum drm_gpuva_mgr_flags flags)
> +{
> +	mt_init_flags(&mgr->region_mt, MT_FLAGS_LOCK_NONE);
> +	mt_init_flags(&mgr->va_mt, MT_FLAGS_LOCK_NONE);
> +
> +	mgr->mm_start = start_offset;
> +	mgr->mm_range = range;
> +
> +	mgr->name = name ? name : "unknown";
> +	mgr->ops = ops;
> +	mgr->flags = flags;
> +
> +	memset(&mgr->kernel_alloc_region, 0, sizeof(struct drm_gpuva_region));
> +	mgr->kernel_alloc_region.va.addr = reserve_offset;
> +	mgr->kernel_alloc_region.va.range = reserve_range;
> +
> +	__drm_gpuva_region_insert(mgr, &mgr->kernel_alloc_region);
> +}
> +EXPORT_SYMBOL(drm_gpuva_manager_init);
> +
> +/**
> + * drm_gpuva_manager_destroy - cleanup a &drm_gpuva_manager
> + * @mgr: pointer to the &drm_gpuva_manager to clean up
> + *
> + * Note that it is a bug to call this function on a manager that still
> + * holds GPU VA mappings.
> + */
> +void
> +drm_gpuva_manager_destroy(struct drm_gpuva_manager *mgr)
> +{
> +	mgr->name = NULL;
> +	__drm_gpuva_region_remove(&mgr->kernel_alloc_region);
> +
> +	WARN(!mtree_empty(&mgr->va_mt),
> +	     "GPUVA tree is not empty, potentially leaking memory.");
> +	__mt_destroy(&mgr->va_mt);
> +
> +	WARN(!mtree_empty(&mgr->region_mt),
> +	     "GPUVA region tree is not empty, potentially leaking memory.");
> +	__mt_destroy(&mgr->region_mt);
> +}
> +EXPORT_SYMBOL(drm_gpuva_manager_destroy);
> +
> +static inline bool
> +drm_gpuva_in_mm_range(struct drm_gpuva_manager *mgr, u64 addr, u64 range)
> +{
> +	u64 end = addr + range;
> +	u64 mm_start = mgr->mm_start;
> +	u64 mm_end = mm_start + mgr->mm_range;
> +
> +	return addr < mm_end && mm_start < end;
> +}
> +
> +static inline bool
> +drm_gpuva_in_kernel_region(struct drm_gpuva_manager *mgr, u64 addr, u64 range)
> +{
> +	u64 end = addr + range;
> +	u64 kstart = mgr->kernel_alloc_region.va.addr;
> +	u64 kend = kstart + mgr->kernel_alloc_region.va.range;
> +
> +	return addr < kend && kstart < end;
> +}
> +
> +static struct drm_gpuva_region *
> +drm_gpuva_in_region(struct drm_gpuva_manager *mgr, u64 addr, u64 range)
> +{
> +	DRM_GPUVA_REGION_ITER(it, mgr);
> +
> +	/* Find the VA region the requested range is strictly enclosed by. */
> +	drm_gpuva_iter_for_each_range(it, addr, addr + range) {
> +		struct drm_gpuva_region *reg = it.reg;
> +
> +		if (reg->va.addr <= addr &&
> +		    reg->va.addr + reg->va.range >= addr + range &&
> +		    reg != &mgr->kernel_alloc_region)
> +			return reg;
> +	}
> +
> +	return NULL;
> +}
> +
> +static bool
> +drm_gpuva_in_any_region(struct drm_gpuva_manager *mgr, u64 addr, u64 range)
> +{
> +	return !!drm_gpuva_in_region(mgr, addr, range);
> +}
> +
> +/**
> + * drm_gpuva_remove_iter - removes the iterators current element
> + * @it: the &drm_gpuva_iterator
> + *
> + * This removes the element the iterator currently points to.
> + */
> +void
> +drm_gpuva_iter_remove(struct drm_gpuva_iterator *it)
> +{
> +	mas_erase(&it->mas);
> +}
> +EXPORT_SYMBOL(drm_gpuva_iter_remove);
> +
> +/**
> + * drm_gpuva_insert - insert a &drm_gpuva
> + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
> + * @va: the &drm_gpuva to insert
> + * @addr: the start address of the GPU VA
> + * @range: the range of the GPU VA
> + *
> + * Insert a &drm_gpuva with a given address and range into a
> + * &drm_gpuva_manager.
> + *
> + * Returns: 0 on success, negative error code on failure.
> + */
> +int
> +drm_gpuva_insert(struct drm_gpuva_manager *mgr,
> +		 struct drm_gpuva *va)
> +{
> +	u64 addr = va->va.addr;
> +	u64 range = va->va.range;
> +	MA_STATE(mas, &mgr->va_mt, addr, addr + range - 1);
> +	struct drm_gpuva_region *reg = NULL;
> +	int ret;
> +
> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
> +		return -EINVAL;
> +
> +	if (unlikely(drm_gpuva_in_kernel_region(mgr, addr, range)))
> +		return -EINVAL;
> +
> +	if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS) {
> +		reg = drm_gpuva_in_region(mgr, addr, range);
> +		if (unlikely(!reg))
> +			return -EINVAL;
> +	}
> +
> +	if (unlikely(drm_gpuva_find_first(mgr, addr, range)))
> +		return -EEXIST;
> +
> +	ret = mas_store_gfp(&mas, va, GFP_KERNEL);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	va->mgr = mgr;
> +	va->region = reg;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(drm_gpuva_insert);
> +
> +/**
> + * drm_gpuva_remove - remove a &drm_gpuva
> + * @va: the &drm_gpuva to remove
> + *
> + * This removes the given &va from the underlaying tree.
> + */
> +void
> +drm_gpuva_remove(struct drm_gpuva *va)
> +{
> +	MA_STATE(mas, &va->mgr->va_mt, va->va.addr, 0);
> +
> +	mas_erase(&mas);
> +}
> +EXPORT_SYMBOL(drm_gpuva_remove);
> +
> +/**
> + * drm_gpuva_link - link a &drm_gpuva
> + * @va: the &drm_gpuva to link
> + *
> + * This adds the given &va to the GPU VA list of the &drm_gem_object it is
> + * associated with.
> + *
> + * This function expects the caller to protect the GEM's GPUVA list against
> + * concurrent access.
> + */
> +void
> +drm_gpuva_link(struct drm_gpuva *va)
> +{
> +	if (likely(va->gem.obj))
> +		list_add_tail(&va->head, &va->gem.obj->gpuva.list);
> +}
> +EXPORT_SYMBOL(drm_gpuva_link);
> +
> +/**
> + * drm_gpuva_unlink - unlink a &drm_gpuva
> + * @va: the &drm_gpuva to unlink
> + *
> + * This removes the given &va from the GPU VA list of the &drm_gem_object it is
> + * associated with.
> + *
> + * This function expects the caller to protect the GEM's GPUVA list against
> + * concurrent access.
> + */
> +void
> +drm_gpuva_unlink(struct drm_gpuva *va)
> +{
> +	if (likely(va->gem.obj))
> +		list_del_init(&va->head);
> +}
> +EXPORT_SYMBOL(drm_gpuva_unlink);
> +
> +/**
> + * drm_gpuva_find_first - find the first &drm_gpuva in the given range
> + * @mgr: the &drm_gpuva_manager to search in
> + * @addr: the &drm_gpuvas address
> + * @range: the &drm_gpuvas range
> + *
> + * Returns: the first &drm_gpuva within the given range
> + */
> +struct drm_gpuva *
> +drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
> +		     u64 addr, u64 range)
> +{
> +	MA_STATE(mas, &mgr->va_mt, addr, 0);
> +
> +	return mas_find(&mas, addr + range - 1);
> +}
> +EXPORT_SYMBOL(drm_gpuva_find_first);
> +
> +/**
> + * drm_gpuva_find - find a &drm_gpuva
> + * @mgr: the &drm_gpuva_manager to search in
> + * @addr: the &drm_gpuvas address
> + * @range: the &drm_gpuvas range
> + *
> + * Returns: the &drm_gpuva at a given &addr and with a given &range
> + */
> +struct drm_gpuva *
> +drm_gpuva_find(struct drm_gpuva_manager *mgr,
> +	       u64 addr, u64 range)
> +{
> +	struct drm_gpuva *va;
> +
> +	va = drm_gpuva_find_first(mgr, addr, range);
> +	if (!va)
> +		goto out;
> +
> +	if (va->va.range != range)
> +		goto out;
> +
> +	return va;
> +
> +out:
> +	return NULL;
> +}
> +EXPORT_SYMBOL(drm_gpuva_find);
> +
> +/**
> + * drm_gpuva_find_prev - find the &drm_gpuva before the given address
> + * @mgr: the &drm_gpuva_manager to search in
> + * @start: the given GPU VA's start address
> + *
> + * Find the adjacent &drm_gpuva before the GPU VA with given &start address.
> + *
> + * Note that if there is any free space between the GPU VA mappings no mapping
> + * is returned.
> + *
> + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
> + */
> +struct drm_gpuva *
> +drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start)
> +{
> +	MA_STATE(mas, &mgr->va_mt, start, 0);
> +
> +	if (start <= mgr->mm_start ||
> +	    start > (mgr->mm_start + mgr->mm_range))
> +		return NULL;
> +
> +	return mas_prev(&mas, start - 1);
> +}
> +EXPORT_SYMBOL(drm_gpuva_find_prev);
> +
> +/**
> + * drm_gpuva_find_next - find the &drm_gpuva after the given address
> + * @mgr: the &drm_gpuva_manager to search in
> + * @end: the given GPU VA's end address
> + *
> + * Find the adjacent &drm_gpuva after the GPU VA with given &end address.
> + *
> + * Note that if there is any free space between the GPU VA mappings no mapping
> + * is returned.
> + *
> + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
> + */
> +struct drm_gpuva *
> +drm_gpuva_find_next(struct drm_gpuva_manager *mgr, u64 end)
> +{
> +	MA_STATE(mas, &mgr->va_mt, end - 1, 0);
> +
> +	if (end < mgr->mm_start ||
> +	    end >= (mgr->mm_start + mgr->mm_range))
> +		return NULL;
> +
> +	return mas_next(&mas, end);
> +}
> +EXPORT_SYMBOL(drm_gpuva_find_next);
> +
> +static int
> +__drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
> +			  struct drm_gpuva_region *reg)
> +{
> +	u64 addr = reg->va.addr;
> +	u64 range = reg->va.range;
> +	MA_STATE(mas, &mgr->region_mt, addr, addr + range - 1);
> +	int ret;
> +
> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
> +		return -EINVAL;
> +
> +	ret = mas_store_gfp(&mas, reg, GFP_KERNEL);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	reg->mgr = mgr;
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpuva_region_insert - insert a &drm_gpuva_region
> + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
> + * @reg: the &drm_gpuva_region to insert
> + * @addr: the start address of the GPU VA
> + * @range: the range of the GPU VA
> + *
> + * Insert a &drm_gpuva_region with a given address and range into a
> + * &drm_gpuva_manager.
> + *
> + * Returns: 0 on success, negative error code on failure.
> + */
> +int
> +drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
> +			struct drm_gpuva_region *reg)
> +{
> +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
> +		return -EINVAL;
> +
> +	return __drm_gpuva_region_insert(mgr, reg);
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_insert);
> +
> +static void
> +__drm_gpuva_region_remove(struct drm_gpuva_region *reg)
> +{
> +	struct drm_gpuva_manager *mgr = reg->mgr;
> +	MA_STATE(mas, &mgr->region_mt, reg->va.addr, 0);
> +
> +	mas_erase(&mas);
> +}
> +
> +/**
> + * drm_gpuva_region_remove - remove a &drm_gpuva_region
> + * @reg: the &drm_gpuva to remove
> + *
> + * This removes the given &reg from the underlaying tree.
> + */
> +void
> +drm_gpuva_region_remove(struct drm_gpuva_region *reg)
> +{
> +	struct drm_gpuva_manager *mgr = reg->mgr;
> +
> +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
> +		return;
> +
> +	if (unlikely(reg == &mgr->kernel_alloc_region)) {
> +		WARN(1, "Can't destroy kernel reserved region.\n");
> +		return;
> +	}
> +
> +	if (unlikely(!drm_gpuva_region_empty(reg)))
> +		WARN(1, "GPU VA region should be empty on destroy.\n");
> +
> +	__drm_gpuva_region_remove(reg);
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_remove);
> +
> +/**
> + * drm_gpuva_region_empty - indicate whether a &drm_gpuva_region is empty
> + * @reg: the &drm_gpuva to destroy
> + *
> + * Returns: true if the &drm_gpuva_region is empty, false otherwise
> + */
> +bool
> +drm_gpuva_region_empty(struct drm_gpuva_region *reg)
> +{
> +	DRM_GPUVA_ITER(it, reg->mgr);
> +
> +	drm_gpuva_iter_for_each_range(it, reg->va.addr,
> +				      reg->va.addr +
> +				      reg->va.range)
> +		return false;
> +
> +	return true;
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_empty);
> +
> +/**
> + * drm_gpuva_region_find_first - find the first &drm_gpuva_region in the given
> + * range
> + * @mgr: the &drm_gpuva_manager to search in
> + * @addr: the &drm_gpuva_regions address
> + * @range: the &drm_gpuva_regions range
> + *
> + * Returns: the first &drm_gpuva_region within the given range
> + */
> +struct drm_gpuva_region *
> +drm_gpuva_region_find_first(struct drm_gpuva_manager *mgr,
> +			    u64 addr, u64 range)
> +{
> +	MA_STATE(mas, &mgr->region_mt, addr, 0);
> +
> +	return mas_find(&mas, addr + range - 1);
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_find_first);
> +
> +/**
> + * drm_gpuva_region_find - find a &drm_gpuva_region
> + * @mgr: the &drm_gpuva_manager to search in
> + * @addr: the &drm_gpuva_regions address
> + * @range: the &drm_gpuva_regions range
> + *
> + * Returns: the &drm_gpuva_region at a given &addr and with a given &range
> + */
> +struct drm_gpuva_region *
> +drm_gpuva_region_find(struct drm_gpuva_manager *mgr,
> +		      u64 addr, u64 range)
> +{
> +	struct drm_gpuva_region *reg;
> +
> +	reg = drm_gpuva_region_find_first(mgr, addr, range);
> +	if (!reg)
> +		goto out;
> +
> +	if (reg->va.range != range)
> +		goto out;
> +
> +	return reg;
> +
> +out:
> +	return NULL;
> +}
> +EXPORT_SYMBOL(drm_gpuva_region_find);
> +
> +static int
> +op_map_cb(int (*step)(struct drm_gpuva_op *, void *),
> +	  void *priv,
> +	  u64 addr, u64 range,
> +	  struct drm_gem_object *obj, u64 offset)
> +{
> +	struct drm_gpuva_op op = {};
> +
> +	op.op = DRM_GPUVA_OP_MAP;
> +	op.map.va.addr = addr;
> +	op.map.va.range = range;
> +	op.map.gem.obj = obj;
> +	op.map.gem.offset = offset;
> +
> +	return step(&op, priv);
> +}
> +
> +static int
> +op_remap_cb(int (*step)(struct drm_gpuva_op *, void *),
> +	    void *priv,
> +	    struct drm_gpuva_op_map *prev,
> +	    struct drm_gpuva_op_map *next,
> +	    struct drm_gpuva_op_unmap *unmap)
> +{
> +	struct drm_gpuva_op op = {};
> +	struct drm_gpuva_op_remap *r;
> +
> +	op.op = DRM_GPUVA_OP_REMAP;
> +	r = &op.remap;
> +	r->prev = prev;
> +	r->next = next;
> +	r->unmap = unmap;
> +
> +	return step(&op, priv);
> +}
> +
> +static int
> +op_unmap_cb(int (*step)(struct drm_gpuva_op *, void *),
> +	    void *priv,
> +	    struct drm_gpuva *va, bool merge)
> +{
> +	struct drm_gpuva_op op = {};
> +
> +	op.op = DRM_GPUVA_OP_UNMAP;
> +	op.unmap.va = va;
> +	op.unmap.keep = merge;
> +
> +	return step(&op, priv);
> +}
> +
> +static inline bool
> +gpuva_should_merge(struct drm_gpuva *va)
> +{
> +	/* Never merge mappings with NULL GEMs. */
> +	return !!va->gem.obj;
> +}
> +
> +static int
> +__drm_gpuva_sm_map(struct drm_gpuva_manager *mgr,
> +		   struct drm_gpuva_fn_ops *ops, void *priv,
> +		   u64 req_addr, u64 req_range,
> +		   struct drm_gem_object *req_obj, u64 req_offset)
> +{
> +	DRM_GPUVA_ITER(it, mgr);
> +	int (*step)(struct drm_gpuva_op *, void *);
> +	struct drm_gpuva *va, *prev = NULL;
> +	u64 req_end = req_addr + req_range;
> +	bool skip_pmerge = false, skip_nmerge = false;
> +	int ret;
> +
> +	step = ops->sm_map_step;
> +
> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, req_addr, req_range)))
> +		return -EINVAL;
> +
> +	if (unlikely(drm_gpuva_in_kernel_region(mgr, req_addr, req_range)))
> +		return -EINVAL;
> +
> +	if ((mgr->flags & DRM_GPUVA_MANAGER_REGIONS) &&
> +	    !drm_gpuva_in_any_region(mgr, req_addr, req_range))
> +		return -EINVAL;
> +
> +	drm_gpuva_iter_for_each_range(it, req_addr, req_end) {
> +		struct drm_gpuva *va = it.va;
> +		struct drm_gem_object *obj = va->gem.obj;
> +		u64 offset = va->gem.offset;
> +		u64 addr = va->va.addr;
> +		u64 range = va->va.range;
> +		u64 end = addr + range;
> +		bool merge = gpuva_should_merge(va);
> +
> +		/* Generally, we want to skip merging with potential mappings
> +		 * left and right of the requested one when we found a
> +		 * collision, since merging happens in this loop already.
> +		 *
> +		 * However, there is one exception when the requested mapping
> +		 * spans into a free VM area. If this is the case we might
> +		 * still hit the boundary of another mapping before and/or
> +		 * after the free VM area.
> +		 */
> +		skip_pmerge = true;
> +		skip_nmerge = true;
> +
> +		if (addr == req_addr) {
> +			merge &= obj == req_obj &&
> +				 offset == req_offset;
> +
> +			if (end == req_end) {
> +				if (merge)
> +					goto done;
> +
> +				ret = op_unmap_cb(step, priv, va, false);
> +				if (ret)
> +					return ret;
> +				break;
> +			}
> +
> +			if (end < req_end) {
> +				skip_nmerge = false;
> +				ret = op_unmap_cb(step, priv, va, merge);
> +				if (ret)
> +					return ret;
> +				goto next;
> +			}
> +
> +			if (end > req_end) {
> +				struct drm_gpuva_op_map n = {
> +					.va.addr = req_end,
> +					.va.range = range - req_range,
> +					.gem.obj = obj,
> +					.gem.offset = offset + req_range,
> +				};
> +				struct drm_gpuva_op_unmap u = { .va = va };
> +
> +				if (merge)
> +					goto done;
> +
> +				ret = op_remap_cb(step, priv, NULL, &n, &u);
> +				if (ret)
> +					return ret;
> +				break;
> +			}
> +		} else if (addr < req_addr) {
> +			u64 ls_range = req_addr - addr;
> +			struct drm_gpuva_op_map p = {
> +				.va.addr = addr,
> +				.va.range = ls_range,
> +				.gem.obj = obj,
> +				.gem.offset = offset,
> +			};
> +			struct drm_gpuva_op_unmap u = { .va = va };
> +
> +			merge &= obj == req_obj &&
> +				 offset + ls_range == req_offset;
> +
> +			if (end == req_end) {
> +				if (merge)
> +					goto done;
> +
> +				ret = op_remap_cb(step, priv, &p, NULL, &u);
> +				if (ret)
> +					return ret;
> +				break;
> +			}
> +
> +			if (end < req_end) {
> +				u64 new_addr = addr;
> +				u64 new_range = req_range + ls_range;
> +				u64 new_offset = offset;
> +
> +				/* We validated that the requested mapping is
> +				 * within a single VA region already.
> +				 * Since it overlaps the current mapping (which
> +				 * can't cross a VA region boundary) we can be
> +				 * sure that we're still within the boundaries
> +				 * of the same VA region after merging.
> +				 */
> +				if (merge) {
> +					req_offset = new_offset;
> +					req_addr = new_addr;
> +					req_range = new_range;
> +					ret = op_unmap_cb(step, priv, va, true);
> +					if (ret)
> +						return ret;
> +					goto next;
> +				}
> +
> +				ret = op_remap_cb(step, priv, &p, NULL, &u);
> +				if (ret)
> +					return ret;
> +				goto next;
> +			}
> +
> +			if (end > req_end) {
> +				struct drm_gpuva_op_map n = {
> +					.va.addr = req_end,
> +					.va.range = end - req_end,
> +					.gem.obj = obj,
> +					.gem.offset = offset + ls_range +
> +						      req_range,
> +				};
> +
> +				if (merge)
> +					goto done;
> +
> +				ret = op_remap_cb(step, priv, &p, &n, &u);
> +				if (ret)
> +					return ret;
> +				break;
> +			}
> +		} else if (addr > req_addr) {
> +			merge &= obj == req_obj &&
> +				 offset == req_offset +
> +					   (addr - req_addr);
> +
> +			if (!prev)
> +				skip_pmerge = false;
> +
> +			if (end == req_end) {
> +				ret = op_unmap_cb(step, priv, va, merge);
> +				if (ret)
> +					return ret;
> +				break;
> +			}
> +
> +			if (end < req_end) {
> +				skip_nmerge = false;
> +				ret = op_unmap_cb(step, priv, va, merge);
> +				if (ret)
> +					return ret;
> +				goto next;
> +			}
> +
> +			if (end > req_end) {
> +				struct drm_gpuva_op_map n = {
> +					.va.addr = req_end,
> +					.va.range = end - req_end,
> +					.gem.obj = obj,
> +					.gem.offset = offset + req_end - addr,
> +				};
> +				struct drm_gpuva_op_unmap u = { .va = va };
> +				u64 new_end = end;
> +				u64 new_range = new_end - req_addr;
> +
> +				/* We validated that the requested mapping is
> +				 * within a single VA region already.
> +				 * Since it overlaps the current mapping (which
> +				 * can't cross a VA region boundary) we can be
> +				 * sure that we're still within the boundaries
> +				 * of the same VA region after merging.
> +				 */
> +				if (merge) {
> +					req_end = new_end;
> +					req_range = new_range;
> +					ret = op_unmap_cb(step, priv, va, true);
> +					if (ret)
> +						return ret;
> +					break;
> +				}
> +
> +				ret = op_remap_cb(step, priv, NULL, &n, &u);
> +				if (ret)
> +					return ret;
> +				break;
> +			}
> +		}
> +next:
> +		prev = va;
> +	}
> +
> +	va = skip_pmerge ? NULL : drm_gpuva_find_prev(mgr, req_addr);
> +	if (va) {
> +		struct drm_gem_object *obj = va->gem.obj;
> +		u64 offset = va->gem.offset;
> +		u64 addr = va->va.addr;
> +		u64 range = va->va.range;
> +		u64 new_offset = offset;
> +		u64 new_addr = addr;
> +		u64 new_range = req_range + range;
> +		bool merge = gpuva_should_merge(va) &&
> +			     obj == req_obj &&
> +			     offset + range == req_offset;
> +
> +		if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS)
> +			merge &= drm_gpuva_in_any_region(mgr, new_addr,
> +							 new_range);
> +
> +		if (merge) {
> +			ret = op_unmap_cb(step, priv, va, true);
> +			if (ret)
> +				return ret;
> +
> +			req_offset = new_offset;
> +			req_addr = new_addr;
> +			req_range = new_range;
> +		}
> +	}
> +
> +	va = skip_nmerge ? NULL : drm_gpuva_find_next(mgr, req_end);
> +	if (va) {
> +		struct drm_gem_object *obj = va->gem.obj;
> +		u64 offset = va->gem.offset;
> +		u64 addr = va->va.addr;
> +		u64 range = va->va.range;
> +		u64 end = addr + range;
> +		u64 new_range = req_range + range;
> +		u64 new_end = end;
> +		bool merge = gpuva_should_merge(va) &&
> +			     obj == req_obj &&
> +			     offset == req_offset + req_range;
> +
> +		if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS)
> +			merge &= drm_gpuva_in_any_region(mgr, req_addr,
> +							 new_range);
> +
> +		if (merge) {
> +			ret = op_unmap_cb(step, priv, va, true);
> +			if (ret)
> +				return ret;
> +
> +			req_range = new_range;
> +			req_end = new_end;
> +		}
> +	}
> +
> +	ret = op_map_cb(step, priv,
> +			req_addr, req_range,
> +			req_obj, req_offset);
> +	if (ret)
> +		return ret;
> +
> +done:
> +	return 0;
> +}
> +
> +static int
> +__drm_gpuva_sm_unmap(struct drm_gpuva_manager *mgr,
> +		     struct drm_gpuva_fn_ops *ops, void *priv,
> +		     u64 req_addr, u64 req_range)
> +{
> +	DRM_GPUVA_ITER(it, mgr);
> +	int (*step)(struct drm_gpuva_op *, void *);
> +	u64 req_end = req_addr + req_range;
> +	int ret;
> +
> +	step = ops->sm_unmap_step;
> +
> +	drm_gpuva_iter_for_each_range(it, req_addr, req_end) {
> +		struct drm_gpuva *va = it.va;
> +		struct drm_gpuva_op_map prev = {}, next = {};
> +		bool prev_split = false, next_split = false;
> +		struct drm_gem_object *obj = va->gem.obj;
> +		u64 offset = va->gem.offset;
> +		u64 addr = va->va.addr;
> +		u64 range = va->va.range;
> +		u64 end = addr + range;
> +
> +		if (addr < req_addr) {
> +			prev.va.addr = addr;
> +			prev.va.range = req_addr - addr;
> +			prev.gem.obj = obj;
> +			prev.gem.offset = offset;
> +
> +			prev_split = true;
> +		}
> +
> +		if (end > req_end) {
> +			next.va.addr = req_end;
> +			next.va.range = end - req_end;
> +			next.gem.obj = obj;
> +			next.gem.offset = offset + (req_end - addr);
> +
> +			next_split = true;
> +		}
> +
> +		if (prev_split || next_split) {
> +			struct drm_gpuva_op_unmap unmap = { .va = va };
> +
> +			ret = op_remap_cb(step, priv, &prev, &next, &unmap);
> +			if (ret)
> +				return ret;
> +		} else {
> +			ret = op_unmap_cb(step, priv, va, false);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * drm_gpuva_sm_map - creates the &drm_gpuva_op split/merge steps
> + * @mgr: the &drm_gpuva_manager representing the GPU VA space
> + * @req_addr: the start address of the new mapping
> + * @req_range: the range of the new mapping
> + * @req_obj: the &drm_gem_object to map
> + * @req_offset: the offset within the &drm_gem_object
> + * @priv: pointer to a driver private data structure
> + *
> + * This function iterates the given range of the GPU VA space. It utilizes the
> + * &drm_gpuva_fn_ops to call back into the driver providing the split and merge
> + * steps.
> + *
> + * Drivers may use these callbacks to update the GPU VA space right away within
> + * the callback. In case the driver decides to copy and store the operations for
> + * later processing neither this function nor &drm_gpuva_sm_unmap is allowed to
> + * be called before the &drm_gpuva_manager's view of the GPU VA space was
> + * updated with the previous set of operations. To update the
> + * &drm_gpuva_manager's view of the GPU VA space drm_gpuva_insert(),
> + * drm_gpuva_destroy_locked() and/or drm_gpuva_destroy_unlocked() should be
> + * used.
> + *
> + * A sequence of callbacks can contain map, unmap and remap operations, but
> + * the sequence of callbacks might also be empty if no operation is required,
> + * e.g. if the requested mapping already exists in the exact same way.
> + *
> + * There can be an arbitrary amount of unmap operations, a maximum of two remap
> + * operations and a single map operation. The latter one, if existent,
> + * represents the original map operation requested by the caller. Please note
> + * that the map operation might has been modified, e.g. if it was merged with
> + * an existent mapping.
> + *
> + * Returns: 0 on success or a negative error code
> + */
> +int
> +drm_gpuva_sm_map(struct drm_gpuva_manager *mgr, void *priv,
> +		 u64 req_addr, u64 req_range,
> +		 struct drm_gem_object *req_obj, u64 req_offset)
> +{
> +	if (!mgr->ops || !mgr->ops->sm_map_step)
> +		return -EINVAL;
> +
> +	return __drm_gpuva_sm_map(mgr, mgr->ops, priv,
> +				  req_addr, req_range,
> +				  req_obj, req_offset);
> +}
> +EXPORT_SYMBOL(drm_gpuva_sm_map);
> +
> +/**
> + * drm_gpuva_sm_unmap - creates the &drm_gpuva_ops to split on unmap
> + * @mgr: the &drm_gpuva_manager representing the GPU VA space
> + * @req_addr: the start address of the range to unmap
> + * @req_range: the range of the mappings to unmap
> + * @ops: the &drm_gpuva_fn_ops callbacks to provide the split/merge steps
> + * @priv: pointer to a driver private data structure
> + *
> + * This function iterates the given range of the GPU VA space. It utilizes the
> + * &drm_gpuva_fn_ops to call back into the driver providing the operations to
> + * unmap and, if required, split existent mappings.
> + *
> + * Drivers may use these callbacks to update the GPU VA space right away within
> + * the callback. In case the driver decides to copy and store the operations for
> + * later processing neither this function nor &drm_gpuva_sm_map is allowed to be
> + * called before the &drm_gpuva_manager's view of the GPU VA space was updated
> + * with the previous set of operations. To update the &drm_gpuva_manager's view
> + * of the GPU VA space drm_gpuva_insert(), drm_gpuva_destroy_locked() and/or
> + * drm_gpuva_destroy_unlocked() should be used.
> + *
> + * A sequence of callbacks can contain unmap and remap operations, depending on
> + * whether there are actual overlapping mappings to split.
> + *
> + * There can be an arbitrary amount of unmap operations and a maximum of two
> + * remap operations.
> + *
> + * Returns: 0 on success or a negative error code
> + */
> +int
> +drm_gpuva_sm_unmap(struct drm_gpuva_manager *mgr, void *priv,
> +		   u64 req_addr, u64 req_range)
> +{
> +	if (!mgr->ops || !mgr->ops->sm_unmap_step)
> +		return -EINVAL;
> +
> +	return __drm_gpuva_sm_unmap(mgr, mgr->ops, priv,
> +				    req_addr, req_range);
> +}
> +EXPORT_SYMBOL(drm_gpuva_sm_unmap);
> +
> +static struct drm_gpuva_op *
> +gpuva_op_alloc(struct drm_gpuva_manager *mgr)
> +{
> +	struct drm_gpuva_fn_ops *fn = mgr->ops;
> +	struct drm_gpuva_op *op;
> +
> +	if (fn && fn->op_alloc)
> +		op = fn->op_alloc();
> +	else
> +		op = kzalloc(sizeof(*op), GFP_KERNEL);
> +
> +	if (unlikely(!op))
> +		return NULL;
> +
> +	return op;
> +}
> +
> +static void
> +gpuva_op_free(struct drm_gpuva_manager *mgr,
> +	      struct drm_gpuva_op *op)
> +{
> +	struct drm_gpuva_fn_ops *fn = mgr->ops;
> +
> +	if (fn && fn->op_free)
> +		fn->op_free(op);
> +	else
> +		kfree(op);
> +}
> +
> +int drm_gpuva_sm_step(struct drm_gpuva_op *__op, void *priv)
> +{
> +	struct {
> +		struct drm_gpuva_manager *mgr;
> +		struct drm_gpuva_ops *ops;
> +	} *args = priv;
> +	struct drm_gpuva_manager *mgr = args->mgr;
> +	struct drm_gpuva_ops *ops = args->ops;
> +	struct drm_gpuva_op *op;
> +
> +	op = gpuva_op_alloc(mgr);
> +	if (unlikely(!op))
> +		goto err;
> +
> +	memcpy(op, __op, sizeof(*op));
> +
> +	if (op->op == DRM_GPUVA_OP_REMAP) {
> +		struct drm_gpuva_op_remap *__r = &__op->remap;
> +		struct drm_gpuva_op_remap *r = &op->remap;
> +
> +		r->unmap = kmemdup(__r->unmap, sizeof(*r->unmap),
> +				   GFP_KERNEL);
> +		if (unlikely(!r->unmap))
> +			goto err_free_op;
> +
> +		if (__r->prev) {
> +			r->prev = kmemdup(__r->prev, sizeof(*r->prev),
> +					  GFP_KERNEL);
> +			if (unlikely(!r->prev))
> +				goto err_free_unmap;
> +		}
> +
> +		if (__r->next) {
> +			r->next = kmemdup(__r->next, sizeof(*r->next),
> +					  GFP_KERNEL);
> +			if (unlikely(!r->next))
> +				goto err_free_prev;
> +		}
> +	}
> +
> +	list_add_tail(&op->entry, &ops->list);
> +
> +	return 0;
> +
> +err_free_unmap:
> +	kfree(op->remap.unmap);
> +err_free_prev:
> +	kfree(op->remap.prev);
> +err_free_op:
> +	gpuva_op_free(mgr, op);
> +err:
> +	return -ENOMEM;
> +}
> +
> +static struct drm_gpuva_fn_ops gpuva_list_ops = {
> +	.sm_map_step = drm_gpuva_sm_step,
> +	.sm_unmap_step = drm_gpuva_sm_step,
> +};
> +
> +/**
> + * drm_gpuva_sm_map_ops_create - creates the &drm_gpuva_ops to split and merge
> + * @mgr: the &drm_gpuva_manager representing the GPU VA space
> + * @req_addr: the start address of the new mapping
> + * @req_range: the range of the new mapping
> + * @req_obj: the &drm_gem_object to map
> + * @req_offset: the offset within the &drm_gem_object
> + *
> + * This function creates a list of operations to perform splitting and merging
> + * of existent mapping(s) with the newly requested one.
> + *
> + * The list can be iterated with &drm_gpuva_for_each_op and must be processed
> + * in the given order. It can contain map, unmap and remap operations, but it
> + * also can be empty if no operation is required, e.g. if the requested mapping
> + * already exists is the exact same way.
> + *
> + * There can be an arbitrary amount of unmap operations, a maximum of two remap
> + * operations and a single map operation. The latter one, if existent,
> + * represents the original map operation requested by the caller. Please note
> + * that the map operation might has been modified, e.g. if it was merged with an
> + * existent mapping.
> + *
> + * Note that before calling this function again with another mapping request it
> + * is necessary to update the &drm_gpuva_manager's view of the GPU VA space. The
> + * previously obtained operations must be either processed or abandoned. To
> + * update the &drm_gpuva_manager's view of the GPU VA space drm_gpuva_insert(),
> + * drm_gpuva_destroy_locked() and/or drm_gpuva_destroy_unlocked() should be
> + * used.
> + *
> + * After the caller finished processing the returned &drm_gpuva_ops, they must
> + * be freed with &drm_gpuva_ops_free.
> + *
> + * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
> + */
> +struct drm_gpuva_ops *
> +drm_gpuva_sm_map_ops_create(struct drm_gpuva_manager *mgr,
> +			    u64 req_addr, u64 req_range,
> +			    struct drm_gem_object *req_obj, u64 req_offset)
> +{
> +	struct drm_gpuva_ops *ops;
> +	struct {
> +		struct drm_gpuva_manager *mgr;
> +		struct drm_gpuva_ops *ops;
> +	} args;
> +	int ret;
> +
> +	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
> +	if (unlikely(!ops))
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD(&ops->list);
> +
> +	args.mgr = mgr;
> +	args.ops = ops;
> +
> +	ret = __drm_gpuva_sm_map(mgr, &gpuva_list_ops, &args,
> +				 req_addr, req_range,
> +				 req_obj, req_offset);
> +	if (ret) {
> +		kfree(ops);
> +		return ERR_PTR(ret);
> +	}
> +
> +	return ops;
> +}
> +EXPORT_SYMBOL(drm_gpuva_sm_map_ops_create);
> +
> +/**
> + * drm_gpuva_sm_unmap_ops_create - creates the &drm_gpuva_ops to split on unmap
> + * @mgr: the &drm_gpuva_manager representing the GPU VA space
> + * @req_addr: the start address of the range to unmap
> + * @req_range: the range of the mappings to unmap
> + *
> + * This function creates a list of operations to perform unmapping and, if
> + * required, splitting of the mappings overlapping the unmap range.
> + *
> + * The list can be iterated with &drm_gpuva_for_each_op and must be processed
> + * in the given order. It can contain unmap and remap operations, depending on
> + * whether there are actual overlapping mappings to split.
> + *
> + * There can be an arbitrary amount of unmap operations and a maximum of two
> + * remap operations.
> + *
> + * Note that before calling this function again with another range to unmap it
> + * is necessary to update the &drm_gpuva_manager's view of the GPU VA space. The
> + * previously obtained operations must be processed or abandoned. To update the
> + * &drm_gpuva_manager's view of the GPU VA space drm_gpuva_insert(),
> + * drm_gpuva_destroy_locked() and/or drm_gpuva_destroy_unlocked() should be
> + * used.
> + *
> + * After the caller finished processing the returned &drm_gpuva_ops, they must
> + * be freed with &drm_gpuva_ops_free.
> + *
> + * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
> + */
> +struct drm_gpuva_ops *
> +drm_gpuva_sm_unmap_ops_create(struct drm_gpuva_manager *mgr,
> +			      u64 req_addr, u64 req_range)
> +{
> +	struct drm_gpuva_ops *ops;
> +	struct {
> +		struct drm_gpuva_manager *mgr;
> +		struct drm_gpuva_ops *ops;
> +	} args;
> +	int ret;
> +
> +	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
> +	if (unlikely(!ops))
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD(&ops->list);
> +
> +	args.mgr = mgr;
> +	args.ops = ops;
> +
> +	ret = __drm_gpuva_sm_unmap(mgr, &gpuva_list_ops, &args,
> +				   req_addr, req_range);
> +	if (ret) {
> +		kfree(ops);
> +		return ERR_PTR(ret);
> +	}
> +
> +	return ops;
> +}
> +EXPORT_SYMBOL(drm_gpuva_sm_unmap_ops_create);
> +
> +/**
> + * drm_gpuva_prefetch_ops_create - creates the &drm_gpuva_ops to prefetch
> + * @mgr: the &drm_gpuva_manager representing the GPU VA space
> + * @req_addr: the start address of the range to prefetch
> + * @req_range: the range of the mappings to prefetch
> + *
> + * This function creates a list of operations to perform prefetching.
> + *
> + * The list can be iterated with &drm_gpuva_for_each_op and must be processed
> + * in the given order. It can contain prefetch operations.
> + *
> + * There can be an arbitrary amount of prefetch operations.
> + *
> + * After the caller finished processing the returned &drm_gpuva_ops, they must
> + * be freed with &drm_gpuva_ops_free.
> + *
> + * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
> + */
> +struct drm_gpuva_ops *
> +drm_gpuva_prefetch_ops_create(struct drm_gpuva_manager *mgr,
> +			      u64 addr, u64 range)
> +{
> +	DRM_GPUVA_ITER(it, mgr);
> +	struct drm_gpuva_ops *ops;
> +	struct drm_gpuva_op *op;
> +	int ret;
> +
> +	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
> +	if (!ops)
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD(&ops->list);
> +
> +	drm_gpuva_iter_for_each_range(it, addr, addr + range) {
> +		op = gpuva_op_alloc(mgr);
> +		if (!op) {
> +			ret = -ENOMEM;
> +			goto err_free_ops;
> +		}
> +
> +		op->op = DRM_GPUVA_OP_PREFETCH;
> +		op->prefetch.va = it.va;
> +		list_add_tail(&op->entry, &ops->list);
> +	}
> +
> +	return ops;
> +
> +err_free_ops:
> +	drm_gpuva_ops_free(mgr, ops);
> +	return ERR_PTR(ret);
> +}
> +EXPORT_SYMBOL(drm_gpuva_prefetch_ops_create);
> +
> +/**
> + * drm_gpuva_gem_unmap_ops_create - creates the &drm_gpuva_ops to unmap a GEM
> + * @mgr: the &drm_gpuva_manager representing the GPU VA space
> + * @obj: the &drm_gem_object to unmap
> + *
> + * This function creates a list of operations to perform unmapping for every
> + * GPUVA attached to a GEM.
> + *
> + * The list can be iterated with &drm_gpuva_for_each_op and consists out of an
> + * arbitrary amount of unmap operations.
> + *
> + * After the caller finished processing the returned &drm_gpuva_ops, they must
> + * be freed with &drm_gpuva_ops_free.
> + *
> + * It is the callers responsibility to protect the GEMs GPUVA list against
> + * concurrent access.
> + *
> + * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
> + */
> +struct drm_gpuva_ops *
> +drm_gpuva_gem_unmap_ops_create(struct drm_gpuva_manager *mgr,
> +			       struct drm_gem_object *obj)
> +{
> +	struct drm_gpuva_ops *ops;
> +	struct drm_gpuva_op *op;
> +	struct drm_gpuva *va;
> +	int ret;
> +
> +	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
> +	if (!ops)
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD(&ops->list);
> +
> +	drm_gem_for_each_gpuva(va, obj) {
> +		op = gpuva_op_alloc(mgr);
> +		if (!op) {
> +			ret = -ENOMEM;
> +			goto err_free_ops;
> +		}
> +
> +		op->op = DRM_GPUVA_OP_UNMAP;
> +		op->unmap.va = va;
> +		list_add_tail(&op->entry, &ops->list);
> +	}
> +
> +	return ops;
> +
> +err_free_ops:
> +	drm_gpuva_ops_free(mgr, ops);
> +	return ERR_PTR(ret);
> +}
> +EXPORT_SYMBOL(drm_gpuva_gem_unmap_ops_create);
> +
> +
> +/**
> + * drm_gpuva_ops_free - free the given &drm_gpuva_ops
> + * @mgr: the &drm_gpuva_manager the ops were created for
> + * @ops: the &drm_gpuva_ops to free
> + *
> + * Frees the given &drm_gpuva_ops structure including all the ops associated
> + * with it.
> + */
> +void
> +drm_gpuva_ops_free(struct drm_gpuva_manager *mgr,
> +		   struct drm_gpuva_ops *ops)
> +{
> +	struct drm_gpuva_op *op, *next;
> +
> +	drm_gpuva_for_each_op_safe(op, next, ops) {
> +		list_del(&op->entry);
> +
> +		if (op->op == DRM_GPUVA_OP_REMAP) {
> +			kfree(op->remap.prev);
> +			kfree(op->remap.next);
> +			kfree(op->remap.unmap);
> +		}
> +
> +		gpuva_op_free(mgr, op);
> +	}
> +
> +	kfree(ops);
> +}
> +EXPORT_SYMBOL(drm_gpuva_ops_free);
> diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
> index 1d76d0686b03..4fe4a1552948 100644
> --- a/include/drm/drm_drv.h
> +++ b/include/drm/drm_drv.h
> @@ -104,6 +104,12 @@ enum drm_driver_feature {
>   	 * acceleration should be handled by two drivers that are connected using auxiliary bus.
>   	 */
>   	DRIVER_COMPUTE_ACCEL            = BIT(7),
> +	/**
> +	 * @DRIVER_GEM_GPUVA:
> +	 *
> +	 * Driver supports user defined GPU VA bindings for GEM objects.
> +	 */
> +	DRIVER_GEM_GPUVA		= BIT(8),
>   
>   	/* IMPORTANT: Below are all the legacy flags, add new ones above. */
>   
> diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> index 772a4adf5287..4a3679034966 100644
> --- a/include/drm/drm_gem.h
> +++ b/include/drm/drm_gem.h
> @@ -36,6 +36,8 @@
>   
>   #include <linux/kref.h>
>   #include <linux/dma-resv.h>
> +#include <linux/list.h>
> +#include <linux/mutex.h>
>   
>   #include <drm/drm_vma_manager.h>
>   
> @@ -337,6 +339,17 @@ struct drm_gem_object {
>   	 */
>   	struct dma_resv _resv;
>   
> +	/**
> +	 * @gpuva:
> +	 *
> +	 * Provides the list and list mutex of GPU VAs attached to this
> +	 * GEM object.
> +	 */
> +	struct {
> +		struct list_head list;
> +		struct mutex mutex;
> +	} gpuva;
> +
>   	/**
>   	 * @funcs:
>   	 *
> @@ -479,4 +492,66 @@ void drm_gem_lru_move_tail(struct drm_gem_lru *lru, struct drm_gem_object *obj);
>   unsigned long drm_gem_lru_scan(struct drm_gem_lru *lru, unsigned nr_to_scan,
>   			       bool (*shrink)(struct drm_gem_object *obj));
>   
> +/**
> + * drm_gem_gpuva_init - initialize the gpuva list of a GEM object
> + * @obj: the &drm_gem_object
> + *
> + * This initializes the &drm_gem_object's &drm_gpuva list and the mutex
> + * protecting it.
> + *
> + * Calling this function is only necessary for drivers intending to support the
> + * &drm_driver_feature DRIVER_GEM_GPUVA.
> + */
> +static inline void drm_gem_gpuva_init(struct drm_gem_object *obj)
> +{
> +	INIT_LIST_HEAD(&obj->gpuva.list);
> +	mutex_init(&obj->gpuva.mutex);
> +}
> +
> +/**
> + * drm_gem_gpuva_lock - lock the GEM's gpuva list mutex
> + * @obj: the &drm_gem_object
> + *
> + * This unlocks the mutex protecting the &drm_gem_object's &drm_gpuva list.
> + */
> +static inline void drm_gem_gpuva_lock(struct drm_gem_object *obj)
> +{
> +	mutex_lock(&obj->gpuva.mutex);
> +}
> +
> +/**
> + * drm_gem_gpuva_unlock - unlock the GEM's gpuva list mutex
> + * @obj: the &drm_gem_object
> + *
> + * This unlocks the mutex protecting the &drm_gem_object's &drm_gpuva list.
> + */
> +static inline void drm_gem_gpuva_unlock(struct drm_gem_object *obj)
> +{
> +	mutex_unlock(&obj->gpuva.mutex);
> +}
> +
> +/**
> + * drm_gem_for_each_gpuva - iternator to walk over a list of gpuvas
> + * @entry: &drm_gpuva structure to assign to in each iteration step
> + * @obj: the &drm_gem_object the &drm_gpuvas to walk are associated with
> + *
> + * This iterator walks over all &drm_gpuva structures associated with the
> + * &drm_gpuva_manager.
> + */
> +#define drm_gem_for_each_gpuva(entry, obj) \
> +	list_for_each_entry(entry, &obj->gpuva.list, head)
> +
> +/**
> + * drm_gem_for_each_gpuva_safe - iternator to safely walk over a list of gpuvas
> + * @entry: &drm_gpuva structure to assign to in each iteration step
> + * @next: &next &drm_gpuva to store the next step
> + * @obj: the &drm_gem_object the &drm_gpuvas to walk are associated with
> + *
> + * This iterator walks over all &drm_gpuva structures associated with the
> + * &drm_gem_object. It is implemented with list_for_each_entry_safe(), hence
> + * it is save against removal of elements.
> + */
> +#define drm_gem_for_each_gpuva_safe(entry, next, obj) \
> +	list_for_each_entry_safe(entry, next, &obj->gpuva.list, head)
> +
>   #endif /* __DRM_GEM_H__ */
> diff --git a/include/drm/drm_gpuva_mgr.h b/include/drm/drm_gpuva_mgr.h
> new file mode 100644
> index 000000000000..d245d01e37a9
> --- /dev/null
> +++ b/include/drm/drm_gpuva_mgr.h
> @@ -0,0 +1,714 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __DRM_GPUVA_MGR_H__
> +#define __DRM_GPUVA_MGR_H__
> +
> +/*
> + * Copyright (c) 2022 Red Hat.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/maple_tree.h>
> +#include <linux/mm.h>
> +#include <linux/rbtree.h>
> +#include <linux/spinlock.h>
> +#include <linux/types.h>
> +
> +struct drm_gpuva_manager;
> +struct drm_gpuva_fn_ops;
> +
> +/**
> + * struct drm_gpuva_region - structure to track a portion of GPU VA space
> + *
> + * This structure represents a portion of a GPUs VA space and is associated
> + * with a &drm_gpuva_manager.
> + *
> + * GPU VA mappings, represented by &drm_gpuva objects, are restricted to be
> + * placed within a &drm_gpuva_region.
> + */
> +struct drm_gpuva_region {
> +	/**
> +	 * @mgr: the &drm_gpuva_manager this object is associated with
> +	 */
> +	struct drm_gpuva_manager *mgr;
> +
> +	/**
> +	 * @va: structure containing the address and range of the &drm_gpuva_region
> +	 */
> +	struct {
> +		/**
> +		 * @addr: the start address
> +		 */
> +		u64 addr;
> +
> +		/*
> +		 * @range: the range
> +		 */
> +		u64 range;
> +	} va;
> +
> +	/**
> +	 * @sparse: indicates whether this region is sparse
> +	 */
> +	bool sparse;
> +};
> +
> +int drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
> +			    struct drm_gpuva_region *reg);
> +void drm_gpuva_region_remove(struct drm_gpuva_region *reg);
> +
> +bool
> +drm_gpuva_region_empty(struct drm_gpuva_region *reg);
> +
> +struct drm_gpuva_region *
> +drm_gpuva_region_find(struct drm_gpuva_manager *mgr,
> +		      u64 addr, u64 range);
> +struct drm_gpuva_region *
> +drm_gpuva_region_find_first(struct drm_gpuva_manager *mgr,
> +			    u64 addr, u64 range);
> +
> +/**
> + * enum drm_gpuva_flags - flags for struct drm_gpuva
> + */
> +enum drm_gpuva_flags {
> +	/**
> +	 * @DRM_GPUVA_EVICTED:
> +	 *
> +	 * Flag indicating that the &drm_gpuva's backing GEM is evicted.
> +	 */
> +	DRM_GPUVA_EVICTED = (1 << 0),
> +
> +	/**
> +	 * @DRM_GPUVA_USERBITS: user defined bits
> +	 */
> +	DRM_GPUVA_USERBITS = (1 << 1),
> +};
> +
> +/**
> + * struct drm_gpuva - structure to track a GPU VA mapping
> + *
> + * This structure represents a GPU VA mapping and is associated with a
> + * &drm_gpuva_manager.
> + *
> + * Typically, this structure is embedded in bigger driver structures.
> + */
> +struct drm_gpuva {
> +	/**
> +	 * @mgr: the &drm_gpuva_manager this object is associated with
> +	 */
> +	struct drm_gpuva_manager *mgr;
> +
> +	/**
> +	 * @region: the &drm_gpuva_region the &drm_gpuva is mapped in
> +	 */
> +	struct drm_gpuva_region *region;
> +
> +	/**
> +	 * @head: the &list_head to attach this object to a &drm_gem_object
> +	 */
> +	struct list_head head;
> +
> +	/**
> +	 * @flags: the &drm_gpuva_flags for this mapping
> +	 */
> +	enum drm_gpuva_flags flags;
> +
> +	/**
> +	 * @va: structure containing the address and range of the &drm_gpuva
> +	 */
> +	struct {
> +		/**
> +		 * @addr: the start address
> +		 */
> +		u64 addr;
> +
> +		/*
> +		 * @range: the range
> +		 */
> +		u64 range;
> +	} va;
> +
> +	/**
> +	 * @gem: structure containing the &drm_gem_object and it's offset
> +	 */
> +	struct {
> +		/**
> +		 * @offset: the offset within the &drm_gem_object
> +		 */
> +		u64 offset;
> +
> +		/**
> +		 * @obj: the mapped &drm_gem_object
> +		 */
> +		struct drm_gem_object *obj;
> +	} gem;
> +};
> +
> +void drm_gpuva_link(struct drm_gpuva *va);
> +void drm_gpuva_unlink(struct drm_gpuva *va);
> +
> +int drm_gpuva_insert(struct drm_gpuva_manager *mgr,
> +		     struct drm_gpuva *va);
> +void drm_gpuva_remove(struct drm_gpuva *va);
> +
> +struct drm_gpuva *drm_gpuva_find(struct drm_gpuva_manager *mgr,
> +				 u64 addr, u64 range);
> +struct drm_gpuva *drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
> +				       u64 addr, u64 range);
> +struct drm_gpuva *drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start);
> +struct drm_gpuva *drm_gpuva_find_next(struct drm_gpuva_manager *mgr, u64 end);
> +
> +/**
> + * drm_gpuva_evict - sets whether the backing GEM of this &drm_gpuva is evicted
> + * @va: the &drm_gpuva to set the evict flag for
> + * @evict: indicates whether the &drm_gpuva is evicted
> + */
> +static inline void drm_gpuva_evict(struct drm_gpuva *va, bool evict)
> +{
> +	if (evict)
> +		va->flags |= DRM_GPUVA_EVICTED;
> +	else
> +		va->flags &= ~DRM_GPUVA_EVICTED;
> +}
> +
> +/**
> + * drm_gpuva_evicted - indicates whether the backing BO of this &drm_gpuva
> + * is evicted
> + * @va: the &drm_gpuva to check
> + */
> +static inline bool drm_gpuva_evicted(struct drm_gpuva *va)
> +{
> +	return va->flags & DRM_GPUVA_EVICTED;
> +}
> +
> +/**
> + * enum drm_gpuva_mgr_flags - the feature flags for the &drm_gpuva_manager
> + */
> +enum drm_gpuva_mgr_flags {
> +	/**
> +	 * @DRM_GPUVA_MANAGER_REGIONS:
> +	 *
> +	 * Enable the &drm_gpuva_manager to separately track &drm_gpuva_regions.
> +	 *
> +	 * &drm_gpuva_regions represent a reserved portion of VA space drivers
> +	 * can create mappings in. If regions are enabled, &drm_gpuvas can be
> +	 * created within an existing &drm_gpuva_region only and merge
> +	 * operations never indicate merging over region boundaries.
> +	 */
> +	DRM_GPUVA_MANAGER_REGIONS = (1 << 0),
> +};
> +
> +/**
> + * struct drm_gpuva_manager - DRM GPU VA Manager
> + *
> + * The DRM GPU VA Manager keeps track of a GPU's virtual address space by using
> + * &maple_tree structures. Typically, this structure is embedded in bigger
> + * driver structures.
> + *
> + * Drivers can pass addresses and ranges in an arbitrary unit, e.g. bytes or
> + * pages.
> + *
> + * There should be one manager instance per GPU virtual address space.
> + */
> +struct drm_gpuva_manager {
> +	/**
> +	 * @name: the name of the DRM GPU VA space
> +	 */
> +	const char *name;
> +
> +	/**
> +	 * @mm_start: start of the VA space
> +	 */
> +	u64 mm_start;
> +
> +	/**
> +	 * @mm_range: length of the VA space
> +	 */
> +	u64 mm_range;
> +
> +	/**
> +	 * @region_mt: the &maple_tree to track GPU VA regions
> +	 */
> +	struct maple_tree region_mt;
> +
> +	/**
> +	 * @va_mt: the &maple_tree to track GPU VA mappings
> +	 */
> +	struct maple_tree va_mt;
> +
> +	/**
> +	 * @kernel_alloc_region:
> +	 *
> +	 * &drm_gpuva_region representing the address space cutout reserved for
> +	 * the kernel
> +	 */
> +	struct drm_gpuva_region kernel_alloc_region;
> +
> +	/**
> +	 * @ops: &drm_gpuva_fn_ops providing the split/merge steps to drivers
> +	 */
> +	struct drm_gpuva_fn_ops *ops;
> +
> +	/**
> +	 * @flags: the feature flags of the &drm_gpuva_manager
> +	 */
> +	enum drm_gpuva_mgr_flags flags;
> +};
> +
> +void drm_gpuva_manager_init(struct drm_gpuva_manager *mgr,
> +			    const char *name,
> +			    u64 start_offset, u64 range,
> +			    u64 reserve_offset, u64 reserve_range,
> +			    struct drm_gpuva_fn_ops *ops,
> +			    enum drm_gpuva_mgr_flags flags);
> +void drm_gpuva_manager_destroy(struct drm_gpuva_manager *mgr);
> +
> +/**
> + * struct drm_gpuva_iterator - iterator for walking the internal (maple) tree
> + */
> +struct drm_gpuva_iterator {
> +	/**
> +	 * @mas: the maple tree iterator (maple advanced state)
> +	 */
> +	struct ma_state mas;
> +
> +	/**
> +	 * @mgr: the &drm_gpuva_manager to iterate
> +	 */
> +	struct drm_gpuva_manager *mgr;
> +
> +	union {
> +		/**
> +		 * @va: the current &drm_gpuva entry
> +		 */
> +		struct drm_gpuva *va;
> +
> +		/**
> +		 * @reg: the current &drm_gpuva_region entry
> +		 */
> +		struct drm_gpuva_region *reg;
> +
> +		/**
> +		 * @entry: the current entry
> +		 */
> +		void *entry;
> +	};
> +};
> +
> +void drm_gpuva_iter_remove(struct drm_gpuva_iterator *it);
> +
> +/**
> + * DRM_GPUVA_ITER - create an iterator structure to iterate the &drm_gpuva tree
> + * @name: the name of the &drm_gpuva_iterator to create
> + * @mgr: the &drm_gpuva_manager to iterate
> + */
> +#define DRM_GPUVA_ITER(name, mgr__)				\
> +	struct drm_gpuva_iterator name = {			\
> +		.mas = __MA_STATE(&(mgr__)->va_mt, 0, 0),	\
> +		.mgr = mgr__,					\
> +		.va = NULL,					\
> +	}
> +
> +/**
> + * DRM_GPUVA_REGION_ITER - create an iterator structure to iterate the
> + * &drm_gpuva_region tree
> + * @name: the name of the &drm_gpuva_iterator to create
> + * @mgr: the &drm_gpuva_manager to iterate
> + */
> +#define DRM_GPUVA_REGION_ITER(name, mgr__)			\
> +	struct drm_gpuva_iterator name = {			\
> +		.mas = __MA_STATE(&(mgr__)->region_mt, 0, 0),	\
> +		.mgr = mgr__,					\
> +		.reg = NULL,					\
> +	}
> +
> +/**
> + * drm_gpuva_iter_for_each_range - iternator to walk over a range of entries
> + * @it__: &drm_gpuva_iterator structure to assign to in each iteration step
> + * @start__: starting offset, the first entry will overlap this
> + * @end__: ending offset, the last entry will start before this (but may overlap)
> + *
> + * This function can be used to iterate both &drm_gpuva objects and
> + * &drm_gpuva_region objects.
> + *
> + * It is safe against the removal of elements using &drm_gpuva_iter_remove,
> + * however it is not safe against the removal of elements using
> + * &drm_gpuva_remove and &drm_gpuva_region_remove.
> + */
> +#define drm_gpuva_iter_for_each_range(it__, start__, end__) \
> +	for ((it__).mas.index = start__, (it__).entry = mas_find(&(it__).mas, end__ - 1); \
> +	     (it__).entry; (it__).entry = mas_find(&(it__).mas, end__ - 1))
> +
> +/**
> + * drm_gpuva_iter_for_each - iternator to walk over all existing entries
> + * @it__: &drm_gpuva_iterator structure to assign to in each iteration step
> + *
> + * This function can be used to iterate both &drm_gpuva objects and
> + * &drm_gpuva_region objects.
> + *
> + * It is safe against the removal of elements using &drm_gpuva_iter_remove,
> + * however it is not safe against the removal of elements using
> + * &drm_gpuva_remove and &drm_gpuva_region_remove.
> + */
> +#define drm_gpuva_iter_for_each(it__) \
> +	drm_gpuva_iter_for_each_range(it__, (it__).mgr->mm_start, \
> +				      (it__).mgr->mm_start + (it__).mgr->mm_range)
> +
> +/**
> + * enum drm_gpuva_op_type - GPU VA operation type
> + *
> + * Operations to alter the GPU VA mappings tracked by the &drm_gpuva_manager.
> + */
> +enum drm_gpuva_op_type {
> +	/**
> +	 * @DRM_GPUVA_OP_MAP: the map op type
> +	 */
> +	DRM_GPUVA_OP_MAP,
> +
> +	/**
> +	 * @DRM_GPUVA_OP_REMAP: the remap op type
> +	 */
> +	DRM_GPUVA_OP_REMAP,
> +
> +	/**
> +	 * @DRM_GPUVA_OP_UNMAP: the unmap op type
> +	 */
> +	DRM_GPUVA_OP_UNMAP,
> +
> +	/**
> +	 * @DRM_GPUVA_OP_PREFETCH: the prefetch op type
> +	 */
> +	DRM_GPUVA_OP_PREFETCH,
> +};
> +
> +/**
> + * struct drm_gpuva_op_map - GPU VA map operation
> + *
> + * This structure represents a single map operation generated by the
> + * DRM GPU VA manager.
> + */
> +struct drm_gpuva_op_map {
> +	/**
> +	 * @va: structure containing address and range of a map
> +	 * operation
> +	 */
> +	struct {
> +		/**
> +		 * @addr: the base address of the new mapping
> +		 */
> +		u64 addr;
> +
> +		/**
> +		 * @range: the range of the new mapping
> +		 */
> +		u64 range;
> +	} va;
> +
> +	/**
> +	 * @gem: structure containing the &drm_gem_object and it's offset
> +	 */
> +	struct {
> +		/**
> +		 * @offset: the offset within the &drm_gem_object
> +		 */
> +		u64 offset;
> +
> +		/**
> +		 * @obj: the &drm_gem_object to map
> +		 */
> +		struct drm_gem_object *obj;
> +	} gem;
> +};
> +
> +/**
> + * struct drm_gpuva_op_unmap - GPU VA unmap operation
> + *
> + * This structure represents a single unmap operation generated by the
> + * DRM GPU VA manager.
> + */
> +struct drm_gpuva_op_unmap {
> +	/**
> +	 * @va: the &drm_gpuva to unmap
> +	 */
> +	struct drm_gpuva *va;
> +
> +	/**
> +	 * @keep:
> +	 *
> +	 * Indicates whether this &drm_gpuva is physically contiguous with the
> +	 * original mapping request.
> +	 *
> +	 * Optionally, if &keep is set, drivers may keep the actual page table
> +	 * mappings for this &drm_gpuva, adding the missing page table entries
> +	 * only and update the &drm_gpuva_manager accordingly.
> +	 */
> +	bool keep;
> +};
> +
> +/**
> + * struct drm_gpuva_op_remap - GPU VA remap operation
> + *
> + * This represents a single remap operation generated by the DRM GPU VA manager.
> + *
> + * A remap operation is generated when an existing GPU VA mmapping is split up
> + * by inserting a new GPU VA mapping or by partially unmapping existent
> + * mapping(s), hence it consists of a maximum of two map and one unmap
> + * operation.
> + *
> + * The @unmap operation takes care of removing the original existing mapping.
> + * @prev is used to remap the preceding part, @next the subsequent part.
> + *
> + * If either a new mapping's start address is aligned with the start address
> + * of the old mapping or the new mapping's end address is aligned with the
> + * end address of the old mapping, either @prev or @next is NULL.
> + *
> + * Note, the reason for a dedicated remap operation, rather than arbitrary
> + * unmap and map operations, is to give drivers the chance of extracting driver
> + * specific data for creating the new mappings from the unmap operations's
> + * &drm_gpuva structure which typically is embedded in larger driver specific
> + * structures.
> + */
> +struct drm_gpuva_op_remap {
> +	/**
> +	 * @prev: the preceding part of a split mapping
> +	 */
> +	struct drm_gpuva_op_map *prev;
> +
> +	/**
> +	 * @next: the subsequent part of a split mapping
> +	 */
> +	struct drm_gpuva_op_map *next;
> +
> +	/**
> +	 * @unmap: the unmap operation for the original existing mapping
> +	 */
> +	struct drm_gpuva_op_unmap *unmap;
> +};
> +
> +/**
> + * struct drm_gpuva_op_prefetch - GPU VA prefetch operation
> + *
> + * This structure represents a single prefetch operation generated by the
> + * DRM GPU VA manager.
> + */
> +struct drm_gpuva_op_prefetch {
> +	/**
> +	 * @va: the &drm_gpuva to prefetch
> +	 */
> +	struct drm_gpuva *va;
> +};
> +
> +/**
> + * struct drm_gpuva_op - GPU VA operation
> + *
> + * This structure represents a single generic operation.
> + *
> + * The particular type of the operation is defined by @op.
> + */
> +struct drm_gpuva_op {
> +	/**
> +	 * @entry:
> +	 *
> +	 * The &list_head used to distribute instances of this struct within
> +	 * &drm_gpuva_ops.
> +	 */
> +	struct list_head entry;
> +
> +	/**
> +	 * @op: the type of the operation
> +	 */
> +	enum drm_gpuva_op_type op;
> +
> +	union {
> +		/**
> +		 * @map: the map operation
> +		 */
> +		struct drm_gpuva_op_map map;
> +
> +		/**
> +		 * @remap: the remap operation
> +		 */
> +		struct drm_gpuva_op_remap remap;
> +
> +		/**
> +		 * @unmap: the unmap operation
> +		 */
> +		struct drm_gpuva_op_unmap unmap;
> +
> +		/**
> +		 * @prefetch: the prefetch operation
> +		 */
> +		struct drm_gpuva_op_prefetch prefetch;
> +	};
> +};
> +
> +/**
> + * struct drm_gpuva_ops - wraps a list of &drm_gpuva_op
> + */
> +struct drm_gpuva_ops {
> +	/**
> +	 * @list: the &list_head
> +	 */
> +	struct list_head list;
> +};
> +
> +/**
> + * drm_gpuva_for_each_op - iterator to walk over &drm_gpuva_ops
> + * @op: &drm_gpuva_op to assign in each iteration step
> + * @ops: &drm_gpuva_ops to walk
> + *
> + * This iterator walks over all ops within a given list of operations.
> + */
> +#define drm_gpuva_for_each_op(op, ops) list_for_each_entry(op, &(ops)->list, entry)
> +
> +/**
> + * drm_gpuva_for_each_op_safe - iterator to safely walk over &drm_gpuva_ops
> + * @op: &drm_gpuva_op to assign in each iteration step
> + * @next: &next &drm_gpuva_op to store the next step
> + * @ops: &drm_gpuva_ops to walk
> + *
> + * This iterator walks over all ops within a given list of operations. It is
> + * implemented with list_for_each_safe(), so save against removal of elements.
> + */
> +#define drm_gpuva_for_each_op_safe(op, next, ops) \
> +	list_for_each_entry_safe(op, next, &(ops)->list, entry)
> +
> +/**
> + * drm_gpuva_for_each_op_from_reverse - iterate backwards from the given point
> + * @op: &drm_gpuva_op to assign in each iteration step
> + * @ops: &drm_gpuva_ops to walk
> + *
> + * This iterator walks over all ops within a given list of operations beginning
> + * from the given operation in reverse order.
> + */
> +#define drm_gpuva_for_each_op_from_reverse(op, ops) \
> +	list_for_each_entry_from_reverse(op, &(ops)->list, entry)
> +
> +/**
> + * drm_gpuva_first_op - returns the first &drm_gpuva_op from &drm_gpuva_ops
> + * @ops: the &drm_gpuva_ops to get the fist &drm_gpuva_op from
> + */
> +#define drm_gpuva_first_op(ops) \
> +	list_first_entry(&(ops)->list, struct drm_gpuva_op, entry)
> +
> +/**
> + * drm_gpuva_last_op - returns the last &drm_gpuva_op from &drm_gpuva_ops
> + * @ops: the &drm_gpuva_ops to get the last &drm_gpuva_op from
> + */
> +#define drm_gpuva_last_op(ops) \
> +	list_last_entry(&(ops)->list, struct drm_gpuva_op, entry)
> +
> +/**
> + * drm_gpuva_prev_op - previous &drm_gpuva_op in the list
> + * @op: the current &drm_gpuva_op
> + */
> +#define drm_gpuva_prev_op(op) list_prev_entry(op, entry)
> +
> +/**
> + * drm_gpuva_next_op - next &drm_gpuva_op in the list
> + * @op: the current &drm_gpuva_op
> + */
> +#define drm_gpuva_next_op(op) list_next_entry(op, entry)
> +
> +struct drm_gpuva_ops *
> +drm_gpuva_sm_map_ops_create(struct drm_gpuva_manager *mgr,
> +			    u64 addr, u64 range,
> +			    struct drm_gem_object *obj, u64 offset);
> +struct drm_gpuva_ops *
> +drm_gpuva_sm_unmap_ops_create(struct drm_gpuva_manager *mgr,
> +			      u64 addr, u64 range);
> +
> +struct drm_gpuva_ops *
> +drm_gpuva_prefetch_ops_create(struct drm_gpuva_manager *mgr,
> +				 u64 addr, u64 range);
> +
> +struct drm_gpuva_ops *
> +drm_gpuva_gem_unmap_ops_create(struct drm_gpuva_manager *mgr,
> +			       struct drm_gem_object *obj);
> +
> +void drm_gpuva_ops_free(struct drm_gpuva_manager *mgr,
> +			struct drm_gpuva_ops *ops);
> +
> +/**
> + * struct drm_gpuva_fn_ops - callbacks for split/merge steps
> + *
> + * This structure defines the callbacks used by &drm_gpuva_sm_map and
> + * &drm_gpuva_sm_unmap to provide the split/merge steps for map and unmap
> + * operations to drivers.
> + */
> +struct drm_gpuva_fn_ops {
> +	/**
> +	 * @op_alloc: called when the &drm_gpuva_manager allocates
> +	 * a struct drm_gpuva_op
> +	 *
> +	 * Some drivers may want to embed struct drm_gpuva_op into driver
> +	 * specific structures. By implementing this callback drivers can
> +	 * allocate memory accordingly.
> +	 *
> +	 * This callback is optional.
> +	 */
> +	struct drm_gpuva_op *(*op_alloc)(void);
> +
> +	/**
> +	 * @op_free: called when the &drm_gpuva_manager frees a
> +	 * struct drm_gpuva_op
> +	 *
> +	 * Some drivers may want to embed struct drm_gpuva_op into driver
> +	 * specific structures. By implementing this callback drivers can
> +	 * free the previously allocated memory accordingly.
> +	 *
> +	 * This callback is optional.
> +	 */
> +	void (*op_free)(struct drm_gpuva_op *op);
> +
> +	/**
> +	 * @sm_map_step: called from &drm_gpuva_sm_map providing the split and
> +	 * merge steps
> +	 *
> +	 * This callback provides a single split / merge step or, if no split
> +	 * and merge is indicated, the original map operation.
> +	 *
> +	 * The &priv pointer is equal to the one drivers pass to
> +	 * &drm_gpuva_sm_map.
> +	 */
> +	int (*sm_map_step)(struct drm_gpuva_op *op, void *priv);
> +
> +	/**
> +	 * @sm_unmap_step: called from &drm_gpuva_sm_map providing the split and
> +	 * merge steps
> +	 *
> +	 * This callback provides a single split step or, if no split is
> +	 * indicated, the plain unmap operations of the corresponding unmap
> +	 * range originally passed to &drm_gpuva_sm_unmap.
> +	 *
> +	 * The &priv pointer is equal to the one drivers pass to
> +	 * &drm_gpuva_sm_unmap.
> +	 */
> +	int (*sm_unmap_step)(struct drm_gpuva_op *op, void *priv);
> +};
> +
> +int drm_gpuva_sm_map(struct drm_gpuva_manager *mgr, void *priv,
> +		     u64 addr, u64 range,
> +		     struct drm_gem_object *obj, u64 offset);
> +
> +int drm_gpuva_sm_unmap(struct drm_gpuva_manager *mgr, void *priv,
> +		       u64 addr, u64 range);
> +
> +#endif /* __DRM_GPUVA_MGR_H__ */
  
Danilo Krummrich Feb. 22, 2023, 3:07 p.m. UTC | #4
On 2/22/23 11:25, Christian König wrote:
> Am 17.02.23 um 14:44 schrieb Danilo Krummrich:

<snip>

>> +/**
>> + * DOC: Overview
>> + *
>> + * The DRM GPU VA Manager, represented by struct drm_gpuva_manager 
>> keeps track
>> + * of a GPU's virtual address (VA) space and manages the 
>> corresponding virtual
>> + * mappings represented by &drm_gpuva objects. It also keeps track of 
>> the
>> + * mapping's backing &drm_gem_object buffers.
>> + *
>> + * &drm_gem_object buffers maintain a list (and a corresponding list 
>> lock) of
>> + * &drm_gpuva objects representing all existent GPU VA mappings using 
>> this
>> + * &drm_gem_object as backing buffer.
>> + *
>> + * If the &DRM_GPUVA_MANAGER_REGIONS feature is enabled, a GPU VA 
>> mapping can
>> + * only be created within a previously allocated &drm_gpuva_region, 
>> which
>> + * represents a reserved portion of the GPU VA space. GPU VA mappings 
>> are not
>> + * allowed to span over a &drm_gpuva_region's boundary.
>> + *
>> + * GPU VA regions can also be flagged as sparse, which allows drivers 
>> to create
>> + * sparse mappings for a whole GPU VA region in order to support Vulkan
>> + * 'Sparse Resources'.
> 
> Well since we have now found that there is absolutely no technical 
> reason for having those regions could we please drop them?

I disagree this was the outcome of our previous discussion.

In nouveau I still need them to track the separate sparse page tables 
and, as you confirmed previously, Nvidia cards are not the only cards 
supporting this feature.

The second reason is that with regions we can avoid merging between 
buffers, which saves some effort. However, I agree that this argument by 
itself probably doesn't hold too much, since you've pointed out in a 
previous mail that:

<cite>
1) If we merge and decide to only do that inside certain boundaries then 
those boundaries needs to be provided and checked against. This burns 
quite some CPU cycles

2) If we just merge what we can we might have extra page table updates 
which cost time and could result in undesired side effects.

3) If we don't merge at all we have additional housekeeping for the 
mappings and maybe hw restrictions.
</cite>

However, if a driver uses regions to track its separate sparse page 
tables anyway it gets 1) for free, which is a nice synergy.

I totally agree that regions aren't for everyone though. Hence, I made 
them an optional feature and by default regions are disabled. In order 
to use them drm_gpuva_manager_init() must be called with the 
DRM_GPUVA_MANAGER_REGIONS feature flag.

I really would not want to open code regions or have two GPUVA manager 
instances in nouveau to track sparse page tables. That would be really 
messy, hence I hope we can agree on this to be an optional feature.

> 
> I don't really see a need for them any more.
> 
> Regards,
> Christian.
>
  
Christian König Feb. 22, 2023, 3:14 p.m. UTC | #5
Am 22.02.23 um 16:07 schrieb Danilo Krummrich:
> On 2/22/23 11:25, Christian König wrote:
>> Am 17.02.23 um 14:44 schrieb Danilo Krummrich:
>
> <snip>
>
>>> +/**
>>> + * DOC: Overview
>>> + *
>>> + * The DRM GPU VA Manager, represented by struct drm_gpuva_manager 
>>> keeps track
>>> + * of a GPU's virtual address (VA) space and manages the 
>>> corresponding virtual
>>> + * mappings represented by &drm_gpuva objects. It also keeps track 
>>> of the
>>> + * mapping's backing &drm_gem_object buffers.
>>> + *
>>> + * &drm_gem_object buffers maintain a list (and a corresponding 
>>> list lock) of
>>> + * &drm_gpuva objects representing all existent GPU VA mappings 
>>> using this
>>> + * &drm_gem_object as backing buffer.
>>> + *
>>> + * If the &DRM_GPUVA_MANAGER_REGIONS feature is enabled, a GPU VA 
>>> mapping can
>>> + * only be created within a previously allocated &drm_gpuva_region, 
>>> which
>>> + * represents a reserved portion of the GPU VA space. GPU VA 
>>> mappings are not
>>> + * allowed to span over a &drm_gpuva_region's boundary.
>>> + *
>>> + * GPU VA regions can also be flagged as sparse, which allows 
>>> drivers to create
>>> + * sparse mappings for a whole GPU VA region in order to support 
>>> Vulkan
>>> + * 'Sparse Resources'.
>>
>> Well since we have now found that there is absolutely no technical 
>> reason for having those regions could we please drop them?
>
> I disagree this was the outcome of our previous discussion.
>
> In nouveau I still need them to track the separate sparse page tables 
> and, as you confirmed previously, Nvidia cards are not the only cards 
> supporting this feature.
>
> The second reason is that with regions we can avoid merging between 
> buffers, which saves some effort. However, I agree that this argument 
> by itself probably doesn't hold too much, since you've pointed out in 
> a previous mail that:
>
> <cite>
> 1) If we merge and decide to only do that inside certain boundaries 
> then those boundaries needs to be provided and checked against. This 
> burns quite some CPU cycles
>
> 2) If we just merge what we can we might have extra page table updates 
> which cost time and could result in undesired side effects.
>
> 3) If we don't merge at all we have additional housekeeping for the 
> mappings and maybe hw restrictions.
> </cite>
>
> However, if a driver uses regions to track its separate sparse page 
> tables anyway it gets 1) for free, which is a nice synergy.
>
> I totally agree that regions aren't for everyone though. Hence, I made 
> them an optional feature and by default regions are disabled. In order 
> to use them drm_gpuva_manager_init() must be called with the 
> DRM_GPUVA_MANAGER_REGIONS feature flag.
>
> I really would not want to open code regions or have two GPUVA manager 
> instances in nouveau to track sparse page tables. That would be really 
> messy, hence I hope we can agree on this to be an optional feature.

I absolutely don't think that this is a good idea then. This separate 
handling of sparse page tables is completely Nouveau specific.

Even when it's optional feature mixing this into the common handling is 
exactly what I pointed out as not properly separating between hardware 
specific and hardware agnostic functionality.

This is exactly the problem we ran into with TTM as well and I've spend 
a massive amount of time to clean that up again.

Regards,
Christian.

>
>>
>> I don't really see a need for them any more.
>>
>> Regards,
>> Christian.
>>
>
  
Danilo Krummrich Feb. 22, 2023, 4:40 p.m. UTC | #6
On 2/22/23 16:14, Christian König wrote:
> Am 22.02.23 um 16:07 schrieb Danilo Krummrich:
>> On 2/22/23 11:25, Christian König wrote:
>>> Am 17.02.23 um 14:44 schrieb Danilo Krummrich:
>>
>> <snip>
>>
>>>> +/**
>>>> + * DOC: Overview
>>>> + *
>>>> + * The DRM GPU VA Manager, represented by struct drm_gpuva_manager 
>>>> keeps track
>>>> + * of a GPU's virtual address (VA) space and manages the 
>>>> corresponding virtual
>>>> + * mappings represented by &drm_gpuva objects. It also keeps track 
>>>> of the
>>>> + * mapping's backing &drm_gem_object buffers.
>>>> + *
>>>> + * &drm_gem_object buffers maintain a list (and a corresponding 
>>>> list lock) of
>>>> + * &drm_gpuva objects representing all existent GPU VA mappings 
>>>> using this
>>>> + * &drm_gem_object as backing buffer.
>>>> + *
>>>> + * If the &DRM_GPUVA_MANAGER_REGIONS feature is enabled, a GPU VA 
>>>> mapping can
>>>> + * only be created within a previously allocated &drm_gpuva_region, 
>>>> which
>>>> + * represents a reserved portion of the GPU VA space. GPU VA 
>>>> mappings are not
>>>> + * allowed to span over a &drm_gpuva_region's boundary.
>>>> + *
>>>> + * GPU VA regions can also be flagged as sparse, which allows 
>>>> drivers to create
>>>> + * sparse mappings for a whole GPU VA region in order to support 
>>>> Vulkan
>>>> + * 'Sparse Resources'.
>>>
>>> Well since we have now found that there is absolutely no technical 
>>> reason for having those regions could we please drop them?
>>
>> I disagree this was the outcome of our previous discussion.
>>
>> In nouveau I still need them to track the separate sparse page tables 
>> and, as you confirmed previously, Nvidia cards are not the only cards 
>> supporting this feature.
>>
>> The second reason is that with regions we can avoid merging between 
>> buffers, which saves some effort. However, I agree that this argument 
>> by itself probably doesn't hold too much, since you've pointed out in 
>> a previous mail that:
>>
>> <cite>
>> 1) If we merge and decide to only do that inside certain boundaries 
>> then those boundaries needs to be provided and checked against. This 
>> burns quite some CPU cycles
>>
>> 2) If we just merge what we can we might have extra page table updates 
>> which cost time and could result in undesired side effects.
>>
>> 3) If we don't merge at all we have additional housekeeping for the 
>> mappings and maybe hw restrictions.
>> </cite>
>>
>> However, if a driver uses regions to track its separate sparse page 
>> tables anyway it gets 1) for free, which is a nice synergy.
>>
>> I totally agree that regions aren't for everyone though. Hence, I made 
>> them an optional feature and by default regions are disabled. In order 
>> to use them drm_gpuva_manager_init() must be called with the 
>> DRM_GPUVA_MANAGER_REGIONS feature flag.
>>
>> I really would not want to open code regions or have two GPUVA manager 
>> instances in nouveau to track sparse page tables. That would be really 
>> messy, hence I hope we can agree on this to be an optional feature.
> 
> I absolutely don't think that this is a good idea then. This separate 
> handling of sparse page tables is completely Nouveau specific.

Actually, I rely on what you said in a previous mail when I say it's, 
potentially, not specific to nouveau.

<cite>
This sounds similar to what AMD hw used to have up until gfx8 (I think), 
basically sparse resources where defined through a separate mechanism to 
the address resolution of the page tables. I won't rule out that other 
hardware has similar approaches.
</cite>

> 
> Even when it's optional feature mixing this into the common handling is 
> exactly what I pointed out as not properly separating between hardware 
> specific and hardware agnostic functionality.

Optionally having regions is *not* a hardware specific concept, drivers 
might use it for a hardware specific purpose though. Which potentially 
is is the case for almost every DRM helper.

Drivers can use regions only for the sake of not merging between buffer 
boundaries as well. Some drivers might prefer this over "never merge" or 
"always merge", depending on the cost of re-organizing page tables for 
unnecessary splits/merges, without having the need of tracking separate 
sparse page tables.

Its just that I think *if* a driver needs to track separate sparse page 
tables anyways its a nice synergy since then there is no extra cost for 
getting this optimization.

> 
> This is exactly the problem we ran into with TTM as well and I've spend 
> a massive amount of time to clean that up again. >

Admittedly, I don't know what problems you are referring to. However, I 
don't see which kind of trouble it could cause by allowing drivers to 
track regions optionally.

> Regards,
> Christian.
> 
>>
>>>
>>> I don't really see a need for them any more.
>>>
>>> Regards,
>>> Christian.
>>>
>>
>
  
Danilo Krummrich Feb. 22, 2023, 6:13 p.m. UTC | #7
On 2/21/23 19:20, Liam R. Howlett wrote:
> * Danilo Krummrich <dakr@redhat.com> [230217 08:45]:
>> Add infrastructure to keep track of GPU virtual address (VA) mappings
>> with a decicated VA space manager implementation.
>>
>> New UAPIs, motivated by Vulkan sparse memory bindings graphics drivers
>> start implementing, allow userspace applications to request multiple and
>> arbitrary GPU VA mappings of buffer objects. The DRM GPU VA manager is
>> intended to serve the following purposes in this context.
>>
>> 1) Provide infrastructure to track GPU VA allocations and mappings,
>>     making use of the maple_tree.
>>
>> 2) Generically connect GPU VA mappings to their backing buffers, in
>>     particular DRM GEM objects.
>>
>> 3) Provide a common implementation to perform more complex mapping
>>     operations on the GPU VA space. In particular splitting and merging
>>     of GPU VA mappings, e.g. for intersecting mapping requests or partial
>>     unmap requests.
>>
>> Suggested-by: Dave Airlie <airlied@redhat.com>
>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>> ---
>>   Documentation/gpu/drm-mm.rst    |   31 +
>>   drivers/gpu/drm/Makefile        |    1 +
>>   drivers/gpu/drm/drm_gem.c       |    3 +
>>   drivers/gpu/drm/drm_gpuva_mgr.c | 1704 +++++++++++++++++++++++++++++++
>>   include/drm/drm_drv.h           |    6 +
>>   include/drm/drm_gem.h           |   75 ++
>>   include/drm/drm_gpuva_mgr.h     |  714 +++++++++++++
>>   7 files changed, 2534 insertions(+)
>>   create mode 100644 drivers/gpu/drm/drm_gpuva_mgr.c
>>   create mode 100644 include/drm/drm_gpuva_mgr.h
>>
>> diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
>> index a52e6f4117d6..c9f120cfe730 100644
>> --- a/Documentation/gpu/drm-mm.rst
>> +++ b/Documentation/gpu/drm-mm.rst
>> @@ -466,6 +466,37 @@ DRM MM Range Allocator Function References
>>   .. kernel-doc:: drivers/gpu/drm/drm_mm.c
>>      :export:
>>   
> ...
> 
>> +
>> +/**
>> + * drm_gpuva_remove_iter - removes the iterators current element
>> + * @it: the &drm_gpuva_iterator
>> + *
>> + * This removes the element the iterator currently points to.
>> + */
>> +void
>> +drm_gpuva_iter_remove(struct drm_gpuva_iterator *it)
>> +{
>> +	mas_erase(&it->mas);
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_iter_remove);
>> +
>> +/**
>> + * drm_gpuva_insert - insert a &drm_gpuva
>> + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
>> + * @va: the &drm_gpuva to insert
>> + * @addr: the start address of the GPU VA
>> + * @range: the range of the GPU VA
>> + *
>> + * Insert a &drm_gpuva with a given address and range into a
>> + * &drm_gpuva_manager.
>> + *
>> + * Returns: 0 on success, negative error code on failure.
>> + */
>> +int
>> +drm_gpuva_insert(struct drm_gpuva_manager *mgr,
>> +		 struct drm_gpuva *va)
>> +{
>> +	u64 addr = va->va.addr;
>> +	u64 range = va->va.range;
>> +	MA_STATE(mas, &mgr->va_mt, addr, addr + range - 1);
>> +	struct drm_gpuva_region *reg = NULL;
>> +	int ret;
>> +
>> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
>> +		return -EINVAL;
>> +
>> +	if (unlikely(drm_gpuva_in_kernel_region(mgr, addr, range)))
>> +		return -EINVAL;
>> +
>> +	if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS) {
>> +		reg = drm_gpuva_in_region(mgr, addr, range);
>> +		if (unlikely(!reg))
>> +			return -EINVAL;
>> +	}
>> +
> 
> -----
> 
>> +	if (unlikely(drm_gpuva_find_first(mgr, addr, range)))
>> +		return -EEXIST;
>> +
>> +	ret = mas_store_gfp(&mas, va, GFP_KERNEL);
> 
> mas_walk() will set the internal maple state to the limits to what it
> finds.  So, instead of an iterator, you can use the walk function and
> ensure there is a large enough area in the existing NULL:
> 
> /*
>   * Nothing at addr, mas now points to the location where the store would
>   * happen
>   */
> if (mas_walk(&mas))
> 	return -EEXIST;
> 
> /* The NULL entry ends at mas.last, make sure there is room */
> if (mas.last < (addr + range - 1))
> 	return -EEXIST;
> 
> /* Limit the store size to the correct end address, and store */
>   mas.last = addr + range - 1;
>   ret = mas_store_gfp(&mas, va, GFP_KERNEL);
> 

Would this variant be significantly more efficient?

Also, would this also work while already walking the tree?

To remove an entry while walking the tree I have a separate function 
drm_gpuva_iter_remove(). Would I need something similar for inserting 
entries?

I already provided this example in a separate mail thread, but it may 
makes sense to move this to the mailing list:

In __drm_gpuva_sm_map() we're iterating a given range of the tree, where 
the given range is the size of the newly requested mapping. 
__drm_gpuva_sm_map() invokes a callback for each sub-operation that 
needs to be taken in order to fulfill this mapping request. In most 
cases such a callback just creates a drm_gpuva_op object and stores it 
in a list.

However, drivers can also implement the callback, such that they 
directly execute this operation within the callback.

Let's have a look at the following example:

      0     a     2
old: |-----------|       (bo_offset=n)

            1     b     3
req:       |-----------| (bo_offset=m)

      0  a' 1     b     3
new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)

This would result in the following operations.

__drm_gpuva_sm_map() finds entry "a" and calls back into the driver 
suggesting to re-map "a" with the new size. The driver removes entry "a" 
from the tree and adds "a'"

__drm_gpuva_sm_map(), ideally, continues the loop searching for nodes 
starting from the end of "a" (which is 2) till the end of the requested 
mapping "b" (which is 3). Since it doesn't find any other mapping within 
this range it calls back into the driver suggesting to finally map "b".

If there would have been another mapping between 2 and 3 it would have 
called back into the driver asking to unmap this mapping beforehand.

So, it boils down to re-mapping as described at the beginning (and 
analogously at the end) of a new mapping range and removing of entries 
that are enclosed by the new mapping range.

>> +	if (unlikely(ret))
>> +		return ret;
>> +
>> +	va->mgr = mgr;
>> +	va->region = reg;
>> +
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_insert);
>> +
>> +/**
>> + * drm_gpuva_remove - remove a &drm_gpuva
>> + * @va: the &drm_gpuva to remove
>> + *
>> + * This removes the given &va from the underlaying tree.
>> + */
>> +void
>> +drm_gpuva_remove(struct drm_gpuva *va)
>> +{
>> +	MA_STATE(mas, &va->mgr->va_mt, va->va.addr, 0);
>> +
>> +	mas_erase(&mas);
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_remove);
>> +
> ...
> 
>> +/**
>> + * drm_gpuva_find_first - find the first &drm_gpuva in the given range
>> + * @mgr: the &drm_gpuva_manager to search in
>> + * @addr: the &drm_gpuvas address
>> + * @range: the &drm_gpuvas range
>> + *
>> + * Returns: the first &drm_gpuva within the given range
>> + */
>> +struct drm_gpuva *
>> +drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
>> +		     u64 addr, u64 range)
>> +{
>> +	MA_STATE(mas, &mgr->va_mt, addr, 0);
>> +
>> +	return mas_find(&mas, addr + range - 1);
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_find_first);
>> +
>> +/**
>> + * drm_gpuva_find - find a &drm_gpuva
>> + * @mgr: the &drm_gpuva_manager to search in
>> + * @addr: the &drm_gpuvas address
>> + * @range: the &drm_gpuvas range
>> + *
>> + * Returns: the &drm_gpuva at a given &addr and with a given &range
> 
> Note that mas_find() will continue upwards in the address space if there
> isn't anything at @addr.  This means that &drm_gpuva may not be at
> &addr.  If you want to check just at &addr, use mas_walk().

Good catch. drm_gpuva_find() should then either also check for 
'va->va.addr == addr' as well or, alternatively, use mas_walk(). As 
above, any reason to prefer mas_walk()?

> 
>> + */
>> +struct drm_gpuva *
>> +drm_gpuva_find(struct drm_gpuva_manager *mgr,
>> +	       u64 addr, u64 range)
>> +{
>> +	struct drm_gpuva *va;
>> +
>> +	va = drm_gpuva_find_first(mgr, addr, range);
>> +	if (!va)
>> +		goto out;
>> +
>> +	if (va->va.range != range)
>> +		goto out;
>> +
>> +	return va;
>> +
>> +out:
>> +	return NULL;
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_find);
>> +
>> +/**
>> + * drm_gpuva_find_prev - find the &drm_gpuva before the given address
>> + * @mgr: the &drm_gpuva_manager to search in
>> + * @start: the given GPU VA's start address
>> + *
>> + * Find the adjacent &drm_gpuva before the GPU VA with given &start address.
>> + *
>> + * Note that if there is any free space between the GPU VA mappings no mapping
>> + * is returned.
>> + *
>> + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
>> + */
>> +struct drm_gpuva *
>> +drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start)
> 
> find_prev() usually continues beyond 1 less than the address. I found
> this name confusing. 

Don't really get that, mind explaining?

> You may as well use mas_walk(), it would be faster.

How would I use mas_walk() for that? If I understand it correctly, 
mas_walk() requires me to know that start address, which I don't know 
for the previous entry.

However, mas_walk() seems to be a good alternative to use for 
drm_gpuva_find_next().

>> +{
>> +	MA_STATE(mas, &mgr->va_mt, start, 0);
>> +
>> +	if (start <= mgr->mm_start ||
>> +	    start > (mgr->mm_start + mgr->mm_range))
>> +		return NULL;
>> +
>> +	return mas_prev(&mas, start - 1);
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_find_prev);
>> +
>> +/**
>> + * drm_gpuva_find_next - find the &drm_gpuva after the given address
>> + * @mgr: the &drm_gpuva_manager to search in
>> + * @end: the given GPU VA's end address
>> + *
>> + * Find the adjacent &drm_gpuva after the GPU VA with given &end address.
>> + *
>> + * Note that if there is any free space between the GPU VA mappings no mapping
>> + * is returned.
>> + *
>> + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
>> + */
>> +struct drm_gpuva *
>> +drm_gpuva_find_next(struct drm_gpuva_manager *mgr, u64 end)
> 
> This name is also a bit confusing for the same reason.  Again, it seems
> worth just walking to end here.
> 
>> +{
>> +	MA_STATE(mas, &mgr->va_mt, end - 1, 0);
>> +
>> +	if (end < mgr->mm_start ||
>> +	    end >= (mgr->mm_start + mgr->mm_range))
>> +		return NULL;
>> +
>> +	return mas_next(&mas, end);
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_find_next);
>> +
>> +static int
>> +__drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
>> +			  struct drm_gpuva_region *reg)
>> +{
>> +	u64 addr = reg->va.addr;
>> +	u64 range = reg->va.range;
>> +	MA_STATE(mas, &mgr->region_mt, addr, addr + range - 1);
>> +	int ret;
>> +
>> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
>> +		return -EINVAL;
>> +
>> +	ret = mas_store_gfp(&mas, reg, GFP_KERNEL);
>> +	if (unlikely(ret))
>> +		return ret;
>> +
>> +	reg->mgr = mgr;
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * drm_gpuva_region_insert - insert a &drm_gpuva_region
>> + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
>> + * @reg: the &drm_gpuva_region to insert
>> + * @addr: the start address of the GPU VA
>> + * @range: the range of the GPU VA
>> + *
>> + * Insert a &drm_gpuva_region with a given address and range into a
>> + * &drm_gpuva_manager.
>> + *
>> + * Returns: 0 on success, negative error code on failure.
>> + */
>> +int
>> +drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
>> +			struct drm_gpuva_region *reg)
>> +{
>> +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
>> +		return -EINVAL;
>> +
>> +	return __drm_gpuva_region_insert(mgr, reg);
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_region_insert);
>> +
>> +static void
>> +__drm_gpuva_region_remove(struct drm_gpuva_region *reg)
>> +{
>> +	struct drm_gpuva_manager *mgr = reg->mgr;
>> +	MA_STATE(mas, &mgr->region_mt, reg->va.addr, 0);
>> +
>> +	mas_erase(&mas);
>> +}
>> +
>> +/**
>> + * drm_gpuva_region_remove - remove a &drm_gpuva_region
>> + * @reg: the &drm_gpuva to remove
>> + *
>> + * This removes the given &reg from the underlaying tree.
>> + */
>> +void
>> +drm_gpuva_region_remove(struct drm_gpuva_region *reg)
>> +{
>> +	struct drm_gpuva_manager *mgr = reg->mgr;
>> +
>> +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
>> +		return;
>> +
>> +	if (unlikely(reg == &mgr->kernel_alloc_region)) {
>> +		WARN(1, "Can't destroy kernel reserved region.\n");
>> +		return;
>> +	}
>> +
>> +	if (unlikely(!drm_gpuva_region_empty(reg)))
>> +		WARN(1, "GPU VA region should be empty on destroy.\n");
>> +
>> +	__drm_gpuva_region_remove(reg);
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_region_remove);
>> +
>> +/**
>> + * drm_gpuva_region_empty - indicate whether a &drm_gpuva_region is empty
>> + * @reg: the &drm_gpuva to destroy
>> + *
>> + * Returns: true if the &drm_gpuva_region is empty, false otherwise
>> + */
>> +bool
>> +drm_gpuva_region_empty(struct drm_gpuva_region *reg)
>> +{
>> +	DRM_GPUVA_ITER(it, reg->mgr);
>> +
>> +	drm_gpuva_iter_for_each_range(it, reg->va.addr,
>> +				      reg->va.addr +
>> +				      reg->va.range)
>> +		return false;
>> +
>> +	return true;
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_region_empty);
>> +
>> +/**
>> + * drm_gpuva_region_find_first - find the first &drm_gpuva_region in the given
>> + * range
>> + * @mgr: the &drm_gpuva_manager to search in
>> + * @addr: the &drm_gpuva_regions address
>> + * @range: the &drm_gpuva_regions range
>> + *
>> + * Returns: the first &drm_gpuva_region within the given range
>> + */
>> +struct drm_gpuva_region *
>> +drm_gpuva_region_find_first(struct drm_gpuva_manager *mgr,
>> +			    u64 addr, u64 range)
>> +{
>> +	MA_STATE(mas, &mgr->region_mt, addr, 0);
>> +
>> +	return mas_find(&mas, addr + range - 1);
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_region_find_first);
>> +
>> +/**
>> + * drm_gpuva_region_find - find a &drm_gpuva_region
>> + * @mgr: the &drm_gpuva_manager to search in
>> + * @addr: the &drm_gpuva_regions address
>> + * @range: the &drm_gpuva_regions range
>> + *
>> + * Returns: the &drm_gpuva_region at a given &addr and with a given &range
> 
> again, I'm not sure you want to find first or walk here.. It sounds like
> you want exactly addr to addr + range VMA?

Exactly, same as above.

> 
>> + */
>> +struct drm_gpuva_region *
>> +drm_gpuva_region_find(struct drm_gpuva_manager *mgr,
>> +		      u64 addr, u64 range)
>> +{
>> +	struct drm_gpuva_region *reg;
>> +
>> +	reg = drm_gpuva_region_find_first(mgr, addr, range);
>> +	if (!reg)
>> +		goto out;
>> +
>> +	if (reg->va.range != range)
>> +		goto out;
>> +
>> +	return reg;
>> +
>> +out:
>> +	return NULL;
>> +}
>> +EXPORT_SYMBOL(drm_gpuva_region_find);
>> +
> 
> ...
>
  
Christian König Feb. 23, 2023, 7:06 a.m. UTC | #8
Am 22.02.23 um 17:40 schrieb Danilo Krummrich:
> On 2/22/23 16:14, Christian König wrote:
>> Am 22.02.23 um 16:07 schrieb Danilo Krummrich:
>>> On 2/22/23 11:25, Christian König wrote:
>>>> Am 17.02.23 um 14:44 schrieb Danilo Krummrich:
>>>
>>> <snip>
>>>
>>>>> +/**
>>>>> + * DOC: Overview
>>>>> + *
>>>>> + * The DRM GPU VA Manager, represented by struct 
>>>>> drm_gpuva_manager keeps track
>>>>> + * of a GPU's virtual address (VA) space and manages the 
>>>>> corresponding virtual
>>>>> + * mappings represented by &drm_gpuva objects. It also keeps 
>>>>> track of the
>>>>> + * mapping's backing &drm_gem_object buffers.
>>>>> + *
>>>>> + * &drm_gem_object buffers maintain a list (and a corresponding 
>>>>> list lock) of
>>>>> + * &drm_gpuva objects representing all existent GPU VA mappings 
>>>>> using this
>>>>> + * &drm_gem_object as backing buffer.
>>>>> + *
>>>>> + * If the &DRM_GPUVA_MANAGER_REGIONS feature is enabled, a GPU VA 
>>>>> mapping can
>>>>> + * only be created within a previously allocated 
>>>>> &drm_gpuva_region, which
>>>>> + * represents a reserved portion of the GPU VA space. GPU VA 
>>>>> mappings are not
>>>>> + * allowed to span over a &drm_gpuva_region's boundary.
>>>>> + *
>>>>> + * GPU VA regions can also be flagged as sparse, which allows 
>>>>> drivers to create
>>>>> + * sparse mappings for a whole GPU VA region in order to support 
>>>>> Vulkan
>>>>> + * 'Sparse Resources'.
>>>>
>>>> Well since we have now found that there is absolutely no technical 
>>>> reason for having those regions could we please drop them?
>>>
>>> I disagree this was the outcome of our previous discussion.
>>>
>>> In nouveau I still need them to track the separate sparse page 
>>> tables and, as you confirmed previously, Nvidia cards are not the 
>>> only cards supporting this feature.
>>>
>>> The second reason is that with regions we can avoid merging between 
>>> buffers, which saves some effort. However, I agree that this 
>>> argument by itself probably doesn't hold too much, since you've 
>>> pointed out in a previous mail that:
>>>
>>> <cite>
>>> 1) If we merge and decide to only do that inside certain boundaries 
>>> then those boundaries needs to be provided and checked against. This 
>>> burns quite some CPU cycles
>>>
>>> 2) If we just merge what we can we might have extra page table 
>>> updates which cost time and could result in undesired side effects.
>>>
>>> 3) If we don't merge at all we have additional housekeeping for the 
>>> mappings and maybe hw restrictions.
>>> </cite>
>>>
>>> However, if a driver uses regions to track its separate sparse page 
>>> tables anyway it gets 1) for free, which is a nice synergy.
>>>
>>> I totally agree that regions aren't for everyone though. Hence, I 
>>> made them an optional feature and by default regions are disabled. 
>>> In order to use them drm_gpuva_manager_init() must be called with 
>>> the DRM_GPUVA_MANAGER_REGIONS feature flag.
>>>
>>> I really would not want to open code regions or have two GPUVA 
>>> manager instances in nouveau to track sparse page tables. That would 
>>> be really messy, hence I hope we can agree on this to be an optional 
>>> feature.
>>
>> I absolutely don't think that this is a good idea then. This separate 
>> handling of sparse page tables is completely Nouveau specific.
>
> Actually, I rely on what you said in a previous mail when I say it's, 
> potentially, not specific to nouveau.
>
> <cite>
> This sounds similar to what AMD hw used to have up until gfx8 (I 
> think), basically sparse resources where defined through a separate 
> mechanism to the address resolution of the page tables. I won't rule 
> out that other hardware has similar approaches.
> </cite>

Ok, sounds like I didn't made my point here clear: AMD does have that 
same mechanism for older hw you try to implement here for Nouveau, but 
we have *abandoned* it because it is to much trouble and especially 
overhead to support! In other words we have said "Ok we would need two 
separate components to cleanly handle that, one for newer hw and one for 
older hw.".

What you now try to do is to write one component which works for both. 
We have already exercised this idea and came to the conclusion that it's 
not a good path to go down. So you're basically just repeating our mistake.

I mean if it's just for Nouveau then I would say feel free to do 
whatever you want, but since this component is supposed to be used by 
more drivers then I strongly think we need to tackle this from a 
different side.

>> Even when it's optional feature mixing this into the common handling 
>> is exactly what I pointed out as not properly separating between 
>> hardware specific and hardware agnostic functionality.
>
> Optionally having regions is *not* a hardware specific concept, 
> drivers might use it for a hardware specific purpose though. Which 
> potentially is is the case for almost every DRM helper.
>
> Drivers can use regions only for the sake of not merging between 
> buffer boundaries as well. Some drivers might prefer this over "never 
> merge" or "always merge", depending on the cost of re-organizing page 
> tables for unnecessary splits/merges, without having the need of 
> tracking separate sparse page tables.
>
> Its just that I think *if* a driver needs to track separate sparse 
> page tables anyways its a nice synergy since then there is no extra 
> cost for getting this optimization.

Well exactly that's the point: I really don't believe that this comes 
without extra costs.

What we could maybe do is to have an two separate functions, one for 
updating the data structures and one for merging. When you now call the 
merging function with a limit you don't get mappings merged over that 
limit and if you don't call the merging function at all you don't get 
merges.

But we should have definitely not have the tracking of the ranges inside 
the common component. This is something separated.

>> This is exactly the problem we ran into with TTM as well and I've 
>> spend a massive amount of time to clean that up again. >
>
> Admittedly, I don't know what problems you are referring to. However, 
> I don't see which kind of trouble it could cause by allowing drivers 
> to track regions optionally.

Take a look at my 2020 presentation about TTM on FOSDEM.

Regards,
Christian.

>
>> Regards,
>> Christian.
>>
>>>
>>>>
>>>> I don't really see a need for them any more.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>
>>
>
  
Danilo Krummrich Feb. 23, 2023, 2:12 p.m. UTC | #9
On 2/23/23 08:06, Christian König wrote:
> Am 22.02.23 um 17:40 schrieb Danilo Krummrich:
>> On 2/22/23 16:14, Christian König wrote:
>>> Am 22.02.23 um 16:07 schrieb Danilo Krummrich:
>>>> On 2/22/23 11:25, Christian König wrote:
>>>>> Am 17.02.23 um 14:44 schrieb Danilo Krummrich:
>>>>
>>>> <snip>
>>>>
>>>>>> +/**
>>>>>> + * DOC: Overview
>>>>>> + *
>>>>>> + * The DRM GPU VA Manager, represented by struct 
>>>>>> drm_gpuva_manager keeps track
>>>>>> + * of a GPU's virtual address (VA) space and manages the 
>>>>>> corresponding virtual
>>>>>> + * mappings represented by &drm_gpuva objects. It also keeps 
>>>>>> track of the
>>>>>> + * mapping's backing &drm_gem_object buffers.
>>>>>> + *
>>>>>> + * &drm_gem_object buffers maintain a list (and a corresponding 
>>>>>> list lock) of
>>>>>> + * &drm_gpuva objects representing all existent GPU VA mappings 
>>>>>> using this
>>>>>> + * &drm_gem_object as backing buffer.
>>>>>> + *
>>>>>> + * If the &DRM_GPUVA_MANAGER_REGIONS feature is enabled, a GPU VA 
>>>>>> mapping can
>>>>>> + * only be created within a previously allocated 
>>>>>> &drm_gpuva_region, which
>>>>>> + * represents a reserved portion of the GPU VA space. GPU VA 
>>>>>> mappings are not
>>>>>> + * allowed to span over a &drm_gpuva_region's boundary.
>>>>>> + *
>>>>>> + * GPU VA regions can also be flagged as sparse, which allows 
>>>>>> drivers to create
>>>>>> + * sparse mappings for a whole GPU VA region in order to support 
>>>>>> Vulkan
>>>>>> + * 'Sparse Resources'.
>>>>>
>>>>> Well since we have now found that there is absolutely no technical 
>>>>> reason for having those regions could we please drop them?
>>>>
>>>> I disagree this was the outcome of our previous discussion.
>>>>
>>>> In nouveau I still need them to track the separate sparse page 
>>>> tables and, as you confirmed previously, Nvidia cards are not the 
>>>> only cards supporting this feature.
>>>>
>>>> The second reason is that with regions we can avoid merging between 
>>>> buffers, which saves some effort. However, I agree that this 
>>>> argument by itself probably doesn't hold too much, since you've 
>>>> pointed out in a previous mail that:
>>>>
>>>> <cite>
>>>> 1) If we merge and decide to only do that inside certain boundaries 
>>>> then those boundaries needs to be provided and checked against. This 
>>>> burns quite some CPU cycles
>>>>
>>>> 2) If we just merge what we can we might have extra page table 
>>>> updates which cost time and could result in undesired side effects.
>>>>
>>>> 3) If we don't merge at all we have additional housekeeping for the 
>>>> mappings and maybe hw restrictions.
>>>> </cite>
>>>>
>>>> However, if a driver uses regions to track its separate sparse page 
>>>> tables anyway it gets 1) for free, which is a nice synergy.
>>>>
>>>> I totally agree that regions aren't for everyone though. Hence, I 
>>>> made them an optional feature and by default regions are disabled. 
>>>> In order to use them drm_gpuva_manager_init() must be called with 
>>>> the DRM_GPUVA_MANAGER_REGIONS feature flag.
>>>>
>>>> I really would not want to open code regions or have two GPUVA 
>>>> manager instances in nouveau to track sparse page tables. That would 
>>>> be really messy, hence I hope we can agree on this to be an optional 
>>>> feature.
>>>
>>> I absolutely don't think that this is a good idea then. This separate 
>>> handling of sparse page tables is completely Nouveau specific.
>>
>> Actually, I rely on what you said in a previous mail when I say it's, 
>> potentially, not specific to nouveau.
>>
>> <cite>
>> This sounds similar to what AMD hw used to have up until gfx8 (I 
>> think), basically sparse resources where defined through a separate 
>> mechanism to the address resolution of the page tables. I won't rule 
>> out that other hardware has similar approaches.
>> </cite>
> 
> Ok, sounds like I didn't made my point here clear: AMD does have that 
> same mechanism for older hw you try to implement here for Nouveau, but 
> we have *abandoned* it because it is to much trouble and especially 
> overhead to support! In other words we have said "Ok we would need two 
> separate components to cleanly handle that, one for newer hw and one for 
> older hw.".

My point was more about the potential existence of other hardware having 
similar concepts.

I, personally, can't judge whether actually making use of having 
separate sparse page tables (or similar concepts) makes sense for other 
drivers or not. I think it depends on how the hardware works, which 
limitations it has in handling page tables, etc.

I definitely recognize your experience and that for AMD you decided its 
not worth using a similar mechanism. I would definitely be interested in 
the details. Do you mind sharing them?

However, I think we need to differentiate between whether for AMD 
hardware you just found an approach that worked out better for your 
specific hardware or whether something is fundamentally broken with 
separate sparse page tables (or similar concepts) in general.

Do you think there is something fundamentally broken with such an 
approach? And if so, why?

> 
> What you now try to do is to write one component which works for both. 
> We have already exercised this idea and came to the conclusion that it's 
> not a good path to go down. So you're basically just repeating our mistake.
> 
> I mean if it's just for Nouveau then I would say feel free to do 
> whatever you want, but since this component is supposed to be used by 
> more drivers then I strongly think we need to tackle this from a 
> different side.
> 
>>> Even when it's optional feature mixing this into the common handling 
>>> is exactly what I pointed out as not properly separating between 
>>> hardware specific and hardware agnostic functionality.
>>
>> Optionally having regions is *not* a hardware specific concept, 
>> drivers might use it for a hardware specific purpose though. Which 
>> potentially is is the case for almost every DRM helper.
>>
>> Drivers can use regions only for the sake of not merging between 
>> buffer boundaries as well. Some drivers might prefer this over "never 
>> merge" or "always merge", depending on the cost of re-organizing page 
>> tables for unnecessary splits/merges, without having the need of 
>> tracking separate sparse page tables.
>>
>> Its just that I think *if* a driver needs to track separate sparse 
>> page tables anyways its a nice synergy since then there is no extra 
>> cost for getting this optimization.
> 
> Well exactly that's the point: I really don't believe that this comes 
> without extra costs.

If you already have to store some information for purpose A and an 
optional purpose B requires the exact same information you would get B 
for free.

Which other costs would you see here?

> 
> What we could maybe do is to have an two separate functions, one for 
> updating the data structures and one for merging. When you now call the 
> merging function with a limit you don't get mappings merged over that 
> limit and if you don't call the merging function at all you don't get 
> merges.

Having a separate merging function would work. However, I am against an 
interface that takes limit parameters. Having such an interface signals 
general compliance with tracking regions to drivers, but without the 
offer to do this job in a generic way.

This sounds like a bad compromise to me. I think we should either accept 
that some drivers might have a purpose of tracking regions and hence 
*optionally* support them or have clear evidence that tracking regions 
never ever make sense at all regardless of how a specific hardware 
handles it's page tables.

Allowing drivers to set the merge strategy, however, is a good idea. I 
could also just add corresponding feature flags to let the driver pick.

> 
> But we should have definitely not have the tracking of the ranges inside 
> the common component. This is something separated.
> 
>>> This is exactly the problem we ran into with TTM as well and I've 
>>> spend a massive amount of time to clean that up again. >
>>
>> Admittedly, I don't know what problems you are referring to. However, 
>> I don't see which kind of trouble it could cause by allowing drivers 
>> to track regions optionally.
> 
> Take a look at my 2020 presentation about TTM on FOSDEM.
> 
> Regards,
> Christian.
> 
>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>>>
>>>>> I don't really see a need for them any more.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>
>>>
>>
>
  
Liam R. Howlett Feb. 23, 2023, 7:09 p.m. UTC | #10
* Danilo Krummrich <dakr@redhat.com> [230222 13:13]:
> On 2/21/23 19:20, Liam R. Howlett wrote:
> > * Danilo Krummrich <dakr@redhat.com> [230217 08:45]:
> > > Add infrastructure to keep track of GPU virtual address (VA) mappings
> > > with a decicated VA space manager implementation.
> > > 
> > > New UAPIs, motivated by Vulkan sparse memory bindings graphics drivers
> > > start implementing, allow userspace applications to request multiple and
> > > arbitrary GPU VA mappings of buffer objects. The DRM GPU VA manager is
> > > intended to serve the following purposes in this context.
> > > 
> > > 1) Provide infrastructure to track GPU VA allocations and mappings,
> > >     making use of the maple_tree.
> > > 
> > > 2) Generically connect GPU VA mappings to their backing buffers, in
> > >     particular DRM GEM objects.
> > > 
> > > 3) Provide a common implementation to perform more complex mapping
> > >     operations on the GPU VA space. In particular splitting and merging
> > >     of GPU VA mappings, e.g. for intersecting mapping requests or partial
> > >     unmap requests.
> > > 
> > > Suggested-by: Dave Airlie <airlied@redhat.com>
> > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > ---
> > >   Documentation/gpu/drm-mm.rst    |   31 +
> > >   drivers/gpu/drm/Makefile        |    1 +
> > >   drivers/gpu/drm/drm_gem.c       |    3 +
> > >   drivers/gpu/drm/drm_gpuva_mgr.c | 1704 +++++++++++++++++++++++++++++++
> > >   include/drm/drm_drv.h           |    6 +
> > >   include/drm/drm_gem.h           |   75 ++
> > >   include/drm/drm_gpuva_mgr.h     |  714 +++++++++++++
> > >   7 files changed, 2534 insertions(+)
> > >   create mode 100644 drivers/gpu/drm/drm_gpuva_mgr.c
> > >   create mode 100644 include/drm/drm_gpuva_mgr.h
> > > 
> > > diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
> > > index a52e6f4117d6..c9f120cfe730 100644
> > > --- a/Documentation/gpu/drm-mm.rst
> > > +++ b/Documentation/gpu/drm-mm.rst
> > > @@ -466,6 +466,37 @@ DRM MM Range Allocator Function References
> > >   .. kernel-doc:: drivers/gpu/drm/drm_mm.c
> > >      :export:
> > ...
> > 
> > > +
> > > +/**
> > > + * drm_gpuva_remove_iter - removes the iterators current element
> > > + * @it: the &drm_gpuva_iterator
> > > + *
> > > + * This removes the element the iterator currently points to.
> > > + */
> > > +void
> > > +drm_gpuva_iter_remove(struct drm_gpuva_iterator *it)
> > > +{
> > > +	mas_erase(&it->mas);
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_iter_remove);
> > > +
> > > +/**
> > > + * drm_gpuva_insert - insert a &drm_gpuva
> > > + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
> > > + * @va: the &drm_gpuva to insert
> > > + * @addr: the start address of the GPU VA
> > > + * @range: the range of the GPU VA
> > > + *
> > > + * Insert a &drm_gpuva with a given address and range into a
> > > + * &drm_gpuva_manager.
> > > + *
> > > + * Returns: 0 on success, negative error code on failure.
> > > + */
> > > +int
> > > +drm_gpuva_insert(struct drm_gpuva_manager *mgr,
> > > +		 struct drm_gpuva *va)
> > > +{
> > > +	u64 addr = va->va.addr;
> > > +	u64 range = va->va.range;
> > > +	MA_STATE(mas, &mgr->va_mt, addr, addr + range - 1);
> > > +	struct drm_gpuva_region *reg = NULL;
> > > +	int ret;
> > > +
> > > +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
> > > +		return -EINVAL;
> > > +
> > > +	if (unlikely(drm_gpuva_in_kernel_region(mgr, addr, range)))
> > > +		return -EINVAL;
> > > +
> > > +	if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS) {
> > > +		reg = drm_gpuva_in_region(mgr, addr, range);
> > > +		if (unlikely(!reg))
> > > +			return -EINVAL;
> > > +	}
> > > +
> > 
> > -----
> > 
> > > +	if (unlikely(drm_gpuva_find_first(mgr, addr, range)))
> > > +		return -EEXIST;
> > > +
> > > +	ret = mas_store_gfp(&mas, va, GFP_KERNEL);
> > 
> > mas_walk() will set the internal maple state to the limits to what it
> > finds.  So, instead of an iterator, you can use the walk function and
> > ensure there is a large enough area in the existing NULL:
> > 
> > /*
> >   * Nothing at addr, mas now points to the location where the store would
> >   * happen
> >   */
> > if (mas_walk(&mas))
> > 	return -EEXIST;
> > 
> > /* The NULL entry ends at mas.last, make sure there is room */
> > if (mas.last < (addr + range - 1))
> > 	return -EEXIST;
> > 
> > /* Limit the store size to the correct end address, and store */
> >   mas.last = addr + range - 1;
> >   ret = mas_store_gfp(&mas, va, GFP_KERNEL);
> > 
> 
> Would this variant be significantly more efficient?

Well, what you are doing is walking the tree to see if there's anything
there... then re-walking the tree to store it.  So, yes, it's much more
efficient..  However, writing is heavier.  How much of the time is spent
walking vs writing depends on the size of the tree, but it's rather easy
to do this in a single walk of the tree so why wouldn't you?

> 
> Also, would this also work while already walking the tree?

Yes, to an extent.  If you are at the correct location in the tree, you
can write to that location.  If you are not in the correct location and
try to write to the tree then things will go poorly..  In this scenario,
we are very much walking the tree and writing to it in two steps.

> 
> To remove an entry while walking the tree I have a separate function
> drm_gpuva_iter_remove(). Would I need something similar for inserting
> entries?

I saw that.  Your remove function uses the erase operation which is
implemented as a walk to that location and a store of a null over the
range that is returned.  You do not need a function to insert an entry
if the maple state is at the correct location, and that doesn't just
mean setting mas.index/mas.last to the correct value.  There is a node &
offset saved in the maple state that needs to be in the correct
location.  If you store to that node then the node may be replaced, so
other iterators that you have may become stale, but the one you used
execute the store operation will now point to the new node with the new
entry.

> 
> I already provided this example in a separate mail thread, but it may makes
> sense to move this to the mailing list:
> 
> In __drm_gpuva_sm_map() we're iterating a given range of the tree, where the
> given range is the size of the newly requested mapping. __drm_gpuva_sm_map()
> invokes a callback for each sub-operation that needs to be taken in order to
> fulfill this mapping request. In most cases such a callback just creates a
> drm_gpuva_op object and stores it in a list.
> 
> However, drivers can also implement the callback, such that they directly
> execute this operation within the callback.
> 
> Let's have a look at the following example:
> 
>      0     a     2
> old: |-----------|       (bo_offset=n)
> 
>            1     b     3
> req:       |-----------| (bo_offset=m)
> 
>      0  a' 1     b     3
> new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
> 
> This would result in the following operations.
> 
> __drm_gpuva_sm_map() finds entry "a" and calls back into the driver
> suggesting to re-map "a" with the new size. The driver removes entry "a"
> from the tree and adds "a'"

What you have here won't work.  The driver will cause your iterators
maple state to point to memory that is freed.  You will either need to
pass through your iterator so that the modifications can occur with that
maple state so it remains valid, or you will need to invalidate the
iterator on every modification by the driver.

I'm sure the first idea you have will be to invalidate the iterator, but
that is probably not the way to proceed.  Even ignoring the unclear
locking of two maple states trying to modify the tree, this is rather
inefficient - each invalidation means a re-walk of the tree.  You may as
well not use an iterator in this case.

Depending on how/when the lookups occur, you could still iterate over
the tree and let the driver modify the ending of "a", but leave the tree
alone and just store b over whatever - but the failure scenarios may
cause you grief.

If you pass the iterator through, then you can just use it to do your
writes and keep iterating as if nothing changed.

> 
> __drm_gpuva_sm_map(), ideally, continues the loop searching for nodes
> starting from the end of "a" (which is 2) till the end of the requested
> mapping "b" (which is 3). Since it doesn't find any other mapping within
> this range it calls back into the driver suggesting to finally map "b".
> 
> If there would have been another mapping between 2 and 3 it would have
> called back into the driver asking to unmap this mapping beforehand.
> 
> So, it boils down to re-mapping as described at the beginning (and
> analogously at the end) of a new mapping range and removing of entries that
> are enclosed by the new mapping range.

I assume the unmapped area is no longer needed, and the 're-map' is
really a removal of information?  Otherwise I'd suggest searching for a
gap which fits your request.  What you have here is a lot like
"MAP_FIXED" vs top-down/bottom-up search in the VMA code, this seems to
be like your __drm_gpuva_sm_map() and the drm mm range allocator with
DRM_MM_INSERT_LOW, and DRM_MM_INSERT_HIGH.

Why can these split/unmappings fail?  Is it because they are still
needed?

> 
> > > +	if (unlikely(ret))
> > > +		return ret;
> > > +
> > > +	va->mgr = mgr;
> > > +	va->region = reg;
> > > +
> > > +	return 0;
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_insert);
> > > +
> > > +/**
> > > + * drm_gpuva_remove - remove a &drm_gpuva
> > > + * @va: the &drm_gpuva to remove
> > > + *
> > > + * This removes the given &va from the underlaying tree.
> > > + */
> > > +void
> > > +drm_gpuva_remove(struct drm_gpuva *va)
> > > +{
> > > +	MA_STATE(mas, &va->mgr->va_mt, va->va.addr, 0);
> > > +
> > > +	mas_erase(&mas);
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_remove);
> > > +
> > ...
> > 
> > > +/**
> > > + * drm_gpuva_find_first - find the first &drm_gpuva in the given range
> > > + * @mgr: the &drm_gpuva_manager to search in
> > > + * @addr: the &drm_gpuvas address
> > > + * @range: the &drm_gpuvas range
> > > + *
> > > + * Returns: the first &drm_gpuva within the given range
> > > + */
> > > +struct drm_gpuva *
> > > +drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
> > > +		     u64 addr, u64 range)
> > > +{
> > > +	MA_STATE(mas, &mgr->va_mt, addr, 0);
> > > +
> > > +	return mas_find(&mas, addr + range - 1);
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_find_first);
> > > +
> > > +/**
> > > + * drm_gpuva_find - find a &drm_gpuva
> > > + * @mgr: the &drm_gpuva_manager to search in
> > > + * @addr: the &drm_gpuvas address
> > > + * @range: the &drm_gpuvas range
> > > + *
> > > + * Returns: the &drm_gpuva at a given &addr and with a given &range
> > 
> > Note that mas_find() will continue upwards in the address space if there
> > isn't anything at @addr.  This means that &drm_gpuva may not be at
> > &addr.  If you want to check just at &addr, use mas_walk().
> 
> Good catch. drm_gpuva_find() should then either also check for 'va->va.addr
> == addr' as well or, alternatively, use mas_walk(). As above, any reason to
> prefer mas_walk()?
> 
> > 
> > > + */
> > > +struct drm_gpuva *
> > > +drm_gpuva_find(struct drm_gpuva_manager *mgr,
> > > +	       u64 addr, u64 range)
> > > +{
> > > +	struct drm_gpuva *va;
> > > +
> > > +	va = drm_gpuva_find_first(mgr, addr, range);
> > > +	if (!va)
> > > +		goto out;
> > > +
> > > +	if (va->va.range != range)
> > > +		goto out;
> > > +
> > > +	return va;
> > > +
> > > +out:
> > > +	return NULL;
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_find);
> > > +
> > > +/**
> > > + * drm_gpuva_find_prev - find the &drm_gpuva before the given address
> > > + * @mgr: the &drm_gpuva_manager to search in
> > > + * @start: the given GPU VA's start address
> > > + *
> > > + * Find the adjacent &drm_gpuva before the GPU VA with given &start address.
> > > + *
> > > + * Note that if there is any free space between the GPU VA mappings no mapping
> > > + * is returned.
> > > + *
> > > + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
> > > + */
> > > +struct drm_gpuva *
> > > +drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start)
> > 
> > find_prev() usually continues beyond 1 less than the address. I found
> > this name confusing.
> 
> Don't really get that, mind explaining?

When I ask for the previous one in a list or tree, I think the one
before.. but since you are limiting your search from start to start - 1,
you may as well walk to start - 1 and see if one exists.

Is that what you meant to do here?

> 
> > You may as well use mas_walk(), it would be faster.
> 
> How would I use mas_walk() for that? If I understand it correctly,
> mas_walk() requires me to know that start address, which I don't know for
> the previous entry.

mas_walk() walks to the value you specify and returns the entry at that
address, not necessarily the start address, but any address in the
range.

If you have a tree and store A = [0x1000 - 0x2000] and set your maple
state to walk to 0x1500, mas_walk() will return A, and the maple state
will have mas.index = 0x1000 and mas.last = 0x2000.

You have set the maple state to start at "start" and called
mas_prev(&mas, start - 1).  start - 1 is the lower limit, so the
internal implementation will walk to start then go to the previous entry
until start - 1.. it will stop at start - 1 and return NULL if there
isn't one there.

> 
> However, mas_walk() seems to be a good alternative to use for
> drm_gpuva_find_next().
> 
> > > +{
> > > +	MA_STATE(mas, &mgr->va_mt, start, 0);
> > > +
> > > +	if (start <= mgr->mm_start ||
> > > +	    start > (mgr->mm_start + mgr->mm_range))
> > > +		return NULL;
> > > +
> > > +	return mas_prev(&mas, start - 1);
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_find_prev);
> > > +
> > > +/**
> > > + * drm_gpuva_find_next - find the &drm_gpuva after the given address
> > > + * @mgr: the &drm_gpuva_manager to search in
> > > + * @end: the given GPU VA's end address
> > > + *
> > > + * Find the adjacent &drm_gpuva after the GPU VA with given &end address.
> > > + *
> > > + * Note that if there is any free space between the GPU VA mappings no mapping
> > > + * is returned.
> > > + *
> > > + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
> > > + */
> > > +struct drm_gpuva *
> > > +drm_gpuva_find_next(struct drm_gpuva_manager *mgr, u64 end)
> > 
> > This name is also a bit confusing for the same reason.  Again, it seems
> > worth just walking to end here.
> > 
> > > +{
> > > +	MA_STATE(mas, &mgr->va_mt, end - 1, 0);
> > > +
> > > +	if (end < mgr->mm_start ||
> > > +	    end >= (mgr->mm_start + mgr->mm_range))
> > > +		return NULL;
> > > +
> > > +	return mas_next(&mas, end);
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_find_next);
> > > +
> > > +static int
> > > +__drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
> > > +			  struct drm_gpuva_region *reg)
> > > +{
> > > +	u64 addr = reg->va.addr;
> > > +	u64 range = reg->va.range;
> > > +	MA_STATE(mas, &mgr->region_mt, addr, addr + range - 1);
> > > +	int ret;
> > > +
> > > +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
> > > +		return -EINVAL;
> > > +
> > > +	ret = mas_store_gfp(&mas, reg, GFP_KERNEL);
> > > +	if (unlikely(ret))
> > > +		return ret;
> > > +
> > > +	reg->mgr = mgr;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * drm_gpuva_region_insert - insert a &drm_gpuva_region
> > > + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
> > > + * @reg: the &drm_gpuva_region to insert
> > > + * @addr: the start address of the GPU VA
> > > + * @range: the range of the GPU VA
> > > + *
> > > + * Insert a &drm_gpuva_region with a given address and range into a
> > > + * &drm_gpuva_manager.
> > > + *
> > > + * Returns: 0 on success, negative error code on failure.
> > > + */
> > > +int
> > > +drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
> > > +			struct drm_gpuva_region *reg)
> > > +{
> > > +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
> > > +		return -EINVAL;
> > > +
> > > +	return __drm_gpuva_region_insert(mgr, reg);
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_region_insert);
> > > +
> > > +static void
> > > +__drm_gpuva_region_remove(struct drm_gpuva_region *reg)
> > > +{
> > > +	struct drm_gpuva_manager *mgr = reg->mgr;
> > > +	MA_STATE(mas, &mgr->region_mt, reg->va.addr, 0);
> > > +
> > > +	mas_erase(&mas);
> > > +}
> > > +
> > > +/**
> > > + * drm_gpuva_region_remove - remove a &drm_gpuva_region
> > > + * @reg: the &drm_gpuva to remove
> > > + *
> > > + * This removes the given &reg from the underlaying tree.
> > > + */
> > > +void
> > > +drm_gpuva_region_remove(struct drm_gpuva_region *reg)
> > > +{
> > > +	struct drm_gpuva_manager *mgr = reg->mgr;
> > > +
> > > +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
> > > +		return;
> > > +
> > > +	if (unlikely(reg == &mgr->kernel_alloc_region)) {
> > > +		WARN(1, "Can't destroy kernel reserved region.\n");
> > > +		return;
> > > +	}
> > > +
> > > +	if (unlikely(!drm_gpuva_region_empty(reg)))
> > > +		WARN(1, "GPU VA region should be empty on destroy.\n");
> > > +
> > > +	__drm_gpuva_region_remove(reg);
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_region_remove);
> > > +
> > > +/**
> > > + * drm_gpuva_region_empty - indicate whether a &drm_gpuva_region is empty
> > > + * @reg: the &drm_gpuva to destroy
> > > + *
> > > + * Returns: true if the &drm_gpuva_region is empty, false otherwise
> > > + */
> > > +bool
> > > +drm_gpuva_region_empty(struct drm_gpuva_region *reg)
> > > +{
> > > +	DRM_GPUVA_ITER(it, reg->mgr);
> > > +
> > > +	drm_gpuva_iter_for_each_range(it, reg->va.addr,
> > > +				      reg->va.addr +
> > > +				      reg->va.range)
> > > +		return false;
> > > +
> > > +	return true;
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_region_empty);
> > > +
> > > +/**
> > > + * drm_gpuva_region_find_first - find the first &drm_gpuva_region in the given
> > > + * range
> > > + * @mgr: the &drm_gpuva_manager to search in
> > > + * @addr: the &drm_gpuva_regions address
> > > + * @range: the &drm_gpuva_regions range
> > > + *
> > > + * Returns: the first &drm_gpuva_region within the given range
> > > + */
> > > +struct drm_gpuva_region *
> > > +drm_gpuva_region_find_first(struct drm_gpuva_manager *mgr,
> > > +			    u64 addr, u64 range)
> > > +{
> > > +	MA_STATE(mas, &mgr->region_mt, addr, 0);
> > > +
> > > +	return mas_find(&mas, addr + range - 1);
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_region_find_first);
> > > +
> > > +/**
> > > + * drm_gpuva_region_find - find a &drm_gpuva_region
> > > + * @mgr: the &drm_gpuva_manager to search in
> > > + * @addr: the &drm_gpuva_regions address
> > > + * @range: the &drm_gpuva_regions range
> > > + *
> > > + * Returns: the &drm_gpuva_region at a given &addr and with a given &range
> > 
> > again, I'm not sure you want to find first or walk here.. It sounds like
> > you want exactly addr to addr + range VMA?
> 
> Exactly, same as above.

MA_STATE(mas, &mgr->region_mt, addr, addr);

reg = mas_walk(&mas);
if (!reg)
	return reg;

if ((mas.index != addr) | (mas.last != range - 1))
	return NULL;

return reg;

> 
> > 
> > > + */
> > > +struct drm_gpuva_region *
> > > +drm_gpuva_region_find(struct drm_gpuva_manager *mgr,
> > > +		      u64 addr, u64 range)
> > > +{
> > > +	struct drm_gpuva_region *reg;
> > > +
> > > +	reg = drm_gpuva_region_find_first(mgr, addr, range);

mas_find() will keep searching, so you may get a VMA that starts higher
than addr.

> > > +	if (!reg)
> > > +		goto out;
> > > +
> > > +	if (reg->va.range != range)
> > > +		goto out;
> > > +
> > > +	return reg;
> > > +
> > > +out:
> > > +	return NULL;
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_region_find);
> > > +
> > 
> > ...
> > 
>
  
Danilo Krummrich Feb. 27, 2023, 12:23 p.m. UTC | #11
On 2/23/23 20:09, Liam R. Howlett wrote:
> * Danilo Krummrich <dakr@redhat.com> [230222 13:13]:
>> On 2/21/23 19:20, Liam R. Howlett wrote:
>>> * Danilo Krummrich <dakr@redhat.com> [230217 08:45]:
>>>> Add infrastructure to keep track of GPU virtual address (VA) mappings
>>>> with a decicated VA space manager implementation.
>>>>
>>>> New UAPIs, motivated by Vulkan sparse memory bindings graphics drivers
>>>> start implementing, allow userspace applications to request multiple and
>>>> arbitrary GPU VA mappings of buffer objects. The DRM GPU VA manager is
>>>> intended to serve the following purposes in this context.
>>>>
>>>> 1) Provide infrastructure to track GPU VA allocations and mappings,
>>>>      making use of the maple_tree.
>>>>
>>>> 2) Generically connect GPU VA mappings to their backing buffers, in
>>>>      particular DRM GEM objects.
>>>>
>>>> 3) Provide a common implementation to perform more complex mapping
>>>>      operations on the GPU VA space. In particular splitting and merging
>>>>      of GPU VA mappings, e.g. for intersecting mapping requests or partial
>>>>      unmap requests.
>>>>
>>>> Suggested-by: Dave Airlie <airlied@redhat.com>
>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>> ---
>>>>    Documentation/gpu/drm-mm.rst    |   31 +
>>>>    drivers/gpu/drm/Makefile        |    1 +
>>>>    drivers/gpu/drm/drm_gem.c       |    3 +
>>>>    drivers/gpu/drm/drm_gpuva_mgr.c | 1704 +++++++++++++++++++++++++++++++
>>>>    include/drm/drm_drv.h           |    6 +
>>>>    include/drm/drm_gem.h           |   75 ++
>>>>    include/drm/drm_gpuva_mgr.h     |  714 +++++++++++++
>>>>    7 files changed, 2534 insertions(+)
>>>>    create mode 100644 drivers/gpu/drm/drm_gpuva_mgr.c
>>>>    create mode 100644 include/drm/drm_gpuva_mgr.h
>>>>
>>>> diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
>>>> index a52e6f4117d6..c9f120cfe730 100644
>>>> --- a/Documentation/gpu/drm-mm.rst
>>>> +++ b/Documentation/gpu/drm-mm.rst
>>>> @@ -466,6 +466,37 @@ DRM MM Range Allocator Function References
>>>>    .. kernel-doc:: drivers/gpu/drm/drm_mm.c
>>>>       :export:
>>> ...
>>>
>>>> +
>>>> +/**
>>>> + * drm_gpuva_remove_iter - removes the iterators current element
>>>> + * @it: the &drm_gpuva_iterator
>>>> + *
>>>> + * This removes the element the iterator currently points to.
>>>> + */
>>>> +void
>>>> +drm_gpuva_iter_remove(struct drm_gpuva_iterator *it)
>>>> +{
>>>> +	mas_erase(&it->mas);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_iter_remove);
>>>> +
>>>> +/**
>>>> + * drm_gpuva_insert - insert a &drm_gpuva
>>>> + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
>>>> + * @va: the &drm_gpuva to insert
>>>> + * @addr: the start address of the GPU VA
>>>> + * @range: the range of the GPU VA
>>>> + *
>>>> + * Insert a &drm_gpuva with a given address and range into a
>>>> + * &drm_gpuva_manager.
>>>> + *
>>>> + * Returns: 0 on success, negative error code on failure.
>>>> + */
>>>> +int
>>>> +drm_gpuva_insert(struct drm_gpuva_manager *mgr,
>>>> +		 struct drm_gpuva *va)
>>>> +{
>>>> +	u64 addr = va->va.addr;
>>>> +	u64 range = va->va.range;
>>>> +	MA_STATE(mas, &mgr->va_mt, addr, addr + range - 1);
>>>> +	struct drm_gpuva_region *reg = NULL;
>>>> +	int ret;
>>>> +
>>>> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (unlikely(drm_gpuva_in_kernel_region(mgr, addr, range)))
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS) {
>>>> +		reg = drm_gpuva_in_region(mgr, addr, range);
>>>> +		if (unlikely(!reg))
>>>> +			return -EINVAL;
>>>> +	}
>>>> +
>>>
>>> -----
>>>
>>>> +	if (unlikely(drm_gpuva_find_first(mgr, addr, range)))
>>>> +		return -EEXIST;
>>>> +
>>>> +	ret = mas_store_gfp(&mas, va, GFP_KERNEL);
>>>
>>> mas_walk() will set the internal maple state to the limits to what it
>>> finds.  So, instead of an iterator, you can use the walk function and
>>> ensure there is a large enough area in the existing NULL:
>>>
>>> /*
>>>    * Nothing at addr, mas now points to the location where the store would
>>>    * happen
>>>    */
>>> if (mas_walk(&mas))
>>> 	return -EEXIST;
>>>
>>> /* The NULL entry ends at mas.last, make sure there is room */
>>> if (mas.last < (addr + range - 1))
>>> 	return -EEXIST;
>>>
>>> /* Limit the store size to the correct end address, and store */
>>>    mas.last = addr + range - 1;
>>>    ret = mas_store_gfp(&mas, va, GFP_KERNEL);
>>>
>>
>> Would this variant be significantly more efficient?
> 
> Well, what you are doing is walking the tree to see if there's anything
> there... then re-walking the tree to store it.  So, yes, it's much more
> efficient..  However, writing is heavier.  How much of the time is spent
> walking vs writing depends on the size of the tree, but it's rather easy
> to do this in a single walk of the tree so why wouldn't you?

I will, I was just curious about how much of an impact it has.

> 
>>
>> Also, would this also work while already walking the tree?
> 
> Yes, to an extent.  If you are at the correct location in the tree, you
> can write to that location.  If you are not in the correct location and
> try to write to the tree then things will go poorly..  In this scenario,
> we are very much walking the tree and writing to it in two steps.
> 
>>
>> To remove an entry while walking the tree I have a separate function
>> drm_gpuva_iter_remove(). Would I need something similar for inserting
>> entries?
> 
> I saw that.  Your remove function uses the erase operation which is
> implemented as a walk to that location and a store of a null over the
> range that is returned.  You do not need a function to insert an entry
> if the maple state is at the correct location, and that doesn't just
> mean setting mas.index/mas.last to the correct value.  There is a node &
> offset saved in the maple state that needs to be in the correct
> location.  If you store to that node then the node may be replaced, so
> other iterators that you have may become stale, but the one you used
> execute the store operation will now point to the new node with the new
> entry.
> 
>>
>> I already provided this example in a separate mail thread, but it may makes
>> sense to move this to the mailing list:
>>
>> In __drm_gpuva_sm_map() we're iterating a given range of the tree, where the
>> given range is the size of the newly requested mapping. __drm_gpuva_sm_map()
>> invokes a callback for each sub-operation that needs to be taken in order to
>> fulfill this mapping request. In most cases such a callback just creates a
>> drm_gpuva_op object and stores it in a list.
>>
>> However, drivers can also implement the callback, such that they directly
>> execute this operation within the callback.
>>
>> Let's have a look at the following example:
>>
>>       0     a     2
>> old: |-----------|       (bo_offset=n)
>>
>>             1     b     3
>> req:       |-----------| (bo_offset=m)
>>
>>       0  a' 1     b     3
>> new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
>>
>> This would result in the following operations.
>>
>> __drm_gpuva_sm_map() finds entry "a" and calls back into the driver
>> suggesting to re-map "a" with the new size. The driver removes entry "a"
>> from the tree and adds "a'"
> 
> What you have here won't work.  The driver will cause your iterators
> maple state to point to memory that is freed.  You will either need to
> pass through your iterator so that the modifications can occur with that
> maple state so it remains valid, or you will need to invalidate the
> iterator on every modification by the driver.
> 
> I'm sure the first idea you have will be to invalidate the iterator, but
> that is probably not the way to proceed.  Even ignoring the unclear
> locking of two maple states trying to modify the tree, this is rather
> inefficient - each invalidation means a re-walk of the tree.  You may as
> well not use an iterator in this case.
> 
> Depending on how/when the lookups occur, you could still iterate over
> the tree and let the driver modify the ending of "a", but leave the tree
> alone and just store b over whatever - but the failure scenarios may
> cause you grief.
> 
> If you pass the iterator through, then you can just use it to do your
> writes and keep iterating as if nothing changed.

Passing through the iterater clearly seems to be the way to go.

I assume that if the entry to insert isn't at the location of the 
iterator (as in the following example) we can just keep walking to this 
location my changing the index of the mas and calling mas_walk()? This 
would also imply that the "outer" tree walk continues after the entry we 
just inserted, right?

            1     a     3
old:       |-----------| (bo_offset=n)

      0     b     2
req: |-----------|       (bo_offset=m)

      0     b     2  a' 3
new: |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)

Again, after finding "a", we want to remove it and insert "a'" instead.

> 
>>
>> __drm_gpuva_sm_map(), ideally, continues the loop searching for nodes
>> starting from the end of "a" (which is 2) till the end of the requested
>> mapping "b" (which is 3). Since it doesn't find any other mapping within
>> this range it calls back into the driver suggesting to finally map "b".
>>
>> If there would have been another mapping between 2 and 3 it would have
>> called back into the driver asking to unmap this mapping beforehand.
>>
>> So, it boils down to re-mapping as described at the beginning (and
>> analogously at the end) of a new mapping range and removing of entries that
>> are enclosed by the new mapping range.
> 
> I assume the unmapped area is no longer needed, and the 're-map' is
> really a removal of information?  Otherwise I'd suggest searching for a
> gap which fits your request.  What you have here is a lot like
> "MAP_FIXED" vs top-down/bottom-up search in the VMA code, this seems to
> be like your __drm_gpuva_sm_map() and the drm mm range allocator with
> DRM_MM_INSERT_LOW, and DRM_MM_INSERT_HIGH.
> 
> Why can these split/unmappings fail?  Is it because they are still
> needed?
> 

You mean the check before the mas_*() operations in drm_gpuva_insert()?

Removing entries should never fail, inserting entries should fail when 
the caller tries to store to an area outside of the VA space (it doesn't 
necessarily span the whole 64-bit space), a kernel reserved area of the 
VA space, is not in any pre-allocated range of the VA space (if regions 
are enabled) or an entry already exists at that location.

>>
>>>> +	if (unlikely(ret))
>>>> +		return ret;
>>>> +
>>>> +	va->mgr = mgr;
>>>> +	va->region = reg;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_insert);
>>>> +
>>>> +/**
>>>> + * drm_gpuva_remove - remove a &drm_gpuva
>>>> + * @va: the &drm_gpuva to remove
>>>> + *
>>>> + * This removes the given &va from the underlaying tree.
>>>> + */
>>>> +void
>>>> +drm_gpuva_remove(struct drm_gpuva *va)
>>>> +{
>>>> +	MA_STATE(mas, &va->mgr->va_mt, va->va.addr, 0);
>>>> +
>>>> +	mas_erase(&mas);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_remove);
>>>> +
>>> ...
>>>
>>>> +/**
>>>> + * drm_gpuva_find_first - find the first &drm_gpuva in the given range
>>>> + * @mgr: the &drm_gpuva_manager to search in
>>>> + * @addr: the &drm_gpuvas address
>>>> + * @range: the &drm_gpuvas range
>>>> + *
>>>> + * Returns: the first &drm_gpuva within the given range
>>>> + */
>>>> +struct drm_gpuva *
>>>> +drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
>>>> +		     u64 addr, u64 range)
>>>> +{
>>>> +	MA_STATE(mas, &mgr->va_mt, addr, 0);
>>>> +
>>>> +	return mas_find(&mas, addr + range - 1);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_find_first);
>>>> +
>>>> +/**
>>>> + * drm_gpuva_find - find a &drm_gpuva
>>>> + * @mgr: the &drm_gpuva_manager to search in
>>>> + * @addr: the &drm_gpuvas address
>>>> + * @range: the &drm_gpuvas range
>>>> + *
>>>> + * Returns: the &drm_gpuva at a given &addr and with a given &range
>>>
>>> Note that mas_find() will continue upwards in the address space if there
>>> isn't anything at @addr.  This means that &drm_gpuva may not be at
>>> &addr.  If you want to check just at &addr, use mas_walk().
>>
>> Good catch. drm_gpuva_find() should then either also check for 'va->va.addr
>> == addr' as well or, alternatively, use mas_walk(). As above, any reason to
>> prefer mas_walk()?
>>
>>>
>>>> + */
>>>> +struct drm_gpuva *
>>>> +drm_gpuva_find(struct drm_gpuva_manager *mgr,
>>>> +	       u64 addr, u64 range)
>>>> +{
>>>> +	struct drm_gpuva *va;
>>>> +
>>>> +	va = drm_gpuva_find_first(mgr, addr, range);
>>>> +	if (!va)
>>>> +		goto out;
>>>> +
>>>> +	if (va->va.range != range)
>>>> +		goto out;
>>>> +
>>>> +	return va;
>>>> +
>>>> +out:
>>>> +	return NULL;
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_find);
>>>> +
>>>> +/**
>>>> + * drm_gpuva_find_prev - find the &drm_gpuva before the given address
>>>> + * @mgr: the &drm_gpuva_manager to search in
>>>> + * @start: the given GPU VA's start address
>>>> + *
>>>> + * Find the adjacent &drm_gpuva before the GPU VA with given &start address.
>>>> + *
>>>> + * Note that if there is any free space between the GPU VA mappings no mapping
>>>> + * is returned.
>>>> + *
>>>> + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
>>>> + */
>>>> +struct drm_gpuva *
>>>> +drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start)
>>>
>>> find_prev() usually continues beyond 1 less than the address. I found
>>> this name confusing.
>>
>> Don't really get that, mind explaining?
> 
> When I ask for the previous one in a list or tree, I think the one
> before.. but since you are limiting your search from start to start - 1,
> you may as well walk to start - 1 and see if one exists.
> 
> Is that what you meant to do here?

Yes, I want to know whether there is a previous entry which ends right 
before the current entry, without a gap between the two.

> 
>>
>>> You may as well use mas_walk(), it would be faster.
>>
>> How would I use mas_walk() for that? If I understand it correctly,
>> mas_walk() requires me to know that start address, which I don't know for
>> the previous entry.
> 
> mas_walk() walks to the value you specify and returns the entry at that
> address, not necessarily the start address, but any address in the
> range.
> 
> If you have a tree and store A = [0x1000 - 0x2000] and set your maple
> state to walk to 0x1500, mas_walk() will return A, and the maple state
> will have mas.index = 0x1000 and mas.last = 0x2000.
> 
> You have set the maple state to start at "start" and called
> mas_prev(&mas, start - 1).  start - 1 is the lower limit, so the
> internal implementation will walk to start then go to the previous entry
> until start - 1.. it will stop at start - 1 and return NULL if there
> isn't one there.

Thanks for the clarification and all the other very helpful comments and 
explanations!

- Danilo

> 
>>
>> However, mas_walk() seems to be a good alternative to use for
>> drm_gpuva_find_next().
>>
>>>> +{
>>>> +	MA_STATE(mas, &mgr->va_mt, start, 0);
>>>> +
>>>> +	if (start <= mgr->mm_start ||
>>>> +	    start > (mgr->mm_start + mgr->mm_range))
>>>> +		return NULL;
>>>> +
>>>> +	return mas_prev(&mas, start - 1);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_find_prev);
>>>> +
>>>> +/**
>>>> + * drm_gpuva_find_next - find the &drm_gpuva after the given address
>>>> + * @mgr: the &drm_gpuva_manager to search in
>>>> + * @end: the given GPU VA's end address
>>>> + *
>>>> + * Find the adjacent &drm_gpuva after the GPU VA with given &end address.
>>>> + *
>>>> + * Note that if there is any free space between the GPU VA mappings no mapping
>>>> + * is returned.
>>>> + *
>>>> + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
>>>> + */
>>>> +struct drm_gpuva *
>>>> +drm_gpuva_find_next(struct drm_gpuva_manager *mgr, u64 end)
>>>
>>> This name is also a bit confusing for the same reason.  Again, it seems
>>> worth just walking to end here.
>>>
>>>> +{
>>>> +	MA_STATE(mas, &mgr->va_mt, end - 1, 0);
>>>> +
>>>> +	if (end < mgr->mm_start ||
>>>> +	    end >= (mgr->mm_start + mgr->mm_range))
>>>> +		return NULL;
>>>> +
>>>> +	return mas_next(&mas, end);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_find_next);
>>>> +
>>>> +static int
>>>> +__drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
>>>> +			  struct drm_gpuva_region *reg)
>>>> +{
>>>> +	u64 addr = reg->va.addr;
>>>> +	u64 range = reg->va.range;
>>>> +	MA_STATE(mas, &mgr->region_mt, addr, addr + range - 1);
>>>> +	int ret;
>>>> +
>>>> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
>>>> +		return -EINVAL;
>>>> +
>>>> +	ret = mas_store_gfp(&mas, reg, GFP_KERNEL);
>>>> +	if (unlikely(ret))
>>>> +		return ret;
>>>> +
>>>> +	reg->mgr = mgr;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +/**
>>>> + * drm_gpuva_region_insert - insert a &drm_gpuva_region
>>>> + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
>>>> + * @reg: the &drm_gpuva_region to insert
>>>> + * @addr: the start address of the GPU VA
>>>> + * @range: the range of the GPU VA
>>>> + *
>>>> + * Insert a &drm_gpuva_region with a given address and range into a
>>>> + * &drm_gpuva_manager.
>>>> + *
>>>> + * Returns: 0 on success, negative error code on failure.
>>>> + */
>>>> +int
>>>> +drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
>>>> +			struct drm_gpuva_region *reg)
>>>> +{
>>>> +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
>>>> +		return -EINVAL;
>>>> +
>>>> +	return __drm_gpuva_region_insert(mgr, reg);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_region_insert);
>>>> +
>>>> +static void
>>>> +__drm_gpuva_region_remove(struct drm_gpuva_region *reg)
>>>> +{
>>>> +	struct drm_gpuva_manager *mgr = reg->mgr;
>>>> +	MA_STATE(mas, &mgr->region_mt, reg->va.addr, 0);
>>>> +
>>>> +	mas_erase(&mas);
>>>> +}
>>>> +
>>>> +/**
>>>> + * drm_gpuva_region_remove - remove a &drm_gpuva_region
>>>> + * @reg: the &drm_gpuva to remove
>>>> + *
>>>> + * This removes the given &reg from the underlaying tree.
>>>> + */
>>>> +void
>>>> +drm_gpuva_region_remove(struct drm_gpuva_region *reg)
>>>> +{
>>>> +	struct drm_gpuva_manager *mgr = reg->mgr;
>>>> +
>>>> +	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
>>>> +		return;
>>>> +
>>>> +	if (unlikely(reg == &mgr->kernel_alloc_region)) {
>>>> +		WARN(1, "Can't destroy kernel reserved region.\n");
>>>> +		return;
>>>> +	}
>>>> +
>>>> +	if (unlikely(!drm_gpuva_region_empty(reg)))
>>>> +		WARN(1, "GPU VA region should be empty on destroy.\n");
>>>> +
>>>> +	__drm_gpuva_region_remove(reg);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_region_remove);
>>>> +
>>>> +/**
>>>> + * drm_gpuva_region_empty - indicate whether a &drm_gpuva_region is empty
>>>> + * @reg: the &drm_gpuva to destroy
>>>> + *
>>>> + * Returns: true if the &drm_gpuva_region is empty, false otherwise
>>>> + */
>>>> +bool
>>>> +drm_gpuva_region_empty(struct drm_gpuva_region *reg)
>>>> +{
>>>> +	DRM_GPUVA_ITER(it, reg->mgr);
>>>> +
>>>> +	drm_gpuva_iter_for_each_range(it, reg->va.addr,
>>>> +				      reg->va.addr +
>>>> +				      reg->va.range)
>>>> +		return false;
>>>> +
>>>> +	return true;
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_region_empty);
>>>> +
>>>> +/**
>>>> + * drm_gpuva_region_find_first - find the first &drm_gpuva_region in the given
>>>> + * range
>>>> + * @mgr: the &drm_gpuva_manager to search in
>>>> + * @addr: the &drm_gpuva_regions address
>>>> + * @range: the &drm_gpuva_regions range
>>>> + *
>>>> + * Returns: the first &drm_gpuva_region within the given range
>>>> + */
>>>> +struct drm_gpuva_region *
>>>> +drm_gpuva_region_find_first(struct drm_gpuva_manager *mgr,
>>>> +			    u64 addr, u64 range)
>>>> +{
>>>> +	MA_STATE(mas, &mgr->region_mt, addr, 0);
>>>> +
>>>> +	return mas_find(&mas, addr + range - 1);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_region_find_first);
>>>> +
>>>> +/**
>>>> + * drm_gpuva_region_find - find a &drm_gpuva_region
>>>> + * @mgr: the &drm_gpuva_manager to search in
>>>> + * @addr: the &drm_gpuva_regions address
>>>> + * @range: the &drm_gpuva_regions range
>>>> + *
>>>> + * Returns: the &drm_gpuva_region at a given &addr and with a given &range
>>>
>>> again, I'm not sure you want to find first or walk here.. It sounds like
>>> you want exactly addr to addr + range VMA?
>>
>> Exactly, same as above.
> 
> MA_STATE(mas, &mgr->region_mt, addr, addr);
> 
> reg = mas_walk(&mas);
> if (!reg)
> 	return reg;
> 
> if ((mas.index != addr) | (mas.last != range - 1))
> 	return NULL;
> 
> return reg;
> 
>>
>>>
>>>> + */
>>>> +struct drm_gpuva_region *
>>>> +drm_gpuva_region_find(struct drm_gpuva_manager *mgr,
>>>> +		      u64 addr, u64 range)
>>>> +{
>>>> +	struct drm_gpuva_region *reg;
>>>> +
>>>> +	reg = drm_gpuva_region_find_first(mgr, addr, range);
> 
> mas_find() will keep searching, so you may get a VMA that starts higher
> than addr.
> 
>>>> +	if (!reg)
>>>> +		goto out;
>>>> +
>>>> +	if (reg->va.range != range)
>>>> +		goto out;
>>>> +
>>>> +	return reg;
>>>> +
>>>> +out:
>>>> +	return NULL;
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_region_find);
>>>> +
>>>
>>> ...
>>>
>>
>
  
Danilo Krummrich Feb. 28, 2023, 2:17 a.m. UTC | #12
On Tue, Feb 21, 2023 at 01:20:50PM -0500, Liam R. Howlett wrote:
> * Danilo Krummrich <dakr@redhat.com> [230217 08:45]:
> > Add infrastructure to keep track of GPU virtual address (VA) mappings
> > with a decicated VA space manager implementation.
> > 
> > New UAPIs, motivated by Vulkan sparse memory bindings graphics drivers
> > start implementing, allow userspace applications to request multiple and
> > arbitrary GPU VA mappings of buffer objects. The DRM GPU VA manager is
> > intended to serve the following purposes in this context.
> > 
> > 1) Provide infrastructure to track GPU VA allocations and mappings,
> >    making use of the maple_tree.
> > 
> > 2) Generically connect GPU VA mappings to their backing buffers, in
> >    particular DRM GEM objects.
> > 
> > 3) Provide a common implementation to perform more complex mapping
> >    operations on the GPU VA space. In particular splitting and merging
> >    of GPU VA mappings, e.g. for intersecting mapping requests or partial
> >    unmap requests.
> > 
> > Suggested-by: Dave Airlie <airlied@redhat.com>
> > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > ---
> >  Documentation/gpu/drm-mm.rst    |   31 +
> >  drivers/gpu/drm/Makefile        |    1 +
> >  drivers/gpu/drm/drm_gem.c       |    3 +
> >  drivers/gpu/drm/drm_gpuva_mgr.c | 1704 +++++++++++++++++++++++++++++++
> >  include/drm/drm_drv.h           |    6 +
> >  include/drm/drm_gem.h           |   75 ++
> >  include/drm/drm_gpuva_mgr.h     |  714 +++++++++++++
> >  7 files changed, 2534 insertions(+)
> >  create mode 100644 drivers/gpu/drm/drm_gpuva_mgr.c
> >  create mode 100644 include/drm/drm_gpuva_mgr.h
> > 
> > diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
> > index a52e6f4117d6..c9f120cfe730 100644
> > --- a/Documentation/gpu/drm-mm.rst
> > +++ b/Documentation/gpu/drm-mm.rst
> > @@ -466,6 +466,37 @@ DRM MM Range Allocator Function References
> >  .. kernel-doc:: drivers/gpu/drm/drm_mm.c
> >     :export:
> >  
> ...
> 
> > +
> > +/**
> > + * drm_gpuva_remove_iter - removes the iterators current element
> > + * @it: the &drm_gpuva_iterator
> > + *
> > + * This removes the element the iterator currently points to.
> > + */
> > +void
> > +drm_gpuva_iter_remove(struct drm_gpuva_iterator *it)
> > +{
> > +	mas_erase(&it->mas);
> > +}
> > +EXPORT_SYMBOL(drm_gpuva_iter_remove);
> > +
> > +/**
> > + * drm_gpuva_insert - insert a &drm_gpuva
> > + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
> > + * @va: the &drm_gpuva to insert
> > + * @addr: the start address of the GPU VA
> > + * @range: the range of the GPU VA
> > + *
> > + * Insert a &drm_gpuva with a given address and range into a
> > + * &drm_gpuva_manager.
> > + *
> > + * Returns: 0 on success, negative error code on failure.
> > + */
> > +int
> > +drm_gpuva_insert(struct drm_gpuva_manager *mgr,
> > +		 struct drm_gpuva *va)
> > +{
> > +	u64 addr = va->va.addr;
> > +	u64 range = va->va.range;
> > +	MA_STATE(mas, &mgr->va_mt, addr, addr + range - 1);
> > +	struct drm_gpuva_region *reg = NULL;
> > +	int ret;
> > +
> > +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
> > +		return -EINVAL;
> > +
> > +	if (unlikely(drm_gpuva_in_kernel_region(mgr, addr, range)))
> > +		return -EINVAL;
> > +
> > +	if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS) {
> > +		reg = drm_gpuva_in_region(mgr, addr, range);
> > +		if (unlikely(!reg))
> > +			return -EINVAL;
> > +	}
> > +
> 
> -----
> 
> > +	if (unlikely(drm_gpuva_find_first(mgr, addr, range)))
> > +		return -EEXIST;
> > +
> > +	ret = mas_store_gfp(&mas, va, GFP_KERNEL);
> 
> mas_walk() will set the internal maple state to the limits to what it
> finds.  So, instead of an iterator, you can use the walk function and
> ensure there is a large enough area in the existing NULL:
> 
> /*
>  * Nothing at addr, mas now points to the location where the store would
>  * happen
>  */
> if (mas_walk(&mas))
> 	return -EEXIST;
> 

For some reason mas_walk() finds an entry and hence this function returns
-EEXIST for the following sequence of insertions.

A = [0xc0000 - 0xfffff]
B = [0x0 - 0xbffff]

Interestingly, inserting B before A works fine.

I attached a test module that reproduces the issue. I hope its just a stupid
mistake I just can't spot though.

> /* The NULL entry ends at mas.last, make sure there is room */
> if (mas.last < (addr + range - 1))
> 	return -EEXIST;
> 
> /* Limit the store size to the correct end address, and store */
>  mas.last = addr + range - 1;
>  ret = mas_store_gfp(&mas, va, GFP_KERNEL);
>
  
Liam R. Howlett Feb. 28, 2023, 4:24 p.m. UTC | #13
* Danilo Krummrich <dakr@redhat.com> [230227 21:17]:
> On Tue, Feb 21, 2023 at 01:20:50PM -0500, Liam R. Howlett wrote:
> > * Danilo Krummrich <dakr@redhat.com> [230217 08:45]:
> > > Add infrastructure to keep track of GPU virtual address (VA) mappings
> > > with a decicated VA space manager implementation.
> > > 
> > > New UAPIs, motivated by Vulkan sparse memory bindings graphics drivers
> > > start implementing, allow userspace applications to request multiple and
> > > arbitrary GPU VA mappings of buffer objects. The DRM GPU VA manager is
> > > intended to serve the following purposes in this context.
> > > 
> > > 1) Provide infrastructure to track GPU VA allocations and mappings,
> > >    making use of the maple_tree.
> > > 
> > > 2) Generically connect GPU VA mappings to their backing buffers, in
> > >    particular DRM GEM objects.
> > > 
> > > 3) Provide a common implementation to perform more complex mapping
> > >    operations on the GPU VA space. In particular splitting and merging
> > >    of GPU VA mappings, e.g. for intersecting mapping requests or partial
> > >    unmap requests.
> > > 
> > > Suggested-by: Dave Airlie <airlied@redhat.com>
> > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > ---
> > >  Documentation/gpu/drm-mm.rst    |   31 +
> > >  drivers/gpu/drm/Makefile        |    1 +
> > >  drivers/gpu/drm/drm_gem.c       |    3 +
> > >  drivers/gpu/drm/drm_gpuva_mgr.c | 1704 +++++++++++++++++++++++++++++++
> > >  include/drm/drm_drv.h           |    6 +
> > >  include/drm/drm_gem.h           |   75 ++
> > >  include/drm/drm_gpuva_mgr.h     |  714 +++++++++++++
> > >  7 files changed, 2534 insertions(+)
> > >  create mode 100644 drivers/gpu/drm/drm_gpuva_mgr.c
> > >  create mode 100644 include/drm/drm_gpuva_mgr.h
> > > 
> > > diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
> > > index a52e6f4117d6..c9f120cfe730 100644
> > > --- a/Documentation/gpu/drm-mm.rst
> > > +++ b/Documentation/gpu/drm-mm.rst
> > > @@ -466,6 +466,37 @@ DRM MM Range Allocator Function References
> > >  .. kernel-doc:: drivers/gpu/drm/drm_mm.c
> > >     :export:
> > >  
> > ...
> > 
> > > +
> > > +/**
> > > + * drm_gpuva_remove_iter - removes the iterators current element
> > > + * @it: the &drm_gpuva_iterator
> > > + *
> > > + * This removes the element the iterator currently points to.
> > > + */
> > > +void
> > > +drm_gpuva_iter_remove(struct drm_gpuva_iterator *it)
> > > +{
> > > +	mas_erase(&it->mas);
> > > +}
> > > +EXPORT_SYMBOL(drm_gpuva_iter_remove);
> > > +
> > > +/**
> > > + * drm_gpuva_insert - insert a &drm_gpuva
> > > + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
> > > + * @va: the &drm_gpuva to insert
> > > + * @addr: the start address of the GPU VA
> > > + * @range: the range of the GPU VA
> > > + *
> > > + * Insert a &drm_gpuva with a given address and range into a
> > > + * &drm_gpuva_manager.
> > > + *
> > > + * Returns: 0 on success, negative error code on failure.
> > > + */
> > > +int
> > > +drm_gpuva_insert(struct drm_gpuva_manager *mgr,
> > > +		 struct drm_gpuva *va)
> > > +{
> > > +	u64 addr = va->va.addr;
> > > +	u64 range = va->va.range;
> > > +	MA_STATE(mas, &mgr->va_mt, addr, addr + range - 1);
> > > +	struct drm_gpuva_region *reg = NULL;
> > > +	int ret;
> > > +
> > > +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
> > > +		return -EINVAL;
> > > +
> > > +	if (unlikely(drm_gpuva_in_kernel_region(mgr, addr, range)))
> > > +		return -EINVAL;
> > > +
> > > +	if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS) {
> > > +		reg = drm_gpuva_in_region(mgr, addr, range);
> > > +		if (unlikely(!reg))
> > > +			return -EINVAL;
> > > +	}
> > > +
> > 
> > -----
> > 
> > > +	if (unlikely(drm_gpuva_find_first(mgr, addr, range)))
> > > +		return -EEXIST;
> > > +
> > > +	ret = mas_store_gfp(&mas, va, GFP_KERNEL);
> > 
> > mas_walk() will set the internal maple state to the limits to what it
> > finds.  So, instead of an iterator, you can use the walk function and
> > ensure there is a large enough area in the existing NULL:
> > 
> > /*
> >  * Nothing at addr, mas now points to the location where the store would
> >  * happen
> >  */
> > if (mas_walk(&mas))
> > 	return -EEXIST;
> > 
> 
> For some reason mas_walk() finds an entry and hence this function returns
> -EEXIST for the following sequence of insertions.
> 
> A = [0xc0000 - 0xfffff]
> B = [0x0 - 0xbffff]
> 
> Interestingly, inserting B before A works fine.
> 
> I attached a test module that reproduces the issue. I hope its just a stupid
> mistake I just can't spot though.

This is probably my fault in how I explained things, I seem to have had
a bug in my code.

Let me try again.

mas_walk(&mas) will go to the range of mas.index
	It will set mas.index = range_start
	It will set mas.last = range_end
	It will return entry in that range.

Your code is walking to addr (0xc0000, say)
You get NULL
and the range is now: mas.index = 0, mas.last = ULONG_MAX

You set mas.last = 0xc0000 + 0x40000 -1
You store your va in the range of 0 - 0xfffff - This isn't what you want
to do and this is why you are seeing it exists when done in this order.

In the reverse order, your lower limit is fine so it works out.

Try adding a check to ensure the lower range is still accurate as well:
        if (mas.index < addr)                                                                                           
                mas.index = addr;

If you compile with CONFIG_DEBUG_MAPLE_TREE, you can use mt_dump() to
dump the tree for debugging.

I also have some quality of life patches I'm developing to configure the
format of the dump (hex/dec) and a mas_dump() for more information as
well.

> 
> > /* The NULL entry ends at mas.last, make sure there is room */
> > if (mas.last < (addr + range - 1))
> > 	return -EEXIST;
> > 
> > /* Limit the store size to the correct end address, and store */
> >  mas.last = addr + range - 1;
> >  ret = mas_store_gfp(&mas, va, GFP_KERNEL);
> >
  
Liam R. Howlett March 2, 2023, 2:38 a.m. UTC | #14
* Danilo Krummrich <dakr@redhat.com> [230227 08:17]:

...
> > > Would this variant be significantly more efficient?
> > 
> > Well, what you are doing is walking the tree to see if there's anything
> > there... then re-walking the tree to store it.  So, yes, it's much more
> > efficient..  However, writing is heavier.  How much of the time is spent
> > walking vs writing depends on the size of the tree, but it's rather easy
> > to do this in a single walk of the tree so why wouldn't you?
> 
> I will, I was just curious about how much of an impact it has.
> 
> > 
> > > 
> > > Also, would this also work while already walking the tree?
> > 
> > Yes, to an extent.  If you are at the correct location in the tree, you
> > can write to that location.  If you are not in the correct location and
> > try to write to the tree then things will go poorly..  In this scenario,
> > we are very much walking the tree and writing to it in two steps.
> > 
> > > 
> > > To remove an entry while walking the tree I have a separate function
> > > drm_gpuva_iter_remove(). Would I need something similar for inserting
> > > entries?
> > 
> > I saw that.  Your remove function uses the erase operation which is
> > implemented as a walk to that location and a store of a null over the
> > range that is returned.  You do not need a function to insert an entry
> > if the maple state is at the correct location, and that doesn't just
> > mean setting mas.index/mas.last to the correct value.  There is a node &
> > offset saved in the maple state that needs to be in the correct
> > location.  If you store to that node then the node may be replaced, so
> > other iterators that you have may become stale, but the one you used
> > execute the store operation will now point to the new node with the new
> > entry.
> > 
> > > 
> > > I already provided this example in a separate mail thread, but it may makes
> > > sense to move this to the mailing list:
> > > 
> > > In __drm_gpuva_sm_map() we're iterating a given range of the tree, where the
> > > given range is the size of the newly requested mapping. __drm_gpuva_sm_map()
> > > invokes a callback for each sub-operation that needs to be taken in order to
> > > fulfill this mapping request. In most cases such a callback just creates a
> > > drm_gpuva_op object and stores it in a list.
> > > 
> > > However, drivers can also implement the callback, such that they directly
> > > execute this operation within the callback.
> > > 
> > > Let's have a look at the following example:
> > > 
> > >       0     a     2
> > > old: |-----------|       (bo_offset=n)
> > > 
> > >             1     b     3
> > > req:       |-----------| (bo_offset=m)
> > > 
> > >       0  a' 1     b     3
> > > new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
> > > 
> > > This would result in the following operations.
> > > 
> > > __drm_gpuva_sm_map() finds entry "a" and calls back into the driver
> > > suggesting to re-map "a" with the new size. The driver removes entry "a"
> > > from the tree and adds "a'"
> > 
> > What you have here won't work.  The driver will cause your iterators
> > maple state to point to memory that is freed.  You will either need to
> > pass through your iterator so that the modifications can occur with that
> > maple state so it remains valid, or you will need to invalidate the
> > iterator on every modification by the driver.
> > 
> > I'm sure the first idea you have will be to invalidate the iterator, but
> > that is probably not the way to proceed.  Even ignoring the unclear
> > locking of two maple states trying to modify the tree, this is rather
> > inefficient - each invalidation means a re-walk of the tree.  You may as
> > well not use an iterator in this case.
> > 
> > Depending on how/when the lookups occur, you could still iterate over
> > the tree and let the driver modify the ending of "a", but leave the tree
> > alone and just store b over whatever - but the failure scenarios may
> > cause you grief.
> > 
> > If you pass the iterator through, then you can just use it to do your
> > writes and keep iterating as if nothing changed.
> 
> Passing through the iterater clearly seems to be the way to go.
> 
> I assume that if the entry to insert isn't at the location of the iterator
> (as in the following example) we can just keep walking to this location my
> changing the index of the mas and calling mas_walk()?

no.  You have to mas_set() to the value and walk from the top of the
tree.  mas_walk() walks down, not from side to side - well, it does go
forward within a node (increasing offset), but if you hit the node limit
then you have gotten yourself in trouble.

> This would also imply
> that the "outer" tree walk continues after the entry we just inserted,
> right?

I don't understand the "outer" tree walk statement.

> 
>            1     a     3
> old:       |-----------| (bo_offset=n)
> 
>      0     b     2
> req: |-----------|       (bo_offset=m)
> 
>      0     b     2  a' 3
> new: |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> 
> Again, after finding "a", we want to remove it and insert "a'" instead.

Ah, so you could walk to 0, see that it's NULL from 0 - 1, call
mas_next() and get "a" from 1 - 3, write "a'" from 2 - 3:

        0     1  a   2  a' 3
broken: |-----|------|-----| (a is broken in this 1/2 step)

mas_set_range(&mas, 0, 2); /* Resets the tree location to MAS_START */
mas_store(&mas, b);
        0     b     2  a' 3
new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)


You can *probably* also get away with this:

walk to 0, see that it's NULL from 0 - 1, call mas_next() and get "a"
from 1 - 3, write "a'" from 2 - 3:

        0     1  a   2  a' 3
broken: |-----|------|-----| (a is broken in this 1/2 step)

mas_prev(&mas, 0); /* Looking at broken a from 1-2.
mas_store(&mas, NULL); /* NULL is expanded on write to 0-2.
            0    NULL   2  a' 3
broken':    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)

mas_store(&mas, b);
        0     b     2  a' 3
new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)

You may want to iterate backwards and do the writes as you go until you
have enough room.. it really depends how you want to go about doing
things.

> 
> > 
> > > 
> > > __drm_gpuva_sm_map(), ideally, continues the loop searching for nodes
> > > starting from the end of "a" (which is 2) till the end of the requested
> > > mapping "b" (which is 3). Since it doesn't find any other mapping within
> > > this range it calls back into the driver suggesting to finally map "b".
> > > 
> > > If there would have been another mapping between 2 and 3 it would have
> > > called back into the driver asking to unmap this mapping beforehand.
> > > 
> > > So, it boils down to re-mapping as described at the beginning (and
> > > analogously at the end) of a new mapping range and removing of entries that
> > > are enclosed by the new mapping range.
> > 
> > I assume the unmapped area is no longer needed, and the 're-map' is
> > really a removal of information?  Otherwise I'd suggest searching for a
> > gap which fits your request.  What you have here is a lot like
> > "MAP_FIXED" vs top-down/bottom-up search in the VMA code, this seems to
> > be like your __drm_gpuva_sm_map() and the drm mm range allocator with
> > DRM_MM_INSERT_LOW, and DRM_MM_INSERT_HIGH.
> > 
> > Why can these split/unmappings fail?  Is it because they are still
> > needed?
> > 
> 
> You mean the check before the mas_*() operations in drm_gpuva_insert()?

Yes, the callbacks.

> 
> Removing entries should never fail, inserting entries should fail when the
> caller tries to store to an area outside of the VA space (it doesn't
> necessarily span the whole 64-bit space), a kernel reserved area of the VA
> space, is not in any pre-allocated range of the VA space (if regions are
> enabled) or an entry already exists at that location.

In the mmap code, I have to deal with splitting the start/end VMA and
removing any VMAs in the way.  I do this by making a 'detached' tree
that is dealt with later, then just overwriting the area with one
mas_store() operation.  Would something like that work for you?

> 
> > > 
> > > > > +	if (unlikely(ret))
> > > > > +		return ret;
> > > > > +
> > > > > +	va->mgr = mgr;
> > > > > +	va->region = reg;
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +EXPORT_SYMBOL(drm_gpuva_insert);
> > > > > +
> > > > > +/**
> > > > > + * drm_gpuva_remove - remove a &drm_gpuva
> > > > > + * @va: the &drm_gpuva to remove
> > > > > + *
> > > > > + * This removes the given &va from the underlaying tree.
> > > > > + */
> > > > > +void
> > > > > +drm_gpuva_remove(struct drm_gpuva *va)
> > > > > +{
> > > > > +	MA_STATE(mas, &va->mgr->va_mt, va->va.addr, 0);
> > > > > +
> > > > > +	mas_erase(&mas);
> > > > > +}
> > > > > +EXPORT_SYMBOL(drm_gpuva_remove);
> > > > > +
> > > > ...
> > > > 
> > > > > +/**
> > > > > + * drm_gpuva_find_first - find the first &drm_gpuva in the given range
> > > > > + * @mgr: the &drm_gpuva_manager to search in
> > > > > + * @addr: the &drm_gpuvas address
> > > > > + * @range: the &drm_gpuvas range
> > > > > + *
> > > > > + * Returns: the first &drm_gpuva within the given range
> > > > > + */
> > > > > +struct drm_gpuva *
> > > > > +drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
> > > > > +		     u64 addr, u64 range)
> > > > > +{
> > > > > +	MA_STATE(mas, &mgr->va_mt, addr, 0);
> > > > > +
> > > > > +	return mas_find(&mas, addr + range - 1);
> > > > > +}
> > > > > +EXPORT_SYMBOL(drm_gpuva_find_first);
> > > > > +
> > > > > +/**
> > > > > + * drm_gpuva_find - find a &drm_gpuva
> > > > > + * @mgr: the &drm_gpuva_manager to search in
> > > > > + * @addr: the &drm_gpuvas address
> > > > > + * @range: the &drm_gpuvas range
> > > > > + *
> > > > > + * Returns: the &drm_gpuva at a given &addr and with a given &range
> > > > 
> > > > Note that mas_find() will continue upwards in the address space if there
> > > > isn't anything at @addr.  This means that &drm_gpuva may not be at
> > > > &addr.  If you want to check just at &addr, use mas_walk().
> > > 
> > > Good catch. drm_gpuva_find() should then either also check for 'va->va.addr
> > > == addr' as well or, alternatively, use mas_walk(). As above, any reason to
> > > prefer mas_walk()?

I think I missed this question last time..

Internally, mas_find() is just a mas_walk() on the first call, then
mas_next() for each call after that.  If, during the mas_walk(), there
is no value at addr, it immediately calls mas_next() to get a value to
return.  It will continue upwards until the limit is reached (addr +
range - 1 in your case).

So if you only want to know if there is something at addr, then it's
best to use mas_walk() and keep things a bit more efficient.  Then you
can check mas.last for your end value.

If you do want the first VMA within the range passed in, then mas_find()
is the function you want.

> > > 
> > > > 
> > > > > + */
> > > > > +struct drm_gpuva *
> > > > > +drm_gpuva_find(struct drm_gpuva_manager *mgr,
> > > > > +	       u64 addr, u64 range)
> > > > > +{
> > > > > +	struct drm_gpuva *va;
> > > > > +
> > > > > +	va = drm_gpuva_find_first(mgr, addr, range);
> > > > > +	if (!va)
> > > > > +		goto out;
> > > > > +
> > > > > +	if (va->va.range != range)
> > > > > +		goto out;
> > > > > +
> > > > > +	return va;
> > > > > +
> > > > > +out:
> > > > > +	return NULL;
> > > > > +}
> > > > > +EXPORT_SYMBOL(drm_gpuva_find);
> > > > > +
> > > > > +/**
> > > > > + * drm_gpuva_find_prev - find the &drm_gpuva before the given address
> > > > > + * @mgr: the &drm_gpuva_manager to search in
> > > > > + * @start: the given GPU VA's start address
> > > > > + *
> > > > > + * Find the adjacent &drm_gpuva before the GPU VA with given &start address.
> > > > > + *
> > > > > + * Note that if there is any free space between the GPU VA mappings no mapping
> > > > > + * is returned.
> > > > > + *
> > > > > + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
> > > > > + */
> > > > > +struct drm_gpuva *
> > > > > +drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start)
> > > > 
> > > > find_prev() usually continues beyond 1 less than the address. I found
> > > > this name confusing.
> > > 
> > > Don't really get that, mind explaining?
> > 
> > When I ask for the previous one in a list or tree, I think the one
> > before.. but since you are limiting your search from start to start - 1,
> > you may as well walk to start - 1 and see if one exists.
> > 
> > Is that what you meant to do here?
> 
> Yes, I want to know whether there is a previous entry which ends right
> before the current entry, without a gap between the two.
> 
> > 
> > > 
> > > > You may as well use mas_walk(), it would be faster.
> > > 
> > > How would I use mas_walk() for that? If I understand it correctly,
> > > mas_walk() requires me to know that start address, which I don't know for
> > > the previous entry.
> > 
> > mas_walk() walks to the value you specify and returns the entry at that
> > address, not necessarily the start address, but any address in the
> > range.
> > 
> > If you have a tree and store A = [0x1000 - 0x2000] and set your maple
> > state to walk to 0x1500, mas_walk() will return A, and the maple state
> > will have mas.index = 0x1000 and mas.last = 0x2000.
> > 
> > You have set the maple state to start at "start" and called
> > mas_prev(&mas, start - 1).  start - 1 is the lower limit, so the
> > internal implementation will walk to start then go to the previous entry
> > until start - 1.. it will stop at start - 1 and return NULL if there
> > isn't one there.
> 
> Thanks for the clarification and all the other very helpful comments and
> explanations!
> 

Always glad to help.  The more users the tree has, the more I can see
where we may need to expand the interface to help others.

...
  
Danilo Krummrich March 6, 2023, 1:39 p.m. UTC | #15
On 2/28/23 17:24, Liam R. Howlett wrote:
> * Danilo Krummrich <dakr@redhat.com> [230227 21:17]:
>> On Tue, Feb 21, 2023 at 01:20:50PM -0500, Liam R. Howlett wrote:
>>> * Danilo Krummrich <dakr@redhat.com> [230217 08:45]:
>>>> Add infrastructure to keep track of GPU virtual address (VA) mappings
>>>> with a decicated VA space manager implementation.
>>>>
>>>> New UAPIs, motivated by Vulkan sparse memory bindings graphics drivers
>>>> start implementing, allow userspace applications to request multiple and
>>>> arbitrary GPU VA mappings of buffer objects. The DRM GPU VA manager is
>>>> intended to serve the following purposes in this context.
>>>>
>>>> 1) Provide infrastructure to track GPU VA allocations and mappings,
>>>>     making use of the maple_tree.
>>>>
>>>> 2) Generically connect GPU VA mappings to their backing buffers, in
>>>>     particular DRM GEM objects.
>>>>
>>>> 3) Provide a common implementation to perform more complex mapping
>>>>     operations on the GPU VA space. In particular splitting and merging
>>>>     of GPU VA mappings, e.g. for intersecting mapping requests or partial
>>>>     unmap requests.
>>>>
>>>> Suggested-by: Dave Airlie <airlied@redhat.com>
>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>> ---
>>>>   Documentation/gpu/drm-mm.rst    |   31 +
>>>>   drivers/gpu/drm/Makefile        |    1 +
>>>>   drivers/gpu/drm/drm_gem.c       |    3 +
>>>>   drivers/gpu/drm/drm_gpuva_mgr.c | 1704 +++++++++++++++++++++++++++++++
>>>>   include/drm/drm_drv.h           |    6 +
>>>>   include/drm/drm_gem.h           |   75 ++
>>>>   include/drm/drm_gpuva_mgr.h     |  714 +++++++++++++
>>>>   7 files changed, 2534 insertions(+)
>>>>   create mode 100644 drivers/gpu/drm/drm_gpuva_mgr.c
>>>>   create mode 100644 include/drm/drm_gpuva_mgr.h
>>>>
>>>> diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
>>>> index a52e6f4117d6..c9f120cfe730 100644
>>>> --- a/Documentation/gpu/drm-mm.rst
>>>> +++ b/Documentation/gpu/drm-mm.rst
>>>> @@ -466,6 +466,37 @@ DRM MM Range Allocator Function References
>>>>   .. kernel-doc:: drivers/gpu/drm/drm_mm.c
>>>>      :export:
>>>>   
>>> ...
>>>
>>>> +
>>>> +/**
>>>> + * drm_gpuva_remove_iter - removes the iterators current element
>>>> + * @it: the &drm_gpuva_iterator
>>>> + *
>>>> + * This removes the element the iterator currently points to.
>>>> + */
>>>> +void
>>>> +drm_gpuva_iter_remove(struct drm_gpuva_iterator *it)
>>>> +{
>>>> +	mas_erase(&it->mas);
>>>> +}
>>>> +EXPORT_SYMBOL(drm_gpuva_iter_remove);
>>>> +
>>>> +/**
>>>> + * drm_gpuva_insert - insert a &drm_gpuva
>>>> + * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
>>>> + * @va: the &drm_gpuva to insert
>>>> + * @addr: the start address of the GPU VA
>>>> + * @range: the range of the GPU VA
>>>> + *
>>>> + * Insert a &drm_gpuva with a given address and range into a
>>>> + * &drm_gpuva_manager.
>>>> + *
>>>> + * Returns: 0 on success, negative error code on failure.
>>>> + */
>>>> +int
>>>> +drm_gpuva_insert(struct drm_gpuva_manager *mgr,
>>>> +		 struct drm_gpuva *va)
>>>> +{
>>>> +	u64 addr = va->va.addr;
>>>> +	u64 range = va->va.range;
>>>> +	MA_STATE(mas, &mgr->va_mt, addr, addr + range - 1);
>>>> +	struct drm_gpuva_region *reg = NULL;
>>>> +	int ret;
>>>> +
>>>> +	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (unlikely(drm_gpuva_in_kernel_region(mgr, addr, range)))
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS) {
>>>> +		reg = drm_gpuva_in_region(mgr, addr, range);
>>>> +		if (unlikely(!reg))
>>>> +			return -EINVAL;
>>>> +	}
>>>> +
>>>
>>> -----
>>>
>>>> +	if (unlikely(drm_gpuva_find_first(mgr, addr, range)))
>>>> +		return -EEXIST;
>>>> +
>>>> +	ret = mas_store_gfp(&mas, va, GFP_KERNEL);
>>>
>>> mas_walk() will set the internal maple state to the limits to what it
>>> finds.  So, instead of an iterator, you can use the walk function and
>>> ensure there is a large enough area in the existing NULL:
>>>
>>> /*
>>>   * Nothing at addr, mas now points to the location where the store would
>>>   * happen
>>>   */
>>> if (mas_walk(&mas))
>>> 	return -EEXIST;
>>>
>>
>> For some reason mas_walk() finds an entry and hence this function returns
>> -EEXIST for the following sequence of insertions.
>>
>> A = [0xc0000 - 0xfffff]
>> B = [0x0 - 0xbffff]
>>
>> Interestingly, inserting B before A works fine.
>>
>> I attached a test module that reproduces the issue. I hope its just a stupid
>> mistake I just can't spot though.
> 
> This is probably my fault in how I explained things, I seem to have had
> a bug in my code.
> 
> Let me try again.
> 
> mas_walk(&mas) will go to the range of mas.index
> 	It will set mas.index = range_start
> 	It will set mas.last = range_end
> 	It will return entry in that range.
> 
> Your code is walking to addr (0xc0000, say)
> You get NULL
> and the range is now: mas.index = 0, mas.last = ULONG_MAX
> 
> You set mas.last = 0xc0000 + 0x40000 -1
> You store your va in the range of 0 - 0xfffff - This isn't what you want
> to do and this is why you are seeing it exists when done in this order.
> 
> In the reverse order, your lower limit is fine so it works out.
> 
> Try adding a check to ensure the lower range is still accurate as well:
>          if (mas.index < addr)
>                  mas.index = addr;
> 

Thanks for the clarification, this indeed fixes the issue.

> If you compile with CONFIG_DEBUG_MAPLE_TREE, you can use mt_dump() to
> dump the tree for debugging.

That's pretty handy.

> 
> I also have some quality of life patches I'm developing to configure the
> format of the dump (hex/dec) and a mas_dump() for more information as
> well.

Great, I tried it out and really missed a hex option. Are they in any 
git repository already, such that I could fetch them?

> 
>>
>>> /* The NULL entry ends at mas.last, make sure there is room */
>>> if (mas.last < (addr + range - 1))
>>> 	return -EEXIST;
>>>
>>> /* Limit the store size to the correct end address, and store */
>>>   mas.last = addr + range - 1;
>>>   ret = mas_store_gfp(&mas, va, GFP_KERNEL);
>>>
> 
>
  
Danilo Krummrich March 6, 2023, 3:46 p.m. UTC | #16
On 3/2/23 03:38, Liam R. Howlett wrote:
> * Danilo Krummrich <dakr@redhat.com> [230227 08:17]:
> 
> ...
>>>> Would this variant be significantly more efficient?
>>>
>>> Well, what you are doing is walking the tree to see if there's anything
>>> there... then re-walking the tree to store it.  So, yes, it's much more
>>> efficient..  However, writing is heavier.  How much of the time is spent
>>> walking vs writing depends on the size of the tree, but it's rather easy
>>> to do this in a single walk of the tree so why wouldn't you?
>>
>> I will, I was just curious about how much of an impact it has.
>>
>>>
>>>>
>>>> Also, would this also work while already walking the tree?
>>>
>>> Yes, to an extent.  If you are at the correct location in the tree, you
>>> can write to that location.  If you are not in the correct location and
>>> try to write to the tree then things will go poorly..  In this scenario,
>>> we are very much walking the tree and writing to it in two steps.
>>>
>>>>
>>>> To remove an entry while walking the tree I have a separate function
>>>> drm_gpuva_iter_remove(). Would I need something similar for inserting
>>>> entries?
>>>
>>> I saw that.  Your remove function uses the erase operation which is
>>> implemented as a walk to that location and a store of a null over the
>>> range that is returned.  You do not need a function to insert an entry
>>> if the maple state is at the correct location, and that doesn't just
>>> mean setting mas.index/mas.last to the correct value.  There is a node &
>>> offset saved in the maple state that needs to be in the correct
>>> location.  If you store to that node then the node may be replaced, so
>>> other iterators that you have may become stale, but the one you used
>>> execute the store operation will now point to the new node with the new
>>> entry.
>>>
>>>>
>>>> I already provided this example in a separate mail thread, but it may makes
>>>> sense to move this to the mailing list:
>>>>
>>>> In __drm_gpuva_sm_map() we're iterating a given range of the tree, where the
>>>> given range is the size of the newly requested mapping. __drm_gpuva_sm_map()
>>>> invokes a callback for each sub-operation that needs to be taken in order to
>>>> fulfill this mapping request. In most cases such a callback just creates a
>>>> drm_gpuva_op object and stores it in a list.
>>>>
>>>> However, drivers can also implement the callback, such that they directly
>>>> execute this operation within the callback.
>>>>
>>>> Let's have a look at the following example:
>>>>
>>>>        0     a     2
>>>> old: |-----------|       (bo_offset=n)
>>>>
>>>>              1     b     3
>>>> req:       |-----------| (bo_offset=m)
>>>>
>>>>        0  a' 1     b     3
>>>> new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
>>>>
>>>> This would result in the following operations.
>>>>
>>>> __drm_gpuva_sm_map() finds entry "a" and calls back into the driver
>>>> suggesting to re-map "a" with the new size. The driver removes entry "a"
>>>> from the tree and adds "a'"
>>>
>>> What you have here won't work.  The driver will cause your iterators
>>> maple state to point to memory that is freed.  You will either need to
>>> pass through your iterator so that the modifications can occur with that
>>> maple state so it remains valid, or you will need to invalidate the
>>> iterator on every modification by the driver.
>>>
>>> I'm sure the first idea you have will be to invalidate the iterator, but
>>> that is probably not the way to proceed.  Even ignoring the unclear
>>> locking of two maple states trying to modify the tree, this is rather
>>> inefficient - each invalidation means a re-walk of the tree.  You may as
>>> well not use an iterator in this case.
>>>
>>> Depending on how/when the lookups occur, you could still iterate over
>>> the tree and let the driver modify the ending of "a", but leave the tree
>>> alone and just store b over whatever - but the failure scenarios may
>>> cause you grief.
>>>
>>> If you pass the iterator through, then you can just use it to do your
>>> writes and keep iterating as if nothing changed.
>>
>> Passing through the iterater clearly seems to be the way to go.
>>
>> I assume that if the entry to insert isn't at the location of the iterator
>> (as in the following example) we can just keep walking to this location my
>> changing the index of the mas and calling mas_walk()?
> 
> no.  You have to mas_set() to the value and walk from the top of the
> tree.  mas_walk() walks down, not from side to side - well, it does go
> forward within a node (increasing offset), but if you hit the node limit
> then you have gotten yourself in trouble.
> 
>> This would also imply
>> that the "outer" tree walk continues after the entry we just inserted,
>> right?
> 
> I don't understand the "outer" tree walk statement.

I think I could have phrased this better. I just mean "my" iterator 
walking each tree entry rather than an internal tree walk, as it happens 
in e.g. mas_walk() or mas_find().

> 
>>
>>             1     a     3
>> old:       |-----------| (bo_offset=n)
>>
>>       0     b     2
>> req: |-----------|       (bo_offset=m)
>>
>>       0     b     2  a' 3
>> new: |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
>>
>> Again, after finding "a", we want to remove it and insert "a'" instead.
> 
> Ah, so you could walk to 0, see that it's NULL from 0 - 1, call
> mas_next() and get "a" from 1 - 3, write "a'" from 2 - 3:
> 
>          0     1  a   2  a' 3
> broken: |-----|------|-----| (a is broken in this 1/2 step)
> 
> mas_set_range(&mas, 0, 2); /* Resets the tree location to MAS_START */
> mas_store(&mas, b);
>          0     b     2  a' 3
> new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> 
> 
> You can *probably* also get away with this:
> 
> walk to 0, see that it's NULL from 0 - 1, call mas_next() and get "a"
> from 1 - 3, write "a'" from 2 - 3:
> 
>          0     1  a   2  a' 3
> broken: |-----|------|-----| (a is broken in this 1/2 step)
> 
> mas_prev(&mas, 0); /* Looking at broken a from 1-2.
> mas_store(&mas, NULL); /* NULL is expanded on write to 0-2.
>              0    NULL   2  a' 3
> broken':    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> 
> mas_store(&mas, b);
>          0     b     2  a' 3
> new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> 
> You may want to iterate backwards and do the writes as you go until you
> have enough room.. it really depends how you want to go about doing
> things.

I see, again thanks for explaining.

I think I would prefer to either (1) have generic insert() function with 
a similar behavior as when iterating through a list or (2) have a 
function dedicated to the "split" use case.

1) When iterating the tree inserting entries at arbitrary locations 
should not influence the next iteration step. Unless the new entry 
really is the next entry, but that'd be optional. I don't see a use case 
for that.

2) Similar to how you broke it down above I could imagine a function 
dedicated to the split operation. This would be similar to what you 
mention for mmap below. However, it wouldn't be a single operation.

The GPUVA manager provides sub-operations to the driver for a single 
mapping request. Those can be an arbitrary amount of unmaps (for 
mappings "in the way", as you say below), one or two remaps (for splits 
at the beginning or end or both) and exactly one map (which is the last 
sub-operation adding the newly requested mapping).

Remaps consist out of the mapping to unmap and one or two new mappings 
to map. The only case where a remap sub-op has two new mappings to map 
is when the newly requested mapping is enclosed by a single existing 
mapping. If we overlap a mapping at the beginning and another one at the 
end this would be two separate remap sub-ops. Of course, between the two 
remaps there could be an arbitrary amount of unmap sub-ops.

Unmap sub-ops are simple, I just need to remove a single entry in the 
tree. drm_gpuva_iter_remove() should be fine for that.

For remap sub-ops, I would need a function that removes an entry and 
then adds one or two new entries within the range of the removed one. 
The next loop iteration should then continue at the entry (is any) after 
the range of the removed one.

However, I'm unsure how to implement this. Would I need to just do a 
mas_store() of the new entry/entries (since the nodes should already be 
allocated) and then clean up the nodes that are left with mas_erase()?

Let's say there is an entry A = [0 - 5] and I want to replace it with B 
= [0 - 1] and C = [4 - 5].

Could I just store B and C and then somehow clean up the range [2 - 3]?

Maybe 1) would be the most flexible way, however, if 2) can be 
implemented more efficiently that's perfectly fine too.

> 
>>
>>>
>>>>
>>>> __drm_gpuva_sm_map(), ideally, continues the loop searching for nodes
>>>> starting from the end of "a" (which is 2) till the end of the requested
>>>> mapping "b" (which is 3). Since it doesn't find any other mapping within
>>>> this range it calls back into the driver suggesting to finally map "b".
>>>>
>>>> If there would have been another mapping between 2 and 3 it would have
>>>> called back into the driver asking to unmap this mapping beforehand.
>>>>
>>>> So, it boils down to re-mapping as described at the beginning (and
>>>> analogously at the end) of a new mapping range and removing of entries that
>>>> are enclosed by the new mapping range.
>>>
>>> I assume the unmapped area is no longer needed, and the 're-map' is
>>> really a removal of information?  Otherwise I'd suggest searching for a
>>> gap which fits your request.  What you have here is a lot like
>>> "MAP_FIXED" vs top-down/bottom-up search in the VMA code, this seems to
>>> be like your __drm_gpuva_sm_map() and the drm mm range allocator with
>>> DRM_MM_INSERT_LOW, and DRM_MM_INSERT_HIGH.
>>>
>>> Why can these split/unmappings fail?  Is it because they are still
>>> needed?
>>>
>>
>> You mean the check before the mas_*() operations in drm_gpuva_insert()?
> 
> Yes, the callbacks.
> 
>>
>> Removing entries should never fail, inserting entries should fail when the
>> caller tries to store to an area outside of the VA space (it doesn't
>> necessarily span the whole 64-bit space), a kernel reserved area of the VA
>> space, is not in any pre-allocated range of the VA space (if regions are
>> enabled) or an entry already exists at that location.
> 
> In the mmap code, I have to deal with splitting the start/end VMA and
> removing any VMAs in the way.  I do this by making a 'detached' tree
> that is dealt with later, then just overwriting the area with one
> mas_store() operation.  Would something like that work for you?

I think this is pretty much the same thing I want to do, hence this 
should work. However, this would require more state keeping for the 
whole iteration, I guess. Drivers shouldn't know how the GPUVA manager 
keeps track of mappings internally (and hence they shouldn't know about 
the maple tree). If I could get away with something similar to what I 
wrote above, I think I'd probably not add this extra complexity, unless 
there are relevant performance reasons to do so.

> 
>>
>>>>
>>>>>> +	if (unlikely(ret))
>>>>>> +		return ret;
>>>>>> +
>>>>>> +	va->mgr = mgr;
>>>>>> +	va->region = reg;
>>>>>> +
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(drm_gpuva_insert);
>>>>>> +
>>>>>> +/**
>>>>>> + * drm_gpuva_remove - remove a &drm_gpuva
>>>>>> + * @va: the &drm_gpuva to remove
>>>>>> + *
>>>>>> + * This removes the given &va from the underlaying tree.
>>>>>> + */
>>>>>> +void
>>>>>> +drm_gpuva_remove(struct drm_gpuva *va)
>>>>>> +{
>>>>>> +	MA_STATE(mas, &va->mgr->va_mt, va->va.addr, 0);
>>>>>> +
>>>>>> +	mas_erase(&mas);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(drm_gpuva_remove);
>>>>>> +
>>>>> ...
>>>>>
>>>>>> +/**
>>>>>> + * drm_gpuva_find_first - find the first &drm_gpuva in the given range
>>>>>> + * @mgr: the &drm_gpuva_manager to search in
>>>>>> + * @addr: the &drm_gpuvas address
>>>>>> + * @range: the &drm_gpuvas range
>>>>>> + *
>>>>>> + * Returns: the first &drm_gpuva within the given range
>>>>>> + */
>>>>>> +struct drm_gpuva *
>>>>>> +drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
>>>>>> +		     u64 addr, u64 range)
>>>>>> +{
>>>>>> +	MA_STATE(mas, &mgr->va_mt, addr, 0);
>>>>>> +
>>>>>> +	return mas_find(&mas, addr + range - 1);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(drm_gpuva_find_first);
>>>>>> +
>>>>>> +/**
>>>>>> + * drm_gpuva_find - find a &drm_gpuva
>>>>>> + * @mgr: the &drm_gpuva_manager to search in
>>>>>> + * @addr: the &drm_gpuvas address
>>>>>> + * @range: the &drm_gpuvas range
>>>>>> + *
>>>>>> + * Returns: the &drm_gpuva at a given &addr and with a given &range
>>>>>
>>>>> Note that mas_find() will continue upwards in the address space if there
>>>>> isn't anything at @addr.  This means that &drm_gpuva may not be at
>>>>> &addr.  If you want to check just at &addr, use mas_walk().
>>>>
>>>> Good catch. drm_gpuva_find() should then either also check for 'va->va.addr
>>>> == addr' as well or, alternatively, use mas_walk(). As above, any reason to
>>>> prefer mas_walk()?
> 
> I think I missed this question last time..
> 
> Internally, mas_find() is just a mas_walk() on the first call, then
> mas_next() for each call after that.  If, during the mas_walk(), there
> is no value at addr, it immediately calls mas_next() to get a value to
> return.  It will continue upwards until the limit is reached (addr +
> range - 1 in your case).
> 
> So if you only want to know if there is something at addr, then it's
> best to use mas_walk() and keep things a bit more efficient.  Then you
> can check mas.last for your end value.
> 
> If you do want the first VMA within the range passed in, then mas_find()
> is the function you want.
> 
>>>>
>>>>>
>>>>>> + */
>>>>>> +struct drm_gpuva *
>>>>>> +drm_gpuva_find(struct drm_gpuva_manager *mgr,
>>>>>> +	       u64 addr, u64 range)
>>>>>> +{
>>>>>> +	struct drm_gpuva *va;
>>>>>> +
>>>>>> +	va = drm_gpuva_find_first(mgr, addr, range);
>>>>>> +	if (!va)
>>>>>> +		goto out;
>>>>>> +
>>>>>> +	if (va->va.range != range)
>>>>>> +		goto out;
>>>>>> +
>>>>>> +	return va;
>>>>>> +
>>>>>> +out:
>>>>>> +	return NULL;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(drm_gpuva_find);
>>>>>> +
>>>>>> +/**
>>>>>> + * drm_gpuva_find_prev - find the &drm_gpuva before the given address
>>>>>> + * @mgr: the &drm_gpuva_manager to search in
>>>>>> + * @start: the given GPU VA's start address
>>>>>> + *
>>>>>> + * Find the adjacent &drm_gpuva before the GPU VA with given &start address.
>>>>>> + *
>>>>>> + * Note that if there is any free space between the GPU VA mappings no mapping
>>>>>> + * is returned.
>>>>>> + *
>>>>>> + * Returns: a pointer to the found &drm_gpuva or NULL if none was found
>>>>>> + */
>>>>>> +struct drm_gpuva *
>>>>>> +drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start)
>>>>>
>>>>> find_prev() usually continues beyond 1 less than the address. I found
>>>>> this name confusing.
>>>>
>>>> Don't really get that, mind explaining?
>>>
>>> When I ask for the previous one in a list or tree, I think the one
>>> before.. but since you are limiting your search from start to start - 1,
>>> you may as well walk to start - 1 and see if one exists.
>>>
>>> Is that what you meant to do here?
>>
>> Yes, I want to know whether there is a previous entry which ends right
>> before the current entry, without a gap between the two.
>>
>>>
>>>>
>>>>> You may as well use mas_walk(), it would be faster.
>>>>
>>>> How would I use mas_walk() for that? If I understand it correctly,
>>>> mas_walk() requires me to know that start address, which I don't know for
>>>> the previous entry.
>>>
>>> mas_walk() walks to the value you specify and returns the entry at that
>>> address, not necessarily the start address, but any address in the
>>> range.
>>>
>>> If you have a tree and store A = [0x1000 - 0x2000] and set your maple
>>> state to walk to 0x1500, mas_walk() will return A, and the maple state
>>> will have mas.index = 0x1000 and mas.last = 0x2000.
>>>
>>> You have set the maple state to start at "start" and called
>>> mas_prev(&mas, start - 1).  start - 1 is the lower limit, so the
>>> internal implementation will walk to start then go to the previous entry
>>> until start - 1.. it will stop at start - 1 and return NULL if there
>>> isn't one there.
>>
>> Thanks for the clarification and all the other very helpful comments and
>> explanations!
>>
> 
> Always glad to help.  The more users the tree has, the more I can see
> where we may need to expand the interface to help others.
> 
> ...
>
  
Liam R. Howlett March 7, 2023, 10:43 p.m. UTC | #17
* Danilo Krummrich <dakr@redhat.com> [230306 10:46]:
> On 3/2/23 03:38, Liam R. Howlett wrote:
> > * Danilo Krummrich <dakr@redhat.com> [230227 08:17]:
> > 
> > ...
> > > > > Would this variant be significantly more efficient?
> > > > 
> > > > Well, what you are doing is walking the tree to see if there's anything
> > > > there... then re-walking the tree to store it.  So, yes, it's much more
> > > > efficient..  However, writing is heavier.  How much of the time is spent
> > > > walking vs writing depends on the size of the tree, but it's rather easy
> > > > to do this in a single walk of the tree so why wouldn't you?
> > > 
> > > I will, I was just curious about how much of an impact it has.
> > > 
> > > > 
> > > > > 
> > > > > Also, would this also work while already walking the tree?
> > > > 
> > > > Yes, to an extent.  If you are at the correct location in the tree, you
> > > > can write to that location.  If you are not in the correct location and
> > > > try to write to the tree then things will go poorly..  In this scenario,
> > > > we are very much walking the tree and writing to it in two steps.
> > > > 
> > > > > 
> > > > > To remove an entry while walking the tree I have a separate function
> > > > > drm_gpuva_iter_remove(). Would I need something similar for inserting
> > > > > entries?
> > > > 
> > > > I saw that.  Your remove function uses the erase operation which is
> > > > implemented as a walk to that location and a store of a null over the
> > > > range that is returned.  You do not need a function to insert an entry
> > > > if the maple state is at the correct location, and that doesn't just
> > > > mean setting mas.index/mas.last to the correct value.  There is a node &
> > > > offset saved in the maple state that needs to be in the correct
> > > > location.  If you store to that node then the node may be replaced, so
> > > > other iterators that you have may become stale, but the one you used
> > > > execute the store operation will now point to the new node with the new
> > > > entry.
> > > > 
> > > > > 
> > > > > I already provided this example in a separate mail thread, but it may makes
> > > > > sense to move this to the mailing list:
> > > > > 
> > > > > In __drm_gpuva_sm_map() we're iterating a given range of the tree, where the
> > > > > given range is the size of the newly requested mapping. __drm_gpuva_sm_map()
> > > > > invokes a callback for each sub-operation that needs to be taken in order to
> > > > > fulfill this mapping request. In most cases such a callback just creates a
> > > > > drm_gpuva_op object and stores it in a list.
> > > > > 
> > > > > However, drivers can also implement the callback, such that they directly
> > > > > execute this operation within the callback.
> > > > > 
> > > > > Let's have a look at the following example:
> > > > > 
> > > > >        0     a     2
> > > > > old: |-----------|       (bo_offset=n)
> > > > > 
> > > > >              1     b     3
> > > > > req:       |-----------| (bo_offset=m)
> > > > > 
> > > > >        0  a' 1     b     3
> > > > > new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
> > > > > 
> > > > > This would result in the following operations.
> > > > > 
> > > > > __drm_gpuva_sm_map() finds entry "a" and calls back into the driver
> > > > > suggesting to re-map "a" with the new size. The driver removes entry "a"
> > > > > from the tree and adds "a'"
> > > > 
> > > > What you have here won't work.  The driver will cause your iterators
> > > > maple state to point to memory that is freed.  You will either need to
> > > > pass through your iterator so that the modifications can occur with that
> > > > maple state so it remains valid, or you will need to invalidate the
> > > > iterator on every modification by the driver.
> > > > 
> > > > I'm sure the first idea you have will be to invalidate the iterator, but
> > > > that is probably not the way to proceed.  Even ignoring the unclear
> > > > locking of two maple states trying to modify the tree, this is rather
> > > > inefficient - each invalidation means a re-walk of the tree.  You may as
> > > > well not use an iterator in this case.
> > > > 
> > > > Depending on how/when the lookups occur, you could still iterate over
> > > > the tree and let the driver modify the ending of "a", but leave the tree
> > > > alone and just store b over whatever - but the failure scenarios may
> > > > cause you grief.
> > > > 
> > > > If you pass the iterator through, then you can just use it to do your
> > > > writes and keep iterating as if nothing changed.
> > > 
> > > Passing through the iterater clearly seems to be the way to go.
> > > 
> > > I assume that if the entry to insert isn't at the location of the iterator
> > > (as in the following example) we can just keep walking to this location my
> > > changing the index of the mas and calling mas_walk()?
> > 
> > no.  You have to mas_set() to the value and walk from the top of the
> > tree.  mas_walk() walks down, not from side to side - well, it does go
> > forward within a node (increasing offset), but if you hit the node limit
> > then you have gotten yourself in trouble.
> > 
> > > This would also imply
> > > that the "outer" tree walk continues after the entry we just inserted,
> > > right?
> > 
> > I don't understand the "outer" tree walk statement.
> 
> I think I could have phrased this better. I just mean "my" iterator walking
> each tree entry rather than an internal tree walk, as it happens in e.g.
> mas_walk() or mas_find().
> 
> > 
> > > 
> > >             1     a     3
> > > old:       |-----------| (bo_offset=n)
> > > 
> > >       0     b     2
> > > req: |-----------|       (bo_offset=m)
> > > 
> > >       0     b     2  a' 3
> > > new: |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> > > 
> > > Again, after finding "a", we want to remove it and insert "a'" instead.
> > 
> > Ah, so you could walk to 0, see that it's NULL from 0 - 1, call
> > mas_next() and get "a" from 1 - 3, write "a'" from 2 - 3:
> > 
> >          0     1  a   2  a' 3
> > broken: |-----|------|-----| (a is broken in this 1/2 step)
> > 
> > mas_set_range(&mas, 0, 2); /* Resets the tree location to MAS_START */
> > mas_store(&mas, b);
> >          0     b     2  a' 3
> > new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> > 
> > 
> > You can *probably* also get away with this:
> > 
> > walk to 0, see that it's NULL from 0 - 1, call mas_next() and get "a"
> > from 1 - 3, write "a'" from 2 - 3:
> > 
> >          0     1  a   2  a' 3
> > broken: |-----|------|-----| (a is broken in this 1/2 step)
> > 
> > mas_prev(&mas, 0); /* Looking at broken a from 1-2.
> > mas_store(&mas, NULL); /* NULL is expanded on write to 0-2.
> >              0    NULL   2  a' 3
> > broken':    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> > 
> > mas_store(&mas, b);
> >          0     b     2  a' 3
> > new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> > 
> > You may want to iterate backwards and do the writes as you go until you
> > have enough room.. it really depends how you want to go about doing
> > things.
> 
> I see, again thanks for explaining.
> 
> I think I would prefer to either (1) have generic insert() function with a
> similar behavior as when iterating through a list or (2) have a function
> dedicated to the "split" use case.
> 
> 1) When iterating the tree inserting entries at arbitrary locations should
> not influence the next iteration step. Unless the new entry really is the
> next entry, but that'd be optional. I don't see a use case for that.
> 
> 2) Similar to how you broke it down above I could imagine a function
> dedicated to the split operation. This would be similar to what you mention
> for mmap below. However, it wouldn't be a single operation.
> 
> The GPUVA manager provides sub-operations to the driver for a single mapping
> request. Those can be an arbitrary amount of unmaps (for mappings "in the
> way", as you say below), one or two remaps (for splits at the beginning or
> end or both) and exactly one map (which is the last sub-operation adding the
> newly requested mapping).
> 
> Remaps consist out of the mapping to unmap and one or two new mappings to
> map. The only case where a remap sub-op has two new mappings to map is when
> the newly requested mapping is enclosed by a single existing mapping. If we
> overlap a mapping at the beginning and another one at the end this would be
> two separate remap sub-ops. Of course, between the two remaps there could be
> an arbitrary amount of unmap sub-ops.
> 
> Unmap sub-ops are simple, I just need to remove a single entry in the tree.
> drm_gpuva_iter_remove() should be fine for that.
> 
> For remap sub-ops, I would need a function that removes an entry and then
> adds one or two new entries within the range of the removed one. The next
> loop iteration should then continue at the entry (is any) after the range of
> the removed one.
> 
> However, I'm unsure how to implement this. Would I need to just do a
> mas_store() of the new entry/entries (since the nodes should already be
> allocated) and then clean up the nodes that are left with mas_erase()?
> 
> Let's say there is an entry A = [0 - 5] and I want to replace it with B = [0
> - 1] and C = [4 - 5].
> 
> Could I just store B and C and then somehow clean up the range [2 - 3]?

The most efficient way:
mas_set(&mas, 0);
// Walk down to 0
mas_walk(&mas);
// We are now pointing at A (index = 0, last = 5)
mas.last = 1;
// No walk here.
mas_store(&mas, B);
// Going to the next entry is very fast.
mas_next(&mas)
// We are now pointing at a fragment of A (index = 2, last = 5)
mas.last = 3;
// No walk here.
mas_store(&mas, NULL);
// Going to the next entry is very fast
mas_next(&mas);
// We are now pointing at a fragment of A (index = 4, last = 5)
mas_store(&mas, C);

Less efficient, but still fine:
// Walk down to 0 and store
mas_set_range(&mas, 0, 1);
mas_store(&mas, B);
// Reset to the top of the tree
mas_set_range(&mas, 4, 5);
// Walk down to 4 and store
mas_store(&mas, C);
// Reset to the top of the tree
mas_set_range(&mas, 2, 3);
// Walk down to 2 and store
mas_store(&mas, NULL);


> 
> Maybe 1) would be the most flexible way, however, if 2) can be implemented
> more efficiently that's perfectly fine too.

You can do anything you want, but the more you can use the same maple
state and save walking from the top the more efficient it will be.
Every level is another dereference down the tree..  We do have a
branching factor of 16 here, so I don't know the size of your tree and
how worth the effort it is for you.

> 
> > 
> > > 
> > > > 
> > > > > 
> > > > > __drm_gpuva_sm_map(), ideally, continues the loop searching for nodes
> > > > > starting from the end of "a" (which is 2) till the end of the requested
> > > > > mapping "b" (which is 3). Since it doesn't find any other mapping within
> > > > > this range it calls back into the driver suggesting to finally map "b".
> > > > > 
> > > > > If there would have been another mapping between 2 and 3 it would have
> > > > > called back into the driver asking to unmap this mapping beforehand.
> > > > > 
> > > > > So, it boils down to re-mapping as described at the beginning (and
> > > > > analogously at the end) of a new mapping range and removing of entries that
> > > > > are enclosed by the new mapping range.
> > > > 
> > > > I assume the unmapped area is no longer needed, and the 're-map' is
> > > > really a removal of information?  Otherwise I'd suggest searching for a
> > > > gap which fits your request.  What you have here is a lot like
> > > > "MAP_FIXED" vs top-down/bottom-up search in the VMA code, this seems to
> > > > be like your __drm_gpuva_sm_map() and the drm mm range allocator with
> > > > DRM_MM_INSERT_LOW, and DRM_MM_INSERT_HIGH.
> > > > 
> > > > Why can these split/unmappings fail?  Is it because they are still
> > > > needed?
> > > > 
> > > 
> > > You mean the check before the mas_*() operations in drm_gpuva_insert()?
> > 
> > Yes, the callbacks.
> > 
> > > 
> > > Removing entries should never fail, inserting entries should fail when the
> > > caller tries to store to an area outside of the VA space (it doesn't
> > > necessarily span the whole 64-bit space), a kernel reserved area of the VA
> > > space, is not in any pre-allocated range of the VA space (if regions are
> > > enabled) or an entry already exists at that location.
> > 
> > In the mmap code, I have to deal with splitting the start/end VMA and
> > removing any VMAs in the way.  I do this by making a 'detached' tree
> > that is dealt with later, then just overwriting the area with one
> > mas_store() operation.  Would something like that work for you?
> 
> I think this is pretty much the same thing I want to do, hence this should
> work. However, this would require more state keeping for the whole
> iteration, I guess. Drivers shouldn't know how the GPUVA manager keeps track
> of mappings internally (and hence they shouldn't know about the maple tree).
> If I could get away with something similar to what I wrote above, I think
> I'd probably not add this extra complexity, unless there are relevant
> performance reasons to do so.
> 

Well maybe you can tell your drivers that there's something in the way
and they can remove it from their end but not alter the tree.  Sort of
like a "Untracked" callback.

If you have a "This range has changed to X-Y" then you can use this
along with the "Untracked" to implement the above in a single
write.  Iterate through the area , call back to the driver to alter the
start range, then keep "Untracing" until you alter the end range.

...

Cheers,
Liam
  
Danilo Krummrich March 13, 2023, 11:46 p.m. UTC | #18
On 3/7/23 23:43, Liam R. Howlett wrote:
> * Danilo Krummrich <dakr@redhat.com> [230306 10:46]:
>> On 3/2/23 03:38, Liam R. Howlett wrote:
>>> * Danilo Krummrich <dakr@redhat.com> [230227 08:17]:
>>>
>>> ...
>>>>>> Would this variant be significantly more efficient?
>>>>>
>>>>> Well, what you are doing is walking the tree to see if there's anything
>>>>> there... then re-walking the tree to store it.  So, yes, it's much more
>>>>> efficient..  However, writing is heavier.  How much of the time is spent
>>>>> walking vs writing depends on the size of the tree, but it's rather easy
>>>>> to do this in a single walk of the tree so why wouldn't you?
>>>>
>>>> I will, I was just curious about how much of an impact it has.
>>>>
>>>>>
>>>>>>
>>>>>> Also, would this also work while already walking the tree?
>>>>>
>>>>> Yes, to an extent.  If you are at the correct location in the tree, you
>>>>> can write to that location.  If you are not in the correct location and
>>>>> try to write to the tree then things will go poorly..  In this scenario,
>>>>> we are very much walking the tree and writing to it in two steps.
>>>>>
>>>>>>
>>>>>> To remove an entry while walking the tree I have a separate function
>>>>>> drm_gpuva_iter_remove(). Would I need something similar for inserting
>>>>>> entries?
>>>>>
>>>>> I saw that.  Your remove function uses the erase operation which is
>>>>> implemented as a walk to that location and a store of a null over the
>>>>> range that is returned.  You do not need a function to insert an entry
>>>>> if the maple state is at the correct location, and that doesn't just
>>>>> mean setting mas.index/mas.last to the correct value.  There is a node &
>>>>> offset saved in the maple state that needs to be in the correct
>>>>> location.  If you store to that node then the node may be replaced, so
>>>>> other iterators that you have may become stale, but the one you used
>>>>> execute the store operation will now point to the new node with the new
>>>>> entry.
>>>>>
>>>>>>
>>>>>> I already provided this example in a separate mail thread, but it may makes
>>>>>> sense to move this to the mailing list:
>>>>>>
>>>>>> In __drm_gpuva_sm_map() we're iterating a given range of the tree, where the
>>>>>> given range is the size of the newly requested mapping. __drm_gpuva_sm_map()
>>>>>> invokes a callback for each sub-operation that needs to be taken in order to
>>>>>> fulfill this mapping request. In most cases such a callback just creates a
>>>>>> drm_gpuva_op object and stores it in a list.
>>>>>>
>>>>>> However, drivers can also implement the callback, such that they directly
>>>>>> execute this operation within the callback.
>>>>>>
>>>>>> Let's have a look at the following example:
>>>>>>
>>>>>>         0     a     2
>>>>>> old: |-----------|       (bo_offset=n)
>>>>>>
>>>>>>               1     b     3
>>>>>> req:       |-----------| (bo_offset=m)
>>>>>>
>>>>>>         0  a' 1     b     3
>>>>>> new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
>>>>>>
>>>>>> This would result in the following operations.
>>>>>>
>>>>>> __drm_gpuva_sm_map() finds entry "a" and calls back into the driver
>>>>>> suggesting to re-map "a" with the new size. The driver removes entry "a"
>>>>>> from the tree and adds "a'"
>>>>>
>>>>> What you have here won't work.  The driver will cause your iterators
>>>>> maple state to point to memory that is freed.  You will either need to
>>>>> pass through your iterator so that the modifications can occur with that
>>>>> maple state so it remains valid, or you will need to invalidate the
>>>>> iterator on every modification by the driver.
>>>>>
>>>>> I'm sure the first idea you have will be to invalidate the iterator, but
>>>>> that is probably not the way to proceed.  Even ignoring the unclear
>>>>> locking of two maple states trying to modify the tree, this is rather
>>>>> inefficient - each invalidation means a re-walk of the tree.  You may as
>>>>> well not use an iterator in this case.
>>>>>
>>>>> Depending on how/when the lookups occur, you could still iterate over
>>>>> the tree and let the driver modify the ending of "a", but leave the tree
>>>>> alone and just store b over whatever - but the failure scenarios may
>>>>> cause you grief.
>>>>>
>>>>> If you pass the iterator through, then you can just use it to do your
>>>>> writes and keep iterating as if nothing changed.
>>>>
>>>> Passing through the iterater clearly seems to be the way to go.
>>>>
>>>> I assume that if the entry to insert isn't at the location of the iterator
>>>> (as in the following example) we can just keep walking to this location my
>>>> changing the index of the mas and calling mas_walk()?
>>>
>>> no.  You have to mas_set() to the value and walk from the top of the
>>> tree.  mas_walk() walks down, not from side to side - well, it does go
>>> forward within a node (increasing offset), but if you hit the node limit
>>> then you have gotten yourself in trouble.
>>>
>>>> This would also imply
>>>> that the "outer" tree walk continues after the entry we just inserted,
>>>> right?
>>>
>>> I don't understand the "outer" tree walk statement.
>>
>> I think I could have phrased this better. I just mean "my" iterator walking
>> each tree entry rather than an internal tree walk, as it happens in e.g.
>> mas_walk() or mas_find().
>>
>>>
>>>>
>>>>              1     a     3
>>>> old:       |-----------| (bo_offset=n)
>>>>
>>>>        0     b     2
>>>> req: |-----------|       (bo_offset=m)
>>>>
>>>>        0     b     2  a' 3
>>>> new: |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
>>>>
>>>> Again, after finding "a", we want to remove it and insert "a'" instead.
>>>
>>> Ah, so you could walk to 0, see that it's NULL from 0 - 1, call
>>> mas_next() and get "a" from 1 - 3, write "a'" from 2 - 3:
>>>
>>>           0     1  a   2  a' 3
>>> broken: |-----|------|-----| (a is broken in this 1/2 step)
>>>
>>> mas_set_range(&mas, 0, 2); /* Resets the tree location to MAS_START */
>>> mas_store(&mas, b);
>>>           0     b     2  a' 3
>>> new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
>>>
>>>
>>> You can *probably* also get away with this:
>>>
>>> walk to 0, see that it's NULL from 0 - 1, call mas_next() and get "a"
>>> from 1 - 3, write "a'" from 2 - 3:
>>>
>>>           0     1  a   2  a' 3
>>> broken: |-----|------|-----| (a is broken in this 1/2 step)
>>>
>>> mas_prev(&mas, 0); /* Looking at broken a from 1-2.
>>> mas_store(&mas, NULL); /* NULL is expanded on write to 0-2.
>>>               0    NULL   2  a' 3
>>> broken':    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
>>>
>>> mas_store(&mas, b);
>>>           0     b     2  a' 3
>>> new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
>>>
>>> You may want to iterate backwards and do the writes as you go until you
>>> have enough room.. it really depends how you want to go about doing
>>> things.
>>
>> I see, again thanks for explaining.
>>
>> I think I would prefer to either (1) have generic insert() function with a
>> similar behavior as when iterating through a list or (2) have a function
>> dedicated to the "split" use case.
>>
>> 1) When iterating the tree inserting entries at arbitrary locations should
>> not influence the next iteration step. Unless the new entry really is the
>> next entry, but that'd be optional. I don't see a use case for that.
>>
>> 2) Similar to how you broke it down above I could imagine a function
>> dedicated to the split operation. This would be similar to what you mention
>> for mmap below. However, it wouldn't be a single operation.
>>
>> The GPUVA manager provides sub-operations to the driver for a single mapping
>> request. Those can be an arbitrary amount of unmaps (for mappings "in the
>> way", as you say below), one or two remaps (for splits at the beginning or
>> end or both) and exactly one map (which is the last sub-operation adding the
>> newly requested mapping).
>>
>> Remaps consist out of the mapping to unmap and one or two new mappings to
>> map. The only case where a remap sub-op has two new mappings to map is when
>> the newly requested mapping is enclosed by a single existing mapping. If we
>> overlap a mapping at the beginning and another one at the end this would be
>> two separate remap sub-ops. Of course, between the two remaps there could be
>> an arbitrary amount of unmap sub-ops.
>>
>> Unmap sub-ops are simple, I just need to remove a single entry in the tree.
>> drm_gpuva_iter_remove() should be fine for that.
>>
>> For remap sub-ops, I would need a function that removes an entry and then
>> adds one or two new entries within the range of the removed one. The next
>> loop iteration should then continue at the entry (is any) after the range of
>> the removed one.
>>
>> However, I'm unsure how to implement this. Would I need to just do a
>> mas_store() of the new entry/entries (since the nodes should already be
>> allocated) and then clean up the nodes that are left with mas_erase()?
>>
>> Let's say there is an entry A = [0 - 5] and I want to replace it with B = [0
>> - 1] and C = [4 - 5].
>>
>> Could I just store B and C and then somehow clean up the range [2 - 3]?
> 
> The most efficient way:
> mas_set(&mas, 0);
> // Walk down to 0
> mas_walk(&mas);
> // We are now pointing at A (index = 0, last = 5)
> mas.last = 1;
> // No walk here.
> mas_store(&mas, B);
> // Going to the next entry is very fast.
> mas_next(&mas)
> // We are now pointing at a fragment of A (index = 2, last = 5)
> mas.last = 3;
> // No walk here.
> mas_store(&mas, NULL);
> // Going to the next entry is very fast
> mas_next(&mas);
> // We are now pointing at a fragment of A (index = 4, last = 5)
> mas_store(&mas, C);
> 
> Less efficient, but still fine:
> // Walk down to 0 and store
> mas_set_range(&mas, 0, 1);
> mas_store(&mas, B);
> // Reset to the top of the tree
> mas_set_range(&mas, 4, 5);
> // Walk down to 4 and store
> mas_store(&mas, C);
> // Reset to the top of the tree
> mas_set_range(&mas, 2, 3);
> // Walk down to 2 and store
> mas_store(&mas, NULL);
> 
> 
>>
>> Maybe 1) would be the most flexible way, however, if 2) can be implemented
>> more efficiently that's perfectly fine too.
> 
> You can do anything you want, but the more you can use the same maple
> state and save walking from the top the more efficient it will be.
> Every level is another dereference down the tree..  We do have a
> branching factor of 16 here, so I don't know the size of your tree and
> how worth the effort it is for you.

I think it could be worth taking the first approach and providing 
functions that are tied specifically to the use cases of the GPUVA 
manager, rather than generalizing them too much and re-walk the tree 
more than necessary. I think the size of the tree can be up to a couple 
100k.

Since some operations may be executed from dma-fence signalling critical 
sections I have a use case for mas_preallocate(). I was wondering if I 
can ignore the "entry" argument of mas_preallocate() and just pass NULL, 
since it's actually never used. What's the purpose of this argument? Or 
is it bug?

> 
>>
>>>
>>>>
>>>>>
>>>>>>
>>>>>> __drm_gpuva_sm_map(), ideally, continues the loop searching for nodes
>>>>>> starting from the end of "a" (which is 2) till the end of the requested
>>>>>> mapping "b" (which is 3). Since it doesn't find any other mapping within
>>>>>> this range it calls back into the driver suggesting to finally map "b".
>>>>>>
>>>>>> If there would have been another mapping between 2 and 3 it would have
>>>>>> called back into the driver asking to unmap this mapping beforehand.
>>>>>>
>>>>>> So, it boils down to re-mapping as described at the beginning (and
>>>>>> analogously at the end) of a new mapping range and removing of entries that
>>>>>> are enclosed by the new mapping range.
>>>>>
>>>>> I assume the unmapped area is no longer needed, and the 're-map' is
>>>>> really a removal of information?  Otherwise I'd suggest searching for a
>>>>> gap which fits your request.  What you have here is a lot like
>>>>> "MAP_FIXED" vs top-down/bottom-up search in the VMA code, this seems to
>>>>> be like your __drm_gpuva_sm_map() and the drm mm range allocator with
>>>>> DRM_MM_INSERT_LOW, and DRM_MM_INSERT_HIGH.
>>>>>
>>>>> Why can these split/unmappings fail?  Is it because they are still
>>>>> needed?
>>>>>
>>>>
>>>> You mean the check before the mas_*() operations in drm_gpuva_insert()?
>>>
>>> Yes, the callbacks.
>>>
>>>>
>>>> Removing entries should never fail, inserting entries should fail when the
>>>> caller tries to store to an area outside of the VA space (it doesn't
>>>> necessarily span the whole 64-bit space), a kernel reserved area of the VA
>>>> space, is not in any pre-allocated range of the VA space (if regions are
>>>> enabled) or an entry already exists at that location.
>>>
>>> In the mmap code, I have to deal with splitting the start/end VMA and
>>> removing any VMAs in the way.  I do this by making a 'detached' tree
>>> that is dealt with later, then just overwriting the area with one
>>> mas_store() operation.  Would something like that work for you?
>>
>> I think this is pretty much the same thing I want to do, hence this should
>> work. However, this would require more state keeping for the whole
>> iteration, I guess. Drivers shouldn't know how the GPUVA manager keeps track
>> of mappings internally (and hence they shouldn't know about the maple tree).
>> If I could get away with something similar to what I wrote above, I think
>> I'd probably not add this extra complexity, unless there are relevant
>> performance reasons to do so.
>>
> 
> Well maybe you can tell your drivers that there's something in the way
> and they can remove it from their end but not alter the tree.  Sort of
> like a "Untracked" callback.
> 
> If you have a "This range has changed to X-Y" then you can use this
> along with the "Untracked" to implement the above in a single
> write.  Iterate through the area , call back to the driver to alter the
> start range, then keep "Untracing" until you alter the end range.
> 
> ...
> 
> Cheers,
> Liam
>
  
Liam R. Howlett March 20, 2023, 7:16 p.m. UTC | #19
* Danilo Krummrich <dakr@redhat.com> [230313 19:46]:
> On 3/7/23 23:43, Liam R. Howlett wrote:
> > * Danilo Krummrich <dakr@redhat.com> [230306 10:46]:
> > > On 3/2/23 03:38, Liam R. Howlett wrote:
> > > > * Danilo Krummrich <dakr@redhat.com> [230227 08:17]:
> > > > 
> > > > ...
> > > > > > > Would this variant be significantly more efficient?
> > > > > > 
> > > > > > Well, what you are doing is walking the tree to see if there's anything
> > > > > > there... then re-walking the tree to store it.  So, yes, it's much more
> > > > > > efficient..  However, writing is heavier.  How much of the time is spent
> > > > > > walking vs writing depends on the size of the tree, but it's rather easy
> > > > > > to do this in a single walk of the tree so why wouldn't you?
> > > > > 
> > > > > I will, I was just curious about how much of an impact it has.
> > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > Also, would this also work while already walking the tree?
> > > > > > 
> > > > > > Yes, to an extent.  If you are at the correct location in the tree, you
> > > > > > can write to that location.  If you are not in the correct location and
> > > > > > try to write to the tree then things will go poorly..  In this scenario,
> > > > > > we are very much walking the tree and writing to it in two steps.
> > > > > > 
> > > > > > > 
> > > > > > > To remove an entry while walking the tree I have a separate function
> > > > > > > drm_gpuva_iter_remove(). Would I need something similar for inserting
> > > > > > > entries?
> > > > > > 
> > > > > > I saw that.  Your remove function uses the erase operation which is
> > > > > > implemented as a walk to that location and a store of a null over the
> > > > > > range that is returned.  You do not need a function to insert an entry
> > > > > > if the maple state is at the correct location, and that doesn't just
> > > > > > mean setting mas.index/mas.last to the correct value.  There is a node &
> > > > > > offset saved in the maple state that needs to be in the correct
> > > > > > location.  If you store to that node then the node may be replaced, so
> > > > > > other iterators that you have may become stale, but the one you used
> > > > > > execute the store operation will now point to the new node with the new
> > > > > > entry.
> > > > > > 
> > > > > > > 
> > > > > > > I already provided this example in a separate mail thread, but it may makes
> > > > > > > sense to move this to the mailing list:
> > > > > > > 
> > > > > > > In __drm_gpuva_sm_map() we're iterating a given range of the tree, where the
> > > > > > > given range is the size of the newly requested mapping. __drm_gpuva_sm_map()
> > > > > > > invokes a callback for each sub-operation that needs to be taken in order to
> > > > > > > fulfill this mapping request. In most cases such a callback just creates a
> > > > > > > drm_gpuva_op object and stores it in a list.
> > > > > > > 
> > > > > > > However, drivers can also implement the callback, such that they directly
> > > > > > > execute this operation within the callback.
> > > > > > > 
> > > > > > > Let's have a look at the following example:
> > > > > > > 
> > > > > > >         0     a     2
> > > > > > > old: |-----------|       (bo_offset=n)
> > > > > > > 
> > > > > > >               1     b     3
> > > > > > > req:       |-----------| (bo_offset=m)
> > > > > > > 
> > > > > > >         0  a' 1     b     3
> > > > > > > new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
> > > > > > > 
> > > > > > > This would result in the following operations.
> > > > > > > 
> > > > > > > __drm_gpuva_sm_map() finds entry "a" and calls back into the driver
> > > > > > > suggesting to re-map "a" with the new size. The driver removes entry "a"
> > > > > > > from the tree and adds "a'"
> > > > > > 
> > > > > > What you have here won't work.  The driver will cause your iterators
> > > > > > maple state to point to memory that is freed.  You will either need to
> > > > > > pass through your iterator so that the modifications can occur with that
> > > > > > maple state so it remains valid, or you will need to invalidate the
> > > > > > iterator on every modification by the driver.
> > > > > > 
> > > > > > I'm sure the first idea you have will be to invalidate the iterator, but
> > > > > > that is probably not the way to proceed.  Even ignoring the unclear
> > > > > > locking of two maple states trying to modify the tree, this is rather
> > > > > > inefficient - each invalidation means a re-walk of the tree.  You may as
> > > > > > well not use an iterator in this case.
> > > > > > 
> > > > > > Depending on how/when the lookups occur, you could still iterate over
> > > > > > the tree and let the driver modify the ending of "a", but leave the tree
> > > > > > alone and just store b over whatever - but the failure scenarios may
> > > > > > cause you grief.
> > > > > > 
> > > > > > If you pass the iterator through, then you can just use it to do your
> > > > > > writes and keep iterating as if nothing changed.
> > > > > 
> > > > > Passing through the iterater clearly seems to be the way to go.
> > > > > 
> > > > > I assume that if the entry to insert isn't at the location of the iterator
> > > > > (as in the following example) we can just keep walking to this location my
> > > > > changing the index of the mas and calling mas_walk()?
> > > > 
> > > > no.  You have to mas_set() to the value and walk from the top of the
> > > > tree.  mas_walk() walks down, not from side to side - well, it does go
> > > > forward within a node (increasing offset), but if you hit the node limit
> > > > then you have gotten yourself in trouble.
> > > > 
> > > > > This would also imply
> > > > > that the "outer" tree walk continues after the entry we just inserted,
> > > > > right?
> > > > 
> > > > I don't understand the "outer" tree walk statement.
> > > 
> > > I think I could have phrased this better. I just mean "my" iterator walking
> > > each tree entry rather than an internal tree walk, as it happens in e.g.
> > > mas_walk() or mas_find().
> > > 
> > > > 
> > > > > 
> > > > >              1     a     3
> > > > > old:       |-----------| (bo_offset=n)
> > > > > 
> > > > >        0     b     2
> > > > > req: |-----------|       (bo_offset=m)
> > > > > 
> > > > >        0     b     2  a' 3
> > > > > new: |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> > > > > 
> > > > > Again, after finding "a", we want to remove it and insert "a'" instead.
> > > > 
> > > > Ah, so you could walk to 0, see that it's NULL from 0 - 1, call
> > > > mas_next() and get "a" from 1 - 3, write "a'" from 2 - 3:
> > > > 
> > > >           0     1  a   2  a' 3
> > > > broken: |-----|------|-----| (a is broken in this 1/2 step)
> > > > 
> > > > mas_set_range(&mas, 0, 2); /* Resets the tree location to MAS_START */
> > > > mas_store(&mas, b);
> > > >           0     b     2  a' 3
> > > > new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> > > > 
> > > > 
> > > > You can *probably* also get away with this:
> > > > 
> > > > walk to 0, see that it's NULL from 0 - 1, call mas_next() and get "a"
> > > > from 1 - 3, write "a'" from 2 - 3:
> > > > 
> > > >           0     1  a   2  a' 3
> > > > broken: |-----|------|-----| (a is broken in this 1/2 step)
> > > > 
> > > > mas_prev(&mas, 0); /* Looking at broken a from 1-2.
> > > > mas_store(&mas, NULL); /* NULL is expanded on write to 0-2.
> > > >               0    NULL   2  a' 3
> > > > broken':    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> > > > 
> > > > mas_store(&mas, b);
> > > >           0     b     2  a' 3
> > > > new:    |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
> > > > 
> > > > You may want to iterate backwards and do the writes as you go until you
> > > > have enough room.. it really depends how you want to go about doing
> > > > things.
> > > 
> > > I see, again thanks for explaining.
> > > 
> > > I think I would prefer to either (1) have generic insert() function with a
> > > similar behavior as when iterating through a list or (2) have a function
> > > dedicated to the "split" use case.
> > > 
> > > 1) When iterating the tree inserting entries at arbitrary locations should
> > > not influence the next iteration step. Unless the new entry really is the
> > > next entry, but that'd be optional. I don't see a use case for that.
> > > 
> > > 2) Similar to how you broke it down above I could imagine a function
> > > dedicated to the split operation. This would be similar to what you mention
> > > for mmap below. However, it wouldn't be a single operation.
> > > 
> > > The GPUVA manager provides sub-operations to the driver for a single mapping
> > > request. Those can be an arbitrary amount of unmaps (for mappings "in the
> > > way", as you say below), one or two remaps (for splits at the beginning or
> > > end or both) and exactly one map (which is the last sub-operation adding the
> > > newly requested mapping).
> > > 
> > > Remaps consist out of the mapping to unmap and one or two new mappings to
> > > map. The only case where a remap sub-op has two new mappings to map is when
> > > the newly requested mapping is enclosed by a single existing mapping. If we
> > > overlap a mapping at the beginning and another one at the end this would be
> > > two separate remap sub-ops. Of course, between the two remaps there could be
> > > an arbitrary amount of unmap sub-ops.
> > > 
> > > Unmap sub-ops are simple, I just need to remove a single entry in the tree.
> > > drm_gpuva_iter_remove() should be fine for that.
> > > 
> > > For remap sub-ops, I would need a function that removes an entry and then
> > > adds one or two new entries within the range of the removed one. The next
> > > loop iteration should then continue at the entry (is any) after the range of
> > > the removed one.
> > > 
> > > However, I'm unsure how to implement this. Would I need to just do a
> > > mas_store() of the new entry/entries (since the nodes should already be
> > > allocated) and then clean up the nodes that are left with mas_erase()?
> > > 
> > > Let's say there is an entry A = [0 - 5] and I want to replace it with B = [0
> > > - 1] and C = [4 - 5].
> > > 
> > > Could I just store B and C and then somehow clean up the range [2 - 3]?
> > 
> > The most efficient way:
> > mas_set(&mas, 0);
> > // Walk down to 0
> > mas_walk(&mas);
> > // We are now pointing at A (index = 0, last = 5)
> > mas.last = 1;
> > // No walk here.
> > mas_store(&mas, B);
> > // Going to the next entry is very fast.
> > mas_next(&mas)
> > // We are now pointing at a fragment of A (index = 2, last = 5)
> > mas.last = 3;
> > // No walk here.
> > mas_store(&mas, NULL);
> > // Going to the next entry is very fast
> > mas_next(&mas);
> > // We are now pointing at a fragment of A (index = 4, last = 5)
> > mas_store(&mas, C);
> > 
> > Less efficient, but still fine:
> > // Walk down to 0 and store
> > mas_set_range(&mas, 0, 1);
> > mas_store(&mas, B);
> > // Reset to the top of the tree
> > mas_set_range(&mas, 4, 5);
> > // Walk down to 4 and store
> > mas_store(&mas, C);
> > // Reset to the top of the tree
> > mas_set_range(&mas, 2, 3);
> > // Walk down to 2 and store
> > mas_store(&mas, NULL);
> > 
> > 
> > > 
> > > Maybe 1) would be the most flexible way, however, if 2) can be implemented
> > > more efficiently that's perfectly fine too.
> > 
> > You can do anything you want, but the more you can use the same maple
> > state and save walking from the top the more efficient it will be.
> > Every level is another dereference down the tree..  We do have a
> > branching factor of 16 here, so I don't know the size of your tree and
> > how worth the effort it is for you.
> 
> I think it could be worth taking the first approach and providing functions
> that are tied specifically to the use cases of the GPUVA manager, rather
> than generalizing them too much and re-walk the tree more than necessary. I
> think the size of the tree can be up to a couple 100k.

A couple 100k VMAs?  As in 2 trees of 100k VMAs or 200k VMAs in a single
tree?  So that's 5 dereferences to walk from the root to the VMA.

> 
> Since some operations may be executed from dma-fence signalling critical
> sections I have a use case for mas_preallocate(). I was wondering if I can
> ignore the "entry" argument of mas_preallocate() and just pass NULL, since
> it's actually never used. What's the purpose of this argument? Or is it bug?

It existed to optimize the preallocations, but that functionality was
never completed.  It is slated to be dropped by a patch [1] in the
mm-unstable branch.  I am not sure it's worth doing the optimization
after the zeroing fix [2] of the maple nodes.  If you find the
preallocations are too large and causing issues, we can revisit.. but
with a 5 level tree, we will allocate 16 nodes and almost always have
extras - we get 16 nodes per page.

If you have sparse data, then I would start to get concerned after ~524K
VMAs, then we'd be looking for 2 pages.  More compact data can run up to
~1.04M before needing 2 pages.  Then again, two pages doesn't seem like
a lot for such a large task.

How sparse is your data, on average?

[1] https://lore.kernel.org/all/20230110154211.1758562-1-vernon2gm@gmail.com/T/#u
[2] https://lore.kernel.org/all/20230105160427.2988454-1-Liam.Howlett@oracle.com/ 

Thanks,
Liam

...
  

Patch

diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
index a52e6f4117d6..c9f120cfe730 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -466,6 +466,37 @@  DRM MM Range Allocator Function References
 .. kernel-doc:: drivers/gpu/drm/drm_mm.c
    :export:
 
+DRM GPU VA Manager
+==================
+
+Overview
+--------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
+   :doc: Overview
+
+Split and Merge
+---------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
+   :doc: Split and Merge
+
+Locking
+-------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
+   :doc: Locking
+
+
+DRM GPU VA Manager Function References
+--------------------------------------
+
+.. kernel-doc:: include/drm/drm_gpuva_mgr.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_gpuva_mgr.c
+   :export:
+
 DRM Buddy Allocator
 ===================
 
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index d40defbb0347..4d098efffb98 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -45,6 +45,7 @@  drm-y := \
 	drm_vblank.o \
 	drm_vblank_work.o \
 	drm_vma_manager.o \
+	drm_gpuva_mgr.o \
 	drm_writeback.o
 drm-$(CONFIG_DRM_LEGACY) += \
 	drm_agpsupport.o \
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 59a0bb5ebd85..65115fe88627 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -164,6 +164,9 @@  void drm_gem_private_object_init(struct drm_device *dev,
 	if (!obj->resv)
 		obj->resv = &obj->_resv;
 
+	if (drm_core_check_feature(dev, DRIVER_GEM_GPUVA))
+		drm_gem_gpuva_init(obj);
+
 	drm_vma_node_reset(&obj->vma_node);
 	INIT_LIST_HEAD(&obj->lru_node);
 }
diff --git a/drivers/gpu/drm/drm_gpuva_mgr.c b/drivers/gpu/drm/drm_gpuva_mgr.c
new file mode 100644
index 000000000000..19f583704562
--- /dev/null
+++ b/drivers/gpu/drm/drm_gpuva_mgr.c
@@ -0,0 +1,1704 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *     Danilo Krummrich <dakr@redhat.com>
+ *
+ */
+
+#include <drm/drm_gem.h>
+#include <drm/drm_gpuva_mgr.h>
+
+/**
+ * DOC: Overview
+ *
+ * The DRM GPU VA Manager, represented by struct drm_gpuva_manager keeps track
+ * of a GPU's virtual address (VA) space and manages the corresponding virtual
+ * mappings represented by &drm_gpuva objects. It also keeps track of the
+ * mapping's backing &drm_gem_object buffers.
+ *
+ * &drm_gem_object buffers maintain a list (and a corresponding list lock) of
+ * &drm_gpuva objects representing all existent GPU VA mappings using this
+ * &drm_gem_object as backing buffer.
+ *
+ * If the &DRM_GPUVA_MANAGER_REGIONS feature is enabled, a GPU VA mapping can
+ * only be created within a previously allocated &drm_gpuva_region, which
+ * represents a reserved portion of the GPU VA space. GPU VA mappings are not
+ * allowed to span over a &drm_gpuva_region's boundary.
+ *
+ * GPU VA regions can also be flagged as sparse, which allows drivers to create
+ * sparse mappings for a whole GPU VA region in order to support Vulkan
+ * 'Sparse Resources'.
+ *
+ * The GPU VA manager internally uses &maple_tree structures to manage the
+ * &drm_gpuva mappings and the &drm_gpuva_regions within a GPU's virtual address
+ * space.
+ *
+ * Besides the GPU VA space regions (&drm_gpuva_region) allocated by a driver
+ * the &drm_gpuva_manager contains a special region representing the portion of
+ * VA space reserved by the kernel. This node is initialized together with the
+ * GPU VA manager instance and removed when the GPU VA manager is destroyed.
+ *
+ * In a typical application drivers would embed struct drm_gpuva_manager,
+ * struct drm_gpuva_region and struct drm_gpuva within their own driver
+ * specific structures, there won't be any memory allocations of it's own nor
+ * memory allocations of &drm_gpuva or &drm_gpuva_region entries.
+ */
+
+/**
+ * DOC: Split and Merge
+ *
+ * The DRM GPU VA manager also provides an algorithm implementing splitting and
+ * merging of existent GPU VA mappings with the ones that are requested to be
+ * mapped or unmapped. This feature is required by the Vulkan API to implement
+ * Vulkan 'Sparse Memory Bindings' - drivers UAPIs often refer to this as
+ * VM BIND.
+ *
+ * Drivers can call drm_gpuva_sm_map() to receive a sequence of callbacks
+ * containing map, unmap and remap operations for a given newly requested
+ * mapping. The sequence of callbacks represents the set of operations to
+ * execute in order to integrate the new mapping cleanly into the current state
+ * of the GPU VA space.
+ *
+ * Depending on how the new GPU VA mapping intersects with the existent mappings
+ * of the GPU VA space the &drm_gpuva_fn_ops callbacks contain an arbitrary
+ * amount of unmap operations, a maximum of two remap operations and a single
+ * map operation. The caller might receive no callback at all if no operation is
+ * required, e.g. if the requested mapping already exists in the exact same way.
+ *
+ * The single map operation, if existent, represents the original map operation
+ * requested by the caller. Please note that this operation might be altered
+ * comparing it with the original map operation, e.g. because it was merged with
+ * an already  existent mapping. Hence, drivers must execute this map operation
+ * instead of the original one passed to drm_gpuva_sm_map().
+ *
+ * &drm_gpuva_op_unmap contains a 'keep' field, which indicates whether the
+ * &drm_gpuva to unmap is physically contiguous with the original mapping
+ * request. Optionally, if 'keep' is set, drivers may keep the actual page table
+ * entries for this &drm_gpuva, adding the missing page table entries only and
+ * update the &drm_gpuva_manager's view of things accordingly.
+ *
+ * Drivers may do the same optimization, namely delta page table updates, also
+ * for remap operations. This is possible since &drm_gpuva_op_remap consists of
+ * one unmap operation and one or two map operations, such that drivers can
+ * derive the page table update delta accordingly.
+ *
+ * Note that there can't be more than two existent mappings to split up, one at
+ * the beginning and one at the end of the new mapping, hence there is a
+ * maximum of two remap operations.
+ *
+ * Generally, the DRM GPU VA manager never merges mappings across the
+ * boundaries of &drm_gpuva_regions. This is the case since merging between
+ * GPU VA regions would result into unmap and map operations to be issued for
+ * both regions involved although the original mapping request was referred to
+ * one specific GPU VA region only. Since the other GPU VA region, the one not
+ * explicitly requested to be altered, might be in use by the GPU, we are not
+ * allowed to issue any map/unmap operations for this region.
+ *
+ * To update the &drm_gpuva_manager's view of the GPU VA space
+ * drm_gpuva_insert() and drm_gpuva_remove() should be used.
+ *
+ * Analogous to drm_gpuva_sm_map() drm_gpuva_sm_unmap() uses &drm_gpuva_fn_ops
+ * to call back into the driver in order to unmap a range of GPU VA space. The
+ * logic behind this function is way simpler though: For all existent mappings
+ * enclosed by the given range unmap operations are created. For mappings which
+ * are only partically located within the given range, remap operations are
+ * created such that those mappings are split up and re-mapped partically.
+ *
+ * The following diagram depicts the basic relationships of existent GPU VA
+ * mappings, a newly requested mapping and the resulting mappings as implemented
+ * by drm_gpuva_sm_map() - it doesn't cover any arbitrary combinations of these.
+ *
+ * 1) Requested mapping is identical, hence noop.
+ *
+ *    ::
+ *
+ *	     0     a     1
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0     a     1
+ *	req: |-----------| (bo_offset=n)
+ *
+ *	     0     a     1
+ *	new: |-----------| (bo_offset=n)
+ *
+ *
+ * 2) Requested mapping is identical, except for the BO offset, hence replace
+ *    the mapping.
+ *
+ *    ::
+ *
+ *	     0     a     1
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0     a     1
+ *	req: |-----------| (bo_offset=m)
+ *
+ *	     0     a     1
+ *	new: |-----------| (bo_offset=m)
+ *
+ *
+ * 3) Requested mapping is identical, except for the backing BO, hence replace
+ *    the mapping.
+ *
+ *    ::
+ *
+ *	     0     a     1
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0     b     1
+ *	req: |-----------| (bo_offset=n)
+ *
+ *	     0     b     1
+ *	new: |-----------| (bo_offset=n)
+ *
+ *
+ * 4) Existent mapping is a left aligned subset of the requested one, hence
+ *    replace the existent one.
+ *
+ *    ::
+ *
+ *	     0  a  1
+ *	old: |-----|       (bo_offset=n)
+ *
+ *	     0     a     2
+ *	req: |-----------| (bo_offset=n)
+ *
+ *	     0     a     2
+ *	new: |-----------| (bo_offset=n)
+ *
+ *    .. note::
+ *       We expect to see the same result for a request with a different BO
+ *       and/or non-contiguous BO offset.
+ *
+ *
+ * 5) Requested mapping's range is a left aligned subset of the existent one,
+ *    but backed by a different BO. Hence, map the requested mapping and split
+ *    the existent one adjusting it's BO offset.
+ *
+ *    ::
+ *
+ *	     0     a     2
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0  b  1
+ *	req: |-----|       (bo_offset=n)
+ *
+ *	     0  b  1  a' 2
+ *	new: |-----|-----| (b.bo_offset=n, a.bo_offset=n+1)
+ *
+ *    .. note::
+ *       We expect to see the same result for a request with a different BO
+ *       and/or non-contiguous BO offset.
+ *
+ *
+ * 6) Existent mapping is a superset of the requested mapping, hence noop.
+ *
+ *    ::
+ *
+ *	     0     a     2
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	     0  a  1
+ *	req: |-----|       (bo_offset=n)
+ *
+ *	     0     a     2
+ *	new: |-----------| (bo_offset=n)
+ *
+ *
+ * 7) Requested mapping's range is a right aligned subset of the existent one,
+ *    but backed by a different BO. Hence, map the requested mapping and split
+ *    the existent one, without adjusting the BO offset.
+ *
+ *    ::
+ *
+ *	     0     a     2
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	           1  b  2
+ *	req:       |-----| (bo_offset=m)
+ *
+ *	     0  a  1  b  2
+ *	new: |-----|-----| (a.bo_offset=n,b.bo_offset=m)
+ *
+ *
+ * 8) Existent mapping is a superset of the requested mapping, hence noop.
+ *
+ *    ::
+ *
+ *	      0     a     2
+ *	old: |-----------| (bo_offset=n)
+ *
+ *	           1  a  2
+ *	req:       |-----| (bo_offset=n+1)
+ *
+ *	     0     a     2
+ *	new: |-----------| (bo_offset=n)
+ *
+ *
+ * 9) Existent mapping is overlapped at the end by the requested mapping backed
+ *    by a different BO. Hence, map the requested mapping and split up the
+ *    existent one, without adjusting the BO offset.
+ *
+ *    ::
+ *
+ *	     0     a     2
+ *	old: |-----------|       (bo_offset=n)
+ *
+ *	           1     b     3
+ *	req:       |-----------| (bo_offset=m)
+ *
+ *	     0  a  1     b     3
+ *	new: |-----|-----------| (a.bo_offset=n,b.bo_offset=m)
+ *
+ *
+ * 10) Existent mapping is overlapped by the requested mapping, both having the
+ *     same backing BO with a contiguous offset. Hence, merge both mappings.
+ *
+ *     ::
+ *
+ *	      0     a     2
+ *	 old: |-----------|       (bo_offset=n)
+ *
+ *	            1     a     3
+ *	 req:       |-----------| (bo_offset=n+1)
+ *
+ *	      0        a        3
+ *	 new: |-----------------| (bo_offset=n)
+ *
+ *
+ * 11) Requested mapping's range is a centered subset of the existent one
+ *     having a different backing BO. Hence, map the requested mapping and split
+ *     up the existent one in two mappings, adjusting the BO offset of the right
+ *     one accordingly.
+ *
+ *     ::
+ *
+ *	      0        a        3
+ *	 old: |-----------------| (bo_offset=n)
+ *
+ *	            1  b  2
+ *	 req:       |-----|       (bo_offset=m)
+ *
+ *	      0  a  1  b  2  a' 3
+ *	 new: |-----|-----|-----| (a.bo_offset=n,b.bo_offset=m,a'.bo_offset=n+2)
+ *
+ *
+ * 12) Requested mapping is a contiguous subset of the existent one, hence noop.
+ *
+ *     ::
+ *
+ *	      0        a        3
+ *	 old: |-----------------| (bo_offset=n)
+ *
+ *	            1  a  2
+ *	 req:       |-----|       (bo_offset=n+1)
+ *
+ *	      0        a        3
+ *	 old: |-----------------| (bo_offset=n)
+ *
+ *
+ * 13) Existent mapping is a right aligned subset of the requested one, hence
+ *     replace the existent one.
+ *
+ *     ::
+ *
+ *	            1  a  2
+ *	 old:       |-----| (bo_offset=n+1)
+ *
+ *	      0     a     2
+ *	 req: |-----------| (bo_offset=n)
+ *
+ *	      0     a     2
+ *	 new: |-----------| (bo_offset=n)
+ *
+ *     .. note::
+ *        We expect to see the same result for a request with a different bo
+ *        and/or non-contiguous bo_offset.
+ *
+ *
+ * 14) Existent mapping is a centered subset of the requested one, hence
+ *     replace the existent one.
+ *
+ *     ::
+ *
+ *	            1  a  2
+ *	 old:       |-----| (bo_offset=n+1)
+ *
+ *	      0        a       3
+ *	 req: |----------------| (bo_offset=n)
+ *
+ *	      0        a       3
+ *	 new: |----------------| (bo_offset=n)
+ *
+ *     .. note::
+ *        We expect to see the same result for a request with a different bo
+ *        and/or non-contiguous bo_offset.
+ *
+ *
+ * 15) Existent mappings is overlapped at the beginning by the requested mapping
+ *     backed by a different BO. Hence, map the requested mapping and split up
+ *     the existent one, adjusting it's BO offset accordingly.
+ *
+ *     ::
+ *
+ *	            1     a     3
+ *	 old:       |-----------| (bo_offset=n)
+ *
+ *	      0     b     2
+ *	 req: |-----------|       (bo_offset=m)
+ *
+ *	      0     b     2  a' 3
+ *	 new: |-----------|-----| (b.bo_offset=m,a.bo_offset=n+2)
+ *
+ *
+ * 16) Requested mapping fills the gap between two existent mappings all having
+ *     the same backing BO, such that all three have a contiguous BO offset.
+ *     Hence, merge all mappings.
+ *
+ *     ::
+ *
+ *	      0     a     1
+ *	 old: |-----------|                        (bo_offset=n)
+ *
+ *	                             2     a     3
+ *	 old':                       |-----------| (bo_offset=n+2)
+ *
+ *	                 1     a     2
+ *	 req:            |-----------|             (bo_offset=n+1)
+ *
+ *	                       a
+ *	 new: |----------------------------------| (bo_offset=n)
+ */
+
+/**
+ * DOC: Locking
+ *
+ * Generally, the GPU VA manager does not take care of locking itself, it is
+ * the drivers responsibility to take care about locking. Drivers might want to
+ * protect the following operations: inserting, removing and iterating
+ * &drm_gpuva and &drm_gpuva_region objects as well as generating all kinds of
+ * operations, such as split / merge or prefetch.
+ *
+ * The GPU VA manager also does not take care of the locking of the backing
+ * &drm_gem_object buffers GPU VA lists by itself; drivers are responsible to
+ * enforce mutual exclusion.
+ */
+
+
+static int __drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
+				     struct drm_gpuva_region *reg);
+static void __drm_gpuva_region_remove(struct drm_gpuva_region *reg);
+
+/**
+ * drm_gpuva_manager_init - initialize a &drm_gpuva_manager
+ * @mgr: pointer to the &drm_gpuva_manager to initialize
+ * @name: the name of the GPU VA space
+ * @start_offset: the start offset of the GPU VA space
+ * @range: the size of the GPU VA space
+ * @reserve_offset: the start of the kernel reserved GPU VA area
+ * @reserve_range: the size of the kernel reserved GPU VA area
+ * @ops: &drm_gpuva_fn_ops called on &drm_gpuva_sm_map / &drm_gpuva_sm_unmap
+ * @flags: the feature flags of the &drm_gpuva_manager
+ *
+ * The &drm_gpuva_manager must be initialized with this function before use.
+ *
+ * Note that @mgr must be cleared to 0 before calling this function. The given
+ * &name is expected to be managed by the surrounding driver structures.
+ */
+void
+drm_gpuva_manager_init(struct drm_gpuva_manager *mgr,
+		       const char *name,
+		       u64 start_offset, u64 range,
+		       u64 reserve_offset, u64 reserve_range,
+		       struct drm_gpuva_fn_ops *ops,
+		       enum drm_gpuva_mgr_flags flags)
+{
+	mt_init_flags(&mgr->region_mt, MT_FLAGS_LOCK_NONE);
+	mt_init_flags(&mgr->va_mt, MT_FLAGS_LOCK_NONE);
+
+	mgr->mm_start = start_offset;
+	mgr->mm_range = range;
+
+	mgr->name = name ? name : "unknown";
+	mgr->ops = ops;
+	mgr->flags = flags;
+
+	memset(&mgr->kernel_alloc_region, 0, sizeof(struct drm_gpuva_region));
+	mgr->kernel_alloc_region.va.addr = reserve_offset;
+	mgr->kernel_alloc_region.va.range = reserve_range;
+
+	__drm_gpuva_region_insert(mgr, &mgr->kernel_alloc_region);
+}
+EXPORT_SYMBOL(drm_gpuva_manager_init);
+
+/**
+ * drm_gpuva_manager_destroy - cleanup a &drm_gpuva_manager
+ * @mgr: pointer to the &drm_gpuva_manager to clean up
+ *
+ * Note that it is a bug to call this function on a manager that still
+ * holds GPU VA mappings.
+ */
+void
+drm_gpuva_manager_destroy(struct drm_gpuva_manager *mgr)
+{
+	mgr->name = NULL;
+	__drm_gpuva_region_remove(&mgr->kernel_alloc_region);
+
+	WARN(!mtree_empty(&mgr->va_mt),
+	     "GPUVA tree is not empty, potentially leaking memory.");
+	__mt_destroy(&mgr->va_mt);
+
+	WARN(!mtree_empty(&mgr->region_mt),
+	     "GPUVA region tree is not empty, potentially leaking memory.");
+	__mt_destroy(&mgr->region_mt);
+}
+EXPORT_SYMBOL(drm_gpuva_manager_destroy);
+
+static inline bool
+drm_gpuva_in_mm_range(struct drm_gpuva_manager *mgr, u64 addr, u64 range)
+{
+	u64 end = addr + range;
+	u64 mm_start = mgr->mm_start;
+	u64 mm_end = mm_start + mgr->mm_range;
+
+	return addr < mm_end && mm_start < end;
+}
+
+static inline bool
+drm_gpuva_in_kernel_region(struct drm_gpuva_manager *mgr, u64 addr, u64 range)
+{
+	u64 end = addr + range;
+	u64 kstart = mgr->kernel_alloc_region.va.addr;
+	u64 kend = kstart + mgr->kernel_alloc_region.va.range;
+
+	return addr < kend && kstart < end;
+}
+
+static struct drm_gpuva_region *
+drm_gpuva_in_region(struct drm_gpuva_manager *mgr, u64 addr, u64 range)
+{
+	DRM_GPUVA_REGION_ITER(it, mgr);
+
+	/* Find the VA region the requested range is strictly enclosed by. */
+	drm_gpuva_iter_for_each_range(it, addr, addr + range) {
+		struct drm_gpuva_region *reg = it.reg;
+
+		if (reg->va.addr <= addr &&
+		    reg->va.addr + reg->va.range >= addr + range &&
+		    reg != &mgr->kernel_alloc_region)
+			return reg;
+	}
+
+	return NULL;
+}
+
+static bool
+drm_gpuva_in_any_region(struct drm_gpuva_manager *mgr, u64 addr, u64 range)
+{
+	return !!drm_gpuva_in_region(mgr, addr, range);
+}
+
+/**
+ * drm_gpuva_remove_iter - removes the iterators current element
+ * @it: the &drm_gpuva_iterator
+ *
+ * This removes the element the iterator currently points to.
+ */
+void
+drm_gpuva_iter_remove(struct drm_gpuva_iterator *it)
+{
+	mas_erase(&it->mas);
+}
+EXPORT_SYMBOL(drm_gpuva_iter_remove);
+
+/**
+ * drm_gpuva_insert - insert a &drm_gpuva
+ * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
+ * @va: the &drm_gpuva to insert
+ * @addr: the start address of the GPU VA
+ * @range: the range of the GPU VA
+ *
+ * Insert a &drm_gpuva with a given address and range into a
+ * &drm_gpuva_manager.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuva_insert(struct drm_gpuva_manager *mgr,
+		 struct drm_gpuva *va)
+{
+	u64 addr = va->va.addr;
+	u64 range = va->va.range;
+	MA_STATE(mas, &mgr->va_mt, addr, addr + range - 1);
+	struct drm_gpuva_region *reg = NULL;
+	int ret;
+
+	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
+		return -EINVAL;
+
+	if (unlikely(drm_gpuva_in_kernel_region(mgr, addr, range)))
+		return -EINVAL;
+
+	if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS) {
+		reg = drm_gpuva_in_region(mgr, addr, range);
+		if (unlikely(!reg))
+			return -EINVAL;
+	}
+
+	if (unlikely(drm_gpuva_find_first(mgr, addr, range)))
+		return -EEXIST;
+
+	ret = mas_store_gfp(&mas, va, GFP_KERNEL);
+	if (unlikely(ret))
+		return ret;
+
+	va->mgr = mgr;
+	va->region = reg;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_gpuva_insert);
+
+/**
+ * drm_gpuva_remove - remove a &drm_gpuva
+ * @va: the &drm_gpuva to remove
+ *
+ * This removes the given &va from the underlaying tree.
+ */
+void
+drm_gpuva_remove(struct drm_gpuva *va)
+{
+	MA_STATE(mas, &va->mgr->va_mt, va->va.addr, 0);
+
+	mas_erase(&mas);
+}
+EXPORT_SYMBOL(drm_gpuva_remove);
+
+/**
+ * drm_gpuva_link - link a &drm_gpuva
+ * @va: the &drm_gpuva to link
+ *
+ * This adds the given &va to the GPU VA list of the &drm_gem_object it is
+ * associated with.
+ *
+ * This function expects the caller to protect the GEM's GPUVA list against
+ * concurrent access.
+ */
+void
+drm_gpuva_link(struct drm_gpuva *va)
+{
+	if (likely(va->gem.obj))
+		list_add_tail(&va->head, &va->gem.obj->gpuva.list);
+}
+EXPORT_SYMBOL(drm_gpuva_link);
+
+/**
+ * drm_gpuva_unlink - unlink a &drm_gpuva
+ * @va: the &drm_gpuva to unlink
+ *
+ * This removes the given &va from the GPU VA list of the &drm_gem_object it is
+ * associated with.
+ *
+ * This function expects the caller to protect the GEM's GPUVA list against
+ * concurrent access.
+ */
+void
+drm_gpuva_unlink(struct drm_gpuva *va)
+{
+	if (likely(va->gem.obj))
+		list_del_init(&va->head);
+}
+EXPORT_SYMBOL(drm_gpuva_unlink);
+
+/**
+ * drm_gpuva_find_first - find the first &drm_gpuva in the given range
+ * @mgr: the &drm_gpuva_manager to search in
+ * @addr: the &drm_gpuvas address
+ * @range: the &drm_gpuvas range
+ *
+ * Returns: the first &drm_gpuva within the given range
+ */
+struct drm_gpuva *
+drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
+		     u64 addr, u64 range)
+{
+	MA_STATE(mas, &mgr->va_mt, addr, 0);
+
+	return mas_find(&mas, addr + range - 1);
+}
+EXPORT_SYMBOL(drm_gpuva_find_first);
+
+/**
+ * drm_gpuva_find - find a &drm_gpuva
+ * @mgr: the &drm_gpuva_manager to search in
+ * @addr: the &drm_gpuvas address
+ * @range: the &drm_gpuvas range
+ *
+ * Returns: the &drm_gpuva at a given &addr and with a given &range
+ */
+struct drm_gpuva *
+drm_gpuva_find(struct drm_gpuva_manager *mgr,
+	       u64 addr, u64 range)
+{
+	struct drm_gpuva *va;
+
+	va = drm_gpuva_find_first(mgr, addr, range);
+	if (!va)
+		goto out;
+
+	if (va->va.range != range)
+		goto out;
+
+	return va;
+
+out:
+	return NULL;
+}
+EXPORT_SYMBOL(drm_gpuva_find);
+
+/**
+ * drm_gpuva_find_prev - find the &drm_gpuva before the given address
+ * @mgr: the &drm_gpuva_manager to search in
+ * @start: the given GPU VA's start address
+ *
+ * Find the adjacent &drm_gpuva before the GPU VA with given &start address.
+ *
+ * Note that if there is any free space between the GPU VA mappings no mapping
+ * is returned.
+ *
+ * Returns: a pointer to the found &drm_gpuva or NULL if none was found
+ */
+struct drm_gpuva *
+drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start)
+{
+	MA_STATE(mas, &mgr->va_mt, start, 0);
+
+	if (start <= mgr->mm_start ||
+	    start > (mgr->mm_start + mgr->mm_range))
+		return NULL;
+
+	return mas_prev(&mas, start - 1);
+}
+EXPORT_SYMBOL(drm_gpuva_find_prev);
+
+/**
+ * drm_gpuva_find_next - find the &drm_gpuva after the given address
+ * @mgr: the &drm_gpuva_manager to search in
+ * @end: the given GPU VA's end address
+ *
+ * Find the adjacent &drm_gpuva after the GPU VA with given &end address.
+ *
+ * Note that if there is any free space between the GPU VA mappings no mapping
+ * is returned.
+ *
+ * Returns: a pointer to the found &drm_gpuva or NULL if none was found
+ */
+struct drm_gpuva *
+drm_gpuva_find_next(struct drm_gpuva_manager *mgr, u64 end)
+{
+	MA_STATE(mas, &mgr->va_mt, end - 1, 0);
+
+	if (end < mgr->mm_start ||
+	    end >= (mgr->mm_start + mgr->mm_range))
+		return NULL;
+
+	return mas_next(&mas, end);
+}
+EXPORT_SYMBOL(drm_gpuva_find_next);
+
+static int
+__drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
+			  struct drm_gpuva_region *reg)
+{
+	u64 addr = reg->va.addr;
+	u64 range = reg->va.range;
+	MA_STATE(mas, &mgr->region_mt, addr, addr + range - 1);
+	int ret;
+
+	if (unlikely(!drm_gpuva_in_mm_range(mgr, addr, range)))
+		return -EINVAL;
+
+	ret = mas_store_gfp(&mas, reg, GFP_KERNEL);
+	if (unlikely(ret))
+		return ret;
+
+	reg->mgr = mgr;
+
+	return 0;
+}
+
+/**
+ * drm_gpuva_region_insert - insert a &drm_gpuva_region
+ * @mgr: the &drm_gpuva_manager to insert the &drm_gpuva in
+ * @reg: the &drm_gpuva_region to insert
+ * @addr: the start address of the GPU VA
+ * @range: the range of the GPU VA
+ *
+ * Insert a &drm_gpuva_region with a given address and range into a
+ * &drm_gpuva_manager.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int
+drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
+			struct drm_gpuva_region *reg)
+{
+	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
+		return -EINVAL;
+
+	return __drm_gpuva_region_insert(mgr, reg);
+}
+EXPORT_SYMBOL(drm_gpuva_region_insert);
+
+static void
+__drm_gpuva_region_remove(struct drm_gpuva_region *reg)
+{
+	struct drm_gpuva_manager *mgr = reg->mgr;
+	MA_STATE(mas, &mgr->region_mt, reg->va.addr, 0);
+
+	mas_erase(&mas);
+}
+
+/**
+ * drm_gpuva_region_remove - remove a &drm_gpuva_region
+ * @reg: the &drm_gpuva to remove
+ *
+ * This removes the given &reg from the underlaying tree.
+ */
+void
+drm_gpuva_region_remove(struct drm_gpuva_region *reg)
+{
+	struct drm_gpuva_manager *mgr = reg->mgr;
+
+	if (unlikely(!(mgr->flags & DRM_GPUVA_MANAGER_REGIONS)))
+		return;
+
+	if (unlikely(reg == &mgr->kernel_alloc_region)) {
+		WARN(1, "Can't destroy kernel reserved region.\n");
+		return;
+	}
+
+	if (unlikely(!drm_gpuva_region_empty(reg)))
+		WARN(1, "GPU VA region should be empty on destroy.\n");
+
+	__drm_gpuva_region_remove(reg);
+}
+EXPORT_SYMBOL(drm_gpuva_region_remove);
+
+/**
+ * drm_gpuva_region_empty - indicate whether a &drm_gpuva_region is empty
+ * @reg: the &drm_gpuva to destroy
+ *
+ * Returns: true if the &drm_gpuva_region is empty, false otherwise
+ */
+bool
+drm_gpuva_region_empty(struct drm_gpuva_region *reg)
+{
+	DRM_GPUVA_ITER(it, reg->mgr);
+
+	drm_gpuva_iter_for_each_range(it, reg->va.addr,
+				      reg->va.addr +
+				      reg->va.range)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL(drm_gpuva_region_empty);
+
+/**
+ * drm_gpuva_region_find_first - find the first &drm_gpuva_region in the given
+ * range
+ * @mgr: the &drm_gpuva_manager to search in
+ * @addr: the &drm_gpuva_regions address
+ * @range: the &drm_gpuva_regions range
+ *
+ * Returns: the first &drm_gpuva_region within the given range
+ */
+struct drm_gpuva_region *
+drm_gpuva_region_find_first(struct drm_gpuva_manager *mgr,
+			    u64 addr, u64 range)
+{
+	MA_STATE(mas, &mgr->region_mt, addr, 0);
+
+	return mas_find(&mas, addr + range - 1);
+}
+EXPORT_SYMBOL(drm_gpuva_region_find_first);
+
+/**
+ * drm_gpuva_region_find - find a &drm_gpuva_region
+ * @mgr: the &drm_gpuva_manager to search in
+ * @addr: the &drm_gpuva_regions address
+ * @range: the &drm_gpuva_regions range
+ *
+ * Returns: the &drm_gpuva_region at a given &addr and with a given &range
+ */
+struct drm_gpuva_region *
+drm_gpuva_region_find(struct drm_gpuva_manager *mgr,
+		      u64 addr, u64 range)
+{
+	struct drm_gpuva_region *reg;
+
+	reg = drm_gpuva_region_find_first(mgr, addr, range);
+	if (!reg)
+		goto out;
+
+	if (reg->va.range != range)
+		goto out;
+
+	return reg;
+
+out:
+	return NULL;
+}
+EXPORT_SYMBOL(drm_gpuva_region_find);
+
+static int
+op_map_cb(int (*step)(struct drm_gpuva_op *, void *),
+	  void *priv,
+	  u64 addr, u64 range,
+	  struct drm_gem_object *obj, u64 offset)
+{
+	struct drm_gpuva_op op = {};
+
+	op.op = DRM_GPUVA_OP_MAP;
+	op.map.va.addr = addr;
+	op.map.va.range = range;
+	op.map.gem.obj = obj;
+	op.map.gem.offset = offset;
+
+	return step(&op, priv);
+}
+
+static int
+op_remap_cb(int (*step)(struct drm_gpuva_op *, void *),
+	    void *priv,
+	    struct drm_gpuva_op_map *prev,
+	    struct drm_gpuva_op_map *next,
+	    struct drm_gpuva_op_unmap *unmap)
+{
+	struct drm_gpuva_op op = {};
+	struct drm_gpuva_op_remap *r;
+
+	op.op = DRM_GPUVA_OP_REMAP;
+	r = &op.remap;
+	r->prev = prev;
+	r->next = next;
+	r->unmap = unmap;
+
+	return step(&op, priv);
+}
+
+static int
+op_unmap_cb(int (*step)(struct drm_gpuva_op *, void *),
+	    void *priv,
+	    struct drm_gpuva *va, bool merge)
+{
+	struct drm_gpuva_op op = {};
+
+	op.op = DRM_GPUVA_OP_UNMAP;
+	op.unmap.va = va;
+	op.unmap.keep = merge;
+
+	return step(&op, priv);
+}
+
+static inline bool
+gpuva_should_merge(struct drm_gpuva *va)
+{
+	/* Never merge mappings with NULL GEMs. */
+	return !!va->gem.obj;
+}
+
+static int
+__drm_gpuva_sm_map(struct drm_gpuva_manager *mgr,
+		   struct drm_gpuva_fn_ops *ops, void *priv,
+		   u64 req_addr, u64 req_range,
+		   struct drm_gem_object *req_obj, u64 req_offset)
+{
+	DRM_GPUVA_ITER(it, mgr);
+	int (*step)(struct drm_gpuva_op *, void *);
+	struct drm_gpuva *va, *prev = NULL;
+	u64 req_end = req_addr + req_range;
+	bool skip_pmerge = false, skip_nmerge = false;
+	int ret;
+
+	step = ops->sm_map_step;
+
+	if (unlikely(!drm_gpuva_in_mm_range(mgr, req_addr, req_range)))
+		return -EINVAL;
+
+	if (unlikely(drm_gpuva_in_kernel_region(mgr, req_addr, req_range)))
+		return -EINVAL;
+
+	if ((mgr->flags & DRM_GPUVA_MANAGER_REGIONS) &&
+	    !drm_gpuva_in_any_region(mgr, req_addr, req_range))
+		return -EINVAL;
+
+	drm_gpuva_iter_for_each_range(it, req_addr, req_end) {
+		struct drm_gpuva *va = it.va;
+		struct drm_gem_object *obj = va->gem.obj;
+		u64 offset = va->gem.offset;
+		u64 addr = va->va.addr;
+		u64 range = va->va.range;
+		u64 end = addr + range;
+		bool merge = gpuva_should_merge(va);
+
+		/* Generally, we want to skip merging with potential mappings
+		 * left and right of the requested one when we found a
+		 * collision, since merging happens in this loop already.
+		 *
+		 * However, there is one exception when the requested mapping
+		 * spans into a free VM area. If this is the case we might
+		 * still hit the boundary of another mapping before and/or
+		 * after the free VM area.
+		 */
+		skip_pmerge = true;
+		skip_nmerge = true;
+
+		if (addr == req_addr) {
+			merge &= obj == req_obj &&
+				 offset == req_offset;
+
+			if (end == req_end) {
+				if (merge)
+					goto done;
+
+				ret = op_unmap_cb(step, priv, va, false);
+				if (ret)
+					return ret;
+				break;
+			}
+
+			if (end < req_end) {
+				skip_nmerge = false;
+				ret = op_unmap_cb(step, priv, va, merge);
+				if (ret)
+					return ret;
+				goto next;
+			}
+
+			if (end > req_end) {
+				struct drm_gpuva_op_map n = {
+					.va.addr = req_end,
+					.va.range = range - req_range,
+					.gem.obj = obj,
+					.gem.offset = offset + req_range,
+				};
+				struct drm_gpuva_op_unmap u = { .va = va };
+
+				if (merge)
+					goto done;
+
+				ret = op_remap_cb(step, priv, NULL, &n, &u);
+				if (ret)
+					return ret;
+				break;
+			}
+		} else if (addr < req_addr) {
+			u64 ls_range = req_addr - addr;
+			struct drm_gpuva_op_map p = {
+				.va.addr = addr,
+				.va.range = ls_range,
+				.gem.obj = obj,
+				.gem.offset = offset,
+			};
+			struct drm_gpuva_op_unmap u = { .va = va };
+
+			merge &= obj == req_obj &&
+				 offset + ls_range == req_offset;
+
+			if (end == req_end) {
+				if (merge)
+					goto done;
+
+				ret = op_remap_cb(step, priv, &p, NULL, &u);
+				if (ret)
+					return ret;
+				break;
+			}
+
+			if (end < req_end) {
+				u64 new_addr = addr;
+				u64 new_range = req_range + ls_range;
+				u64 new_offset = offset;
+
+				/* We validated that the requested mapping is
+				 * within a single VA region already.
+				 * Since it overlaps the current mapping (which
+				 * can't cross a VA region boundary) we can be
+				 * sure that we're still within the boundaries
+				 * of the same VA region after merging.
+				 */
+				if (merge) {
+					req_offset = new_offset;
+					req_addr = new_addr;
+					req_range = new_range;
+					ret = op_unmap_cb(step, priv, va, true);
+					if (ret)
+						return ret;
+					goto next;
+				}
+
+				ret = op_remap_cb(step, priv, &p, NULL, &u);
+				if (ret)
+					return ret;
+				goto next;
+			}
+
+			if (end > req_end) {
+				struct drm_gpuva_op_map n = {
+					.va.addr = req_end,
+					.va.range = end - req_end,
+					.gem.obj = obj,
+					.gem.offset = offset + ls_range +
+						      req_range,
+				};
+
+				if (merge)
+					goto done;
+
+				ret = op_remap_cb(step, priv, &p, &n, &u);
+				if (ret)
+					return ret;
+				break;
+			}
+		} else if (addr > req_addr) {
+			merge &= obj == req_obj &&
+				 offset == req_offset +
+					   (addr - req_addr);
+
+			if (!prev)
+				skip_pmerge = false;
+
+			if (end == req_end) {
+				ret = op_unmap_cb(step, priv, va, merge);
+				if (ret)
+					return ret;
+				break;
+			}
+
+			if (end < req_end) {
+				skip_nmerge = false;
+				ret = op_unmap_cb(step, priv, va, merge);
+				if (ret)
+					return ret;
+				goto next;
+			}
+
+			if (end > req_end) {
+				struct drm_gpuva_op_map n = {
+					.va.addr = req_end,
+					.va.range = end - req_end,
+					.gem.obj = obj,
+					.gem.offset = offset + req_end - addr,
+				};
+				struct drm_gpuva_op_unmap u = { .va = va };
+				u64 new_end = end;
+				u64 new_range = new_end - req_addr;
+
+				/* We validated that the requested mapping is
+				 * within a single VA region already.
+				 * Since it overlaps the current mapping (which
+				 * can't cross a VA region boundary) we can be
+				 * sure that we're still within the boundaries
+				 * of the same VA region after merging.
+				 */
+				if (merge) {
+					req_end = new_end;
+					req_range = new_range;
+					ret = op_unmap_cb(step, priv, va, true);
+					if (ret)
+						return ret;
+					break;
+				}
+
+				ret = op_remap_cb(step, priv, NULL, &n, &u);
+				if (ret)
+					return ret;
+				break;
+			}
+		}
+next:
+		prev = va;
+	}
+
+	va = skip_pmerge ? NULL : drm_gpuva_find_prev(mgr, req_addr);
+	if (va) {
+		struct drm_gem_object *obj = va->gem.obj;
+		u64 offset = va->gem.offset;
+		u64 addr = va->va.addr;
+		u64 range = va->va.range;
+		u64 new_offset = offset;
+		u64 new_addr = addr;
+		u64 new_range = req_range + range;
+		bool merge = gpuva_should_merge(va) &&
+			     obj == req_obj &&
+			     offset + range == req_offset;
+
+		if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS)
+			merge &= drm_gpuva_in_any_region(mgr, new_addr,
+							 new_range);
+
+		if (merge) {
+			ret = op_unmap_cb(step, priv, va, true);
+			if (ret)
+				return ret;
+
+			req_offset = new_offset;
+			req_addr = new_addr;
+			req_range = new_range;
+		}
+	}
+
+	va = skip_nmerge ? NULL : drm_gpuva_find_next(mgr, req_end);
+	if (va) {
+		struct drm_gem_object *obj = va->gem.obj;
+		u64 offset = va->gem.offset;
+		u64 addr = va->va.addr;
+		u64 range = va->va.range;
+		u64 end = addr + range;
+		u64 new_range = req_range + range;
+		u64 new_end = end;
+		bool merge = gpuva_should_merge(va) &&
+			     obj == req_obj &&
+			     offset == req_offset + req_range;
+
+		if (mgr->flags & DRM_GPUVA_MANAGER_REGIONS)
+			merge &= drm_gpuva_in_any_region(mgr, req_addr,
+							 new_range);
+
+		if (merge) {
+			ret = op_unmap_cb(step, priv, va, true);
+			if (ret)
+				return ret;
+
+			req_range = new_range;
+			req_end = new_end;
+		}
+	}
+
+	ret = op_map_cb(step, priv,
+			req_addr, req_range,
+			req_obj, req_offset);
+	if (ret)
+		return ret;
+
+done:
+	return 0;
+}
+
+static int
+__drm_gpuva_sm_unmap(struct drm_gpuva_manager *mgr,
+		     struct drm_gpuva_fn_ops *ops, void *priv,
+		     u64 req_addr, u64 req_range)
+{
+	DRM_GPUVA_ITER(it, mgr);
+	int (*step)(struct drm_gpuva_op *, void *);
+	u64 req_end = req_addr + req_range;
+	int ret;
+
+	step = ops->sm_unmap_step;
+
+	drm_gpuva_iter_for_each_range(it, req_addr, req_end) {
+		struct drm_gpuva *va = it.va;
+		struct drm_gpuva_op_map prev = {}, next = {};
+		bool prev_split = false, next_split = false;
+		struct drm_gem_object *obj = va->gem.obj;
+		u64 offset = va->gem.offset;
+		u64 addr = va->va.addr;
+		u64 range = va->va.range;
+		u64 end = addr + range;
+
+		if (addr < req_addr) {
+			prev.va.addr = addr;
+			prev.va.range = req_addr - addr;
+			prev.gem.obj = obj;
+			prev.gem.offset = offset;
+
+			prev_split = true;
+		}
+
+		if (end > req_end) {
+			next.va.addr = req_end;
+			next.va.range = end - req_end;
+			next.gem.obj = obj;
+			next.gem.offset = offset + (req_end - addr);
+
+			next_split = true;
+		}
+
+		if (prev_split || next_split) {
+			struct drm_gpuva_op_unmap unmap = { .va = va };
+
+			ret = op_remap_cb(step, priv, &prev, &next, &unmap);
+			if (ret)
+				return ret;
+		} else {
+			ret = op_unmap_cb(step, priv, va, false);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * drm_gpuva_sm_map - creates the &drm_gpuva_op split/merge steps
+ * @mgr: the &drm_gpuva_manager representing the GPU VA space
+ * @req_addr: the start address of the new mapping
+ * @req_range: the range of the new mapping
+ * @req_obj: the &drm_gem_object to map
+ * @req_offset: the offset within the &drm_gem_object
+ * @priv: pointer to a driver private data structure
+ *
+ * This function iterates the given range of the GPU VA space. It utilizes the
+ * &drm_gpuva_fn_ops to call back into the driver providing the split and merge
+ * steps.
+ *
+ * Drivers may use these callbacks to update the GPU VA space right away within
+ * the callback. In case the driver decides to copy and store the operations for
+ * later processing neither this function nor &drm_gpuva_sm_unmap is allowed to
+ * be called before the &drm_gpuva_manager's view of the GPU VA space was
+ * updated with the previous set of operations. To update the
+ * &drm_gpuva_manager's view of the GPU VA space drm_gpuva_insert(),
+ * drm_gpuva_destroy_locked() and/or drm_gpuva_destroy_unlocked() should be
+ * used.
+ *
+ * A sequence of callbacks can contain map, unmap and remap operations, but
+ * the sequence of callbacks might also be empty if no operation is required,
+ * e.g. if the requested mapping already exists in the exact same way.
+ *
+ * There can be an arbitrary amount of unmap operations, a maximum of two remap
+ * operations and a single map operation. The latter one, if existent,
+ * represents the original map operation requested by the caller. Please note
+ * that the map operation might has been modified, e.g. if it was merged with
+ * an existent mapping.
+ *
+ * Returns: 0 on success or a negative error code
+ */
+int
+drm_gpuva_sm_map(struct drm_gpuva_manager *mgr, void *priv,
+		 u64 req_addr, u64 req_range,
+		 struct drm_gem_object *req_obj, u64 req_offset)
+{
+	if (!mgr->ops || !mgr->ops->sm_map_step)
+		return -EINVAL;
+
+	return __drm_gpuva_sm_map(mgr, mgr->ops, priv,
+				  req_addr, req_range,
+				  req_obj, req_offset);
+}
+EXPORT_SYMBOL(drm_gpuva_sm_map);
+
+/**
+ * drm_gpuva_sm_unmap - creates the &drm_gpuva_ops to split on unmap
+ * @mgr: the &drm_gpuva_manager representing the GPU VA space
+ * @req_addr: the start address of the range to unmap
+ * @req_range: the range of the mappings to unmap
+ * @ops: the &drm_gpuva_fn_ops callbacks to provide the split/merge steps
+ * @priv: pointer to a driver private data structure
+ *
+ * This function iterates the given range of the GPU VA space. It utilizes the
+ * &drm_gpuva_fn_ops to call back into the driver providing the operations to
+ * unmap and, if required, split existent mappings.
+ *
+ * Drivers may use these callbacks to update the GPU VA space right away within
+ * the callback. In case the driver decides to copy and store the operations for
+ * later processing neither this function nor &drm_gpuva_sm_map is allowed to be
+ * called before the &drm_gpuva_manager's view of the GPU VA space was updated
+ * with the previous set of operations. To update the &drm_gpuva_manager's view
+ * of the GPU VA space drm_gpuva_insert(), drm_gpuva_destroy_locked() and/or
+ * drm_gpuva_destroy_unlocked() should be used.
+ *
+ * A sequence of callbacks can contain unmap and remap operations, depending on
+ * whether there are actual overlapping mappings to split.
+ *
+ * There can be an arbitrary amount of unmap operations and a maximum of two
+ * remap operations.
+ *
+ * Returns: 0 on success or a negative error code
+ */
+int
+drm_gpuva_sm_unmap(struct drm_gpuva_manager *mgr, void *priv,
+		   u64 req_addr, u64 req_range)
+{
+	if (!mgr->ops || !mgr->ops->sm_unmap_step)
+		return -EINVAL;
+
+	return __drm_gpuva_sm_unmap(mgr, mgr->ops, priv,
+				    req_addr, req_range);
+}
+EXPORT_SYMBOL(drm_gpuva_sm_unmap);
+
+static struct drm_gpuva_op *
+gpuva_op_alloc(struct drm_gpuva_manager *mgr)
+{
+	struct drm_gpuva_fn_ops *fn = mgr->ops;
+	struct drm_gpuva_op *op;
+
+	if (fn && fn->op_alloc)
+		op = fn->op_alloc();
+	else
+		op = kzalloc(sizeof(*op), GFP_KERNEL);
+
+	if (unlikely(!op))
+		return NULL;
+
+	return op;
+}
+
+static void
+gpuva_op_free(struct drm_gpuva_manager *mgr,
+	      struct drm_gpuva_op *op)
+{
+	struct drm_gpuva_fn_ops *fn = mgr->ops;
+
+	if (fn && fn->op_free)
+		fn->op_free(op);
+	else
+		kfree(op);
+}
+
+int drm_gpuva_sm_step(struct drm_gpuva_op *__op, void *priv)
+{
+	struct {
+		struct drm_gpuva_manager *mgr;
+		struct drm_gpuva_ops *ops;
+	} *args = priv;
+	struct drm_gpuva_manager *mgr = args->mgr;
+	struct drm_gpuva_ops *ops = args->ops;
+	struct drm_gpuva_op *op;
+
+	op = gpuva_op_alloc(mgr);
+	if (unlikely(!op))
+		goto err;
+
+	memcpy(op, __op, sizeof(*op));
+
+	if (op->op == DRM_GPUVA_OP_REMAP) {
+		struct drm_gpuva_op_remap *__r = &__op->remap;
+		struct drm_gpuva_op_remap *r = &op->remap;
+
+		r->unmap = kmemdup(__r->unmap, sizeof(*r->unmap),
+				   GFP_KERNEL);
+		if (unlikely(!r->unmap))
+			goto err_free_op;
+
+		if (__r->prev) {
+			r->prev = kmemdup(__r->prev, sizeof(*r->prev),
+					  GFP_KERNEL);
+			if (unlikely(!r->prev))
+				goto err_free_unmap;
+		}
+
+		if (__r->next) {
+			r->next = kmemdup(__r->next, sizeof(*r->next),
+					  GFP_KERNEL);
+			if (unlikely(!r->next))
+				goto err_free_prev;
+		}
+	}
+
+	list_add_tail(&op->entry, &ops->list);
+
+	return 0;
+
+err_free_unmap:
+	kfree(op->remap.unmap);
+err_free_prev:
+	kfree(op->remap.prev);
+err_free_op:
+	gpuva_op_free(mgr, op);
+err:
+	return -ENOMEM;
+}
+
+static struct drm_gpuva_fn_ops gpuva_list_ops = {
+	.sm_map_step = drm_gpuva_sm_step,
+	.sm_unmap_step = drm_gpuva_sm_step,
+};
+
+/**
+ * drm_gpuva_sm_map_ops_create - creates the &drm_gpuva_ops to split and merge
+ * @mgr: the &drm_gpuva_manager representing the GPU VA space
+ * @req_addr: the start address of the new mapping
+ * @req_range: the range of the new mapping
+ * @req_obj: the &drm_gem_object to map
+ * @req_offset: the offset within the &drm_gem_object
+ *
+ * This function creates a list of operations to perform splitting and merging
+ * of existent mapping(s) with the newly requested one.
+ *
+ * The list can be iterated with &drm_gpuva_for_each_op and must be processed
+ * in the given order. It can contain map, unmap and remap operations, but it
+ * also can be empty if no operation is required, e.g. if the requested mapping
+ * already exists is the exact same way.
+ *
+ * There can be an arbitrary amount of unmap operations, a maximum of two remap
+ * operations and a single map operation. The latter one, if existent,
+ * represents the original map operation requested by the caller. Please note
+ * that the map operation might has been modified, e.g. if it was merged with an
+ * existent mapping.
+ *
+ * Note that before calling this function again with another mapping request it
+ * is necessary to update the &drm_gpuva_manager's view of the GPU VA space. The
+ * previously obtained operations must be either processed or abandoned. To
+ * update the &drm_gpuva_manager's view of the GPU VA space drm_gpuva_insert(),
+ * drm_gpuva_destroy_locked() and/or drm_gpuva_destroy_unlocked() should be
+ * used.
+ *
+ * After the caller finished processing the returned &drm_gpuva_ops, they must
+ * be freed with &drm_gpuva_ops_free.
+ *
+ * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
+ */
+struct drm_gpuva_ops *
+drm_gpuva_sm_map_ops_create(struct drm_gpuva_manager *mgr,
+			    u64 req_addr, u64 req_range,
+			    struct drm_gem_object *req_obj, u64 req_offset)
+{
+	struct drm_gpuva_ops *ops;
+	struct {
+		struct drm_gpuva_manager *mgr;
+		struct drm_gpuva_ops *ops;
+	} args;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (unlikely(!ops))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->list);
+
+	args.mgr = mgr;
+	args.ops = ops;
+
+	ret = __drm_gpuva_sm_map(mgr, &gpuva_list_ops, &args,
+				 req_addr, req_range,
+				 req_obj, req_offset);
+	if (ret) {
+		kfree(ops);
+		return ERR_PTR(ret);
+	}
+
+	return ops;
+}
+EXPORT_SYMBOL(drm_gpuva_sm_map_ops_create);
+
+/**
+ * drm_gpuva_sm_unmap_ops_create - creates the &drm_gpuva_ops to split on unmap
+ * @mgr: the &drm_gpuva_manager representing the GPU VA space
+ * @req_addr: the start address of the range to unmap
+ * @req_range: the range of the mappings to unmap
+ *
+ * This function creates a list of operations to perform unmapping and, if
+ * required, splitting of the mappings overlapping the unmap range.
+ *
+ * The list can be iterated with &drm_gpuva_for_each_op and must be processed
+ * in the given order. It can contain unmap and remap operations, depending on
+ * whether there are actual overlapping mappings to split.
+ *
+ * There can be an arbitrary amount of unmap operations and a maximum of two
+ * remap operations.
+ *
+ * Note that before calling this function again with another range to unmap it
+ * is necessary to update the &drm_gpuva_manager's view of the GPU VA space. The
+ * previously obtained operations must be processed or abandoned. To update the
+ * &drm_gpuva_manager's view of the GPU VA space drm_gpuva_insert(),
+ * drm_gpuva_destroy_locked() and/or drm_gpuva_destroy_unlocked() should be
+ * used.
+ *
+ * After the caller finished processing the returned &drm_gpuva_ops, they must
+ * be freed with &drm_gpuva_ops_free.
+ *
+ * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
+ */
+struct drm_gpuva_ops *
+drm_gpuva_sm_unmap_ops_create(struct drm_gpuva_manager *mgr,
+			      u64 req_addr, u64 req_range)
+{
+	struct drm_gpuva_ops *ops;
+	struct {
+		struct drm_gpuva_manager *mgr;
+		struct drm_gpuva_ops *ops;
+	} args;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (unlikely(!ops))
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->list);
+
+	args.mgr = mgr;
+	args.ops = ops;
+
+	ret = __drm_gpuva_sm_unmap(mgr, &gpuva_list_ops, &args,
+				   req_addr, req_range);
+	if (ret) {
+		kfree(ops);
+		return ERR_PTR(ret);
+	}
+
+	return ops;
+}
+EXPORT_SYMBOL(drm_gpuva_sm_unmap_ops_create);
+
+/**
+ * drm_gpuva_prefetch_ops_create - creates the &drm_gpuva_ops to prefetch
+ * @mgr: the &drm_gpuva_manager representing the GPU VA space
+ * @req_addr: the start address of the range to prefetch
+ * @req_range: the range of the mappings to prefetch
+ *
+ * This function creates a list of operations to perform prefetching.
+ *
+ * The list can be iterated with &drm_gpuva_for_each_op and must be processed
+ * in the given order. It can contain prefetch operations.
+ *
+ * There can be an arbitrary amount of prefetch operations.
+ *
+ * After the caller finished processing the returned &drm_gpuva_ops, they must
+ * be freed with &drm_gpuva_ops_free.
+ *
+ * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
+ */
+struct drm_gpuva_ops *
+drm_gpuva_prefetch_ops_create(struct drm_gpuva_manager *mgr,
+			      u64 addr, u64 range)
+{
+	DRM_GPUVA_ITER(it, mgr);
+	struct drm_gpuva_ops *ops;
+	struct drm_gpuva_op *op;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->list);
+
+	drm_gpuva_iter_for_each_range(it, addr, addr + range) {
+		op = gpuva_op_alloc(mgr);
+		if (!op) {
+			ret = -ENOMEM;
+			goto err_free_ops;
+		}
+
+		op->op = DRM_GPUVA_OP_PREFETCH;
+		op->prefetch.va = it.va;
+		list_add_tail(&op->entry, &ops->list);
+	}
+
+	return ops;
+
+err_free_ops:
+	drm_gpuva_ops_free(mgr, ops);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(drm_gpuva_prefetch_ops_create);
+
+/**
+ * drm_gpuva_gem_unmap_ops_create - creates the &drm_gpuva_ops to unmap a GEM
+ * @mgr: the &drm_gpuva_manager representing the GPU VA space
+ * @obj: the &drm_gem_object to unmap
+ *
+ * This function creates a list of operations to perform unmapping for every
+ * GPUVA attached to a GEM.
+ *
+ * The list can be iterated with &drm_gpuva_for_each_op and consists out of an
+ * arbitrary amount of unmap operations.
+ *
+ * After the caller finished processing the returned &drm_gpuva_ops, they must
+ * be freed with &drm_gpuva_ops_free.
+ *
+ * It is the callers responsibility to protect the GEMs GPUVA list against
+ * concurrent access.
+ *
+ * Returns: a pointer to the &drm_gpuva_ops on success, an ERR_PTR on failure
+ */
+struct drm_gpuva_ops *
+drm_gpuva_gem_unmap_ops_create(struct drm_gpuva_manager *mgr,
+			       struct drm_gem_object *obj)
+{
+	struct drm_gpuva_ops *ops;
+	struct drm_gpuva_op *op;
+	struct drm_gpuva *va;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->list);
+
+	drm_gem_for_each_gpuva(va, obj) {
+		op = gpuva_op_alloc(mgr);
+		if (!op) {
+			ret = -ENOMEM;
+			goto err_free_ops;
+		}
+
+		op->op = DRM_GPUVA_OP_UNMAP;
+		op->unmap.va = va;
+		list_add_tail(&op->entry, &ops->list);
+	}
+
+	return ops;
+
+err_free_ops:
+	drm_gpuva_ops_free(mgr, ops);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(drm_gpuva_gem_unmap_ops_create);
+
+
+/**
+ * drm_gpuva_ops_free - free the given &drm_gpuva_ops
+ * @mgr: the &drm_gpuva_manager the ops were created for
+ * @ops: the &drm_gpuva_ops to free
+ *
+ * Frees the given &drm_gpuva_ops structure including all the ops associated
+ * with it.
+ */
+void
+drm_gpuva_ops_free(struct drm_gpuva_manager *mgr,
+		   struct drm_gpuva_ops *ops)
+{
+	struct drm_gpuva_op *op, *next;
+
+	drm_gpuva_for_each_op_safe(op, next, ops) {
+		list_del(&op->entry);
+
+		if (op->op == DRM_GPUVA_OP_REMAP) {
+			kfree(op->remap.prev);
+			kfree(op->remap.next);
+			kfree(op->remap.unmap);
+		}
+
+		gpuva_op_free(mgr, op);
+	}
+
+	kfree(ops);
+}
+EXPORT_SYMBOL(drm_gpuva_ops_free);
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index 1d76d0686b03..4fe4a1552948 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -104,6 +104,12 @@  enum drm_driver_feature {
 	 * acceleration should be handled by two drivers that are connected using auxiliary bus.
 	 */
 	DRIVER_COMPUTE_ACCEL            = BIT(7),
+	/**
+	 * @DRIVER_GEM_GPUVA:
+	 *
+	 * Driver supports user defined GPU VA bindings for GEM objects.
+	 */
+	DRIVER_GEM_GPUVA		= BIT(8),
 
 	/* IMPORTANT: Below are all the legacy flags, add new ones above. */
 
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index 772a4adf5287..4a3679034966 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -36,6 +36,8 @@ 
 
 #include <linux/kref.h>
 #include <linux/dma-resv.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
 
 #include <drm/drm_vma_manager.h>
 
@@ -337,6 +339,17 @@  struct drm_gem_object {
 	 */
 	struct dma_resv _resv;
 
+	/**
+	 * @gpuva:
+	 *
+	 * Provides the list and list mutex of GPU VAs attached to this
+	 * GEM object.
+	 */
+	struct {
+		struct list_head list;
+		struct mutex mutex;
+	} gpuva;
+
 	/**
 	 * @funcs:
 	 *
@@ -479,4 +492,66 @@  void drm_gem_lru_move_tail(struct drm_gem_lru *lru, struct drm_gem_object *obj);
 unsigned long drm_gem_lru_scan(struct drm_gem_lru *lru, unsigned nr_to_scan,
 			       bool (*shrink)(struct drm_gem_object *obj));
 
+/**
+ * drm_gem_gpuva_init - initialize the gpuva list of a GEM object
+ * @obj: the &drm_gem_object
+ *
+ * This initializes the &drm_gem_object's &drm_gpuva list and the mutex
+ * protecting it.
+ *
+ * Calling this function is only necessary for drivers intending to support the
+ * &drm_driver_feature DRIVER_GEM_GPUVA.
+ */
+static inline void drm_gem_gpuva_init(struct drm_gem_object *obj)
+{
+	INIT_LIST_HEAD(&obj->gpuva.list);
+	mutex_init(&obj->gpuva.mutex);
+}
+
+/**
+ * drm_gem_gpuva_lock - lock the GEM's gpuva list mutex
+ * @obj: the &drm_gem_object
+ *
+ * This unlocks the mutex protecting the &drm_gem_object's &drm_gpuva list.
+ */
+static inline void drm_gem_gpuva_lock(struct drm_gem_object *obj)
+{
+	mutex_lock(&obj->gpuva.mutex);
+}
+
+/**
+ * drm_gem_gpuva_unlock - unlock the GEM's gpuva list mutex
+ * @obj: the &drm_gem_object
+ *
+ * This unlocks the mutex protecting the &drm_gem_object's &drm_gpuva list.
+ */
+static inline void drm_gem_gpuva_unlock(struct drm_gem_object *obj)
+{
+	mutex_unlock(&obj->gpuva.mutex);
+}
+
+/**
+ * drm_gem_for_each_gpuva - iternator to walk over a list of gpuvas
+ * @entry: &drm_gpuva structure to assign to in each iteration step
+ * @obj: the &drm_gem_object the &drm_gpuvas to walk are associated with
+ *
+ * This iterator walks over all &drm_gpuva structures associated with the
+ * &drm_gpuva_manager.
+ */
+#define drm_gem_for_each_gpuva(entry, obj) \
+	list_for_each_entry(entry, &obj->gpuva.list, head)
+
+/**
+ * drm_gem_for_each_gpuva_safe - iternator to safely walk over a list of gpuvas
+ * @entry: &drm_gpuva structure to assign to in each iteration step
+ * @next: &next &drm_gpuva to store the next step
+ * @obj: the &drm_gem_object the &drm_gpuvas to walk are associated with
+ *
+ * This iterator walks over all &drm_gpuva structures associated with the
+ * &drm_gem_object. It is implemented with list_for_each_entry_safe(), hence
+ * it is save against removal of elements.
+ */
+#define drm_gem_for_each_gpuva_safe(entry, next, obj) \
+	list_for_each_entry_safe(entry, next, &obj->gpuva.list, head)
+
 #endif /* __DRM_GEM_H__ */
diff --git a/include/drm/drm_gpuva_mgr.h b/include/drm/drm_gpuva_mgr.h
new file mode 100644
index 000000000000..d245d01e37a9
--- /dev/null
+++ b/include/drm/drm_gpuva_mgr.h
@@ -0,0 +1,714 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __DRM_GPUVA_MGR_H__
+#define __DRM_GPUVA_MGR_H__
+
+/*
+ * Copyright (c) 2022 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/maple_tree.h>
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+struct drm_gpuva_manager;
+struct drm_gpuva_fn_ops;
+
+/**
+ * struct drm_gpuva_region - structure to track a portion of GPU VA space
+ *
+ * This structure represents a portion of a GPUs VA space and is associated
+ * with a &drm_gpuva_manager.
+ *
+ * GPU VA mappings, represented by &drm_gpuva objects, are restricted to be
+ * placed within a &drm_gpuva_region.
+ */
+struct drm_gpuva_region {
+	/**
+	 * @mgr: the &drm_gpuva_manager this object is associated with
+	 */
+	struct drm_gpuva_manager *mgr;
+
+	/**
+	 * @va: structure containing the address and range of the &drm_gpuva_region
+	 */
+	struct {
+		/**
+		 * @addr: the start address
+		 */
+		u64 addr;
+
+		/*
+		 * @range: the range
+		 */
+		u64 range;
+	} va;
+
+	/**
+	 * @sparse: indicates whether this region is sparse
+	 */
+	bool sparse;
+};
+
+int drm_gpuva_region_insert(struct drm_gpuva_manager *mgr,
+			    struct drm_gpuva_region *reg);
+void drm_gpuva_region_remove(struct drm_gpuva_region *reg);
+
+bool
+drm_gpuva_region_empty(struct drm_gpuva_region *reg);
+
+struct drm_gpuva_region *
+drm_gpuva_region_find(struct drm_gpuva_manager *mgr,
+		      u64 addr, u64 range);
+struct drm_gpuva_region *
+drm_gpuva_region_find_first(struct drm_gpuva_manager *mgr,
+			    u64 addr, u64 range);
+
+/**
+ * enum drm_gpuva_flags - flags for struct drm_gpuva
+ */
+enum drm_gpuva_flags {
+	/**
+	 * @DRM_GPUVA_EVICTED:
+	 *
+	 * Flag indicating that the &drm_gpuva's backing GEM is evicted.
+	 */
+	DRM_GPUVA_EVICTED = (1 << 0),
+
+	/**
+	 * @DRM_GPUVA_USERBITS: user defined bits
+	 */
+	DRM_GPUVA_USERBITS = (1 << 1),
+};
+
+/**
+ * struct drm_gpuva - structure to track a GPU VA mapping
+ *
+ * This structure represents a GPU VA mapping and is associated with a
+ * &drm_gpuva_manager.
+ *
+ * Typically, this structure is embedded in bigger driver structures.
+ */
+struct drm_gpuva {
+	/**
+	 * @mgr: the &drm_gpuva_manager this object is associated with
+	 */
+	struct drm_gpuva_manager *mgr;
+
+	/**
+	 * @region: the &drm_gpuva_region the &drm_gpuva is mapped in
+	 */
+	struct drm_gpuva_region *region;
+
+	/**
+	 * @head: the &list_head to attach this object to a &drm_gem_object
+	 */
+	struct list_head head;
+
+	/**
+	 * @flags: the &drm_gpuva_flags for this mapping
+	 */
+	enum drm_gpuva_flags flags;
+
+	/**
+	 * @va: structure containing the address and range of the &drm_gpuva
+	 */
+	struct {
+		/**
+		 * @addr: the start address
+		 */
+		u64 addr;
+
+		/*
+		 * @range: the range
+		 */
+		u64 range;
+	} va;
+
+	/**
+	 * @gem: structure containing the &drm_gem_object and it's offset
+	 */
+	struct {
+		/**
+		 * @offset: the offset within the &drm_gem_object
+		 */
+		u64 offset;
+
+		/**
+		 * @obj: the mapped &drm_gem_object
+		 */
+		struct drm_gem_object *obj;
+	} gem;
+};
+
+void drm_gpuva_link(struct drm_gpuva *va);
+void drm_gpuva_unlink(struct drm_gpuva *va);
+
+int drm_gpuva_insert(struct drm_gpuva_manager *mgr,
+		     struct drm_gpuva *va);
+void drm_gpuva_remove(struct drm_gpuva *va);
+
+struct drm_gpuva *drm_gpuva_find(struct drm_gpuva_manager *mgr,
+				 u64 addr, u64 range);
+struct drm_gpuva *drm_gpuva_find_first(struct drm_gpuva_manager *mgr,
+				       u64 addr, u64 range);
+struct drm_gpuva *drm_gpuva_find_prev(struct drm_gpuva_manager *mgr, u64 start);
+struct drm_gpuva *drm_gpuva_find_next(struct drm_gpuva_manager *mgr, u64 end);
+
+/**
+ * drm_gpuva_evict - sets whether the backing GEM of this &drm_gpuva is evicted
+ * @va: the &drm_gpuva to set the evict flag for
+ * @evict: indicates whether the &drm_gpuva is evicted
+ */
+static inline void drm_gpuva_evict(struct drm_gpuva *va, bool evict)
+{
+	if (evict)
+		va->flags |= DRM_GPUVA_EVICTED;
+	else
+		va->flags &= ~DRM_GPUVA_EVICTED;
+}
+
+/**
+ * drm_gpuva_evicted - indicates whether the backing BO of this &drm_gpuva
+ * is evicted
+ * @va: the &drm_gpuva to check
+ */
+static inline bool drm_gpuva_evicted(struct drm_gpuva *va)
+{
+	return va->flags & DRM_GPUVA_EVICTED;
+}
+
+/**
+ * enum drm_gpuva_mgr_flags - the feature flags for the &drm_gpuva_manager
+ */
+enum drm_gpuva_mgr_flags {
+	/**
+	 * @DRM_GPUVA_MANAGER_REGIONS:
+	 *
+	 * Enable the &drm_gpuva_manager to separately track &drm_gpuva_regions.
+	 *
+	 * &drm_gpuva_regions represent a reserved portion of VA space drivers
+	 * can create mappings in. If regions are enabled, &drm_gpuvas can be
+	 * created within an existing &drm_gpuva_region only and merge
+	 * operations never indicate merging over region boundaries.
+	 */
+	DRM_GPUVA_MANAGER_REGIONS = (1 << 0),
+};
+
+/**
+ * struct drm_gpuva_manager - DRM GPU VA Manager
+ *
+ * The DRM GPU VA Manager keeps track of a GPU's virtual address space by using
+ * &maple_tree structures. Typically, this structure is embedded in bigger
+ * driver structures.
+ *
+ * Drivers can pass addresses and ranges in an arbitrary unit, e.g. bytes or
+ * pages.
+ *
+ * There should be one manager instance per GPU virtual address space.
+ */
+struct drm_gpuva_manager {
+	/**
+	 * @name: the name of the DRM GPU VA space
+	 */
+	const char *name;
+
+	/**
+	 * @mm_start: start of the VA space
+	 */
+	u64 mm_start;
+
+	/**
+	 * @mm_range: length of the VA space
+	 */
+	u64 mm_range;
+
+	/**
+	 * @region_mt: the &maple_tree to track GPU VA regions
+	 */
+	struct maple_tree region_mt;
+
+	/**
+	 * @va_mt: the &maple_tree to track GPU VA mappings
+	 */
+	struct maple_tree va_mt;
+
+	/**
+	 * @kernel_alloc_region:
+	 *
+	 * &drm_gpuva_region representing the address space cutout reserved for
+	 * the kernel
+	 */
+	struct drm_gpuva_region kernel_alloc_region;
+
+	/**
+	 * @ops: &drm_gpuva_fn_ops providing the split/merge steps to drivers
+	 */
+	struct drm_gpuva_fn_ops *ops;
+
+	/**
+	 * @flags: the feature flags of the &drm_gpuva_manager
+	 */
+	enum drm_gpuva_mgr_flags flags;
+};
+
+void drm_gpuva_manager_init(struct drm_gpuva_manager *mgr,
+			    const char *name,
+			    u64 start_offset, u64 range,
+			    u64 reserve_offset, u64 reserve_range,
+			    struct drm_gpuva_fn_ops *ops,
+			    enum drm_gpuva_mgr_flags flags);
+void drm_gpuva_manager_destroy(struct drm_gpuva_manager *mgr);
+
+/**
+ * struct drm_gpuva_iterator - iterator for walking the internal (maple) tree
+ */
+struct drm_gpuva_iterator {
+	/**
+	 * @mas: the maple tree iterator (maple advanced state)
+	 */
+	struct ma_state mas;
+
+	/**
+	 * @mgr: the &drm_gpuva_manager to iterate
+	 */
+	struct drm_gpuva_manager *mgr;
+
+	union {
+		/**
+		 * @va: the current &drm_gpuva entry
+		 */
+		struct drm_gpuva *va;
+
+		/**
+		 * @reg: the current &drm_gpuva_region entry
+		 */
+		struct drm_gpuva_region *reg;
+
+		/**
+		 * @entry: the current entry
+		 */
+		void *entry;
+	};
+};
+
+void drm_gpuva_iter_remove(struct drm_gpuva_iterator *it);
+
+/**
+ * DRM_GPUVA_ITER - create an iterator structure to iterate the &drm_gpuva tree
+ * @name: the name of the &drm_gpuva_iterator to create
+ * @mgr: the &drm_gpuva_manager to iterate
+ */
+#define DRM_GPUVA_ITER(name, mgr__)				\
+	struct drm_gpuva_iterator name = {			\
+		.mas = __MA_STATE(&(mgr__)->va_mt, 0, 0),	\
+		.mgr = mgr__,					\
+		.va = NULL,					\
+	}
+
+/**
+ * DRM_GPUVA_REGION_ITER - create an iterator structure to iterate the
+ * &drm_gpuva_region tree
+ * @name: the name of the &drm_gpuva_iterator to create
+ * @mgr: the &drm_gpuva_manager to iterate
+ */
+#define DRM_GPUVA_REGION_ITER(name, mgr__)			\
+	struct drm_gpuva_iterator name = {			\
+		.mas = __MA_STATE(&(mgr__)->region_mt, 0, 0),	\
+		.mgr = mgr__,					\
+		.reg = NULL,					\
+	}
+
+/**
+ * drm_gpuva_iter_for_each_range - iternator to walk over a range of entries
+ * @it__: &drm_gpuva_iterator structure to assign to in each iteration step
+ * @start__: starting offset, the first entry will overlap this
+ * @end__: ending offset, the last entry will start before this (but may overlap)
+ *
+ * This function can be used to iterate both &drm_gpuva objects and
+ * &drm_gpuva_region objects.
+ *
+ * It is safe against the removal of elements using &drm_gpuva_iter_remove,
+ * however it is not safe against the removal of elements using
+ * &drm_gpuva_remove and &drm_gpuva_region_remove.
+ */
+#define drm_gpuva_iter_for_each_range(it__, start__, end__) \
+	for ((it__).mas.index = start__, (it__).entry = mas_find(&(it__).mas, end__ - 1); \
+	     (it__).entry; (it__).entry = mas_find(&(it__).mas, end__ - 1))
+
+/**
+ * drm_gpuva_iter_for_each - iternator to walk over all existing entries
+ * @it__: &drm_gpuva_iterator structure to assign to in each iteration step
+ *
+ * This function can be used to iterate both &drm_gpuva objects and
+ * &drm_gpuva_region objects.
+ *
+ * It is safe against the removal of elements using &drm_gpuva_iter_remove,
+ * however it is not safe against the removal of elements using
+ * &drm_gpuva_remove and &drm_gpuva_region_remove.
+ */
+#define drm_gpuva_iter_for_each(it__) \
+	drm_gpuva_iter_for_each_range(it__, (it__).mgr->mm_start, \
+				      (it__).mgr->mm_start + (it__).mgr->mm_range)
+
+/**
+ * enum drm_gpuva_op_type - GPU VA operation type
+ *
+ * Operations to alter the GPU VA mappings tracked by the &drm_gpuva_manager.
+ */
+enum drm_gpuva_op_type {
+	/**
+	 * @DRM_GPUVA_OP_MAP: the map op type
+	 */
+	DRM_GPUVA_OP_MAP,
+
+	/**
+	 * @DRM_GPUVA_OP_REMAP: the remap op type
+	 */
+	DRM_GPUVA_OP_REMAP,
+
+	/**
+	 * @DRM_GPUVA_OP_UNMAP: the unmap op type
+	 */
+	DRM_GPUVA_OP_UNMAP,
+
+	/**
+	 * @DRM_GPUVA_OP_PREFETCH: the prefetch op type
+	 */
+	DRM_GPUVA_OP_PREFETCH,
+};
+
+/**
+ * struct drm_gpuva_op_map - GPU VA map operation
+ *
+ * This structure represents a single map operation generated by the
+ * DRM GPU VA manager.
+ */
+struct drm_gpuva_op_map {
+	/**
+	 * @va: structure containing address and range of a map
+	 * operation
+	 */
+	struct {
+		/**
+		 * @addr: the base address of the new mapping
+		 */
+		u64 addr;
+
+		/**
+		 * @range: the range of the new mapping
+		 */
+		u64 range;
+	} va;
+
+	/**
+	 * @gem: structure containing the &drm_gem_object and it's offset
+	 */
+	struct {
+		/**
+		 * @offset: the offset within the &drm_gem_object
+		 */
+		u64 offset;
+
+		/**
+		 * @obj: the &drm_gem_object to map
+		 */
+		struct drm_gem_object *obj;
+	} gem;
+};
+
+/**
+ * struct drm_gpuva_op_unmap - GPU VA unmap operation
+ *
+ * This structure represents a single unmap operation generated by the
+ * DRM GPU VA manager.
+ */
+struct drm_gpuva_op_unmap {
+	/**
+	 * @va: the &drm_gpuva to unmap
+	 */
+	struct drm_gpuva *va;
+
+	/**
+	 * @keep:
+	 *
+	 * Indicates whether this &drm_gpuva is physically contiguous with the
+	 * original mapping request.
+	 *
+	 * Optionally, if &keep is set, drivers may keep the actual page table
+	 * mappings for this &drm_gpuva, adding the missing page table entries
+	 * only and update the &drm_gpuva_manager accordingly.
+	 */
+	bool keep;
+};
+
+/**
+ * struct drm_gpuva_op_remap - GPU VA remap operation
+ *
+ * This represents a single remap operation generated by the DRM GPU VA manager.
+ *
+ * A remap operation is generated when an existing GPU VA mmapping is split up
+ * by inserting a new GPU VA mapping or by partially unmapping existent
+ * mapping(s), hence it consists of a maximum of two map and one unmap
+ * operation.
+ *
+ * The @unmap operation takes care of removing the original existing mapping.
+ * @prev is used to remap the preceding part, @next the subsequent part.
+ *
+ * If either a new mapping's start address is aligned with the start address
+ * of the old mapping or the new mapping's end address is aligned with the
+ * end address of the old mapping, either @prev or @next is NULL.
+ *
+ * Note, the reason for a dedicated remap operation, rather than arbitrary
+ * unmap and map operations, is to give drivers the chance of extracting driver
+ * specific data for creating the new mappings from the unmap operations's
+ * &drm_gpuva structure which typically is embedded in larger driver specific
+ * structures.
+ */
+struct drm_gpuva_op_remap {
+	/**
+	 * @prev: the preceding part of a split mapping
+	 */
+	struct drm_gpuva_op_map *prev;
+
+	/**
+	 * @next: the subsequent part of a split mapping
+	 */
+	struct drm_gpuva_op_map *next;
+
+	/**
+	 * @unmap: the unmap operation for the original existing mapping
+	 */
+	struct drm_gpuva_op_unmap *unmap;
+};
+
+/**
+ * struct drm_gpuva_op_prefetch - GPU VA prefetch operation
+ *
+ * This structure represents a single prefetch operation generated by the
+ * DRM GPU VA manager.
+ */
+struct drm_gpuva_op_prefetch {
+	/**
+	 * @va: the &drm_gpuva to prefetch
+	 */
+	struct drm_gpuva *va;
+};
+
+/**
+ * struct drm_gpuva_op - GPU VA operation
+ *
+ * This structure represents a single generic operation.
+ *
+ * The particular type of the operation is defined by @op.
+ */
+struct drm_gpuva_op {
+	/**
+	 * @entry:
+	 *
+	 * The &list_head used to distribute instances of this struct within
+	 * &drm_gpuva_ops.
+	 */
+	struct list_head entry;
+
+	/**
+	 * @op: the type of the operation
+	 */
+	enum drm_gpuva_op_type op;
+
+	union {
+		/**
+		 * @map: the map operation
+		 */
+		struct drm_gpuva_op_map map;
+
+		/**
+		 * @remap: the remap operation
+		 */
+		struct drm_gpuva_op_remap remap;
+
+		/**
+		 * @unmap: the unmap operation
+		 */
+		struct drm_gpuva_op_unmap unmap;
+
+		/**
+		 * @prefetch: the prefetch operation
+		 */
+		struct drm_gpuva_op_prefetch prefetch;
+	};
+};
+
+/**
+ * struct drm_gpuva_ops - wraps a list of &drm_gpuva_op
+ */
+struct drm_gpuva_ops {
+	/**
+	 * @list: the &list_head
+	 */
+	struct list_head list;
+};
+
+/**
+ * drm_gpuva_for_each_op - iterator to walk over &drm_gpuva_ops
+ * @op: &drm_gpuva_op to assign in each iteration step
+ * @ops: &drm_gpuva_ops to walk
+ *
+ * This iterator walks over all ops within a given list of operations.
+ */
+#define drm_gpuva_for_each_op(op, ops) list_for_each_entry(op, &(ops)->list, entry)
+
+/**
+ * drm_gpuva_for_each_op_safe - iterator to safely walk over &drm_gpuva_ops
+ * @op: &drm_gpuva_op to assign in each iteration step
+ * @next: &next &drm_gpuva_op to store the next step
+ * @ops: &drm_gpuva_ops to walk
+ *
+ * This iterator walks over all ops within a given list of operations. It is
+ * implemented with list_for_each_safe(), so save against removal of elements.
+ */
+#define drm_gpuva_for_each_op_safe(op, next, ops) \
+	list_for_each_entry_safe(op, next, &(ops)->list, entry)
+
+/**
+ * drm_gpuva_for_each_op_from_reverse - iterate backwards from the given point
+ * @op: &drm_gpuva_op to assign in each iteration step
+ * @ops: &drm_gpuva_ops to walk
+ *
+ * This iterator walks over all ops within a given list of operations beginning
+ * from the given operation in reverse order.
+ */
+#define drm_gpuva_for_each_op_from_reverse(op, ops) \
+	list_for_each_entry_from_reverse(op, &(ops)->list, entry)
+
+/**
+ * drm_gpuva_first_op - returns the first &drm_gpuva_op from &drm_gpuva_ops
+ * @ops: the &drm_gpuva_ops to get the fist &drm_gpuva_op from
+ */
+#define drm_gpuva_first_op(ops) \
+	list_first_entry(&(ops)->list, struct drm_gpuva_op, entry)
+
+/**
+ * drm_gpuva_last_op - returns the last &drm_gpuva_op from &drm_gpuva_ops
+ * @ops: the &drm_gpuva_ops to get the last &drm_gpuva_op from
+ */
+#define drm_gpuva_last_op(ops) \
+	list_last_entry(&(ops)->list, struct drm_gpuva_op, entry)
+
+/**
+ * drm_gpuva_prev_op - previous &drm_gpuva_op in the list
+ * @op: the current &drm_gpuva_op
+ */
+#define drm_gpuva_prev_op(op) list_prev_entry(op, entry)
+
+/**
+ * drm_gpuva_next_op - next &drm_gpuva_op in the list
+ * @op: the current &drm_gpuva_op
+ */
+#define drm_gpuva_next_op(op) list_next_entry(op, entry)
+
+struct drm_gpuva_ops *
+drm_gpuva_sm_map_ops_create(struct drm_gpuva_manager *mgr,
+			    u64 addr, u64 range,
+			    struct drm_gem_object *obj, u64 offset);
+struct drm_gpuva_ops *
+drm_gpuva_sm_unmap_ops_create(struct drm_gpuva_manager *mgr,
+			      u64 addr, u64 range);
+
+struct drm_gpuva_ops *
+drm_gpuva_prefetch_ops_create(struct drm_gpuva_manager *mgr,
+				 u64 addr, u64 range);
+
+struct drm_gpuva_ops *
+drm_gpuva_gem_unmap_ops_create(struct drm_gpuva_manager *mgr,
+			       struct drm_gem_object *obj);
+
+void drm_gpuva_ops_free(struct drm_gpuva_manager *mgr,
+			struct drm_gpuva_ops *ops);
+
+/**
+ * struct drm_gpuva_fn_ops - callbacks for split/merge steps
+ *
+ * This structure defines the callbacks used by &drm_gpuva_sm_map and
+ * &drm_gpuva_sm_unmap to provide the split/merge steps for map and unmap
+ * operations to drivers.
+ */
+struct drm_gpuva_fn_ops {
+	/**
+	 * @op_alloc: called when the &drm_gpuva_manager allocates
+	 * a struct drm_gpuva_op
+	 *
+	 * Some drivers may want to embed struct drm_gpuva_op into driver
+	 * specific structures. By implementing this callback drivers can
+	 * allocate memory accordingly.
+	 *
+	 * This callback is optional.
+	 */
+	struct drm_gpuva_op *(*op_alloc)(void);
+
+	/**
+	 * @op_free: called when the &drm_gpuva_manager frees a
+	 * struct drm_gpuva_op
+	 *
+	 * Some drivers may want to embed struct drm_gpuva_op into driver
+	 * specific structures. By implementing this callback drivers can
+	 * free the previously allocated memory accordingly.
+	 *
+	 * This callback is optional.
+	 */
+	void (*op_free)(struct drm_gpuva_op *op);
+
+	/**
+	 * @sm_map_step: called from &drm_gpuva_sm_map providing the split and
+	 * merge steps
+	 *
+	 * This callback provides a single split / merge step or, if no split
+	 * and merge is indicated, the original map operation.
+	 *
+	 * The &priv pointer is equal to the one drivers pass to
+	 * &drm_gpuva_sm_map.
+	 */
+	int (*sm_map_step)(struct drm_gpuva_op *op, void *priv);
+
+	/**
+	 * @sm_unmap_step: called from &drm_gpuva_sm_map providing the split and
+	 * merge steps
+	 *
+	 * This callback provides a single split step or, if no split is
+	 * indicated, the plain unmap operations of the corresponding unmap
+	 * range originally passed to &drm_gpuva_sm_unmap.
+	 *
+	 * The &priv pointer is equal to the one drivers pass to
+	 * &drm_gpuva_sm_unmap.
+	 */
+	int (*sm_unmap_step)(struct drm_gpuva_op *op, void *priv);
+};
+
+int drm_gpuva_sm_map(struct drm_gpuva_manager *mgr, void *priv,
+		     u64 addr, u64 range,
+		     struct drm_gem_object *obj, u64 offset);
+
+int drm_gpuva_sm_unmap(struct drm_gpuva_manager *mgr, void *priv,
+		       u64 addr, u64 range);
+
+#endif /* __DRM_GPUVA_MGR_H__ */