[RFC,16/28] arm64: RME: Allow populating initial contents
Commit Message
The VMM needs to populate the realm with some data before starting (e.g.
a kernel and initrd). This is measured by the RMM and used as part of
the attestation later on.
Signed-off-by: Steven Price <steven.price@arm.com>
---
arch/arm64/kvm/rme.c | 366 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 366 insertions(+)
Comments
On Fri, 27 Jan 2023 11:29:20 +0000
Steven Price <steven.price@arm.com> wrote:
> The VMM needs to populate the realm with some data before starting (e.g.
> a kernel and initrd). This is measured by the RMM and used as part of
> the attestation later on.
>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> arch/arm64/kvm/rme.c | 366 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 366 insertions(+)
>
> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
> index 16e0bfea98b1..3405b43e1421 100644
> --- a/arch/arm64/kvm/rme.c
> +++ b/arch/arm64/kvm/rme.c
> @@ -4,6 +4,7 @@
> */
>
> #include <linux/kvm_host.h>
> +#include <linux/hugetlb.h>
>
> #include <asm/kvm_emulate.h>
> #include <asm/kvm_mmu.h>
> @@ -426,6 +427,359 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size)
> }
> }
>
> +static int realm_create_protected_data_page(struct realm *realm,
> + unsigned long ipa,
> + struct page *dst_page,
> + struct page *tmp_page)
> +{
> + phys_addr_t dst_phys, tmp_phys;
> + int ret;
> +
> + copy_page(page_address(tmp_page), page_address(dst_page));
> +
> + dst_phys = page_to_phys(dst_page);
> + tmp_phys = page_to_phys(tmp_page);
> +
> + if (rmi_granule_delegate(dst_phys))
> + return -ENXIO;
> +
> + ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, tmp_phys,
> + RMI_MEASURE_CONTENT);
> +
> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> + /* Create missing RTTs and retry */
> + int level = RMI_RETURN_INDEX(ret);
> +
> + ret = realm_create_rtt_levels(realm, ipa, level,
> + RME_RTT_MAX_LEVEL, NULL);
> + if (ret)
> + goto err;
> +
> + ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa,
> + tmp_phys, RMI_MEASURE_CONTENT);
> + }
> +
> + if (ret)
> + goto err;
> +
> + return 0;
> +
> +err:
> + if (WARN_ON(rmi_granule_undelegate(dst_phys))) {
> + /* Page can't be returned to NS world so is lost */
> + get_page(dst_page);
> + }
> + return -ENXIO;
> +}
> +
> +static int fold_rtt(phys_addr_t rd, unsigned long addr, int level,
> + struct realm *realm)
> +{
> + struct rtt_entry rtt;
> + phys_addr_t rtt_addr;
> +
> + if (rmi_rtt_read_entry(rd, addr, level, &rtt))
> + return -ENXIO;
> +
> + if (rtt.state != RMI_TABLE)
> + return -EINVAL;
> +
> + rtt_addr = rmi_rtt_get_phys(&rtt);
> + if (rmi_rtt_fold(rtt_addr, rd, addr, level + 1))
> + return -ENXIO;
> +
> + free_delegated_page(realm, rtt_addr);
> +
> + return 0;
> +}
> +
> +int realm_map_protected(struct realm *realm,
> + unsigned long hva,
> + unsigned long base_ipa,
> + struct page *dst_page,
> + unsigned long map_size,
> + struct kvm_mmu_memory_cache *memcache)
> +{
> + phys_addr_t dst_phys = page_to_phys(dst_page);
> + phys_addr_t rd = virt_to_phys(realm->rd);
> + unsigned long phys = dst_phys;
> + unsigned long ipa = base_ipa;
> + unsigned long size;
> + int map_level;
> + int ret = 0;
> +
> + if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
> + return -EINVAL;
> +
> + switch (map_size) {
> + case PAGE_SIZE:
> + map_level = 3;
> + break;
> + case RME_L2_BLOCK_SIZE:
> + map_level = 2;
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + if (map_level < RME_RTT_MAX_LEVEL) {
> + /*
> + * A temporary RTT is needed during the map, precreate it,
> + * however if there is an error (e.g. missing parent tables)
> + * this will be handled below.
> + */
> + realm_create_rtt_levels(realm, ipa, map_level,
> + RME_RTT_MAX_LEVEL, memcache);
> + }
> +
> + for (size = 0; size < map_size; size += PAGE_SIZE) {
> + if (rmi_granule_delegate(phys)) {
> + struct rtt_entry rtt;
> +
> + /*
> + * It's possible we raced with another VCPU on the same
> + * fault. If the entry exists and matches then exit
> + * early and assume the other VCPU will handle the
> + * mapping.
> + */
> + if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
> + goto err;
> +
> + // FIXME: For a block mapping this could race at level
> + // 2 or 3...
> + if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
> + rtt.state != RMI_ASSIGNED ||
> + rtt.desc != phys))) {
> + goto err;
> + }
> +
> + return 0;
> + }
> +
> + ret = rmi_data_create_unknown(phys, rd, ipa);
> +
> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> + /* Create missing RTTs and retry */
> + int level = RMI_RETURN_INDEX(ret);
> +
> + ret = realm_create_rtt_levels(realm, ipa, level,
> + RME_RTT_MAX_LEVEL,
> + memcache);
> + WARN_ON(ret);
> + if (ret)
> + goto err_undelegate;
> +
> + ret = rmi_data_create_unknown(phys, rd, ipa);
> + }
> + WARN_ON(ret);
> +
> + if (ret)
> + goto err_undelegate;
> +
> + phys += PAGE_SIZE;
> + ipa += PAGE_SIZE;
> + }
> +
> + if (map_size == RME_L2_BLOCK_SIZE)
> + ret = fold_rtt(rd, base_ipa, map_level, realm);
> + if (WARN_ON(ret))
> + goto err;
> +
> + return 0;
> +
> +err_undelegate:
> + if (WARN_ON(rmi_granule_undelegate(phys))) {
> + /* Page can't be returned to NS world so is lost */
> + get_page(phys_to_page(phys));
> + }
> +err:
> + while (size > 0) {
> + phys -= PAGE_SIZE;
> + size -= PAGE_SIZE;
> + ipa -= PAGE_SIZE;
> +
> + rmi_data_destroy(rd, ipa);
> +
> + if (WARN_ON(rmi_granule_undelegate(phys))) {
> + /* Page can't be returned to NS world so is lost */
> + get_page(phys_to_page(phys));
> + }
> + }
> + return -ENXIO;
> +}
> +
There seems no caller to the function above. Better move it to the related
patch.
> +static int populate_par_region(struct kvm *kvm,
> + phys_addr_t ipa_base,
> + phys_addr_t ipa_end)
> +{
> + struct realm *realm = &kvm->arch.realm;
> + struct kvm_memory_slot *memslot;
> + gfn_t base_gfn, end_gfn;
> + int idx;
> + phys_addr_t ipa;
> + int ret = 0;
> + struct page *tmp_page;
> + phys_addr_t rd = virt_to_phys(realm->rd);
> +
> + base_gfn = gpa_to_gfn(ipa_base);
> + end_gfn = gpa_to_gfn(ipa_end);
> +
> + idx = srcu_read_lock(&kvm->srcu);
> + memslot = gfn_to_memslot(kvm, base_gfn);
> + if (!memslot) {
> + ret = -EFAULT;
> + goto out;
> + }
> +
> + /* We require the region to be contained within a single memslot */
> + if (memslot->base_gfn + memslot->npages < end_gfn) {
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + tmp_page = alloc_page(GFP_KERNEL);
> + if (!tmp_page) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + mmap_read_lock(current->mm);
> +
> + ipa = ipa_base;
> +
> + while (ipa < ipa_end) {
> + struct vm_area_struct *vma;
> + unsigned long map_size;
> + unsigned int vma_shift;
> + unsigned long offset;
> + unsigned long hva;
> + struct page *page;
> + kvm_pfn_t pfn;
> + int level;
> +
> + hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
> + vma = vma_lookup(current->mm, hva);
> + if (!vma) {
> + ret = -EFAULT;
> + break;
> + }
> +
> + if (is_vm_hugetlb_page(vma))
> + vma_shift = huge_page_shift(hstate_vma(vma));
> + else
> + vma_shift = PAGE_SHIFT;
> +
> + map_size = 1 << vma_shift;
> +
> + /*
> + * FIXME: This causes over mapping, but there's no good
> + * solution here with the ABI as it stands
> + */
> + ipa = ALIGN_DOWN(ipa, map_size);
> +
> + switch (map_size) {
> + case RME_L2_BLOCK_SIZE:
> + level = 2;
> + break;
> + case PAGE_SIZE:
> + level = 3;
> + break;
> + default:
> + WARN_ONCE(1, "Unsupport vma_shift %d", vma_shift);
> + ret = -EFAULT;
> + break;
> + }
> +
> + pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa));
> +
> + if (is_error_pfn(pfn)) {
> + ret = -EFAULT;
> + break;
> + }
> +
> + ret = rmi_rtt_init_ripas(rd, ipa, level);
> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> + ret = realm_create_rtt_levels(realm, ipa,
> + RMI_RETURN_INDEX(ret),
> + level, NULL);
> + if (ret)
> + break;
> + ret = rmi_rtt_init_ripas(rd, ipa, level);
> + if (ret) {
> + ret = -ENXIO;
> + break;
> + }
> + }
> +
> + if (level < RME_RTT_MAX_LEVEL) {
> + /*
> + * A temporary RTT is needed during the map, precreate
> + * it, however if there is an error (e.g. missing
> + * parent tables) this will be handled in the
> + * realm_create_protected_data_page() call.
> + */
> + realm_create_rtt_levels(realm, ipa, level,
> + RME_RTT_MAX_LEVEL, NULL);
> + }
> +
> + page = pfn_to_page(pfn);
> +
> + for (offset = 0; offset < map_size && !ret;
> + offset += PAGE_SIZE, page++) {
> + phys_addr_t page_ipa = ipa + offset;
> +
> + ret = realm_create_protected_data_page(realm, page_ipa,
> + page, tmp_page);
> + }
> + if (ret)
> + goto err_release_pfn;
> +
> + if (level == 2) {
> + ret = fold_rtt(rd, ipa, level, realm);
> + if (ret)
> + goto err_release_pfn;
> + }
> +
> + ipa += map_size;
> + kvm_set_pfn_accessed(pfn);
> + kvm_set_pfn_dirty(pfn);
kvm_release_pfn_dirty() has already called kvm_set_pfn_{accessed, dirty}().
> + kvm_release_pfn_dirty(pfn);
> +err_release_pfn:
> + if (ret) {
> + kvm_release_pfn_clean(pfn);
> + break;
> + }
> + }
> +
> + mmap_read_unlock(current->mm);
> + __free_page(tmp_page);
> +
> +out:
> + srcu_read_unlock(&kvm->srcu, idx);
> + return ret;
> +}
> +
> +static int kvm_populate_realm(struct kvm *kvm,
> + struct kvm_cap_arm_rme_populate_realm_args *args)
> +{
> + phys_addr_t ipa_base, ipa_end;
> +
Check kvm_is_realm(kvm) here or in the kvm_realm_enable_cap().
> + if (kvm_realm_state(kvm) != REALM_STATE_NEW)
> + return -EBUSY;
Maybe -EINVAL? The realm hasn't been created (RMI_REALM_CREATE is not called
yet). The userspace shouldn't reach this path.
> +
> + if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
> + !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
> + return -EINVAL;
> +
> + ipa_base = args->populate_ipa_base;
> + ipa_end = ipa_base + args->populate_ipa_size;
> +
> + if (ipa_end < ipa_base)
> + return -EINVAL;
> +
> + return populate_par_region(kvm, ipa_base, ipa_end);
> +}
> +
> static int set_ipa_state(struct kvm_vcpu *vcpu,
> unsigned long ipa,
> unsigned long end,
> @@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
> r = kvm_init_ipa_range_realm(kvm, &args);
> break;
> }
> + case KVM_CAP_ARM_RME_POPULATE_REALM: {
> + struct kvm_cap_arm_rme_populate_realm_args args;
> + void __user *argp = u64_to_user_ptr(cap->args[1]);
> +
> + if (copy_from_user(&args, argp, sizeof(args))) {
> + r = -EFAULT;
> + break;
> + }
> +
> + r = kvm_populate_realm(kvm, &args);
> + break;
> + }
> default:
> r = -EINVAL;
> break;
On 06/03/2023 17:34, Zhi Wang wrote:
> On Fri, 27 Jan 2023 11:29:20 +0000
> Steven Price <steven.price@arm.com> wrote:
>
>> The VMM needs to populate the realm with some data before starting (e.g.
>> a kernel and initrd). This is measured by the RMM and used as part of
>> the attestation later on.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> arch/arm64/kvm/rme.c | 366 +++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 366 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
>> index 16e0bfea98b1..3405b43e1421 100644
>> --- a/arch/arm64/kvm/rme.c
>> +++ b/arch/arm64/kvm/rme.c
>> @@ -4,6 +4,7 @@
>> */
>>
>> #include <linux/kvm_host.h>
>> +#include <linux/hugetlb.h>
>>
>> #include <asm/kvm_emulate.h>
>> #include <asm/kvm_mmu.h>
>> @@ -426,6 +427,359 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size)
>> }
>> }
>>
>> +static int realm_create_protected_data_page(struct realm *realm,
>> + unsigned long ipa,
>> + struct page *dst_page,
>> + struct page *tmp_page)
>> +{
>> + phys_addr_t dst_phys, tmp_phys;
>> + int ret;
>> +
>> + copy_page(page_address(tmp_page), page_address(dst_page));
>> +
>> + dst_phys = page_to_phys(dst_page);
>> + tmp_phys = page_to_phys(tmp_page);
>> +
>> + if (rmi_granule_delegate(dst_phys))
>> + return -ENXIO;
>> +
>> + ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, tmp_phys,
>> + RMI_MEASURE_CONTENT);
>> +
>> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
>> + /* Create missing RTTs and retry */
>> + int level = RMI_RETURN_INDEX(ret);
>> +
>> + ret = realm_create_rtt_levels(realm, ipa, level,
>> + RME_RTT_MAX_LEVEL, NULL);
>> + if (ret)
>> + goto err;
>> +
>> + ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa,
>> + tmp_phys, RMI_MEASURE_CONTENT);
>> + }
>> +
>> + if (ret)
>> + goto err;
>> +
>> + return 0;
>> +
>> +err:
>> + if (WARN_ON(rmi_granule_undelegate(dst_phys))) {
>> + /* Page can't be returned to NS world so is lost */
>> + get_page(dst_page);
>> + }
>> + return -ENXIO;
>> +}
>> +
>> +static int fold_rtt(phys_addr_t rd, unsigned long addr, int level,
>> + struct realm *realm)
>> +{
>> + struct rtt_entry rtt;
>> + phys_addr_t rtt_addr;
>> +
>> + if (rmi_rtt_read_entry(rd, addr, level, &rtt))
>> + return -ENXIO;
>> +
>> + if (rtt.state != RMI_TABLE)
>> + return -EINVAL;
>> +
>> + rtt_addr = rmi_rtt_get_phys(&rtt);
>> + if (rmi_rtt_fold(rtt_addr, rd, addr, level + 1))
>> + return -ENXIO;
>> +
>> + free_delegated_page(realm, rtt_addr);
>> +
>> + return 0;
>> +}
>> +
>> +int realm_map_protected(struct realm *realm,
>> + unsigned long hva,
>> + unsigned long base_ipa,
>> + struct page *dst_page,
>> + unsigned long map_size,
>> + struct kvm_mmu_memory_cache *memcache)
>> +{
>> + phys_addr_t dst_phys = page_to_phys(dst_page);
>> + phys_addr_t rd = virt_to_phys(realm->rd);
>> + unsigned long phys = dst_phys;
>> + unsigned long ipa = base_ipa;
>> + unsigned long size;
>> + int map_level;
>> + int ret = 0;
>> +
>> + if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
>> + return -EINVAL;
>> +
>> + switch (map_size) {
>> + case PAGE_SIZE:
>> + map_level = 3;
>> + break;
>> + case RME_L2_BLOCK_SIZE:
>> + map_level = 2;
>> + break;
>> + default:
>> + return -EINVAL;
>> + }
>> +
>> + if (map_level < RME_RTT_MAX_LEVEL) {
>> + /*
>> + * A temporary RTT is needed during the map, precreate it,
>> + * however if there is an error (e.g. missing parent tables)
>> + * this will be handled below.
>> + */
>> + realm_create_rtt_levels(realm, ipa, map_level,
>> + RME_RTT_MAX_LEVEL, memcache);
>> + }
>> +
>> + for (size = 0; size < map_size; size += PAGE_SIZE) {
>> + if (rmi_granule_delegate(phys)) {
>> + struct rtt_entry rtt;
>> +
>> + /*
>> + * It's possible we raced with another VCPU on the same
>> + * fault. If the entry exists and matches then exit
>> + * early and assume the other VCPU will handle the
>> + * mapping.
>> + */
>> + if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
>> + goto err;
>> +
>> + // FIXME: For a block mapping this could race at level
>> + // 2 or 3...
>> + if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
>> + rtt.state != RMI_ASSIGNED ||
>> + rtt.desc != phys))) {
>> + goto err;
>> + }
>> +
>> + return 0;
>> + }
>> +
>> + ret = rmi_data_create_unknown(phys, rd, ipa);
>> +
>> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
>> + /* Create missing RTTs and retry */
>> + int level = RMI_RETURN_INDEX(ret);
>> +
>> + ret = realm_create_rtt_levels(realm, ipa, level,
>> + RME_RTT_MAX_LEVEL,
>> + memcache);
>> + WARN_ON(ret);
>> + if (ret)
>> + goto err_undelegate;
>> +
>> + ret = rmi_data_create_unknown(phys, rd, ipa);
>> + }
>> + WARN_ON(ret);
>> +
>> + if (ret)
>> + goto err_undelegate;
>> +
>> + phys += PAGE_SIZE;
>> + ipa += PAGE_SIZE;
>> + }
>> +
>> + if (map_size == RME_L2_BLOCK_SIZE)
>> + ret = fold_rtt(rd, base_ipa, map_level, realm);
>> + if (WARN_ON(ret))
>> + goto err;
>> +
>> + return 0;
>> +
>> +err_undelegate:
>> + if (WARN_ON(rmi_granule_undelegate(phys))) {
>> + /* Page can't be returned to NS world so is lost */
>> + get_page(phys_to_page(phys));
>> + }
>> +err:
>> + while (size > 0) {
>> + phys -= PAGE_SIZE;
>> + size -= PAGE_SIZE;
>> + ipa -= PAGE_SIZE;
>> +
>> + rmi_data_destroy(rd, ipa);
>> +
>> + if (WARN_ON(rmi_granule_undelegate(phys))) {
>> + /* Page can't be returned to NS world so is lost */
>> + get_page(phys_to_page(phys));
>> + }
>> + }
>> + return -ENXIO;
>> +}
>> +
>
> There seems no caller to the function above. Better move it to the related
> patch.
Indeed this should really be in the next patch - will move as it's very
confusing having it in this patch (sorry about that).
>> +static int populate_par_region(struct kvm *kvm,
>> + phys_addr_t ipa_base,
>> + phys_addr_t ipa_end)
>> +{
>> + struct realm *realm = &kvm->arch.realm;
>> + struct kvm_memory_slot *memslot;
>> + gfn_t base_gfn, end_gfn;
>> + int idx;
>> + phys_addr_t ipa;
>> + int ret = 0;
>> + struct page *tmp_page;
>> + phys_addr_t rd = virt_to_phys(realm->rd);
>> +
>> + base_gfn = gpa_to_gfn(ipa_base);
>> + end_gfn = gpa_to_gfn(ipa_end);
>> +
>> + idx = srcu_read_lock(&kvm->srcu);
>> + memslot = gfn_to_memslot(kvm, base_gfn);
>> + if (!memslot) {
>> + ret = -EFAULT;
>> + goto out;
>> + }
>> +
>> + /* We require the region to be contained within a single memslot */
>> + if (memslot->base_gfn + memslot->npages < end_gfn) {
>> + ret = -EINVAL;
>> + goto out;
>> + }
>> +
>> + tmp_page = alloc_page(GFP_KERNEL);
>> + if (!tmp_page) {
>> + ret = -ENOMEM;
>> + goto out;
>> + }
>> +
>> + mmap_read_lock(current->mm);
>> +
>> + ipa = ipa_base;
>> +
>> + while (ipa < ipa_end) {
>> + struct vm_area_struct *vma;
>> + unsigned long map_size;
>> + unsigned int vma_shift;
>> + unsigned long offset;
>> + unsigned long hva;
>> + struct page *page;
>> + kvm_pfn_t pfn;
>> + int level;
>> +
>> + hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
>> + vma = vma_lookup(current->mm, hva);
>> + if (!vma) {
>> + ret = -EFAULT;
>> + break;
>> + }
>> +
>> + if (is_vm_hugetlb_page(vma))
>> + vma_shift = huge_page_shift(hstate_vma(vma));
>> + else
>> + vma_shift = PAGE_SHIFT;
>> +
>> + map_size = 1 << vma_shift;
>> +
>> + /*
>> + * FIXME: This causes over mapping, but there's no good
>> + * solution here with the ABI as it stands
>> + */
>> + ipa = ALIGN_DOWN(ipa, map_size);
>> +
>> + switch (map_size) {
>> + case RME_L2_BLOCK_SIZE:
>> + level = 2;
>> + break;
>> + case PAGE_SIZE:
>> + level = 3;
>> + break;
>> + default:
>> + WARN_ONCE(1, "Unsupport vma_shift %d", vma_shift);
>> + ret = -EFAULT;
>> + break;
>> + }
>> +
>> + pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa));
>> +
>> + if (is_error_pfn(pfn)) {
>> + ret = -EFAULT;
>> + break;
>> + }
>> +
>> + ret = rmi_rtt_init_ripas(rd, ipa, level);
>> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
>> + ret = realm_create_rtt_levels(realm, ipa,
>> + RMI_RETURN_INDEX(ret),
>> + level, NULL);
>> + if (ret)
>> + break;
>> + ret = rmi_rtt_init_ripas(rd, ipa, level);
>> + if (ret) {
>> + ret = -ENXIO;
>> + break;
>> + }
>> + }
>> +
>> + if (level < RME_RTT_MAX_LEVEL) {
>> + /*
>> + * A temporary RTT is needed during the map, precreate
>> + * it, however if there is an error (e.g. missing
>> + * parent tables) this will be handled in the
>> + * realm_create_protected_data_page() call.
>> + */
>> + realm_create_rtt_levels(realm, ipa, level,
>> + RME_RTT_MAX_LEVEL, NULL);
>> + }
>> +
>> + page = pfn_to_page(pfn);
>> +
>> + for (offset = 0; offset < map_size && !ret;
>> + offset += PAGE_SIZE, page++) {
>> + phys_addr_t page_ipa = ipa + offset;
>> +
>> + ret = realm_create_protected_data_page(realm, page_ipa,
>> + page, tmp_page);
>> + }
>> + if (ret)
>> + goto err_release_pfn;
>> +
>> + if (level == 2) {
>> + ret = fold_rtt(rd, ipa, level, realm);
>> + if (ret)
>> + goto err_release_pfn;
>> + }
>> +
>> + ipa += map_size;
>
>> + kvm_set_pfn_accessed(pfn);
>> + kvm_set_pfn_dirty(pfn);
>
> kvm_release_pfn_dirty() has already called kvm_set_pfn_{accessed, dirty}().
Will remove those calls.
>> + kvm_release_pfn_dirty(pfn);
>> +err_release_pfn:
>> + if (ret) {
>> + kvm_release_pfn_clean(pfn);
>> + break;
>> + }
>> + }
>> +
>> + mmap_read_unlock(current->mm);
>> + __free_page(tmp_page);
>> +
>> +out:
>> + srcu_read_unlock(&kvm->srcu, idx);
>> + return ret;
>> +}
>> +
>> +static int kvm_populate_realm(struct kvm *kvm,
>> + struct kvm_cap_arm_rme_populate_realm_args *args)
>> +{
>> + phys_addr_t ipa_base, ipa_end;
>> +
>
> Check kvm_is_realm(kvm) here or in the kvm_realm_enable_cap().
I'm going to update kvm_vm_ioctl_enable_cap() to check kvm_is_realm() so
we won't get here.
>> + if (kvm_realm_state(kvm) != REALM_STATE_NEW)
>> + return -EBUSY;
>
> Maybe -EINVAL? The realm hasn't been created (RMI_REALM_CREATE is not called
> yet). The userspace shouldn't reach this path.
Well user space can attempt to populate in the ACTIVE state - which is
where the idea of 'busy' comes from. Admittedly it's a little confusing
when RMI_REALM_CREATE hasn't been called.
I'm not particularly bothered about the return code, but it's useful to
have a different code to -EINVAL as it's not an invalid argument, but
calling at the wrong time. I can't immediately see a better error code
though.
Steve
>> +
>> + if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
>> + !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
>> + return -EINVAL;
>> +
>> + ipa_base = args->populate_ipa_base;
>> + ipa_end = ipa_base + args->populate_ipa_size;
>> +
>> + if (ipa_end < ipa_base)
>> + return -EINVAL;
>> +
>> + return populate_par_region(kvm, ipa_base, ipa_end);
>> +}
>> +
>> static int set_ipa_state(struct kvm_vcpu *vcpu,
>> unsigned long ipa,
>> unsigned long end,
>> @@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>> r = kvm_init_ipa_range_realm(kvm, &args);
>> break;
>> }
>> + case KVM_CAP_ARM_RME_POPULATE_REALM: {
>> + struct kvm_cap_arm_rme_populate_realm_args args;
>> + void __user *argp = u64_to_user_ptr(cap->args[1]);
>> +
>> + if (copy_from_user(&args, argp, sizeof(args))) {
>> + r = -EFAULT;
>> + break;
>> + }
>> +
>> + r = kvm_populate_realm(kvm, &args);
>> + break;
>> + }
>> default:
>> r = -EINVAL;
>> break;
>
On Fri, 10 Mar 2023 15:47:16 +0000
Steven Price <steven.price@arm.com> wrote:
> On 06/03/2023 17:34, Zhi Wang wrote:
> > On Fri, 27 Jan 2023 11:29:20 +0000
> > Steven Price <steven.price@arm.com> wrote:
> >
> >> The VMM needs to populate the realm with some data before starting (e.g.
> >> a kernel and initrd). This is measured by the RMM and used as part of
> >> the attestation later on.
> >>
> >> Signed-off-by: Steven Price <steven.price@arm.com>
> >> ---
> >> arch/arm64/kvm/rme.c | 366 +++++++++++++++++++++++++++++++++++++++++++
> >> 1 file changed, 366 insertions(+)
> >>
> >> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
> >> index 16e0bfea98b1..3405b43e1421 100644
> >> --- a/arch/arm64/kvm/rme.c
> >> +++ b/arch/arm64/kvm/rme.c
> >> @@ -4,6 +4,7 @@
> >> */
> >>
> >> #include <linux/kvm_host.h>
> >> +#include <linux/hugetlb.h>
> >>
> >> #include <asm/kvm_emulate.h>
> >> #include <asm/kvm_mmu.h>
> >> @@ -426,6 +427,359 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size)
> >> }
> >> }
> >>
> >> +static int realm_create_protected_data_page(struct realm *realm,
> >> + unsigned long ipa,
> >> + struct page *dst_page,
> >> + struct page *tmp_page)
> >> +{
> >> + phys_addr_t dst_phys, tmp_phys;
> >> + int ret;
> >> +
> >> + copy_page(page_address(tmp_page), page_address(dst_page));
> >> +
> >> + dst_phys = page_to_phys(dst_page);
> >> + tmp_phys = page_to_phys(tmp_page);
> >> +
> >> + if (rmi_granule_delegate(dst_phys))
> >> + return -ENXIO;
> >> +
> >> + ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, tmp_phys,
> >> + RMI_MEASURE_CONTENT);
> >> +
> >> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> >> + /* Create missing RTTs and retry */
> >> + int level = RMI_RETURN_INDEX(ret);
> >> +
> >> + ret = realm_create_rtt_levels(realm, ipa, level,
> >> + RME_RTT_MAX_LEVEL, NULL);
> >> + if (ret)
> >> + goto err;
> >> +
> >> + ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa,
> >> + tmp_phys, RMI_MEASURE_CONTENT);
> >> + }
> >> +
> >> + if (ret)
> >> + goto err;
> >> +
> >> + return 0;
> >> +
> >> +err:
> >> + if (WARN_ON(rmi_granule_undelegate(dst_phys))) {
> >> + /* Page can't be returned to NS world so is lost */
> >> + get_page(dst_page);
> >> + }
> >> + return -ENXIO;
> >> +}
> >> +
> >> +static int fold_rtt(phys_addr_t rd, unsigned long addr, int level,
> >> + struct realm *realm)
> >> +{
> >> + struct rtt_entry rtt;
> >> + phys_addr_t rtt_addr;
> >> +
> >> + if (rmi_rtt_read_entry(rd, addr, level, &rtt))
> >> + return -ENXIO;
> >> +
> >> + if (rtt.state != RMI_TABLE)
> >> + return -EINVAL;
> >> +
> >> + rtt_addr = rmi_rtt_get_phys(&rtt);
> >> + if (rmi_rtt_fold(rtt_addr, rd, addr, level + 1))
> >> + return -ENXIO;
> >> +
> >> + free_delegated_page(realm, rtt_addr);
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +int realm_map_protected(struct realm *realm,
> >> + unsigned long hva,
> >> + unsigned long base_ipa,
> >> + struct page *dst_page,
> >> + unsigned long map_size,
> >> + struct kvm_mmu_memory_cache *memcache)
> >> +{
> >> + phys_addr_t dst_phys = page_to_phys(dst_page);
> >> + phys_addr_t rd = virt_to_phys(realm->rd);
> >> + unsigned long phys = dst_phys;
> >> + unsigned long ipa = base_ipa;
> >> + unsigned long size;
> >> + int map_level;
> >> + int ret = 0;
> >> +
> >> + if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
> >> + return -EINVAL;
> >> +
> >> + switch (map_size) {
> >> + case PAGE_SIZE:
> >> + map_level = 3;
> >> + break;
> >> + case RME_L2_BLOCK_SIZE:
> >> + map_level = 2;
> >> + break;
> >> + default:
> >> + return -EINVAL;
> >> + }
> >> +
> >> + if (map_level < RME_RTT_MAX_LEVEL) {
> >> + /*
> >> + * A temporary RTT is needed during the map, precreate it,
> >> + * however if there is an error (e.g. missing parent tables)
> >> + * this will be handled below.
> >> + */
> >> + realm_create_rtt_levels(realm, ipa, map_level,
> >> + RME_RTT_MAX_LEVEL, memcache);
> >> + }
> >> +
> >> + for (size = 0; size < map_size; size += PAGE_SIZE) {
> >> + if (rmi_granule_delegate(phys)) {
> >> + struct rtt_entry rtt;
> >> +
> >> + /*
> >> + * It's possible we raced with another VCPU on the same
> >> + * fault. If the entry exists and matches then exit
> >> + * early and assume the other VCPU will handle the
> >> + * mapping.
> >> + */
> >> + if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
> >> + goto err;
> >> +
> >> + // FIXME: For a block mapping this could race at level
> >> + // 2 or 3...
> >> + if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
> >> + rtt.state != RMI_ASSIGNED ||
> >> + rtt.desc != phys))) {
> >> + goto err;
> >> + }
> >> +
> >> + return 0;
> >> + }
> >> +
> >> + ret = rmi_data_create_unknown(phys, rd, ipa);
> >> +
> >> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> >> + /* Create missing RTTs and retry */
> >> + int level = RMI_RETURN_INDEX(ret);
> >> +
> >> + ret = realm_create_rtt_levels(realm, ipa, level,
> >> + RME_RTT_MAX_LEVEL,
> >> + memcache);
> >> + WARN_ON(ret);
> >> + if (ret)
> >> + goto err_undelegate;
> >> +
> >> + ret = rmi_data_create_unknown(phys, rd, ipa);
> >> + }
> >> + WARN_ON(ret);
> >> +
> >> + if (ret)
> >> + goto err_undelegate;
> >> +
> >> + phys += PAGE_SIZE;
> >> + ipa += PAGE_SIZE;
> >> + }
> >> +
> >> + if (map_size == RME_L2_BLOCK_SIZE)
> >> + ret = fold_rtt(rd, base_ipa, map_level, realm);
> >> + if (WARN_ON(ret))
> >> + goto err;
> >> +
> >> + return 0;
> >> +
> >> +err_undelegate:
> >> + if (WARN_ON(rmi_granule_undelegate(phys))) {
> >> + /* Page can't be returned to NS world so is lost */
> >> + get_page(phys_to_page(phys));
> >> + }
> >> +err:
> >> + while (size > 0) {
> >> + phys -= PAGE_SIZE;
> >> + size -= PAGE_SIZE;
> >> + ipa -= PAGE_SIZE;
> >> +
> >> + rmi_data_destroy(rd, ipa);
> >> +
> >> + if (WARN_ON(rmi_granule_undelegate(phys))) {
> >> + /* Page can't be returned to NS world so is lost */
> >> + get_page(phys_to_page(phys));
> >> + }
> >> + }
> >> + return -ENXIO;
> >> +}
> >> +
> >
> > There seems no caller to the function above. Better move it to the related
> > patch.
>
> Indeed this should really be in the next patch - will move as it's very
> confusing having it in this patch (sorry about that).
>
> >> +static int populate_par_region(struct kvm *kvm,
> >> + phys_addr_t ipa_base,
> >> + phys_addr_t ipa_end)
> >> +{
> >> + struct realm *realm = &kvm->arch.realm;
> >> + struct kvm_memory_slot *memslot;
> >> + gfn_t base_gfn, end_gfn;
> >> + int idx;
> >> + phys_addr_t ipa;
> >> + int ret = 0;
> >> + struct page *tmp_page;
> >> + phys_addr_t rd = virt_to_phys(realm->rd);
> >> +
> >> + base_gfn = gpa_to_gfn(ipa_base);
> >> + end_gfn = gpa_to_gfn(ipa_end);
> >> +
> >> + idx = srcu_read_lock(&kvm->srcu);
> >> + memslot = gfn_to_memslot(kvm, base_gfn);
> >> + if (!memslot) {
> >> + ret = -EFAULT;
> >> + goto out;
> >> + }
> >> +
> >> + /* We require the region to be contained within a single memslot */
> >> + if (memslot->base_gfn + memslot->npages < end_gfn) {
> >> + ret = -EINVAL;
> >> + goto out;
> >> + }
> >> +
> >> + tmp_page = alloc_page(GFP_KERNEL);
> >> + if (!tmp_page) {
> >> + ret = -ENOMEM;
> >> + goto out;
> >> + }
> >> +
> >> + mmap_read_lock(current->mm);
> >> +
> >> + ipa = ipa_base;
> >> +
> >> + while (ipa < ipa_end) {
> >> + struct vm_area_struct *vma;
> >> + unsigned long map_size;
> >> + unsigned int vma_shift;
> >> + unsigned long offset;
> >> + unsigned long hva;
> >> + struct page *page;
> >> + kvm_pfn_t pfn;
> >> + int level;
> >> +
> >> + hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
> >> + vma = vma_lookup(current->mm, hva);
> >> + if (!vma) {
> >> + ret = -EFAULT;
> >> + break;
> >> + }
> >> +
> >> + if (is_vm_hugetlb_page(vma))
> >> + vma_shift = huge_page_shift(hstate_vma(vma));
> >> + else
> >> + vma_shift = PAGE_SHIFT;
> >> +
> >> + map_size = 1 << vma_shift;
> >> +
> >> + /*
> >> + * FIXME: This causes over mapping, but there's no good
> >> + * solution here with the ABI as it stands
> >> + */
> >> + ipa = ALIGN_DOWN(ipa, map_size);
> >> +
> >> + switch (map_size) {
> >> + case RME_L2_BLOCK_SIZE:
> >> + level = 2;
> >> + break;
> >> + case PAGE_SIZE:
> >> + level = 3;
> >> + break;
> >> + default:
> >> + WARN_ONCE(1, "Unsupport vma_shift %d", vma_shift);
> >> + ret = -EFAULT;
> >> + break;
> >> + }
> >> +
> >> + pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa));
> >> +
> >> + if (is_error_pfn(pfn)) {
> >> + ret = -EFAULT;
> >> + break;
> >> + }
> >> +
> >> + ret = rmi_rtt_init_ripas(rd, ipa, level);
> >> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> >> + ret = realm_create_rtt_levels(realm, ipa,
> >> + RMI_RETURN_INDEX(ret),
> >> + level, NULL);
> >> + if (ret)
> >> + break;
> >> + ret = rmi_rtt_init_ripas(rd, ipa, level);
> >> + if (ret) {
> >> + ret = -ENXIO;
> >> + break;
> >> + }
> >> + }
> >> +
> >> + if (level < RME_RTT_MAX_LEVEL) {
> >> + /*
> >> + * A temporary RTT is needed during the map, precreate
> >> + * it, however if there is an error (e.g. missing
> >> + * parent tables) this will be handled in the
> >> + * realm_create_protected_data_page() call.
> >> + */
> >> + realm_create_rtt_levels(realm, ipa, level,
> >> + RME_RTT_MAX_LEVEL, NULL);
> >> + }
> >> +
> >> + page = pfn_to_page(pfn);
> >> +
> >> + for (offset = 0; offset < map_size && !ret;
> >> + offset += PAGE_SIZE, page++) {
> >> + phys_addr_t page_ipa = ipa + offset;
> >> +
> >> + ret = realm_create_protected_data_page(realm, page_ipa,
> >> + page, tmp_page);
> >> + }
> >> + if (ret)
> >> + goto err_release_pfn;
> >> +
> >> + if (level == 2) {
> >> + ret = fold_rtt(rd, ipa, level, realm);
> >> + if (ret)
> >> + goto err_release_pfn;
> >> + }
> >> +
> >> + ipa += map_size;
> >
> >> + kvm_set_pfn_accessed(pfn);
> >> + kvm_set_pfn_dirty(pfn);
> >
> > kvm_release_pfn_dirty() has already called kvm_set_pfn_{accessed, dirty}().
>
> Will remove those calls.
>
> >> + kvm_release_pfn_dirty(pfn);
> >> +err_release_pfn:
> >> + if (ret) {
> >> + kvm_release_pfn_clean(pfn);
> >> + break;
> >> + }
> >> + }
> >> +
> >> + mmap_read_unlock(current->mm);
> >> + __free_page(tmp_page);
> >> +
> >> +out:
> >> + srcu_read_unlock(&kvm->srcu, idx);
> >> + return ret;
> >> +}
> >> +
> >> +static int kvm_populate_realm(struct kvm *kvm,
> >> + struct kvm_cap_arm_rme_populate_realm_args *args)
> >> +{
> >> + phys_addr_t ipa_base, ipa_end;
> >> +
> >
> > Check kvm_is_realm(kvm) here or in the kvm_realm_enable_cap().
>
> I'm going to update kvm_vm_ioctl_enable_cap() to check kvm_is_realm() so
> we won't get here.
>
> >> + if (kvm_realm_state(kvm) != REALM_STATE_NEW)
> >> + return -EBUSY;
> >
> > Maybe -EINVAL? The realm hasn't been created (RMI_REALM_CREATE is not called
> > yet). The userspace shouldn't reach this path.
>
> Well user space can attempt to populate in the ACTIVE state - which is
> where the idea of 'busy' comes from. Admittedly it's a little confusing
> when RMI_REALM_CREATE hasn't been called.
>
> I'm not particularly bothered about the return code, but it's useful to
> have a different code to -EINVAL as it's not an invalid argument, but
> calling at the wrong time. I can't immediately see a better error code
> though.
>
The reason why I feel -EBUSY is little bit off is EBUSY usually indicates
something is already initialized and currently running, then another
calling path wanna to operate it.
I took a look on the ioctls in arch/arm64/kvm/arm.c. It seems people have
different opinions for calling execution path at a wrong time:
For example:
long kvm_arch_vcpu_ioctl()
...
case KVM_GET_REG_LIST: {
struct kvm_reg_list __user *user_list = argp;
struct kvm_reg_list reg_list;
unsigned n;
r = -ENOEXEC;
if (unlikely(!kvm_vcpu_initialized(vcpu)))
break;
r = -EPERM;
if (!kvm_arm_vcpu_is_finalized(vcpu))
break;
If we have to choose one, I prefer -ENOEXEC as -EPERM is stranger. But
personally my vote goes to -EINVAL.
> Steve
>
> >> +
> >> + if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
> >> + !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
> >> + return -EINVAL;
> >> +
> >> + ipa_base = args->populate_ipa_base;
> >> + ipa_end = ipa_base + args->populate_ipa_size;
> >> +
> >> + if (ipa_end < ipa_base)
> >> + return -EINVAL;
> >> +
> >> + return populate_par_region(kvm, ipa_base, ipa_end);
> >> +}
> >> +
> >> static int set_ipa_state(struct kvm_vcpu *vcpu,
> >> unsigned long ipa,
> >> unsigned long end,
> >> @@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
> >> r = kvm_init_ipa_range_realm(kvm, &args);
> >> break;
> >> }
> >> + case KVM_CAP_ARM_RME_POPULATE_REALM: {
> >> + struct kvm_cap_arm_rme_populate_realm_args args;
> >> + void __user *argp = u64_to_user_ptr(cap->args[1]);
> >> +
> >> + if (copy_from_user(&args, argp, sizeof(args))) {
> >> + r = -EFAULT;
> >> + break;
> >> + }
> >> +
> >> + r = kvm_populate_realm(kvm, &args);
> >> + break;
> >> + }
> >> default:
> >> r = -EINVAL;
> >> break;
> >
>
On 14/03/2023 15:31, Zhi Wang wrote:
> On Fri, 10 Mar 2023 15:47:16 +0000
> Steven Price <steven.price@arm.com> wrote:
>
>> On 06/03/2023 17:34, Zhi Wang wrote:
>>> On Fri, 27 Jan 2023 11:29:20 +0000
>>> Steven Price <steven.price@arm.com> wrote:
<snip>
>>>> + if (kvm_realm_state(kvm) != REALM_STATE_NEW)
>>>> + return -EBUSY;
>>>
>>> Maybe -EINVAL? The realm hasn't been created (RMI_REALM_CREATE is not called
>>> yet). The userspace shouldn't reach this path.
>>
>> Well user space can attempt to populate in the ACTIVE state - which is
>> where the idea of 'busy' comes from. Admittedly it's a little confusing
>> when RMI_REALM_CREATE hasn't been called.
>>
>> I'm not particularly bothered about the return code, but it's useful to
>> have a different code to -EINVAL as it's not an invalid argument, but
>> calling at the wrong time. I can't immediately see a better error code
>> though.
>>
> The reason why I feel -EBUSY is little bit off is EBUSY usually indicates
> something is already initialized and currently running, then another
> calling path wanna to operate it.
>
> I took a look on the ioctls in arch/arm64/kvm/arm.c. It seems people have
> different opinions for calling execution path at a wrong time:
>
> For example:
>
> long kvm_arch_vcpu_ioctl()
> ...
> case KVM_GET_REG_LIST: {
> struct kvm_reg_list __user *user_list = argp;
> struct kvm_reg_list reg_list;
> unsigned n;
>
> r = -ENOEXEC;
> if (unlikely(!kvm_vcpu_initialized(vcpu)))
> break;
>
> r = -EPERM;
> if (!kvm_arm_vcpu_is_finalized(vcpu))
> break;
>
> If we have to choose one, I prefer -ENOEXEC as -EPERM is stranger. But
> personally my vote goes to -EINVAL.
Ok, I think you've convinced me - I'll change to -EINVAL. It is invalid
use of the API and none of the other error codes seem a great fit.
Although I do wish Linux had more descriptive error codes - I often end
up peppering the kernel with a few printks when using a new API to find
out what I'm doing wrong.
Steve
>> Steve
>>
>>>> +
>>>> + if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
>>>> + !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
>>>> + return -EINVAL;
>>>> +
>>>> + ipa_base = args->populate_ipa_base;
>>>> + ipa_end = ipa_base + args->populate_ipa_size;
>>>> +
>>>> + if (ipa_end < ipa_base)
>>>> + return -EINVAL;
>>>> +
>>>> + return populate_par_region(kvm, ipa_base, ipa_end);
>>>> +}
>>>> +
>>>> static int set_ipa_state(struct kvm_vcpu *vcpu,
>>>> unsigned long ipa,
>>>> unsigned long end,
>>>> @@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>>>> r = kvm_init_ipa_range_realm(kvm, &args);
>>>> break;
>>>> }
>>>> + case KVM_CAP_ARM_RME_POPULATE_REALM: {
>>>> + struct kvm_cap_arm_rme_populate_realm_args args;
>>>> + void __user *argp = u64_to_user_ptr(cap->args[1]);
>>>> +
>>>> + if (copy_from_user(&args, argp, sizeof(args))) {
>>>> + r = -EFAULT;
>>>> + break;
>>>> + }
>>>> +
>>>> + r = kvm_populate_realm(kvm, &args);
>>>> + break;
>>>> + }
>>>> default:
>>>> r = -EINVAL;
>>>> break;
>>>
>>
>
@@ -4,6 +4,7 @@
*/
#include <linux/kvm_host.h>
+#include <linux/hugetlb.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
@@ -426,6 +427,359 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size)
}
}
+static int realm_create_protected_data_page(struct realm *realm,
+ unsigned long ipa,
+ struct page *dst_page,
+ struct page *tmp_page)
+{
+ phys_addr_t dst_phys, tmp_phys;
+ int ret;
+
+ copy_page(page_address(tmp_page), page_address(dst_page));
+
+ dst_phys = page_to_phys(dst_page);
+ tmp_phys = page_to_phys(tmp_page);
+
+ if (rmi_granule_delegate(dst_phys))
+ return -ENXIO;
+
+ ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa, tmp_phys,
+ RMI_MEASURE_CONTENT);
+
+ if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+ /* Create missing RTTs and retry */
+ int level = RMI_RETURN_INDEX(ret);
+
+ ret = realm_create_rtt_levels(realm, ipa, level,
+ RME_RTT_MAX_LEVEL, NULL);
+ if (ret)
+ goto err;
+
+ ret = rmi_data_create(dst_phys, virt_to_phys(realm->rd), ipa,
+ tmp_phys, RMI_MEASURE_CONTENT);
+ }
+
+ if (ret)
+ goto err;
+
+ return 0;
+
+err:
+ if (WARN_ON(rmi_granule_undelegate(dst_phys))) {
+ /* Page can't be returned to NS world so is lost */
+ get_page(dst_page);
+ }
+ return -ENXIO;
+}
+
+static int fold_rtt(phys_addr_t rd, unsigned long addr, int level,
+ struct realm *realm)
+{
+ struct rtt_entry rtt;
+ phys_addr_t rtt_addr;
+
+ if (rmi_rtt_read_entry(rd, addr, level, &rtt))
+ return -ENXIO;
+
+ if (rtt.state != RMI_TABLE)
+ return -EINVAL;
+
+ rtt_addr = rmi_rtt_get_phys(&rtt);
+ if (rmi_rtt_fold(rtt_addr, rd, addr, level + 1))
+ return -ENXIO;
+
+ free_delegated_page(realm, rtt_addr);
+
+ return 0;
+}
+
+int realm_map_protected(struct realm *realm,
+ unsigned long hva,
+ unsigned long base_ipa,
+ struct page *dst_page,
+ unsigned long map_size,
+ struct kvm_mmu_memory_cache *memcache)
+{
+ phys_addr_t dst_phys = page_to_phys(dst_page);
+ phys_addr_t rd = virt_to_phys(realm->rd);
+ unsigned long phys = dst_phys;
+ unsigned long ipa = base_ipa;
+ unsigned long size;
+ int map_level;
+ int ret = 0;
+
+ if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
+ return -EINVAL;
+
+ switch (map_size) {
+ case PAGE_SIZE:
+ map_level = 3;
+ break;
+ case RME_L2_BLOCK_SIZE:
+ map_level = 2;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (map_level < RME_RTT_MAX_LEVEL) {
+ /*
+ * A temporary RTT is needed during the map, precreate it,
+ * however if there is an error (e.g. missing parent tables)
+ * this will be handled below.
+ */
+ realm_create_rtt_levels(realm, ipa, map_level,
+ RME_RTT_MAX_LEVEL, memcache);
+ }
+
+ for (size = 0; size < map_size; size += PAGE_SIZE) {
+ if (rmi_granule_delegate(phys)) {
+ struct rtt_entry rtt;
+
+ /*
+ * It's possible we raced with another VCPU on the same
+ * fault. If the entry exists and matches then exit
+ * early and assume the other VCPU will handle the
+ * mapping.
+ */
+ if (rmi_rtt_read_entry(rd, ipa, RME_RTT_MAX_LEVEL, &rtt))
+ goto err;
+
+ // FIXME: For a block mapping this could race at level
+ // 2 or 3...
+ if (WARN_ON((rtt.walk_level != RME_RTT_MAX_LEVEL ||
+ rtt.state != RMI_ASSIGNED ||
+ rtt.desc != phys))) {
+ goto err;
+ }
+
+ return 0;
+ }
+
+ ret = rmi_data_create_unknown(phys, rd, ipa);
+
+ if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+ /* Create missing RTTs and retry */
+ int level = RMI_RETURN_INDEX(ret);
+
+ ret = realm_create_rtt_levels(realm, ipa, level,
+ RME_RTT_MAX_LEVEL,
+ memcache);
+ WARN_ON(ret);
+ if (ret)
+ goto err_undelegate;
+
+ ret = rmi_data_create_unknown(phys, rd, ipa);
+ }
+ WARN_ON(ret);
+
+ if (ret)
+ goto err_undelegate;
+
+ phys += PAGE_SIZE;
+ ipa += PAGE_SIZE;
+ }
+
+ if (map_size == RME_L2_BLOCK_SIZE)
+ ret = fold_rtt(rd, base_ipa, map_level, realm);
+ if (WARN_ON(ret))
+ goto err;
+
+ return 0;
+
+err_undelegate:
+ if (WARN_ON(rmi_granule_undelegate(phys))) {
+ /* Page can't be returned to NS world so is lost */
+ get_page(phys_to_page(phys));
+ }
+err:
+ while (size > 0) {
+ phys -= PAGE_SIZE;
+ size -= PAGE_SIZE;
+ ipa -= PAGE_SIZE;
+
+ rmi_data_destroy(rd, ipa);
+
+ if (WARN_ON(rmi_granule_undelegate(phys))) {
+ /* Page can't be returned to NS world so is lost */
+ get_page(phys_to_page(phys));
+ }
+ }
+ return -ENXIO;
+}
+
+static int populate_par_region(struct kvm *kvm,
+ phys_addr_t ipa_base,
+ phys_addr_t ipa_end)
+{
+ struct realm *realm = &kvm->arch.realm;
+ struct kvm_memory_slot *memslot;
+ gfn_t base_gfn, end_gfn;
+ int idx;
+ phys_addr_t ipa;
+ int ret = 0;
+ struct page *tmp_page;
+ phys_addr_t rd = virt_to_phys(realm->rd);
+
+ base_gfn = gpa_to_gfn(ipa_base);
+ end_gfn = gpa_to_gfn(ipa_end);
+
+ idx = srcu_read_lock(&kvm->srcu);
+ memslot = gfn_to_memslot(kvm, base_gfn);
+ if (!memslot) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* We require the region to be contained within a single memslot */
+ if (memslot->base_gfn + memslot->npages < end_gfn) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ tmp_page = alloc_page(GFP_KERNEL);
+ if (!tmp_page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mmap_read_lock(current->mm);
+
+ ipa = ipa_base;
+
+ while (ipa < ipa_end) {
+ struct vm_area_struct *vma;
+ unsigned long map_size;
+ unsigned int vma_shift;
+ unsigned long offset;
+ unsigned long hva;
+ struct page *page;
+ kvm_pfn_t pfn;
+ int level;
+
+ hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
+ vma = vma_lookup(current->mm, hva);
+ if (!vma) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (is_vm_hugetlb_page(vma))
+ vma_shift = huge_page_shift(hstate_vma(vma));
+ else
+ vma_shift = PAGE_SHIFT;
+
+ map_size = 1 << vma_shift;
+
+ /*
+ * FIXME: This causes over mapping, but there's no good
+ * solution here with the ABI as it stands
+ */
+ ipa = ALIGN_DOWN(ipa, map_size);
+
+ switch (map_size) {
+ case RME_L2_BLOCK_SIZE:
+ level = 2;
+ break;
+ case PAGE_SIZE:
+ level = 3;
+ break;
+ default:
+ WARN_ONCE(1, "Unsupport vma_shift %d", vma_shift);
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa));
+
+ if (is_error_pfn(pfn)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = rmi_rtt_init_ripas(rd, ipa, level);
+ if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+ ret = realm_create_rtt_levels(realm, ipa,
+ RMI_RETURN_INDEX(ret),
+ level, NULL);
+ if (ret)
+ break;
+ ret = rmi_rtt_init_ripas(rd, ipa, level);
+ if (ret) {
+ ret = -ENXIO;
+ break;
+ }
+ }
+
+ if (level < RME_RTT_MAX_LEVEL) {
+ /*
+ * A temporary RTT is needed during the map, precreate
+ * it, however if there is an error (e.g. missing
+ * parent tables) this will be handled in the
+ * realm_create_protected_data_page() call.
+ */
+ realm_create_rtt_levels(realm, ipa, level,
+ RME_RTT_MAX_LEVEL, NULL);
+ }
+
+ page = pfn_to_page(pfn);
+
+ for (offset = 0; offset < map_size && !ret;
+ offset += PAGE_SIZE, page++) {
+ phys_addr_t page_ipa = ipa + offset;
+
+ ret = realm_create_protected_data_page(realm, page_ipa,
+ page, tmp_page);
+ }
+ if (ret)
+ goto err_release_pfn;
+
+ if (level == 2) {
+ ret = fold_rtt(rd, ipa, level, realm);
+ if (ret)
+ goto err_release_pfn;
+ }
+
+ ipa += map_size;
+ kvm_set_pfn_accessed(pfn);
+ kvm_set_pfn_dirty(pfn);
+ kvm_release_pfn_dirty(pfn);
+err_release_pfn:
+ if (ret) {
+ kvm_release_pfn_clean(pfn);
+ break;
+ }
+ }
+
+ mmap_read_unlock(current->mm);
+ __free_page(tmp_page);
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
+}
+
+static int kvm_populate_realm(struct kvm *kvm,
+ struct kvm_cap_arm_rme_populate_realm_args *args)
+{
+ phys_addr_t ipa_base, ipa_end;
+
+ if (kvm_realm_state(kvm) != REALM_STATE_NEW)
+ return -EBUSY;
+
+ if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
+ !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
+ return -EINVAL;
+
+ ipa_base = args->populate_ipa_base;
+ ipa_end = ipa_base + args->populate_ipa_size;
+
+ if (ipa_end < ipa_base)
+ return -EINVAL;
+
+ return populate_par_region(kvm, ipa_base, ipa_end);
+}
+
static int set_ipa_state(struct kvm_vcpu *vcpu,
unsigned long ipa,
unsigned long end,
@@ -748,6 +1102,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
r = kvm_init_ipa_range_realm(kvm, &args);
break;
}
+ case KVM_CAP_ARM_RME_POPULATE_REALM: {
+ struct kvm_cap_arm_rme_populate_realm_args args;
+ void __user *argp = u64_to_user_ptr(cap->args[1]);
+
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ r = -EFAULT;
+ break;
+ }
+
+ r = kvm_populate_realm(kvm, &args);
+ break;
+ }
default:
r = -EINVAL;
break;