[v3,1/9] KVM: s390: Extend MEM_OP ioctl by storage key checked cmpxchg
Commit Message
User space can use the MEM_OP ioctl to make storage key checked reads
and writes to the guest, however, it has no way of performing atomic,
key checked, accesses to the guest.
Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
mode. For now, support this mode for absolute accesses only.
This mode can be use, for example, to set the device-state-change
indicator and the adapter-local-summary indicator atomically.
Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
---
include/uapi/linux/kvm.h | 5 ++
arch/s390/kvm/gaccess.h | 3 ++
arch/s390/kvm/gaccess.c | 101 +++++++++++++++++++++++++++++++++++++++
arch/s390/kvm/kvm-s390.c | 35 +++++++++++++-
4 files changed, 142 insertions(+), 2 deletions(-)
Comments
On Thu, Nov 17, 2022 at 11:17:50PM +0100, Janis Schoetterl-Glausch wrote:
> User space can use the MEM_OP ioctl to make storage key checked reads
> and writes to the guest, however, it has no way of performing atomic,
> key checked, accesses to the guest.
> Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
> mode. For now, support this mode for absolute accesses only.
>
> This mode can be use, for example, to set the device-state-change
> indicator and the adapter-local-summary indicator atomically.
>
> Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
> ---
> include/uapi/linux/kvm.h | 5 ++
> arch/s390/kvm/gaccess.h | 3 ++
> arch/s390/kvm/gaccess.c | 101 +++++++++++++++++++++++++++++++++++++++
> arch/s390/kvm/kvm-s390.c | 35 +++++++++++++-
> 4 files changed, 142 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 0d5d4419139a..1f36be5493e6 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -588,6 +588,8 @@ struct kvm_s390_mem_op {
> struct {
> __u8 ar; /* the access register number */
> __u8 key; /* access key, ignored if flag unset */
> + __u8 pad1[6]; /* ignored */
> + __u64 old_p; /* ignored if flag unset */
Just one comment: the suffix "_p" for pointer is quite unusual within
the kernel. This also would be the first of its kind within kvm.h.
Usually there is either no suffix or "_addr".
So for consistency reasons I would suggest to change this to one of
the common variants.
The code itself looks good from my point of view, even though for the
sake of simplicity I would have put the complete sign/zero extended
128 bit old value into the structure, instead of having a pointer to
the value. Imho that would simplify the interface. Also alignment, as
pointed out previously, really doesn't matter for this use case.
But you had already something like that previously and changed it, so
no reason to go back and forth. Not really important.
On 18/11/2022 11.12, Heiko Carstens wrote:
> On Thu, Nov 17, 2022 at 11:17:50PM +0100, Janis Schoetterl-Glausch wrote:
>> User space can use the MEM_OP ioctl to make storage key checked reads
>> and writes to the guest, however, it has no way of performing atomic,
>> key checked, accesses to the guest.
>> Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
>> mode. For now, support this mode for absolute accesses only.
>>
>> This mode can be use, for example, to set the device-state-change
>> indicator and the adapter-local-summary indicator atomically.
>>
>> Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
>> ---
>> include/uapi/linux/kvm.h | 5 ++
>> arch/s390/kvm/gaccess.h | 3 ++
>> arch/s390/kvm/gaccess.c | 101 +++++++++++++++++++++++++++++++++++++++
>> arch/s390/kvm/kvm-s390.c | 35 +++++++++++++-
>> 4 files changed, 142 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 0d5d4419139a..1f36be5493e6 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -588,6 +588,8 @@ struct kvm_s390_mem_op {
>> struct {
>> __u8 ar; /* the access register number */
>> __u8 key; /* access key, ignored if flag unset */
>> + __u8 pad1[6]; /* ignored */
>> + __u64 old_p; /* ignored if flag unset */
>
> Just one comment: the suffix "_p" for pointer is quite unusual within
> the kernel. This also would be the first of its kind within kvm.h.
> Usually there is either no suffix or "_addr".
> So for consistency reasons I would suggest to change this to one of
> the common variants.
>
> The code itself looks good from my point of view, even though for the
> sake of simplicity I would have put the complete sign/zero extended
> 128 bit old value into the structure, instead of having a pointer to
> the value.
See
https://lore.kernel.org/kvm/37197cfe-d109-332f-089b-266d7e8e23f8@redhat.com/
... it would break the "IOW" definition of the ioctl. It can be done, but
that confuses tools like valgrind, as far as I know. So I think the idea
with the pointer is better in this case.
Thomas
On Fri, Nov 18, 2022 at 03:37:26PM +0100, Thomas Huth wrote:
> > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > index 0d5d4419139a..1f36be5493e6 100644
> > > --- a/include/uapi/linux/kvm.h
> > > +++ b/include/uapi/linux/kvm.h
> > > @@ -588,6 +588,8 @@ struct kvm_s390_mem_op {
> > > struct {
> > > __u8 ar; /* the access register number */
> > > __u8 key; /* access key, ignored if flag unset */
> > > + __u8 pad1[6]; /* ignored */
> > > + __u64 old_p; /* ignored if flag unset */
> >
> > Just one comment: the suffix "_p" for pointer is quite unusual within
> > the kernel. This also would be the first of its kind within kvm.h.
> > Usually there is either no suffix or "_addr".
> > So for consistency reasons I would suggest to change this to one of
> > the common variants.
> >
> > The code itself looks good from my point of view, even though for the
> > sake of simplicity I would have put the complete sign/zero extended
> > 128 bit old value into the structure, instead of having a pointer to
> > the value.
>
> See
> https://lore.kernel.org/kvm/37197cfe-d109-332f-089b-266d7e8e23f8@redhat.com/
> ... it would break the "IOW" definition of the ioctl. It can be done, but
> that confuses tools like valgrind, as far as I know. So I think the idea
> with the pointer is better in this case.
Ah right, I forgot about that. Then let's do it this way.
On Fri, 2022-11-18 at 11:12 +0100, Heiko Carstens wrote:
> On Thu, Nov 17, 2022 at 11:17:50PM +0100, Janis Schoetterl-Glausch wrote:
> > User space can use the MEM_OP ioctl to make storage key checked reads
> > and writes to the guest, however, it has no way of performing atomic,
> > key checked, accesses to the guest.
> > Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
> > mode. For now, support this mode for absolute accesses only.
> >
> > This mode can be use, for example, to set the device-state-change
> > indicator and the adapter-local-summary indicator atomically.
> >
> > Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
> > ---
> > include/uapi/linux/kvm.h | 5 ++
> > arch/s390/kvm/gaccess.h | 3 ++
> > arch/s390/kvm/gaccess.c | 101 +++++++++++++++++++++++++++++++++++++++
> > arch/s390/kvm/kvm-s390.c | 35 +++++++++++++-
> > 4 files changed, 142 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 0d5d4419139a..1f36be5493e6 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -588,6 +588,8 @@ struct kvm_s390_mem_op {
> > struct {
> > __u8 ar; /* the access register number */
> > __u8 key; /* access key, ignored if flag unset */
> > + __u8 pad1[6]; /* ignored */
> > + __u64 old_p; /* ignored if flag unset */
>
> Just one comment: the suffix "_p" for pointer is quite unusual within
> the kernel. This also would be the first of its kind within kvm.h.
> Usually there is either no suffix or "_addr".
> So for consistency reasons I would suggest to change this to one of
> the common variants.
>
Thanks, good point.
[...]
On Thu, 17 Nov 2022 23:17:50 +0100
Janis Schoetterl-Glausch <scgl@linux.ibm.com> wrote:
> User space can use the MEM_OP ioctl to make storage key checked reads
> and writes to the guest, however, it has no way of performing atomic,
> key checked, accesses to the guest.
> Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
> mode. For now, support this mode for absolute accesses only.
>
> This mode can be use, for example, to set the device-state-change
> indicator and the adapter-local-summary indicator atomically.
>
> Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
> ---
> include/uapi/linux/kvm.h | 5 ++
> arch/s390/kvm/gaccess.h | 3 ++
> arch/s390/kvm/gaccess.c | 101 +++++++++++++++++++++++++++++++++++++++
> arch/s390/kvm/kvm-s390.c | 35 +++++++++++++-
> 4 files changed, 142 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 0d5d4419139a..1f36be5493e6 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -588,6 +588,8 @@ struct kvm_s390_mem_op {
> struct {
> __u8 ar; /* the access register number */
> __u8 key; /* access key, ignored if flag unset */
> + __u8 pad1[6]; /* ignored */
> + __u64 old_p; /* ignored if flag unset */
> };
> __u32 sida_offset; /* offset into the sida */
> __u8 reserved[32]; /* ignored */
> @@ -604,6 +606,9 @@ struct kvm_s390_mem_op {
> #define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0)
> #define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1)
> #define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2)
> +#define KVM_S390_MEMOP_F_CMPXCHG (1ULL << 3)
> +/* Non program exception return codes (pgm codes are 16 bit) */
> +#define KVM_S390_MEMOP_R_NO_XCHG ((1 << 16) + 0)
are you planning to have further *_R_* macros in the near future?
if not, remove the + 0
if yes, move the (1 << 16) to a macro, so it becomes something like
(KVM_S390_MEMOP_R_BASE + 0)
(maybe you can find a better/shorter name)
>
> /* for KVM_INTERRUPT */
> struct kvm_interrupt {
> diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
> index 9408d6cc8e2c..92a3b9fb31ec 100644
> --- a/arch/s390/kvm/gaccess.h
> +++ b/arch/s390/kvm/gaccess.h
> @@ -206,6 +206,9 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
> int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
> void *data, unsigned long len, enum gacc_mode mode);
>
> +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
> + __uint128_t *old, __uint128_t new, u8 access_key);
> +
> /**
> * write_guest_with_key - copy data from kernel space to guest space
> * @vcpu: virtual cpu
> diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
> index 0243b6e38d36..be042865d8a1 100644
> --- a/arch/s390/kvm/gaccess.c
> +++ b/arch/s390/kvm/gaccess.c
> @@ -1161,6 +1161,107 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
> return rc;
> }
>
> +/**
> + * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
> + * @kvm: Virtual machine instance.
> + * @gpa: Absolute guest address of the location to be changed.
> + * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
> + * non power of two will result in failure.
> + * @old_p: Pointer to old value. If the location at @gpa contains this value, the
> + * exchange will succeed. After calling cmpxchg_guest_abs_with_key() *@old
> + * contains the value at @gpa before the attempt to exchange the value.
> + * @new: The value to place at @gpa.
> + * @access_key: The access key to use for the guest access.
> + *
> + * Atomically exchange the value at @gpa by @new, if it contains *@old.
> + * Honors storage keys.
> + *
> + * Return: * 0: successful exchange
> + * * 1: exchange unsuccessful
> + * * a program interruption code indicating the reason cmpxchg could
> + * not be attempted
> + * * -EINVAL: address misaligned or len not power of two
> + * * -EAGAIN: transient failure (len 1 or 2)
please also document -EOPNOTSUPP
> + */
> +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
> + __uint128_t *old_p, __uint128_t new,
> + u8 access_key)
> +{
> + gfn_t gfn = gpa >> PAGE_SHIFT;
> + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
exchange the above two lines (reverse christmas tree)
> + bool writable;
> + hva_t hva;
> + int ret;
> +
> + if (!IS_ALIGNED(gpa, len))
> + return -EINVAL;
> +
> + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
> + if (kvm_is_error_hva(hva))
> + return PGM_ADDRESSING;
> + /*
> + * Check if it's a read-only memslot, even though that cannot occur
> + * since those are unsupported.
> + * Don't try to actually handle that case.
> + */
> + if (!writable)
> + return -EOPNOTSUPP;
either you document this, or you return something else (like -EINVAL)
> +
> + hva += offset_in_page(gpa);
> + switch (len) {
> + case 1: {
> + u8 old;
> +
> + ret = cmpxchg_user_key((u8 *)hva, &old, *old_p, new, access_key);
> + ret = ret < 0 ? ret : old != *old_p;
> + *old_p = old;
> + break;
> + }
> + case 2: {
> + u16 old;
> +
> + ret = cmpxchg_user_key((u16 *)hva, &old, *old_p, new, access_key);
> + ret = ret < 0 ? ret : old != *old_p;
> + *old_p = old;
> + break;
> + }
> + case 4: {
> + u32 old;
> +
> + ret = cmpxchg_user_key((u32 *)hva, &old, *old_p, new, access_key);
> + ret = ret < 0 ? ret : old != *old_p;
> + *old_p = old;
> + break;
> + }
> + case 8: {
> + u64 old;
> +
> + ret = cmpxchg_user_key((u64 *)hva, &old, *old_p, new, access_key);
> + ret = ret < 0 ? ret : old != *old_p;
> + *old_p = old;
> + break;
> + }
> + case 16: {
> + __uint128_t old;
> +
> + ret = cmpxchg_user_key((__uint128_t *)hva, &old, *old_p, new, access_key);
> + ret = ret < 0 ? ret : old != *old_p;
> + *old_p = old;
> + break;
I really dislike repeating the same code 5 times, but I guess there was
no other way?
> + }
> + default:
> + return -EINVAL;
> + }
> + mark_page_dirty_in_slot(kvm, slot, gfn);
> + /*
> + * Assume that the fault is caused by protection, either key protection
> + * or user page write protection.
> + */
> + if (ret == -EFAULT)
> + ret = PGM_PROTECTION;
> + return ret;
> +}
> +
> /**
> * guest_translate_address_with_key - translate guest logical into guest absolute address
> * @vcpu: virtual cpu
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 45d4b8182b07..2410b4044283 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -576,7 +576,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> case KVM_CAP_S390_VCPU_RESETS:
> case KVM_CAP_SET_GUEST_DEBUG:
> case KVM_CAP_S390_DIAG318:
> - case KVM_CAP_S390_MEM_OP_EXTENSION:
> r = 1;
> break;
> case KVM_CAP_SET_GUEST_DEBUG2:
> @@ -590,6 +589,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> case KVM_CAP_S390_MEM_OP:
> r = MEM_OP_MAX_SIZE;
> break;
> + case KVM_CAP_S390_MEM_OP_EXTENSION:
> + /*
> + * Flag bits indicating which extensions are supported.
> + * The first extension doesn't use a flag, but pretend it does,
> + * this way that can be changed in the future.
> + */
> + r = 0x3;
> + break;
> case KVM_CAP_NR_VCPUS:
> case KVM_CAP_MAX_VCPUS:
> case KVM_CAP_MAX_VCPU_ID:
> @@ -2714,12 +2721,19 @@ static bool access_key_invalid(u8 access_key)
> static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
> {
> void __user *uaddr = (void __user *)mop->buf;
> + void __user *old_p = (void __user *)mop->old_p;
> + union {
> + __uint128_t quad;
> + char raw[sizeof(__uint128_t)];
> + } old = { .quad = 0}, new = { .quad = 0 };
> + unsigned int off_in_quad = sizeof(__uint128_t) - mop->size;
> u64 supported_flags;
> void *tmpbuf = NULL;
> int r, srcu_idx;
>
> supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION
> - | KVM_S390_MEMOP_F_CHECK_ONLY;
> + | KVM_S390_MEMOP_F_CHECK_ONLY
> + | KVM_S390_MEMOP_F_CMPXCHG;
> if (mop->flags & ~supported_flags || !mop->size)
> return -EINVAL;
> if (mop->size > MEM_OP_MAX_SIZE)
> @@ -2741,6 +2755,15 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
> } else {
> mop->key = 0;
> }
> + if (mop->flags & KVM_S390_MEMOP_F_CMPXCHG) {
add a quick comment here to explain that this check validates
off_in_quad, and that it does not do a full validation of mop->size,
which will happen in cmpxchg_guest_abs_with_key.
> + if (mop->size > sizeof(new))
> + return -EINVAL;
> + /* off_in_quad has been validated */
> + if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size))
> + return -EFAULT;
> + if (copy_from_user(&old.raw[off_in_quad], old_p, mop->size))
> + return -EFAULT;
> + }
> if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
> tmpbuf = vmalloc(mop->size);
> if (!tmpbuf)
> @@ -2771,6 +2794,14 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
> case KVM_S390_MEMOP_ABSOLUTE_WRITE: {
> if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
> r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key);
> + } else if (mop->flags & KVM_S390_MEMOP_F_CMPXCHG) {
> + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size,
> + &old.quad, new.quad, mop->key);
> + if (r == 1) {
> + r = KVM_S390_MEMOP_R_NO_XCHG;
I wonder if you could not simplify things by returning directly
KVM_S390_MEMOP_R_NO_XCHG instead of 1
> + if (copy_to_user(old_p, &old.raw[off_in_quad], mop->size))
> + r = -EFAULT;
> + }
> } else {
> if (copy_from_user(tmpbuf, uaddr, mop->size)) {
> r = -EFAULT;
On Thu, 2022-12-01 at 17:15 +0100, Claudio Imbrenda wrote:
> On Thu, 17 Nov 2022 23:17:50 +0100
> Janis Schoetterl-Glausch <scgl@linux.ibm.com> wrote:
>
> > User space can use the MEM_OP ioctl to make storage key checked reads
> > and writes to the guest, however, it has no way of performing atomic,
> > key checked, accesses to the guest.
> > Extend the MEM_OP ioctl in order to allow for this, by adding a cmpxchg
> > mode. For now, support this mode for absolute accesses only.
> >
> > This mode can be use, for example, to set the device-state-change
> > indicator and the adapter-local-summary indicator atomically.
> >
> > Signed-off-by: Janis Schoetterl-Glausch <scgl@linux.ibm.com>
> > ---
> > include/uapi/linux/kvm.h | 5 ++
> > arch/s390/kvm/gaccess.h | 3 ++
> > arch/s390/kvm/gaccess.c | 101 +++++++++++++++++++++++++++++++++++++++
> > arch/s390/kvm/kvm-s390.c | 35 +++++++++++++-
> > 4 files changed, 142 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 0d5d4419139a..1f36be5493e6 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -588,6 +588,8 @@ struct kvm_s390_mem_op {
> > struct {
> > __u8 ar; /* the access register number */
> > __u8 key; /* access key, ignored if flag unset */
> > + __u8 pad1[6]; /* ignored */
> > + __u64 old_p; /* ignored if flag unset */
> > };
> > __u32 sida_offset; /* offset into the sida */
> > __u8 reserved[32]; /* ignored */
> > @@ -604,6 +606,9 @@ struct kvm_s390_mem_op {
> > #define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0)
> > #define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1)
> > #define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2)
> > +#define KVM_S390_MEMOP_F_CMPXCHG (1ULL << 3)
> > +/* Non program exception return codes (pgm codes are 16 bit) */
> > +#define KVM_S390_MEMOP_R_NO_XCHG ((1 << 16) + 0)
>
> are you planning to have further *_R_* macros in the near future?
> if not, remove the + 0
No, we can indeed just add it back if there ever are additional ones.
> if yes, move the (1 << 16) to a macro, so it becomes something like
> (KVM_S390_MEMOP_R_BASE + 0)
>
> (maybe you can find a better/shorter name)
>
> >
> > /* for KVM_INTERRUPT */
> > struct kvm_interrupt {
> > diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
> > index 9408d6cc8e2c..92a3b9fb31ec 100644
> > --- a/arch/s390/kvm/gaccess.h
> > +++ b/arch/s390/kvm/gaccess.h
> > @@ -206,6 +206,9 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
> > int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
> > void *data, unsigned long len, enum gacc_mode mode);
> >
> > +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
> > + __uint128_t *old, __uint128_t new, u8 access_key);
> > +
> > /**
> > * write_guest_with_key - copy data from kernel space to guest space
> > * @vcpu: virtual cpu
> > diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
> > index 0243b6e38d36..be042865d8a1 100644
> > --- a/arch/s390/kvm/gaccess.c
> > +++ b/arch/s390/kvm/gaccess.c
> > @@ -1161,6 +1161,107 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
> > return rc;
> > }
> >
> > +/**
> > + * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
> > + * @kvm: Virtual machine instance.
> > + * @gpa: Absolute guest address of the location to be changed.
> > + * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
> > + * non power of two will result in failure.
> > + * @old_p: Pointer to old value. If the location at @gpa contains this value, the
> > + * exchange will succeed. After calling cmpxchg_guest_abs_with_key() *@old
> > + * contains the value at @gpa before the attempt to exchange the value.
> > + * @new: The value to place at @gpa.
> > + * @access_key: The access key to use for the guest access.
> > + *
> > + * Atomically exchange the value at @gpa by @new, if it contains *@old.
> > + * Honors storage keys.
> > + *
> > + * Return: * 0: successful exchange
> > + * * 1: exchange unsuccessful
> > + * * a program interruption code indicating the reason cmpxchg could
> > + * not be attempted
> > + * * -EINVAL: address misaligned or len not power of two
> > + * * -EAGAIN: transient failure (len 1 or 2)
>
> please also document -EOPNOTSUPP
I'd add "* -EOPNOTSUPP: should never occur", then, that ok with you?
>
> > + */
> > +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
> > + __uint128_t *old_p, __uint128_t new,
> > + u8 access_key)
> > +{
> > + gfn_t gfn = gpa >> PAGE_SHIFT;
> > + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
>
> exchange the above two lines (reverse christmas tree)
Is this a hard requirement? Since there is a dependency.
If I do the initialization further down, the order wouldn't actually change.
>
> > + bool writable;
> > + hva_t hva;
> > + int ret;
> > +
> > + if (!IS_ALIGNED(gpa, len))
> > + return -EINVAL;
> > +
> > + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
> > + if (kvm_is_error_hva(hva))
> > + return PGM_ADDRESSING;
> > + /*
> > + * Check if it's a read-only memslot, even though that cannot occur
> > + * since those are unsupported.
> > + * Don't try to actually handle that case.
> > + */
> > + if (!writable)
> > + return -EOPNOTSUPP;
>
> either you document this, or you return something else (like -EINVAL)
>
> > +
> > + hva += offset_in_page(gpa);
> > + switch (len) {
> > + case 1: {
> > + u8 old;
> > +
> > + ret = cmpxchg_user_key((u8 *)hva, &old, *old_p, new, access_key);
> > + ret = ret < 0 ? ret : old != *old_p;
> > + *old_p = old;
> > + break;
> > + }
> > + case 2: {
> > + u16 old;
> > +
> > + ret = cmpxchg_user_key((u16 *)hva, &old, *old_p, new, access_key);
> > + ret = ret < 0 ? ret : old != *old_p;
> > + *old_p = old;
> > + break;
> > + }
> > + case 4: {
> > + u32 old;
> > +
> > + ret = cmpxchg_user_key((u32 *)hva, &old, *old_p, new, access_key);
> > + ret = ret < 0 ? ret : old != *old_p;
> > + *old_p = old;
> > + break;
> > + }
> > + case 8: {
> > + u64 old;
> > +
> > + ret = cmpxchg_user_key((u64 *)hva, &old, *old_p, new, access_key);
> > + ret = ret < 0 ? ret : old != *old_p;
> > + *old_p = old;
> > + break;
> > + }
> > + case 16: {
> > + __uint128_t old;
> > +
> > + ret = cmpxchg_user_key((__uint128_t *)hva, &old, *old_p, new, access_key);
> > + ret = ret < 0 ? ret : old != *old_p;
> > + *old_p = old;
> > + break;
>
> I really dislike repeating the same code 5 times, but I guess there was
> no other way?
I could use the function called by cmpxchg_user_key directly, but Heiko won't agree to that.
A macro would work too, of course, not sure if I prefer that tho.
>
> > + }
> > + default:
> > + return -EINVAL;
> > + }
> > + mark_page_dirty_in_slot(kvm, slot, gfn);
> > + /*
> > + * Assume that the fault is caused by protection, either key protection
> > + * or user page write protection.
> > + */
> > + if (ret == -EFAULT)
> > + ret = PGM_PROTECTION;
> > + return ret;
> > +}
> > +
> > /**
> > * guest_translate_address_with_key - translate guest logical into guest absolute address
> > * @vcpu: virtual cpu
> > diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> > index 45d4b8182b07..2410b4044283 100644
> > --- a/arch/s390/kvm/kvm-s390.c
> > +++ b/arch/s390/kvm/kvm-s390.c
> > @@ -576,7 +576,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> > case KVM_CAP_S390_VCPU_RESETS:
> > case KVM_CAP_SET_GUEST_DEBUG:
> > case KVM_CAP_S390_DIAG318:
> > - case KVM_CAP_S390_MEM_OP_EXTENSION:
> > r = 1;
> > break;
> > case KVM_CAP_SET_GUEST_DEBUG2:
> > @@ -590,6 +589,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> > case KVM_CAP_S390_MEM_OP:
> > r = MEM_OP_MAX_SIZE;
> > break;
> > + case KVM_CAP_S390_MEM_OP_EXTENSION:
> > + /*
> > + * Flag bits indicating which extensions are supported.
> > + * The first extension doesn't use a flag, but pretend it does,
> > + * this way that can be changed in the future.
> > + */
> > + r = 0x3;
> > + break;
> > case KVM_CAP_NR_VCPUS:
> > case KVM_CAP_MAX_VCPUS:
> > case KVM_CAP_MAX_VCPU_ID:
> > @@ -2714,12 +2721,19 @@ static bool access_key_invalid(u8 access_key)
> > static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
> > {
> > void __user *uaddr = (void __user *)mop->buf;
> > + void __user *old_p = (void __user *)mop->old_p;
> > + union {
> > + __uint128_t quad;
> > + char raw[sizeof(__uint128_t)];
> > + } old = { .quad = 0}, new = { .quad = 0 };
> > + unsigned int off_in_quad = sizeof(__uint128_t) - mop->size;
> > u64 supported_flags;
> > void *tmpbuf = NULL;
> > int r, srcu_idx;
> >
> > supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION
> > - | KVM_S390_MEMOP_F_CHECK_ONLY;
> > + | KVM_S390_MEMOP_F_CHECK_ONLY
> > + | KVM_S390_MEMOP_F_CMPXCHG;
> > if (mop->flags & ~supported_flags || !mop->size)
> > return -EINVAL;
> > if (mop->size > MEM_OP_MAX_SIZE)
> > @@ -2741,6 +2755,15 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
> > } else {
> > mop->key = 0;
> > }
> > + if (mop->flags & KVM_S390_MEMOP_F_CMPXCHG) {
>
> add a quick comment here to explain that this check validates
> off_in_quad, and that it does not do a full validation of mop->size,
> which will happen in cmpxchg_guest_abs_with_key.
Will do.
>
> > + if (mop->size > sizeof(new))
> > + return -EINVAL;
> > + /* off_in_quad has been validated */
> > + if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size))
> > + return -EFAULT;
> > + if (copy_from_user(&old.raw[off_in_quad], old_p, mop->size))
> > + return -EFAULT;
> > + }
> > if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
> > tmpbuf = vmalloc(mop->size);
> > if (!tmpbuf)
> > @@ -2771,6 +2794,14 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
> > case KVM_S390_MEMOP_ABSOLUTE_WRITE: {
> > if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
> > r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key);
> > + } else if (mop->flags & KVM_S390_MEMOP_F_CMPXCHG) {
> > + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size,
> > + &old.quad, new.quad, mop->key);
> > + if (r == 1) {
> > + r = KVM_S390_MEMOP_R_NO_XCHG;
>
> I wonder if you could not simplify things by returning directly
> KVM_S390_MEMOP_R_NO_XCHG instead of 1
To me it feels like KVM_S390_MEMOP_R_NO_XCHG is api surface and should be referenced here.
cmpxchg_guest_abs_with_key isn't mem op specific
(of course that's the only thing it is currently used for).
>
> > + if (copy_to_user(old_p, &old.raw[off_in_quad], mop->size))
> > + r = -EFAULT;
> > + }
> > } else {
> > if (copy_from_user(tmpbuf, uaddr, mop->size)) {
> > r = -EFAULT;
>
On Thu, 01 Dec 2022 18:44:56 +0100
Janis Schoetterl-Glausch <scgl@linux.ibm.com> wrote:
> >
> > please also document -EOPNOTSUPP
>
> I'd add "* -EOPNOTSUPP: should never occur", then, that ok with you?
no, also explain in which conditions it is returned
something like:
* -EOPNOTSUPP: if the memslot is not writable (should never occour)
> >
> > > + */
> > > +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
> > > + __uint128_t *old_p, __uint128_t new,
> > > + u8 access_key)
> > > +{
> > > + gfn_t gfn = gpa >> PAGE_SHIFT;
> > > + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
> >
> > exchange the above two lines (reverse christmas tree)
>
> Is this a hard requirement? Since there is a dependency.
> If I do the initialization further down, the order wouldn't actually change.
ahhhhh right, I had missed that
keep it as it is, of course
[...]
> > I really dislike repeating the same code 5 times, but I guess there was
> > no other way?
>
> I could use the function called by cmpxchg_user_key directly, but Heiko won't agree to that.
> A macro would work too, of course, not sure if I prefer that tho.
ok so there is no other way, let's keep it as it is
[...]
> To me it feels like KVM_S390_MEMOP_R_NO_XCHG is api surface and should be referenced here.
> cmpxchg_guest_abs_with_key isn't mem op specific
> (of course that's the only thing it is currently used for).
fair enough
> >
> > > + if (copy_to_user(old_p, &old.raw[off_in_quad], mop->size))
> > > + r = -EFAULT;
> > > + }
> > > } else {
> > > if (copy_from_user(tmpbuf, uaddr, mop->size)) {
> > > r = -EFAULT;
> >
>
@@ -588,6 +588,8 @@ struct kvm_s390_mem_op {
struct {
__u8 ar; /* the access register number */
__u8 key; /* access key, ignored if flag unset */
+ __u8 pad1[6]; /* ignored */
+ __u64 old_p; /* ignored if flag unset */
};
__u32 sida_offset; /* offset into the sida */
__u8 reserved[32]; /* ignored */
@@ -604,6 +606,9 @@ struct kvm_s390_mem_op {
#define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0)
#define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1)
#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2)
+#define KVM_S390_MEMOP_F_CMPXCHG (1ULL << 3)
+/* Non program exception return codes (pgm codes are 16 bit) */
+#define KVM_S390_MEMOP_R_NO_XCHG ((1 << 16) + 0)
/* for KVM_INTERRUPT */
struct kvm_interrupt {
@@ -206,6 +206,9 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode);
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
+ __uint128_t *old, __uint128_t new, u8 access_key);
+
/**
* write_guest_with_key - copy data from kernel space to guest space
* @vcpu: virtual cpu
@@ -1161,6 +1161,107 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
return rc;
}
+/**
+ * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
+ * @kvm: Virtual machine instance.
+ * @gpa: Absolute guest address of the location to be changed.
+ * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
+ * non power of two will result in failure.
+ * @old_p: Pointer to old value. If the location at @gpa contains this value, the
+ * exchange will succeed. After calling cmpxchg_guest_abs_with_key() *@old
+ * contains the value at @gpa before the attempt to exchange the value.
+ * @new: The value to place at @gpa.
+ * @access_key: The access key to use for the guest access.
+ *
+ * Atomically exchange the value at @gpa by @new, if it contains *@old.
+ * Honors storage keys.
+ *
+ * Return: * 0: successful exchange
+ * * 1: exchange unsuccessful
+ * * a program interruption code indicating the reason cmpxchg could
+ * not be attempted
+ * * -EINVAL: address misaligned or len not power of two
+ * * -EAGAIN: transient failure (len 1 or 2)
+ */
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
+ __uint128_t *old_p, __uint128_t new,
+ u8 access_key)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ bool writable;
+ hva_t hva;
+ int ret;
+
+ if (!IS_ALIGNED(gpa, len))
+ return -EINVAL;
+
+ hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+ if (kvm_is_error_hva(hva))
+ return PGM_ADDRESSING;
+ /*
+ * Check if it's a read-only memslot, even though that cannot occur
+ * since those are unsupported.
+ * Don't try to actually handle that case.
+ */
+ if (!writable)
+ return -EOPNOTSUPP;
+
+ hva += offset_in_page(gpa);
+ switch (len) {
+ case 1: {
+ u8 old;
+
+ ret = cmpxchg_user_key((u8 *)hva, &old, *old_p, new, access_key);
+ ret = ret < 0 ? ret : old != *old_p;
+ *old_p = old;
+ break;
+ }
+ case 2: {
+ u16 old;
+
+ ret = cmpxchg_user_key((u16 *)hva, &old, *old_p, new, access_key);
+ ret = ret < 0 ? ret : old != *old_p;
+ *old_p = old;
+ break;
+ }
+ case 4: {
+ u32 old;
+
+ ret = cmpxchg_user_key((u32 *)hva, &old, *old_p, new, access_key);
+ ret = ret < 0 ? ret : old != *old_p;
+ *old_p = old;
+ break;
+ }
+ case 8: {
+ u64 old;
+
+ ret = cmpxchg_user_key((u64 *)hva, &old, *old_p, new, access_key);
+ ret = ret < 0 ? ret : old != *old_p;
+ *old_p = old;
+ break;
+ }
+ case 16: {
+ __uint128_t old;
+
+ ret = cmpxchg_user_key((__uint128_t *)hva, &old, *old_p, new, access_key);
+ ret = ret < 0 ? ret : old != *old_p;
+ *old_p = old;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ mark_page_dirty_in_slot(kvm, slot, gfn);
+ /*
+ * Assume that the fault is caused by protection, either key protection
+ * or user page write protection.
+ */
+ if (ret == -EFAULT)
+ ret = PGM_PROTECTION;
+ return ret;
+}
+
/**
* guest_translate_address_with_key - translate guest logical into guest absolute address
* @vcpu: virtual cpu
@@ -576,7 +576,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_VCPU_RESETS:
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_S390_DIAG318:
- case KVM_CAP_S390_MEM_OP_EXTENSION:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -590,6 +589,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_MEM_OP:
r = MEM_OP_MAX_SIZE;
break;
+ case KVM_CAP_S390_MEM_OP_EXTENSION:
+ /*
+ * Flag bits indicating which extensions are supported.
+ * The first extension doesn't use a flag, but pretend it does,
+ * this way that can be changed in the future.
+ */
+ r = 0x3;
+ break;
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
case KVM_CAP_MAX_VCPU_ID:
@@ -2714,12 +2721,19 @@ static bool access_key_invalid(u8 access_key)
static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
+ void __user *old_p = (void __user *)mop->old_p;
+ union {
+ __uint128_t quad;
+ char raw[sizeof(__uint128_t)];
+ } old = { .quad = 0}, new = { .quad = 0 };
+ unsigned int off_in_quad = sizeof(__uint128_t) - mop->size;
u64 supported_flags;
void *tmpbuf = NULL;
int r, srcu_idx;
supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION
- | KVM_S390_MEMOP_F_CHECK_ONLY;
+ | KVM_S390_MEMOP_F_CHECK_ONLY
+ | KVM_S390_MEMOP_F_CMPXCHG;
if (mop->flags & ~supported_flags || !mop->size)
return -EINVAL;
if (mop->size > MEM_OP_MAX_SIZE)
@@ -2741,6 +2755,15 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
} else {
mop->key = 0;
}
+ if (mop->flags & KVM_S390_MEMOP_F_CMPXCHG) {
+ if (mop->size > sizeof(new))
+ return -EINVAL;
+ /* off_in_quad has been validated */
+ if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size))
+ return -EFAULT;
+ if (copy_from_user(&old.raw[off_in_quad], old_p, mop->size))
+ return -EFAULT;
+ }
if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
tmpbuf = vmalloc(mop->size);
if (!tmpbuf)
@@ -2771,6 +2794,14 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
case KVM_S390_MEMOP_ABSOLUTE_WRITE: {
if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key);
+ } else if (mop->flags & KVM_S390_MEMOP_F_CMPXCHG) {
+ r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size,
+ &old.quad, new.quad, mop->key);
+ if (r == 1) {
+ r = KVM_S390_MEMOP_R_NO_XCHG;
+ if (copy_to_user(old_p, &old.raw[off_in_quad], mop->size))
+ r = -EFAULT;
+ }
} else {
if (copy_from_user(tmpbuf, uaddr, mop->size)) {
r = -EFAULT;