[v3,2/6] cxl/mbox: Add GET_POISON_LIST mailbox command

Message ID 46c7c7339224744fce424b196da3e5566effec17.1668115235.git.alison.schofield@intel.com
State New
Headers
Series CXL Poison List Retrieval & Tracing |

Commit Message

Alison Schofield Nov. 11, 2022, 3:12 a.m. UTC
  From: Alison Schofield <alison.schofield@intel.com>

CXL devices maintain a list of locations that are poisoned or result
in poison if the addresses are accessed by the host.

Per the spec (CXL 3.0 8.2.9.8.4.1), the device returns this Poison
list as a set of  Media Error Records that include the source of the
error, the starting device physical address and length. The length is
the number of adjacent DPAs in the record and is in units of 64 bytes.

Retrieve the list and log each Media Error Record as a trace event of
type 'cxl_poison'.

When the poison list is requested by region, include the region name
and uuid in the trace event.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
---
 drivers/cxl/core/mbox.c | 81 +++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/cxlmem.h    | 37 +++++++++++++++++++
 2 files changed, 118 insertions(+)
  

Comments

Jonathan Cameron Nov. 16, 2022, 12:41 p.m. UTC | #1
On Thu, 10 Nov 2022 19:12:40 -0800
alison.schofield@intel.com wrote:

> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices maintain a list of locations that are poisoned or result
> in poison if the addresses are accessed by the host.
> 
> Per the spec (CXL 3.0 8.2.9.8.4.1), the device returns this Poison
> list as a set of  Media Error Records that include the source of the
> error, the starting device physical address and length. The length is
> the number of adjacent DPAs in the record and is in units of 64 bytes.
> 
> Retrieve the list and log each Media Error Record as a trace event of
> type 'cxl_poison'.
> 
> When the poison list is requested by region, include the region name
> and uuid in the trace event.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Hi Alison,

I've forgotten most of previous discussions around versions of this series
so I may well repeat things that were covered earlier!

A few things inline.

Thanks,

Jonathan


> ---
>  drivers/cxl/core/mbox.c | 81 +++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/cxlmem.h    | 37 +++++++++++++++++++
>  2 files changed, 118 insertions(+)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 0c90f13870a4..88f034e97812 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -9,6 +9,9 @@
>  
>  #include "core.h"
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/cxl.h>
> +
>  static bool cxl_raw_allow_all;
>  
>  /**
> @@ -752,6 +755,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>  {
>  	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
>  	struct cxl_mbox_identify id;
> +	__le32 val = 0;
>  	int rc;
>  
>  	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
> @@ -771,6 +775,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>  	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
>  	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
>  
> +	memcpy(&val, id.poison_list_max_mer, 3);

This is ugly.  I've lost track of last discussion about get_unaligned_le24()
and using it on elements of a packed structure.  At very least can we
do a memcpy to a u8[3] array and then use get_unaligned_le24() on that if
we can't use it directly on the structure element?


> +	cxlds->poison_max = min_t(u32, le32_to_cpu(val), CXL_POISON_LIST_MAX);
> +
>  	return 0;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
> @@ -835,6 +842,79 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
>  
> +int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> +		       struct cxl_region *cxlr)
> +{
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	const char *memdev_name = dev_name(&cxlmd->dev);
Could just do this where it's used rather than here.

> +	const char *pcidev_name = dev_name(cxlds->dev);
Same with this.

> +	struct cxl_mbox_poison_payload_out *po;
> +	struct cxl_mbox_poison_payload_in pi;
> +	int nr_records = 0;
> +	int rc;
> +
> +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> +	if (!po)
> +		return -ENOMEM;
> +
> +	pi.offset = cpu_to_le64(offset);
> +	pi.length = cpu_to_le64(len);
> +
> +	rc = mutex_lock_interruptible(&cxlds->poison_list_mutex);
> +	if (rc)
> +		goto out;
> +
> +	do {
> +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> +				       sizeof(pi), po, cxlds->payload_size);
> +		if (rc)
> +			break;
> +
> +		if (trace_cxl_poison_enabled())
> +			cxl_trace_poison(po, cxlr, memdev_name, pcidev_name);
> +
> +		/* Protect against an uncleared _FLAG_MORE */
> +		nr_records = nr_records + le16_to_cpu(po->count);
> +		if (nr_records >= cxlds->poison_max) {
> +			dev_dbg(&cxlmd->dev, "Max Error Records reached: %d\n",
> +				nr_records);
> +			break;
> +		}
> +	} while (po->flags & CXL_POISON_FLAG_MORE);
> +
> +	mutex_unlock(&cxlds->poison_list_mutex);
> +out:
> +	kvfree(po);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison, CXL);
> +
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
>  {
>  	struct cxl_dev_state *cxlds;
> @@ -846,6 +926,7 @@ struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
>  	}
>  
>  	mutex_init(&cxlds->mbox_mutex);
> +	mutex_init(&cxlds->poison_list_mutex);
>  	cxlds->dev = dev;
>  
>  	return cxlds;
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 669868cc1553..49d891347e39 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -192,6 +192,8 @@ struct cxl_endpoint_dvsec_info {
>   *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
>   * @lsa_size: Size of Label Storage Area
>   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
> + * @poison_max: maximum media error records held in device cache
For consistency of capitalization: Maximum 
> + * @poison_list_mutex: Mutex to synchronize poison list retrieval
>   * @mbox_mutex: Mutex to synchronize mailbox access.
>   * @firmware_version: Firmware version for the memory device.
>   * @enabled_cmds: Hardware commands found enabled in CEL.
> @@ -224,6 +226,8 @@ struct cxl_dev_state {
>  

...
  
Alison Schofield Nov. 17, 2022, 11:55 p.m. UTC | #2
On Wed, Nov 16, 2022 at 12:41:18PM +0000, Jonathan Cameron wrote:
> On Thu, 10 Nov 2022 19:12:40 -0800
> alison.schofield@intel.com wrote:
> 
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices maintain a list of locations that are poisoned or result
> > in poison if the addresses are accessed by the host.
> > 
> > Per the spec (CXL 3.0 8.2.9.8.4.1), the device returns this Poison
> > list as a set of  Media Error Records that include the source of the
> > error, the starting device physical address and length. The length is
> > the number of adjacent DPAs in the record and is in units of 64 bytes.
> > 
> > Retrieve the list and log each Media Error Record as a trace event of
> > type 'cxl_poison'.
> > 
> > When the poison list is requested by region, include the region name
> > and uuid in the trace event.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> Hi Alison,
> 
> I've forgotten most of previous discussions around versions of this series
> so I may well repeat things that were covered earlier!
> 
> A few things inline.
> 
> Thanks,
> 
> Jonathan
Thanks Jonathan. Got 'em all.
> 
> 
> > ---
snip
> > 
> >  
> > +	memcpy(&val, id.poison_list_max_mer, 3);
> 
> This is ugly.  I've lost track of last discussion about get_unaligned_le24()
> and using it on elements of a packed structure.  At very least can we
> do a memcpy to a u8[3] array and then use get_unaligned_le24() on that if
> we can't use it directly on the structure element?
> 

Done, like this -
val = get_unaligned_le24(id.poison_list_max_mer);

> 
snip
> > +
> > +int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> > +		       struct cxl_region *cxlr)
> > +{
> > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +	const char *memdev_name = dev_name(&cxlmd->dev);
> Could just do this where it's used rather than here.
> 
> > +	const char *pcidev_name = dev_name(cxlds->dev);
> Same with this.
> 

Done.

> 
snip
> ...
> 
>
  
Dan Williams Dec. 7, 2022, 2:41 a.m. UTC | #3
alison.schofield@ wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices maintain a list of locations that are poisoned or result
> in poison if the addresses are accessed by the host.
> 
> Per the spec (CXL 3.0 8.2.9.8.4.1), the device returns this Poison
> list as a set of  Media Error Records that include the source of the
> error, the starting device physical address and length. The length is
> the number of adjacent DPAs in the record and is in units of 64 bytes.
> 
> Retrieve the list and log each Media Error Record as a trace event of
> type 'cxl_poison'.
> 
> When the poison list is requested by region, include the region name
> and uuid in the trace event.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> ---
>  drivers/cxl/core/mbox.c | 81 +++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/cxlmem.h    | 37 +++++++++++++++++++
>  2 files changed, 118 insertions(+)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 0c90f13870a4..88f034e97812 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -9,6 +9,9 @@
>  
>  #include "core.h"
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/cxl.h>
> +
>  static bool cxl_raw_allow_all;
>  
>  /**
> @@ -752,6 +755,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>  {
>  	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
>  	struct cxl_mbox_identify id;
> +	__le32 val = 0;
>  	int rc;
>  
>  	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
> @@ -771,6 +775,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>  	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
>  	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
>  
> +	memcpy(&val, id.poison_list_max_mer, 3);

I see Jonathan already commented on the need for get_unaligned_le24()
here, seconded.


> +	cxlds->poison_max = min_t(u32, le32_to_cpu(val), CXL_POISON_LIST_MAX);
> +
>  	return 0;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
> @@ -835,6 +842,79 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
>  
> +static void cxl_trace_poison(struct cxl_mbox_poison_payload_out *po,
> +			     struct cxl_region *cxlr,
> +			     const char *memdev_name,
> +			     const char *pcidev_name)

Type-safety please. Pass a 'struct cxl_memdev *' and 'struct pci_dev *'.
Might need to be 'struct device *' instead of 'struct pci_dev *'
depending on if this needs to be called from cxl_test, but anything is
better than a non-specific string.

> +{
> +	const char *region_name = cxlr ? dev_name(&cxlr->dev) : NULL;

...and push this conversion into the trace point.

> +	struct cxl_region_params *p = cxlr ? &cxlr->params : NULL;
> +	uuid_t *uuid = p ? &p->uuid : NULL;
> +	u64 addr, dpa, overflow_t = 0;
> +	u8 source;
> +	u32 len;
> +
> +	if (po->flags & CXL_POISON_FLAG_OVERFLOW)
> +		overflow_t = le64_to_cpu(po->overflow_timestamp);
> +
> +	for (int i = 0; i < le16_to_cpu(po->count); i++) {
> +		len = le32_to_cpu(po->record[i].length) * CXL_POISON_LEN_MULT;
> +		addr = le64_to_cpu(po->record[i].address);
> +		source = addr & CXL_POISON_SOURCE_MASK;
> +		dpa = addr & CXL_POISON_START_MASK;
> +
> +		trace_cxl_poison(memdev_name, pcidev_name, region_name, uuid,
> +				 dpa, len, source, po->flags, overflow_t);
> +	}
> +}
> +
> +int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> +		       struct cxl_region *cxlr)
> +{
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	const char *memdev_name = dev_name(&cxlmd->dev);
> +	const char *pcidev_name = dev_name(cxlds->dev);
> +	struct cxl_mbox_poison_payload_out *po;
> +	struct cxl_mbox_poison_payload_in pi;
> +	int nr_records = 0;
> +	int rc;
> +
> +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> +	if (!po)
> +		return -ENOMEM;
> +
> +	pi.offset = cpu_to_le64(offset);
> +	pi.length = cpu_to_le64(len);
> +
> +	rc = mutex_lock_interruptible(&cxlds->poison_list_mutex);

So I do not know what this mutex is protecting if there is an allocation
per cxl_mem_get_poison() invocation. Although I suspect that's somewhat
wasteful. Just allocate one buffer at the beginning of time and then use
the lock to protect that buffer.

Although, I wonder if this and Event handling should share locks and one
preallocated buffer? Otherwise I do think it is important for Events and
Poison handling to be able to make forward progress without needing to
allocate up to a megabyte of memory at runtime. The other payload_size
allocations are for one-off things that run at the beginning of time,
but Poison and Events run repeatedly.

> +	if (rc)
> +		goto out;
> +
> +	do {
> +		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
> +				       sizeof(pi), po, cxlds->payload_size);
> +		if (rc)
> +			break;
> +
> +		if (trace_cxl_poison_enabled())
> +			cxl_trace_poison(po, cxlr, memdev_name, pcidev_name);
> +
> +		/* Protect against an uncleared _FLAG_MORE */
> +		nr_records = nr_records + le16_to_cpu(po->count);
> +		if (nr_records >= cxlds->poison_max) {
> +			dev_dbg(&cxlmd->dev, "Max Error Records reached: %d\n",
> +				nr_records);
> +			break;
> +		}
> +	} while (po->flags & CXL_POISON_FLAG_MORE);
> +
> +	mutex_unlock(&cxlds->poison_list_mutex);
> +out:
> +	kvfree(po);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison, CXL);
> +
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
>  {
>  	struct cxl_dev_state *cxlds;
> @@ -846,6 +926,7 @@ struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
>  	}
>  
>  	mutex_init(&cxlds->mbox_mutex);
> +	mutex_init(&cxlds->poison_list_mutex);
>  	cxlds->dev = dev;
>  
>  	return cxlds;
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 669868cc1553..49d891347e39 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -192,6 +192,8 @@ struct cxl_endpoint_dvsec_info {
>   *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
>   * @lsa_size: Size of Label Storage Area
>   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
> + * @poison_max: maximum media error records held in device cache
> + * @poison_list_mutex: Mutex to synchronize poison list retrieval
>   * @mbox_mutex: Mutex to synchronize mailbox access.
>   * @firmware_version: Firmware version for the memory device.
>   * @enabled_cmds: Hardware commands found enabled in CEL.
> @@ -224,6 +226,8 @@ struct cxl_dev_state {
>  
>  	size_t payload_size;
>  	size_t lsa_size;
> +	u32 poison_max;
> +	struct mutex poison_list_mutex;  /* Protect reads of poison list */
>  	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>  	char firmware_version[0x10];
>  	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
> @@ -349,6 +353,37 @@ struct cxl_mbox_set_partition_info {
>  
>  /* Get Poison List  CXL 3.0 Spec 8.2.9.8.4.1 */
>  
> +struct cxl_mbox_poison_payload_in {
> +	__le64 offset;
> +	__le64 length;
> +} __packed;
> +
> +struct cxl_mbox_poison_payload_out {
> +	u8 flags;
> +	u8 rsvd1;
> +	__le64 overflow_timestamp;
> +	__le16 count;
> +	u8 rsvd2[0x14];

Let's use decimal values for size.

> +	struct cxl_poison_record {
> +		__le64 address;
> +		__le32 length;
> +		__le32 rsvd;
> +	} __packed record[];
> +} __packed;
> +
> +/*
> + * Get Poison List address field encodes the starting
> + * address of poison, and the source of the poison.
> + */
> +#define CXL_POISON_START_MASK		GENMASK_ULL(63, 6)
> +#define CXL_POISON_SOURCE_MASK		GENMASK(2, 0)
> +
> +/* Get Poison List record length is in units of 64 bytes */
> +#define CXL_POISON_LEN_MULT	64
> +
> +/* Kernel defined maximum for a list of poison errors */
> +#define CXL_POISON_LIST_MAX	1024
> +
>  /* Get Poison List: Payload out flags */
>  #define CXL_POISON_FLAG_MORE            BIT(0)
>  #define CXL_POISON_FLAG_OVERFLOW        BIT(1)
> @@ -395,6 +430,8 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
>  struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
>  void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
>  void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> +int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> +		       struct cxl_region *cxlr);
>  #ifdef CONFIG_CXL_SUSPEND
>  void cxl_mem_active_inc(void);
>  void cxl_mem_active_dec(void);
> -- 
> 2.37.3
>
  
Alison Schofield Dec. 7, 2022, 4:10 p.m. UTC | #4
On Tue, Dec 06, 2022 at 06:41:34PM -0800, Dan Williams wrote:
> alison.schofield@ wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices maintain a list of locations that are poisoned or result
> > in poison if the addresses are accessed by the host.
> > 
> > Per the spec (CXL 3.0 8.2.9.8.4.1), the device returns this Poison
> > list as a set of  Media Error Records that include the source of the
> > error, the starting device physical address and length. The length is
> > the number of adjacent DPAs in the record and is in units of 64 bytes.
> > 
> > Retrieve the list and log each Media Error Record as a trace event of
> > type 'cxl_poison'.
> > 
> > When the poison list is requested by region, include the region name
> > and uuid in the trace event.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > ---
> >  drivers/cxl/core/mbox.c | 81 +++++++++++++++++++++++++++++++++++++++++
> >  drivers/cxl/cxlmem.h    | 37 +++++++++++++++++++
> >  2 files changed, 118 insertions(+)
> > 
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 0c90f13870a4..88f034e97812 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -9,6 +9,9 @@
> >  
> >  #include "core.h"
> >  
> > +#define CREATE_TRACE_POINTS
> > +#include <trace/events/cxl.h>
> > +
> >  static bool cxl_raw_allow_all;
> >  
> >  /**
> > @@ -752,6 +755,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
> >  {
> >  	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
> >  	struct cxl_mbox_identify id;
> > +	__le32 val = 0;
> >  	int rc;
> >  
> >  	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
> > @@ -771,6 +775,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
> >  	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
> >  	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
> >  
> > +	memcpy(&val, id.poison_list_max_mer, 3);
> 
> I see Jonathan already commented on the need for get_unaligned_le24()
> here, seconded.
> 
Got it.

> 
> > +	cxlds->poison_max = min_t(u32, le32_to_cpu(val), CXL_POISON_LIST_MAX);
> > +
> >  	return 0;
> >  }
> >  EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
> > @@ -835,6 +842,79 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
> >  }
> >  EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
> >  
> > +static void cxl_trace_poison(struct cxl_mbox_poison_payload_out *po,
> > +			     struct cxl_region *cxlr,
> > +			     const char *memdev_name,
> > +			     const char *pcidev_name)
> 
> Type-safety please. Pass a 'struct cxl_memdev *' and 'struct pci_dev *'.
> Might need to be 'struct device *' instead of 'struct pci_dev *'
> depending on if this needs to be called from cxl_test, but anything is
> better than a non-specific string.
> 
Got it.

> > +{
> > +	const char *region_name = cxlr ? dev_name(&cxlr->dev) : NULL;
> 
> ...and push this conversion into the trace point.
> 
Got that, and also pushed the decode of struct cxl_poison_record to
the trace point like this:

TP_PROTO(const struct device *memdev, const struct pci_dev *pcidev,
                     const struct cxl_region *region,
                     const struct cxl_poison_record *record,
                     u8 flags, __le64 overflow_t),


> > +	struct cxl_region_params *p = cxlr ? &cxlr->params : NULL;
> > +	uuid_t *uuid = p ? &p->uuid : NULL;
> > +	u64 addr, dpa, overflow_t = 0;
> > +	u8 source;
> > +	u32 len;
> > +
> > +	if (po->flags & CXL_POISON_FLAG_OVERFLOW)
> > +		overflow_t = le64_to_cpu(po->overflow_timestamp);
> > +
> > +	for (int i = 0; i < le16_to_cpu(po->count); i++) {
> > +		len = le32_to_cpu(po->record[i].length) * CXL_POISON_LEN_MULT;
> > +		addr = le64_to_cpu(po->record[i].address);
> > +		source = addr & CXL_POISON_SOURCE_MASK;
> > +		dpa = addr & CXL_POISON_START_MASK;
> > +
> > +		trace_cxl_poison(memdev_name, pcidev_name, region_name, uuid,
> > +				 dpa, len, source, po->flags, overflow_t);
> > +	}
> > +}
> > +
> > +int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> > +		       struct cxl_region *cxlr)
> > +{
> > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +	const char *memdev_name = dev_name(&cxlmd->dev);
> > +	const char *pcidev_name = dev_name(cxlds->dev);
> > +	struct cxl_mbox_poison_payload_out *po;
> > +	struct cxl_mbox_poison_payload_in pi;
> > +	int nr_records = 0;
> > +	int rc;
> > +
> > +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> > +	if (!po)
> > +		return -ENOMEM;
> > +
> > +	pi.offset = cpu_to_le64(offset);
> > +	pi.length = cpu_to_le64(len);
> > +
> > +	rc = mutex_lock_interruptible(&cxlds->poison_list_mutex);
> 
> So I do not know what this mutex is protecting if there is an allocation
> per cxl_mem_get_poison() invocation. Although I suspect that's somewhat
> wasteful. Just allocate one buffer at the beginning of time and then use
> the lock to protect that buffer.

Intent was a single lock on the device to protect the state of the
poison list retrieval - do not allow > 1 reader. With > 1 reader
software may not know if it retrieved the complete list.

I'm not understanding about protecting a buffer, instead of protecting
the state. Here I try to protect the state.

> 
> Although, I wonder if this and Event handling should share locks and one
> preallocated buffer? Otherwise I do think it is important for Events and
> Poison handling to be able to make forward progress without needing to
> allocate up to a megabyte of memory at runtime. The other payload_size
> allocations are for one-off things that run at the beginning of time,
> but Poison and Events run repeatedly.
> 
> > 
snip
  
Dan Williams Dec. 7, 2022, 9:39 p.m. UTC | #5
Alison Schofield wrote:
> On Tue, Dec 06, 2022 at 06:41:34PM -0800, Dan Williams wrote:
> > alison.schofield@ wrote:
[..]
> > > +int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> > > +		       struct cxl_region *cxlr)
> > > +{
> > > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > > +	const char *memdev_name = dev_name(&cxlmd->dev);
> > > +	const char *pcidev_name = dev_name(cxlds->dev);
> > > +	struct cxl_mbox_poison_payload_out *po;
> > > +	struct cxl_mbox_poison_payload_in pi;
> > > +	int nr_records = 0;
> > > +	int rc;
> > > +
> > > +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> > > +	if (!po)
> > > +		return -ENOMEM;
> > > +
> > > +	pi.offset = cpu_to_le64(offset);
> > > +	pi.length = cpu_to_le64(len);
> > > +
> > > +	rc = mutex_lock_interruptible(&cxlds->poison_list_mutex);
> > 
> > So I do not know what this mutex is protecting if there is an allocation
> > per cxl_mem_get_poison() invocation. Although I suspect that's somewhat
> > wasteful. Just allocate one buffer at the beginning of time and then use
> > the lock to protect that buffer.
> 
> Intent was a single lock on the device to protect the state of the
> poison list retrieval - do not allow > 1 reader. With > 1 reader
> software may not know if it retrieved the complete list.
> 
> I'm not understanding about protecting a buffer, instead of protecting
> the state. Here I try to protect the state.

Ah, sorry I read cxlds->poison_list_mutex and assumed it was serializing
access to the buffer, not a state machine. I do think it would be
worthwhile to make this a self contained structure with its own kdoc to
explain what is going on, something like:

diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index ab138004f644..02697d1d951c 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -193,6 +193,19 @@ struct cxl_endpoint_dvsec_info {
        struct range dvsec_range[2];
 };
 
+/**
+ * struct cxl_poison_state - summary
+ * @payload: ...
+ * @lock: ...
+ *
+ * A bit more description of why state needs to be held over successive
+ * mbox commands...
+ */
+struct cxl_poison_state {
+       void *payload;
+       struct mutex lock;
+};
+
 /**
  * struct cxl_dev_state - The driver device state
  *
@@ -210,6 +223,7 @@ struct cxl_endpoint_dvsec_info {
  * @lsa_size: Size of Label Storage Area
  *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
  * @mbox_mutex: Mutex to synchronize mailbox access.
+ * @poison: Poison list retrieval and tracing
  * @firmware_version: Firmware version for the memory device.
  * @enabled_cmds: Hardware commands found enabled in CEL.
  * @exclusive_cmds: Commands that are kernel-internal only
@@ -244,6 +258,7 @@ struct cxl_dev_state {
        size_t payload_size;
        size_t lsa_size;
        struct mutex mbox_mutex; /* Protects device mailbox and firmware */
+       struct cxl_poison_state poison;
        char firmware_version[0x10];
        DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
        DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
  
Alison Schofield Dec. 8, 2022, 3:47 a.m. UTC | #6
On Wed, Dec 07, 2022 at 01:39:58PM -0800, Dan Williams wrote:
> Alison Schofield wrote:
> > On Tue, Dec 06, 2022 at 06:41:34PM -0800, Dan Williams wrote:
> > > alison.schofield@ wrote:
> [..]
> > > > +int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> > > > +		       struct cxl_region *cxlr)
> > > > +{
> > > > +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > > > +	const char *memdev_name = dev_name(&cxlmd->dev);
> > > > +	const char *pcidev_name = dev_name(cxlds->dev);
> > > > +	struct cxl_mbox_poison_payload_out *po;
> > > > +	struct cxl_mbox_poison_payload_in pi;
> > > > +	int nr_records = 0;
> > > > +	int rc;
> > > > +
> > > > +	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
> > > > +	if (!po)
> > > > +		return -ENOMEM;
> > > > +
> > > > +	pi.offset = cpu_to_le64(offset);
> > > > +	pi.length = cpu_to_le64(len);
> > > > +
> > > > +	rc = mutex_lock_interruptible(&cxlds->poison_list_mutex);
> > > 
> > > So I do not know what this mutex is protecting if there is an allocation
> > > per cxl_mem_get_poison() invocation. Although I suspect that's somewhat
> > > wasteful. Just allocate one buffer at the beginning of time and then use
> > > the lock to protect that buffer.
> > 
> > Intent was a single lock on the device to protect the state of the
> > poison list retrieval - do not allow > 1 reader. With > 1 reader
> > software may not know if it retrieved the complete list.
> > 
> > I'm not understanding about protecting a buffer, instead of protecting
> > the state. Here I try to protect the state.
> 
> Ah, sorry I read cxlds->poison_list_mutex and assumed it was serializing
> access to the buffer, not a state machine. I do think it would be
> worthwhile to make this a self contained structure with its own kdoc to
> explain what is going on, something like:
> 

OK thanks, it's getting through to me now. 
Will do !

> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index ab138004f644..02697d1d951c 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -193,6 +193,19 @@ struct cxl_endpoint_dvsec_info {
>         struct range dvsec_range[2];
>  };
>  
> +/**
> + * struct cxl_poison_state - summary
> + * @payload: ...
> + * @lock: ...
> + *
> + * A bit more description of why state needs to be held over successive
> + * mbox commands...
> + */
> +struct cxl_poison_state {
> +       void *payload;
> +       struct mutex lock;
> +};
> +
>  /**
>   * struct cxl_dev_state - The driver device state
>   *
> @@ -210,6 +223,7 @@ struct cxl_endpoint_dvsec_info {
>   * @lsa_size: Size of Label Storage Area
>   *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
>   * @mbox_mutex: Mutex to synchronize mailbox access.
> + * @poison: Poison list retrieval and tracing
>   * @firmware_version: Firmware version for the memory device.
>   * @enabled_cmds: Hardware commands found enabled in CEL.
>   * @exclusive_cmds: Commands that are kernel-internal only
> @@ -244,6 +258,7 @@ struct cxl_dev_state {
>         size_t payload_size;
>         size_t lsa_size;
>         struct mutex mbox_mutex; /* Protects device mailbox and firmware */
> +       struct cxl_poison_state poison;
>         char firmware_version[0x10];
>         DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>         DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
  

Patch

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 0c90f13870a4..88f034e97812 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -9,6 +9,9 @@ 
 
 #include "core.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/cxl.h>
+
 static bool cxl_raw_allow_all;
 
 /**
@@ -752,6 +755,7 @@  int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
 {
 	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
 	struct cxl_mbox_identify id;
+	__le32 val = 0;
 	int rc;
 
 	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
@@ -771,6 +775,9 @@  int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
 	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
 	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
 
+	memcpy(&val, id.poison_list_max_mer, 3);
+	cxlds->poison_max = min_t(u32, le32_to_cpu(val), CXL_POISON_LIST_MAX);
+
 	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
@@ -835,6 +842,79 @@  int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
 
+static void cxl_trace_poison(struct cxl_mbox_poison_payload_out *po,
+			     struct cxl_region *cxlr,
+			     const char *memdev_name,
+			     const char *pcidev_name)
+{
+	const char *region_name = cxlr ? dev_name(&cxlr->dev) : NULL;
+	struct cxl_region_params *p = cxlr ? &cxlr->params : NULL;
+	uuid_t *uuid = p ? &p->uuid : NULL;
+	u64 addr, dpa, overflow_t = 0;
+	u8 source;
+	u32 len;
+
+	if (po->flags & CXL_POISON_FLAG_OVERFLOW)
+		overflow_t = le64_to_cpu(po->overflow_timestamp);
+
+	for (int i = 0; i < le16_to_cpu(po->count); i++) {
+		len = le32_to_cpu(po->record[i].length) * CXL_POISON_LEN_MULT;
+		addr = le64_to_cpu(po->record[i].address);
+		source = addr & CXL_POISON_SOURCE_MASK;
+		dpa = addr & CXL_POISON_START_MASK;
+
+		trace_cxl_poison(memdev_name, pcidev_name, region_name, uuid,
+				 dpa, len, source, po->flags, overflow_t);
+	}
+}
+
+int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
+		       struct cxl_region *cxlr)
+{
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	const char *memdev_name = dev_name(&cxlmd->dev);
+	const char *pcidev_name = dev_name(cxlds->dev);
+	struct cxl_mbox_poison_payload_out *po;
+	struct cxl_mbox_poison_payload_in pi;
+	int nr_records = 0;
+	int rc;
+
+	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
+	if (!po)
+		return -ENOMEM;
+
+	pi.offset = cpu_to_le64(offset);
+	pi.length = cpu_to_le64(len);
+
+	rc = mutex_lock_interruptible(&cxlds->poison_list_mutex);
+	if (rc)
+		goto out;
+
+	do {
+		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
+				       sizeof(pi), po, cxlds->payload_size);
+		if (rc)
+			break;
+
+		if (trace_cxl_poison_enabled())
+			cxl_trace_poison(po, cxlr, memdev_name, pcidev_name);
+
+		/* Protect against an uncleared _FLAG_MORE */
+		nr_records = nr_records + le16_to_cpu(po->count);
+		if (nr_records >= cxlds->poison_max) {
+			dev_dbg(&cxlmd->dev, "Max Error Records reached: %d\n",
+				nr_records);
+			break;
+		}
+	} while (po->flags & CXL_POISON_FLAG_MORE);
+
+	mutex_unlock(&cxlds->poison_list_mutex);
+out:
+	kvfree(po);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison, CXL);
+
 struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
 {
 	struct cxl_dev_state *cxlds;
@@ -846,6 +926,7 @@  struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
 	}
 
 	mutex_init(&cxlds->mbox_mutex);
+	mutex_init(&cxlds->poison_list_mutex);
 	cxlds->dev = dev;
 
 	return cxlds;
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 669868cc1553..49d891347e39 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -192,6 +192,8 @@  struct cxl_endpoint_dvsec_info {
  *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
  * @lsa_size: Size of Label Storage Area
  *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
+ * @poison_max: maximum media error records held in device cache
+ * @poison_list_mutex: Mutex to synchronize poison list retrieval
  * @mbox_mutex: Mutex to synchronize mailbox access.
  * @firmware_version: Firmware version for the memory device.
  * @enabled_cmds: Hardware commands found enabled in CEL.
@@ -224,6 +226,8 @@  struct cxl_dev_state {
 
 	size_t payload_size;
 	size_t lsa_size;
+	u32 poison_max;
+	struct mutex poison_list_mutex;  /* Protect reads of poison list */
 	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
 	char firmware_version[0x10];
 	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
@@ -349,6 +353,37 @@  struct cxl_mbox_set_partition_info {
 
 /* Get Poison List  CXL 3.0 Spec 8.2.9.8.4.1 */
 
+struct cxl_mbox_poison_payload_in {
+	__le64 offset;
+	__le64 length;
+} __packed;
+
+struct cxl_mbox_poison_payload_out {
+	u8 flags;
+	u8 rsvd1;
+	__le64 overflow_timestamp;
+	__le16 count;
+	u8 rsvd2[0x14];
+	struct cxl_poison_record {
+		__le64 address;
+		__le32 length;
+		__le32 rsvd;
+	} __packed record[];
+} __packed;
+
+/*
+ * Get Poison List address field encodes the starting
+ * address of poison, and the source of the poison.
+ */
+#define CXL_POISON_START_MASK		GENMASK_ULL(63, 6)
+#define CXL_POISON_SOURCE_MASK		GENMASK(2, 0)
+
+/* Get Poison List record length is in units of 64 bytes */
+#define CXL_POISON_LEN_MULT	64
+
+/* Kernel defined maximum for a list of poison errors */
+#define CXL_POISON_LIST_MAX	1024
+
 /* Get Poison List: Payload out flags */
 #define CXL_POISON_FLAG_MORE            BIT(0)
 #define CXL_POISON_FLAG_OVERFLOW        BIT(1)
@@ -395,6 +430,8 @@  int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
 struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
 void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
 void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
+int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
+		       struct cxl_region *cxlr);
 #ifdef CONFIG_CXL_SUSPEND
 void cxl_mem_active_inc(void);
 void cxl_mem_active_dec(void);