[v8,2/6] cxl/trace: Add TRACE support for CXL media-error records

Message ID a77d5513c55177361749a27544409cb6a0c94dd5.1678468593.git.alison.schofield@intel.com
State New
Headers
Series CXL Poison List Retrieval & Tracing |

Commit Message

Alison Schofield March 10, 2023, 5:54 p.m. UTC
  From: Alison Schofield <alison.schofield@intel.com>

CXL devices may support the retrieval of a device poison list.
Add a new trace event that the CXL subsystem may use to log
the media-error records returned in the poison list.

Log each media-error record as a trace event of type 'cxl_poison'.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/cxl/core/mbox.c  |  4 +-
 drivers/cxl/core/trace.h | 84 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)
  

Comments

Ira Weiny March 13, 2023, 10:47 p.m. UTC | #1
alison.schofield@ wrote:
> From: Alison Schofield <alison.schofield@intel.com>
> 
> CXL devices may support the retrieval of a device poison list.
> Add a new trace event that the CXL subsystem may use to log
> the media-error records returned in the poison list.
> 
> Log each media-error record as a trace event of type 'cxl_poison'.
> 
> Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> ---
>  drivers/cxl/core/mbox.c  |  4 +-
>  drivers/cxl/core/trace.h | 84 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 87 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 0da0a30511f2..77fc811bdfed 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -1147,7 +1147,9 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
>  		if (rc)
>  			break;
>  
> -		/* TODO TRACE the media error records */
> +		for (int i = 0; i < le16_to_cpu(po->count); i++)
> +			trace_cxl_poison(cxlmd, cxlr, &po->record[i],
> +					 po->flags, po->overflow_t);
>  
>  		/* Protect against an uncleared _FLAG_MORE */
>  		nr_records = nr_records + le16_to_cpu(po->count);
> diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
> index 9b8d3d997834..33a22d26e742 100644
> --- a/drivers/cxl/core/trace.h
> +++ b/drivers/cxl/core/trace.h
> @@ -7,6 +7,7 @@
>  #define _CXL_EVENTS_H
>  
>  #include <linux/tracepoint.h>
> +#include <linux/pci.h>
>  #include <asm-generic/unaligned.h>
>  
>  #include <cxl.h>
> @@ -600,6 +601,89 @@ TRACE_EVENT(cxl_memory_module,
>  	)
>  );
>  
> +#define __show_poison_source(source)                          \
> +	__print_symbolic(source,                              \
> +		{ CXL_POISON_SOURCE_UNKNOWN,   "Unknown"  },  \
> +		{ CXL_POISON_SOURCE_EXTERNAL,  "External" },  \
> +		{ CXL_POISON_SOURCE_INTERNAL,  "Internal" },  \
> +		{ CXL_POISON_SOURCE_INJECTED,  "Injected" },  \
> +		{ CXL_POISON_SOURCE_VENDOR,    "Vendor"   })
> +
> +#define show_poison_source(source)			     \
> +	(((source > CXL_POISON_SOURCE_INJECTED) &&	     \
> +	 (source != CXL_POISON_SOURCE_VENDOR)) ? "Reserved"  \
> +	 : __show_poison_source(source))
> +
> +#define show_poison_flags(flags)                             \
> +	__print_flags(flags, "|",                            \
> +		{ CXL_POISON_FLAG_MORE,      "More"     },   \
> +		{ CXL_POISON_FLAG_OVERFLOW,  "Overflow"  },  \
> +		{ CXL_POISON_FLAG_SCANNING,  "Scanning"  })
> +
> +#define __cxl_poison_addr(record)					\
> +	(le64_to_cpu(record->address))
> +#define cxl_poison_record_dpa(record)					\
> +	(__cxl_poison_addr(record) & CXL_POISON_START_MASK)
> +#define cxl_poison_record_source(record)				\
> +	(__cxl_poison_addr(record)  & CXL_POISON_SOURCE_MASK)
> +#define cxl_poison_record_length(record)				\
> +	(le32_to_cpu(record->length) * CXL_POISON_LEN_MULT)
> +#define cxl_poison_overflow(flags, time)				\
> +	(flags & CXL_POISON_FLAG_OVERFLOW ? le64_to_cpu(time) : 0)
> +
> +TRACE_EVENT(cxl_poison,
> +
> +	TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *region,
> +		 const struct cxl_poison_record *record,
> +		 u8 flags, __le64 overflow_t),

FWIW I made event overflow a separate trace event.

Will this make all of the poisons in a single GetPoison command marked
with overflow in the trace buffer?

Ira

> +
> +	TP_ARGS(cxlmd, region, record, flags, overflow_t),
> +
> +	TP_STRUCT__entry(
> +		__string(memdev, dev_name(&cxlmd->dev))
> +		__string(host, dev_name(cxlmd->dev.parent))
> +		__field(u64, serial)
> +		__string(region, region)
> +		__field(u64, overflow_t)
> +		__field(u64, dpa)
> +		__field(u32, length)
> +		__array(char, uuid, 16)
> +		__field(u8, source)
> +		__field(u8, flags)
> +	    ),
> +
> +	TP_fast_assign(
> +		__assign_str(memdev, dev_name(&cxlmd->dev));
> +		__assign_str(host, dev_name(cxlmd->dev.parent));
> +		__entry->serial = cxlmd->cxlds->serial;
> +		__entry->overflow_t = cxl_poison_overflow(flags, overflow_t);
> +		__entry->dpa = cxl_poison_record_dpa(record);
> +		__entry->length = cxl_poison_record_length(record);
> +		__entry->source = cxl_poison_record_source(record);
> +		__entry->flags = flags;
> +		if (region) {
> +			__assign_str(region, dev_name(&region->dev));
> +			memcpy(__entry->uuid, &region->params.uuid, 16);
> +		} else {
> +			__assign_str(region, "");
> +			memset(__entry->uuid, 0, 16);
> +		}
> +	    ),
> +
> +	TP_printk("memdev=%s host=%s serial=%lld region=%s region_uuid=%pU dpa=0x%llx length=0x%x source=%s flags=%s overflow_time=%llu",
> +		__get_str(memdev),
> +		__get_str(host),
> +		__entry->serial,
> +		__get_str(region),
> +		__entry->uuid,
> +		__entry->dpa,
> +		__entry->length,
> +		show_poison_source(__entry->source),
> +		show_poison_flags(__entry->flags),
> +		__entry->overflow_t
> +	)
> +);
> +
>  #endif /* _CXL_EVENTS_H */
>  
>  #define TRACE_INCLUDE_FILE trace
> -- 
> 2.37.3
>
  
Alison Schofield March 14, 2023, 12:21 a.m. UTC | #2
On Mon, Mar 13, 2023 at 03:47:35PM -0700, Ira Weiny wrote:
> alison.schofield@ wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> > 
> > CXL devices may support the retrieval of a device poison list.
> > Add a new trace event that the CXL subsystem may use to log
> > the media-error records returned in the poison list.
> > 
> > Log each media-error record as a trace event of type 'cxl_poison'.
> > 
> > Signed-off-by: Alison Schofield <alison.schofield@intel.com>
> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > ---
> >  drivers/cxl/core/mbox.c  |  4 +-
> >  drivers/cxl/core/trace.h | 84 ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 87 insertions(+), 1 deletion(-)

snip

> > 
> > +#define cxl_poison_overflow(flags, time)				\
> > +	(flags & CXL_POISON_FLAG_OVERFLOW ? le64_to_cpu(time) : 0)
> > +
> > +TRACE_EVENT(cxl_poison,
> > +
> > +	TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *region,
> > +		 const struct cxl_poison_record *record,
> > +		 u8 flags, __le64 overflow_t),
> 
> FWIW I made event overflow a separate trace event.
> 
> Will this make all of the poisons in a single GetPoison command marked
> with overflow in the trace buffer?

Yes. Every record returned within a poison payload, gets the same flags and
overflow_t reported in its trace events. 

I took a peek at what you did.  (Perhaps we should have called that
cxl_event_overflow). I don't think the poison reporting allows a
similar, singular overflow trace event. The overflow setting means
the device has overflowed its poison list and the list may be
incomplete. I think we repeat the overflow state on every cxl_poison
trace event until the overflow status goes away. (Scan Media)

Alison


> 
> Ira
> 
> > +
snip

> 
>
  

Patch

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 0da0a30511f2..77fc811bdfed 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1147,7 +1147,9 @@  int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
 		if (rc)
 			break;
 
-		/* TODO TRACE the media error records */
+		for (int i = 0; i < le16_to_cpu(po->count); i++)
+			trace_cxl_poison(cxlmd, cxlr, &po->record[i],
+					 po->flags, po->overflow_t);
 
 		/* Protect against an uncleared _FLAG_MORE */
 		nr_records = nr_records + le16_to_cpu(po->count);
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 9b8d3d997834..33a22d26e742 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -7,6 +7,7 @@ 
 #define _CXL_EVENTS_H
 
 #include <linux/tracepoint.h>
+#include <linux/pci.h>
 #include <asm-generic/unaligned.h>
 
 #include <cxl.h>
@@ -600,6 +601,89 @@  TRACE_EVENT(cxl_memory_module,
 	)
 );
 
+#define __show_poison_source(source)                          \
+	__print_symbolic(source,                              \
+		{ CXL_POISON_SOURCE_UNKNOWN,   "Unknown"  },  \
+		{ CXL_POISON_SOURCE_EXTERNAL,  "External" },  \
+		{ CXL_POISON_SOURCE_INTERNAL,  "Internal" },  \
+		{ CXL_POISON_SOURCE_INJECTED,  "Injected" },  \
+		{ CXL_POISON_SOURCE_VENDOR,    "Vendor"   })
+
+#define show_poison_source(source)			     \
+	(((source > CXL_POISON_SOURCE_INJECTED) &&	     \
+	 (source != CXL_POISON_SOURCE_VENDOR)) ? "Reserved"  \
+	 : __show_poison_source(source))
+
+#define show_poison_flags(flags)                             \
+	__print_flags(flags, "|",                            \
+		{ CXL_POISON_FLAG_MORE,      "More"     },   \
+		{ CXL_POISON_FLAG_OVERFLOW,  "Overflow"  },  \
+		{ CXL_POISON_FLAG_SCANNING,  "Scanning"  })
+
+#define __cxl_poison_addr(record)					\
+	(le64_to_cpu(record->address))
+#define cxl_poison_record_dpa(record)					\
+	(__cxl_poison_addr(record) & CXL_POISON_START_MASK)
+#define cxl_poison_record_source(record)				\
+	(__cxl_poison_addr(record)  & CXL_POISON_SOURCE_MASK)
+#define cxl_poison_record_length(record)				\
+	(le32_to_cpu(record->length) * CXL_POISON_LEN_MULT)
+#define cxl_poison_overflow(flags, time)				\
+	(flags & CXL_POISON_FLAG_OVERFLOW ? le64_to_cpu(time) : 0)
+
+TRACE_EVENT(cxl_poison,
+
+	TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *region,
+		 const struct cxl_poison_record *record,
+		 u8 flags, __le64 overflow_t),
+
+	TP_ARGS(cxlmd, region, record, flags, overflow_t),
+
+	TP_STRUCT__entry(
+		__string(memdev, dev_name(&cxlmd->dev))
+		__string(host, dev_name(cxlmd->dev.parent))
+		__field(u64, serial)
+		__string(region, region)
+		__field(u64, overflow_t)
+		__field(u64, dpa)
+		__field(u32, length)
+		__array(char, uuid, 16)
+		__field(u8, source)
+		__field(u8, flags)
+	    ),
+
+	TP_fast_assign(
+		__assign_str(memdev, dev_name(&cxlmd->dev));
+		__assign_str(host, dev_name(cxlmd->dev.parent));
+		__entry->serial = cxlmd->cxlds->serial;
+		__entry->overflow_t = cxl_poison_overflow(flags, overflow_t);
+		__entry->dpa = cxl_poison_record_dpa(record);
+		__entry->length = cxl_poison_record_length(record);
+		__entry->source = cxl_poison_record_source(record);
+		__entry->flags = flags;
+		if (region) {
+			__assign_str(region, dev_name(&region->dev));
+			memcpy(__entry->uuid, &region->params.uuid, 16);
+		} else {
+			__assign_str(region, "");
+			memset(__entry->uuid, 0, 16);
+		}
+	    ),
+
+	TP_printk("memdev=%s host=%s serial=%lld region=%s region_uuid=%pU dpa=0x%llx length=0x%x source=%s flags=%s overflow_time=%llu",
+		__get_str(memdev),
+		__get_str(host),
+		__entry->serial,
+		__get_str(region),
+		__entry->uuid,
+		__entry->dpa,
+		__entry->length,
+		show_poison_source(__entry->source),
+		show_poison_flags(__entry->flags),
+		__entry->overflow_t
+	)
+);
+
 #endif /* _CXL_EVENTS_H */
 
 #define TRACE_INCLUDE_FILE trace