[v9,2/3] PCI/DOE: Expose the DOE features via sysfs

Message ID 20231013034158.2745127-2-alistair.francis@wdc.com
State New
Headers
Series [v9,1/3] PCI/DOE: Rename DOE protocol to feature |

Commit Message

Alistair Francis Oct. 13, 2023, 3:41 a.m. UTC
  The PCIe 6 specification added support for the Data Object Exchange (DOE).
When DOE is supported the DOE Discovery Feature must be implemented per
PCIe r6.1 sec 6.30.1.1. The protocol allows a requester to obtain
information about the other DOE features supported by the device.

The kernel is already querying the DOE features supported and cacheing
the values. Expose the values in sysfs to allow user space to
determine which DOE features are supported by the PCIe device.

By exposing the information to userspace tools like lspci can relay the
information to users. By listing all of the supported features we can
allow userspace to parse the list, which might include
vendor specific features as well as yet to be supported features.

Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
v9:
 - Add a teardown function
 - Rename functions to be clearer
 - Tidy up the commit message
 - Remove #ifdef from header
v8:
 - Inlucde an example in the docs
 - Fixup removing a file that wasn't added
 - Remove a blank line
v7:
 - Fixup the #ifdefs to keep the test robot happy
v6:
 - Use "feature" instead of protocol
 - Don't use any devm_* functions
 - Add two more patches to the series
v5:
 - Return the file name as the file contents
 - Code cleanups and simplifications
v4:
 - Fixup typos in the documentation
 - Make it clear that the file names contain the information
 - Small code cleanups
 - Remove most #ifdefs
 - Remove extra NULL assignment
v3:
 - Expose each DOE feature as a separate file
v2:
 - Add documentation
 - Code cleanups

 Documentation/ABI/testing/sysfs-bus-pci |  24 +++++
 drivers/pci/doe.c                       | 134 ++++++++++++++++++++++++
 drivers/pci/pci-sysfs.c                 |  14 +++
 drivers/pci/pci.h                       |   1 +
 include/linux/pci-doe.h                 |   2 +
 5 files changed, 175 insertions(+)
  

Comments

Lukas Wunner Oct. 17, 2023, 8:34 a.m. UTC | #1
On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote:
> +#ifdef CONFIG_SYSFS
> +static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj,
> +					     struct attribute *a, int n)
> +{
> +	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
> +	struct pci_doe_mb *doe_mb;
> +	unsigned long index, j;
> +	void *entry;
> +
> +	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> +		xa_for_each(&doe_mb->feats, j, entry)
> +			return a->mode;
> +	}
> +
> +	return 0;
> +}

Out of curiosity:  Does this method prevent creation of a
"doe_features" directory for devices which don't have any
DOE mailboxes?

(If it does, a code comment explaining that might be helpful.)


> +const struct attribute_group pci_dev_doe_feature_group = {
> +	.name	= "doe_features",
> +	.attrs	= pci_dev_doe_feature_attrs,
> +	.is_visible = pci_doe_sysfs_attr_is_visible,
> +};

Nit:  Wondering why the "=" is aligned for .name and .attrs
but not for .is_visible?


> +static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev,
> +					 struct pci_doe_mb *doe_mb)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct device_attribute *attrs = doe_mb->sysfs_attrs;
> +	unsigned long i;
> +	void *entry;

Nit:  Inverse Christmas tree?


> +static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev,
> +					  struct pci_doe_mb *doe_mb)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct device_attribute *attrs;
> +	unsigned long num_features = 0;
> +	unsigned long vid, type;
> +	unsigned long i;
> +	void *entry;
> +	int ret;
> +
> +	xa_for_each(&doe_mb->feats, i, entry)
> +		num_features++;
> +
> +	attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL);
> +	if (!attrs)
> +		return -ENOMEM;
> +
> +	doe_mb->sysfs_attrs = attrs;
> +	xa_for_each(&doe_mb->feats, i, entry) {
> +		sysfs_attr_init(&attrs[i].attr);
> +		vid = xa_to_value(entry) >> 8;
> +		type = xa_to_value(entry) & 0xFF;
> +		attrs[i].attr.name = kasprintf(GFP_KERNEL,
> +					       "0x%04lX:%02lX", vid, type);

Nit:  Capital X conversion specifier will result in upper case hex
characters in filename and contents, whereas existing sysfs attributes
such as "vendor", "device" contain lower case hex characters.

Might want to consider lower case x for consistency.


> +void pci_doe_sysfs_teardown(struct pci_dev *pdev)
> +{
> +	struct pci_doe_mb *doe_mb;
> +	unsigned long index;
> +
> +	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> +		pci_doe_sysfs_feature_remove(pdev, doe_mb);
> +	}

Nit:  Curly braces not necessary.


> @@ -1153,6 +1154,10 @@ static void pci_remove_resource_files(struct pci_dev *pdev)
>  {
>  	int i;
>  
> +	if (IS_ENABLED(CONFIG_PCI_DOE)) {
> +		pci_doe_sysfs_teardown(pdev);
> +	}

Nit:  Curly braces not necessary.


> @@ -1230,6 +1235,12 @@ static int pci_create_resource_files(struct pci_dev *pdev)
>  	int i;
>  	int retval;
>  
> +	if (IS_ENABLED(CONFIG_PCI_DOE)) {
> +		retval = pci_doe_sysfs_init(pdev);
> +		if (retval)
> +			return retval;
> +	}
> +
>  	/* Expose the PCI resources from this device as files */
>  	for (i = 0; i < PCI_STD_NUM_BARS; i++) {

I think this needs to be added to pci_create_sysfs_dev_files() instead.

pci_create_resource_files() only deals with creation of resource files,
as the name implies, which is unrelated to DOE features.

Worse, pci_create_resource_files() is also called from
pci_dev_resource_resize_attr(), i.e. every time user space
writes to the "resource##n##_resize" attributes.

Similarly, the call to pci_doe_sysfs_teardown() belongs in
pci_remove_sysfs_dev_files().

Thanks,

Lukas
  
Bjorn Helgaas Oct. 19, 2023, 4:58 p.m. UTC | #2
On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote:
> The PCIe 6 specification added support for the Data Object Exchange (DOE).
> When DOE is supported the DOE Discovery Feature must be implemented per
> PCIe r6.1 sec 6.30.1.1. The protocol allows a requester to obtain
> information about the other DOE features supported by the device.
> ...

> +static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj,
> +					     struct attribute *a, int n)
> +{
> +	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
> +	struct pci_doe_mb *doe_mb;
> +	unsigned long index, j;
> +	void *entry;
> +
> +	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> +		xa_for_each(&doe_mb->feats, j, entry)
> +			return a->mode;
> +	}
> +
> +	return 0;

The nested loops that don't test anything look a little weird and
maybe I'm missing something, but this looks like it returns a->mode if
any mailbox with a feature exists, and 0 otherwise.

Is that the same as this:

  if (pdev->doe_mbs)
    return a->mode;

  return 0;

since it sounds like a mailbox must support at least one feature?

> +}
> +
> +static struct attribute *pci_dev_doe_feature_attrs[] = {
> +	NULL,
> +};
> +
> +const struct attribute_group pci_dev_doe_feature_group = {
> +	.name	= "doe_features",
> +	.attrs	= pci_dev_doe_feature_attrs,
> +	.is_visible = pci_doe_sysfs_attr_is_visible,
> +};
> +
> +static ssize_t pci_doe_sysfs_feature_show(struct device *dev,
> +					  struct device_attribute *attr,
> +					  char *buf)
> +{
> +	return sysfs_emit(buf, "%s\n", attr->attr.name);
> +}
> +
> +static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev,
> +					 struct pci_doe_mb *doe_mb)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct device_attribute *attrs = doe_mb->sysfs_attrs;
> +	unsigned long i;
> +	void *entry;
> +
> +	if (!attrs)
> +		return;
> +
> +	doe_mb->sysfs_attrs = NULL;
> +	xa_for_each(&doe_mb->feats, i, entry) {
> +		if (attrs[i].show)
> +			sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr,
> +						     pci_dev_doe_feature_group.name);
> +		kfree(attrs[i].attr.name);
> +	}
> +	kfree(attrs);
> +}
> +
> +static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev,
> +					  struct pci_doe_mb *doe_mb)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct device_attribute *attrs;
> +	unsigned long num_features = 0;
> +	unsigned long vid, type;
> +	unsigned long i;
> +	void *entry;
> +	int ret;
> +
> +	xa_for_each(&doe_mb->feats, i, entry)
> +		num_features++;
> +
> +	attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL);
> +	if (!attrs)
> +		return -ENOMEM;
> +
> +	doe_mb->sysfs_attrs = attrs;
> +	xa_for_each(&doe_mb->feats, i, entry) {
> +		sysfs_attr_init(&attrs[i].attr);
> +		vid = xa_to_value(entry) >> 8;
> +		type = xa_to_value(entry) & 0xFF;
> +		attrs[i].attr.name = kasprintf(GFP_KERNEL,
> +					       "0x%04lX:%02lX", vid, type);

What's the rationale for using "0x" on the vendor ID but not on the
type?  "0x1234:10" hints that the "10" might be decimal since it lacks
"0x".

Suggest lower-case "%04lx:%02lx" either way.

FWIW, there's no "0x" prefix on the hex vendor IDs in "lspci -n"
output and dmesg messages like this:

  pci 0000:01:00.0: [10de:13b6] type 00

> +		if (!attrs[i].attr.name) {
> +			ret = -ENOMEM;
> +			goto fail;
> +		}
> +
> +		attrs[i].attr.mode = 0444;
> +		attrs[i].show = pci_doe_sysfs_feature_show;
> +
> +		ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr,
> +					      pci_dev_doe_feature_group.name);
> +		if (ret) {
> +			attrs[i].show = NULL;
> +			goto fail;
> +		}
> +	}
> +
> +	return 0;
> +
> +fail:
> +	pci_doe_sysfs_feature_remove(pdev, doe_mb);
> +	return ret;

Not sure we should treat this failure this seriously.  Looks like it
will prevent creation of the BAR resource files, and possibly even
abort pci_sysfs_init() early.  I think the pci_dev will still be
usable (lacking DOE sysfs) even if this fails.

> +}
> +
> +void pci_doe_sysfs_teardown(struct pci_dev *pdev)
> +{
> +	struct pci_doe_mb *doe_mb;
> +	unsigned long index;
> +
> +	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> +		pci_doe_sysfs_feature_remove(pdev, doe_mb);
> +	}
> +}
> +
> +int pci_doe_sysfs_init(struct pci_dev *pdev)
> +{
> +	struct pci_doe_mb *doe_mb;
> +	unsigned long index;
> +	int ret;
> +
> +	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> +		ret = pci_doe_sysfs_feature_populate(pdev, doe_mb);
> +		if (ret)
> +			return ret;
> +	}

I agree with Lukas that pci_create_resource_files() is not the right
place to call this.

I try hard to avoid calling *anything* from the
pci_create_sysfs_dev_files() path because it has the nasty
"sysfs_initialized" check and the associated pci_sysfs_init()
initcall.

It's so much cleaner when we can set up static attributes that are
automatically added in the device_add() path.  I don't know whether
that's possible.  I see lots of discussion with Greg KH that might be
related, but I'm not sure.

I do know that we enumerate the mailboxes and features during
pci_init_capabilities(), which happens before device_add(), so the
information about which attributes should be present is at least
*available* early enough:

  pci_host_probe
    pci_scan_root_bus_bridge
      ...
        pci_scan_single_device
          pci_device_add
            pci_init_capabilities
              pci_doe_init
                while (pci_find_next_ext_capability(PCI_EXT_CAP_ID_DOE))
                  pci_doe_create_mb
                    pci_doe_cache_features
                      pci_doe_discovery
                      xa_insert(&doe_mb->feats)   <--
            device_add
              device_add_attrs
                device_add_groups
    pci_bus_add_devices
      pci_bus_add_device
        pci_create_sysfs_dev_files
          ...
            pci_doe_sysfs_init                    <--
              xa_for_each(&pdev->doe_mbs)
                pci_doe_sysfs_feature_populate
                  xa_for_each(&doe_mb->feats)
                    sysfs_add_file_to_group(pci_dev_doe_feature_group.name)

Is it feasible to build an attribute group in pci_doe_init() and add
it to dev->groups so device_add() will automatically add them?

It looks like __power_supply_register() does something like that:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/power/supply/power_supply_core.c?id=v6.5#n1375

> --- a/include/linux/pci-doe.h
> +++ b/include/linux/pci-doe.h
> @@ -22,4 +22,6 @@ int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type,
>  	    const void *request, size_t request_sz,
>  	    void *response, size_t response_sz);
>  
> +int pci_doe_sysfs_init(struct pci_dev *pci_dev);
> +void pci_doe_sysfs_teardown(struct pci_dev *pdev);

These declarations look like they should be in drivers/pci/pci.h as
well.  I don't think anything outside the PCI core should need these.

Bjorn
  
Lukas Wunner Oct. 19, 2023, 7:32 p.m. UTC | #3
On Thu, Oct 19, 2023 at 11:58:29AM -0500, Bjorn Helgaas wrote:
> On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote:
> > +	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> > +		xa_for_each(&doe_mb->feats, j, entry)
> > +			return a->mode;
> > +	}
> > +
> > +	return 0;
> 
> The nested loops that don't test anything look a little weird and
> maybe I'm missing something, but this looks like it returns a->mode if
> any mailbox with a feature exists, and 0 otherwise.
> 
> Is that the same as this:
> 
>   if (pdev->doe_mbs)
>     return a->mode;
> 
>   return 0;
> 
> since it sounds like a mailbox must support at least one feature?

In theory it's the same, in practice there *might* be non-compliant
devices which lack support for the discovery feature.


> > +		attrs[i].attr.name = kasprintf(GFP_KERNEL,
> > +					       "0x%04lX:%02lX", vid, type);
> 
> What's the rationale for using "0x" on the vendor ID but not on the
> type?  "0x1234:10" hints that the "10" might be decimal since it lacks
> "0x".
> 
> Suggest lower-case "%04lx:%02lx" either way.
> 
> FWIW, there's no "0x" prefix on the hex vendor IDs in "lspci -n"
> output and dmesg messages like this:
> 
>   pci 0000:01:00.0: [10de:13b6] type 00

The existing attributes "vendor", "device" etc do emit the "0x".

From drivers/pci/pci-sysfs.c:

pci_config_attr(vendor, "0x%04x\n");
pci_config_attr(device, "0x%04x\n");
pci_config_attr(subsystem_vendor, "0x%04x\n");
pci_config_attr(subsystem_device, "0x%04x\n");
pci_config_attr(revision, "0x%02x\n");
pci_config_attr(class, "0x%06x\n");


> I try hard to avoid calling *anything* from the
> pci_create_sysfs_dev_files() path because it has the nasty
> "sysfs_initialized" check and the associated pci_sysfs_init()
> initcall.

What's the purpose of sysfs_initialized anyway?

It was introduced by this historic commit:
https://git.kernel.org/tglx/history/c/f6d553444da2

Can PCI_ROM_RESOURCEs appear after device enumeration but before
the late_initcall stage?

If sysfs_initialized is only needed for PCI_ROM_RESOURCEs, can we
constrain pci_sysfs_init() to those and avoid creating all the
other runtime sysfs attributes in the initcall?

Thanks,

Lukas
  
Bjorn Helgaas Oct. 19, 2023, 8:01 p.m. UTC | #4
On Thu, Oct 19, 2023 at 09:32:46PM +0200, Lukas Wunner wrote:
> On Thu, Oct 19, 2023 at 11:58:29AM -0500, Bjorn Helgaas wrote:
> > On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote:
> > > +	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> > > +		xa_for_each(&doe_mb->feats, j, entry)
> > > +			return a->mode;
> > > +	}
> > > +
> > > +	return 0;
> > 
> > The nested loops that don't test anything look a little weird and
> > maybe I'm missing something, but this looks like it returns a->mode if
> > any mailbox with a feature exists, and 0 otherwise.
> > 
> > Is that the same as this:
> > 
> >   if (pdev->doe_mbs)
> >     return a->mode;
> > 
> >   return 0;
> > 
> > since it sounds like a mailbox must support at least one feature?
> 
> In theory it's the same, in practice there *might* be non-compliant
> devices which lack support for the discovery feature.

Is there any point in setting ->doe_mbs if there's no feature?

> > > +		attrs[i].attr.name = kasprintf(GFP_KERNEL,
> > > +					       "0x%04lX:%02lX", vid, type);
> > 
> > What's the rationale for using "0x" on the vendor ID but not on the
> > type?  "0x1234:10" hints that the "10" might be decimal since it lacks
> > "0x".

This is my main question.  Seems like it should be both or neither.

> > I try hard to avoid calling *anything* from the
> > pci_create_sysfs_dev_files() path because it has the nasty
> > "sysfs_initialized" check and the associated pci_sysfs_init()
> > initcall.
> 
> What's the purpose of sysfs_initialized anyway?
> 
> It was introduced by this historic commit:
> https://git.kernel.org/tglx/history/c/f6d553444da2
> 
> Can PCI_ROM_RESOURCEs appear after device enumeration but before
> the late_initcall stage?
> 
> If sysfs_initialized is only needed for PCI_ROM_RESOURCEs, can we
> constrain pci_sysfs_init() to those and avoid creating all the
> other runtime sysfs attributes in the initcall?

I think pci_sysfs_init() is already constrained to only the BARs and
ROM.  Constraining it to only the ROM would be an improvement, but I'd
really like to get rid of it altogether.  Krzysztof W. moved a lot of
stuff out of pci_sysfs_init() a while ago, but the BARs are harder
because of some arch/alpha wrinkles, IIRC.

I think the reason for pci_sysfs_init() exists in the first place is
because those resources may be assigned after pci_device_add(), and
(my memory is hazy here) it seems like changing the size of binary
attributes is hard, which might fit with the
pci_remove_resource_files() and pci_create_resource_files() in the
resource##n##_resize_store() macro:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/pci/pci-sysfs.c?id=v6.5#n1440

Bjorn
  
Alistair Francis Nov. 3, 2023, 1:27 a.m. UTC | #5
On Tue, Oct 17, 2023 at 6:34 PM Lukas Wunner <lukas@wunner.de> wrote:
>
> On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote:
> > +#ifdef CONFIG_SYSFS
> > +static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj,
> > +                                          struct attribute *a, int n)
> > +{
> > +     struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
> > +     struct pci_doe_mb *doe_mb;
> > +     unsigned long index, j;
> > +     void *entry;
> > +
> > +     xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> > +             xa_for_each(&doe_mb->feats, j, entry)
> > +                     return a->mode;
> > +     }
> > +
> > +     return 0;
> > +}
>
> Out of curiosity:  Does this method prevent creation of a
> "doe_features" directory for devices which don't have any
> DOE mailboxes?

It does once this patch (or something similar) is applied:

https://lkml.org/lkml/2022/8/24/607

GKH and I are working on getting a patch like that working and merged

Alistair

>
> (If it does, a code comment explaining that might be helpful.)
>
>
> > +const struct attribute_group pci_dev_doe_feature_group = {
> > +     .name   = "doe_features",
> > +     .attrs  = pci_dev_doe_feature_attrs,
> > +     .is_visible = pci_doe_sysfs_attr_is_visible,
> > +};
>
> Nit:  Wondering why the "=" is aligned for .name and .attrs
> but not for .is_visible?
>
>
> > +static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev,
> > +                                      struct pci_doe_mb *doe_mb)
> > +{
> > +     struct device *dev = &pdev->dev;
> > +     struct device_attribute *attrs = doe_mb->sysfs_attrs;
> > +     unsigned long i;
> > +     void *entry;
>
> Nit:  Inverse Christmas tree?
>
>
> > +static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev,
> > +                                       struct pci_doe_mb *doe_mb)
> > +{
> > +     struct device *dev = &pdev->dev;
> > +     struct device_attribute *attrs;
> > +     unsigned long num_features = 0;
> > +     unsigned long vid, type;
> > +     unsigned long i;
> > +     void *entry;
> > +     int ret;
> > +
> > +     xa_for_each(&doe_mb->feats, i, entry)
> > +             num_features++;
> > +
> > +     attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL);
> > +     if (!attrs)
> > +             return -ENOMEM;
> > +
> > +     doe_mb->sysfs_attrs = attrs;
> > +     xa_for_each(&doe_mb->feats, i, entry) {
> > +             sysfs_attr_init(&attrs[i].attr);
> > +             vid = xa_to_value(entry) >> 8;
> > +             type = xa_to_value(entry) & 0xFF;
> > +             attrs[i].attr.name = kasprintf(GFP_KERNEL,
> > +                                            "0x%04lX:%02lX", vid, type);
>
> Nit:  Capital X conversion specifier will result in upper case hex
> characters in filename and contents, whereas existing sysfs attributes
> such as "vendor", "device" contain lower case hex characters.
>
> Might want to consider lower case x for consistency.
>
>
> > +void pci_doe_sysfs_teardown(struct pci_dev *pdev)
> > +{
> > +     struct pci_doe_mb *doe_mb;
> > +     unsigned long index;
> > +
> > +     xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> > +             pci_doe_sysfs_feature_remove(pdev, doe_mb);
> > +     }
>
> Nit:  Curly braces not necessary.
>
>
> > @@ -1153,6 +1154,10 @@ static void pci_remove_resource_files(struct pci_dev *pdev)
> >  {
> >       int i;
> >
> > +     if (IS_ENABLED(CONFIG_PCI_DOE)) {
> > +             pci_doe_sysfs_teardown(pdev);
> > +     }
>
> Nit:  Curly braces not necessary.
>
>
> > @@ -1230,6 +1235,12 @@ static int pci_create_resource_files(struct pci_dev *pdev)
> >       int i;
> >       int retval;
> >
> > +     if (IS_ENABLED(CONFIG_PCI_DOE)) {
> > +             retval = pci_doe_sysfs_init(pdev);
> > +             if (retval)
> > +                     return retval;
> > +     }
> > +
> >       /* Expose the PCI resources from this device as files */
> >       for (i = 0; i < PCI_STD_NUM_BARS; i++) {
>
> I think this needs to be added to pci_create_sysfs_dev_files() instead.
>
> pci_create_resource_files() only deals with creation of resource files,
> as the name implies, which is unrelated to DOE features.
>
> Worse, pci_create_resource_files() is also called from
> pci_dev_resource_resize_attr(), i.e. every time user space
> writes to the "resource##n##_resize" attributes.
>
> Similarly, the call to pci_doe_sysfs_teardown() belongs in
> pci_remove_sysfs_dev_files().
>
> Thanks,
>
> Lukas
  
Alistair Francis Nov. 3, 2023, 2:17 a.m. UTC | #6
On Fri, Oct 20, 2023 at 2:58 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Fri, Oct 13, 2023 at 01:41:57PM +1000, Alistair Francis wrote:
> > The PCIe 6 specification added support for the Data Object Exchange (DOE).
> > When DOE is supported the DOE Discovery Feature must be implemented per
> > PCIe r6.1 sec 6.30.1.1. The protocol allows a requester to obtain
> > information about the other DOE features supported by the device.
> > ...
>
> > +static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj,
> > +                                          struct attribute *a, int n)
> > +{
> > +     struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
> > +     struct pci_doe_mb *doe_mb;
> > +     unsigned long index, j;
> > +     void *entry;
> > +
> > +     xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> > +             xa_for_each(&doe_mb->feats, j, entry)
> > +                     return a->mode;
> > +     }
> > +
> > +     return 0;
>
> The nested loops that don't test anything look a little weird and
> maybe I'm missing something, but this looks like it returns a->mode if
> any mailbox with a feature exists, and 0 otherwise.
>
> Is that the same as this:
>
>   if (pdev->doe_mbs)
>     return a->mode;
>
>   return 0;
>
> since it sounds like a mailbox must support at least one feature?

I don't think this is the exact same.

pdev->doe_mbs exist (created by xa_init()) even if there are no
features supported.

I do think it's important we make sure DOE features exist before we
show the property.

>
> > +}
> > +
> > +static struct attribute *pci_dev_doe_feature_attrs[] = {
> > +     NULL,
> > +};
> > +
> > +const struct attribute_group pci_dev_doe_feature_group = {
> > +     .name   = "doe_features",
> > +     .attrs  = pci_dev_doe_feature_attrs,
> > +     .is_visible = pci_doe_sysfs_attr_is_visible,
> > +};
> > +
> > +static ssize_t pci_doe_sysfs_feature_show(struct device *dev,
> > +                                       struct device_attribute *attr,
> > +                                       char *buf)
> > +{
> > +     return sysfs_emit(buf, "%s\n", attr->attr.name);
> > +}
> > +
> > +static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev,
> > +                                      struct pci_doe_mb *doe_mb)
> > +{
> > +     struct device *dev = &pdev->dev;
> > +     struct device_attribute *attrs = doe_mb->sysfs_attrs;
> > +     unsigned long i;
> > +     void *entry;
> > +
> > +     if (!attrs)
> > +             return;
> > +
> > +     doe_mb->sysfs_attrs = NULL;
> > +     xa_for_each(&doe_mb->feats, i, entry) {
> > +             if (attrs[i].show)
> > +                     sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr,
> > +                                                  pci_dev_doe_feature_group.name);
> > +             kfree(attrs[i].attr.name);
> > +     }
> > +     kfree(attrs);
> > +}
> > +
> > +static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev,
> > +                                       struct pci_doe_mb *doe_mb)
> > +{
> > +     struct device *dev = &pdev->dev;
> > +     struct device_attribute *attrs;
> > +     unsigned long num_features = 0;
> > +     unsigned long vid, type;
> > +     unsigned long i;
> > +     void *entry;
> > +     int ret;
> > +
> > +     xa_for_each(&doe_mb->feats, i, entry)
> > +             num_features++;
> > +
> > +     attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL);
> > +     if (!attrs)
> > +             return -ENOMEM;
> > +
> > +     doe_mb->sysfs_attrs = attrs;
> > +     xa_for_each(&doe_mb->feats, i, entry) {
> > +             sysfs_attr_init(&attrs[i].attr);
> > +             vid = xa_to_value(entry) >> 8;
> > +             type = xa_to_value(entry) & 0xFF;
> > +             attrs[i].attr.name = kasprintf(GFP_KERNEL,
> > +                                            "0x%04lX:%02lX", vid, type);
>
> What's the rationale for using "0x" on the vendor ID but not on the
> type?  "0x1234:10" hints that the "10" might be decimal since it lacks
> "0x".
>
> Suggest lower-case "%04lx:%02lx" either way.

Fixed!

>
> FWIW, there's no "0x" prefix on the hex vendor IDs in "lspci -n"
> output and dmesg messages like this:
>
>   pci 0000:01:00.0: [10de:13b6] type 00
>
> > +             if (!attrs[i].attr.name) {
> > +                     ret = -ENOMEM;
> > +                     goto fail;
> > +             }
> > +
> > +             attrs[i].attr.mode = 0444;
> > +             attrs[i].show = pci_doe_sysfs_feature_show;
> > +
> > +             ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr,
> > +                                           pci_dev_doe_feature_group.name);
> > +             if (ret) {
> > +                     attrs[i].show = NULL;
> > +                     goto fail;
> > +             }
> > +     }
> > +
> > +     return 0;
> > +
> > +fail:
> > +     pci_doe_sysfs_feature_remove(pdev, doe_mb);
> > +     return ret;
>
> Not sure we should treat this failure this seriously.  Looks like it
> will prevent creation of the BAR resource files, and possibly even
> abort pci_sysfs_init() early.  I think the pci_dev will still be
> usable (lacking DOE sysfs) even if this fails.

I can change the call in pci_create_resource_files() to not return?

>
> > +}
> > +
> > +void pci_doe_sysfs_teardown(struct pci_dev *pdev)
> > +{
> > +     struct pci_doe_mb *doe_mb;
> > +     unsigned long index;
> > +
> > +     xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> > +             pci_doe_sysfs_feature_remove(pdev, doe_mb);
> > +     }
> > +}
> > +
> > +int pci_doe_sysfs_init(struct pci_dev *pdev)
> > +{
> > +     struct pci_doe_mb *doe_mb;
> > +     unsigned long index;
> > +     int ret;
> > +
> > +     xa_for_each(&pdev->doe_mbs, index, doe_mb) {
> > +             ret = pci_doe_sysfs_feature_populate(pdev, doe_mb);
> > +             if (ret)
> > +                     return ret;
> > +     }
>
> I agree with Lukas that pci_create_resource_files() is not the right
> place to call this.
>
> I try hard to avoid calling *anything* from the
> pci_create_sysfs_dev_files() path because it has the nasty
> "sysfs_initialized" check and the associated pci_sysfs_init()
> initcall.
>
> It's so much cleaner when we can set up static attributes that are
> automatically added in the device_add() path.  I don't know whether
> that's possible.  I see lots of discussion with Greg KH that might be
> related, but I'm not sure.

I don't think it's possible, at least not that I or anyone else has
been able to figure out yet.

>
> I do know that we enumerate the mailboxes and features during
> pci_init_capabilities(), which happens before device_add(), so the
> information about which attributes should be present is at least
> *available* early enough:
>
>   pci_host_probe
>     pci_scan_root_bus_bridge
>       ...
>         pci_scan_single_device
>           pci_device_add
>             pci_init_capabilities
>               pci_doe_init
>                 while (pci_find_next_ext_capability(PCI_EXT_CAP_ID_DOE))
>                   pci_doe_create_mb
>                     pci_doe_cache_features
>                       pci_doe_discovery
>                       xa_insert(&doe_mb->feats)   <--
>             device_add
>               device_add_attrs
>                 device_add_groups
>     pci_bus_add_devices
>       pci_bus_add_device
>         pci_create_sysfs_dev_files
>           ...
>             pci_doe_sysfs_init                    <--
>               xa_for_each(&pdev->doe_mbs)
>                 pci_doe_sysfs_feature_populate
>                   xa_for_each(&doe_mb->feats)
>                     sysfs_add_file_to_group(pci_dev_doe_feature_group.name)
>
> Is it feasible to build an attribute group in pci_doe_init() and add
> it to dev->groups so device_add() will automatically add them?

That doesn't work as the sysfs_add_file_to_group() function will seg
fault when trying to find the parent as I don't think it exists yet.

[    0.767581] BUG: kernel NULL pointer dereference, address: 0000000000000008
[    0.767835] #PF: supervisor read access in kernel mode
[    0.767835] #PF: error_code(0x0000) - not-present page
[    0.767835] PGD 0 P4D 0
[    0.767835] Oops: 0000 [#1] PREEMPT SMP NOPTI
[    0.767835] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
6.6.0-10270-g5dda351a02c8-dirty #10
[    0.767835] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009),
BIOS rel-1.16.2-14-g1e1da7a96300-prebuilt.qemu.org 04/01/2014
[    0.767835] RIP: 0010:kernfs_find_and_get_ns+0x10/0x70
[    0.767835] Code: 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90
90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 41 55 49 89 d5 41 54 49 89
f4 55 53 <48> 8b 0
[    0.767835] RSP: 0018:ffff96f9c00138a8 EFLAGS: 00000246
[    0.767835] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000030
[    0.767835] RDX: 0000000000000000 RSI: ffffffffafec53d9 RDI: 0000000000000000
[    0.767835] RBP: ffff957b4180e0b8 R08: 0000000000000000 R09: 0000000000ffff10
[    0.767835] R10: 0000000000000000 R11: ffffffffaf677c80 R12: ffffffffafec53d9
[    0.767835] R13: 0000000000000000 R14: ffff957b413c1ea0 R15: ffff957b4180e000
[    0.767835] FS:  0000000000000000(0000) GS:ffff957bbdc00000(0000)
knlGS:0000000000000000
[    0.767835] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    0.767835] CR2: 0000000000000008 CR3: 000000004442e000 CR4: 00000000000006f0
[    0.767835] Call Trace:
[    0.767835]  <TASK>
[    0.767835]  ? __die+0x1e/0x60
[    0.767835]  ? page_fault_oops+0x17c/0x470
[    0.767835]  ? search_module_extables+0x14/0x50
[    0.767835]  ? exc_page_fault+0x67/0x150
[    0.767835]  ? asm_exc_page_fault+0x26/0x30
[    0.767835]  ? __pfx_pci_mmcfg_read+0x10/0x10
[    0.767835]  ? kernfs_find_and_get_ns+0x10/0x70
[    0.767835]  ? kasprintf+0x5a/0x80
[    0.767835]  sysfs_add_file_to_group+0x4c/0x110
[    0.767835]  pci_doe_sysfs_init+0x13b/0x240
[    0.767835]  pci_device_add+0x1d7/0x620
[    0.767835]  pci_scan_single_device+0xc8/0x100
[    0.767835]  pci_scan_slot+0x6f/0x1e0
[    0.767835]  pci_scan_child_bus_extend+0x30/0x210
[    0.767835]  pci_scan_bridge_extend+0x5f4/0x710
[    0.767835]  pci_scan_child_bus_extend+0xc2/0x210
[    0.767835]  acpi_pci_root_create+0x283/0x2f0
[    0.767835]  pci_acpi_scan_root+0x199/0x200
[    0.767835]  acpi_pci_root_add+0x1ba/0x370
[    0.767835]  acpi_bus_attach+0x140/0x260
[    0.767835]  ? __pfx_acpi_dev_for_one_check+0x10/0x10
[    0.767835]  device_for_each_child+0x68/0xa0
[    0.767835]  acpi_dev_for_each_child+0x37/0x60
[    0.767835]  ? __pfx_acpi_bus_attach+0x10/0x10
[    0.767835]  acpi_bus_attach+0x21e/0x260
[    0.767835]  ? __pfx_acpi_dev_for_one_check+0x10/0x10
[    0.767835]  device_for_each_child+0x68/0xa0
[    0.767835]  acpi_dev_for_each_child+0x37/0x60
[    0.767835]  ? __pfx_acpi_bus_attach+0x10/0x10
[    0.767835]  acpi_bus_attach+0x21e/0x260
[    0.767835]  acpi_bus_scan+0x6b/0x1e0
[    0.767835]  acpi_scan_init+0xdc/0x290
[    0.767835]  acpi_init+0x22b/0x500
[    0.767835]  ? __pfx_acpi_init+0x10/0x10
[    0.767835]  do_one_initcall+0x56/0x220
[    0.767835]  kernel_init_freeable+0x19e/0x2d0
[    0.767835]  ? __pfx_kernel_init+0x10/0x10
[    0.767835]  kernel_init+0x15/0x1b0
[    0.767835]  ret_from_fork+0x2f/0x50
[    0.767835]  ? __pfx_kernel_init+0x10/0x10
[    0.767835]  ret_from_fork_asm+0x1b/0x30

I can move this to pci_create_sysfs_dev_files() instead if that's at
least better?

>
> It looks like __power_supply_register() does something like that:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/power/supply/power_supply_core.c?id=v6.5#n1375
>
> > --- a/include/linux/pci-doe.h
> > +++ b/include/linux/pci-doe.h
> > @@ -22,4 +22,6 @@ int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type,
> >           const void *request, size_t request_sz,
> >           void *response, size_t response_sz);
> >
> > +int pci_doe_sysfs_init(struct pci_dev *pci_dev);
> > +void pci_doe_sysfs_teardown(struct pci_dev *pdev);
>
> These declarations look like they should be in drivers/pci/pci.h as
> well.  I don't think anything outside the PCI core should need these.

I will move these.

Alistair

>
> Bjorn
  

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index ecf47559f495..66d9d66e3584 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -500,3 +500,27 @@  Description:
 		console drivers from the device.  Raw users of pci-sysfs
 		resourceN attributes must be terminated prior to resizing.
 		Success of the resizing operation is not guaranteed.
+
+What:		/sys/bus/pci/devices/.../doe_features
+Date:		October 2023
+Contact:	Linux PCI developers <linux-pci@vger.kernel.org>
+Description:
+		This directory contains a list of the supported
+		Data Object Exchange (DOE) features. The feature values are in
+		the file name. The contents of each file are the same as the
+		name.
+
+		The value comes from the device and specifies the vendor and
+		data object type supported. The lower (RHS of the colon) is
+		the data object type in hex. The upper (LHS of the colon)
+		is the vendor ID.
+
+		As all DOE devices must support the DOE discovery protocol, if
+		DOE is supported you will at least see this file, with this
+		contents
+
+		# cat doe_features/0x0001:00
+		0x0001:00
+
+		If the device supports other protocols you will see other files
+		as well.
diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index db2a86edf2e1..1ffc5441e6d1 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -47,6 +47,7 @@ 
  * @wq: Wait queue for work item
  * @work_queue: Queue of pci_doe_work items
  * @flags: Bit array of PCI_DOE_FLAG_* flags
+ * @sysfs_attrs: Array of sysfs device attributes
  */
 struct pci_doe_mb {
 	struct pci_dev *pdev;
@@ -56,6 +57,10 @@  struct pci_doe_mb {
 	wait_queue_head_t wq;
 	struct workqueue_struct *work_queue;
 	unsigned long flags;
+
+#ifdef CONFIG_SYSFS
+	struct device_attribute *sysfs_attrs;
+#endif
 };
 
 struct pci_doe_feature {
@@ -92,6 +97,135 @@  struct pci_doe_task {
 	struct pci_doe_mb *doe_mb;
 };
 
+#ifdef CONFIG_SYSFS
+static umode_t pci_doe_sysfs_attr_is_visible(struct kobject *kobj,
+					     struct attribute *a, int n)
+{
+	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
+	struct pci_doe_mb *doe_mb;
+	unsigned long index, j;
+	void *entry;
+
+	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
+		xa_for_each(&doe_mb->feats, j, entry)
+			return a->mode;
+	}
+
+	return 0;
+}
+
+static struct attribute *pci_dev_doe_feature_attrs[] = {
+	NULL,
+};
+
+const struct attribute_group pci_dev_doe_feature_group = {
+	.name	= "doe_features",
+	.attrs	= pci_dev_doe_feature_attrs,
+	.is_visible = pci_doe_sysfs_attr_is_visible,
+};
+
+static ssize_t pci_doe_sysfs_feature_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	return sysfs_emit(buf, "%s\n", attr->attr.name);
+}
+
+static void pci_doe_sysfs_feature_remove(struct pci_dev *pdev,
+					 struct pci_doe_mb *doe_mb)
+{
+	struct device *dev = &pdev->dev;
+	struct device_attribute *attrs = doe_mb->sysfs_attrs;
+	unsigned long i;
+	void *entry;
+
+	if (!attrs)
+		return;
+
+	doe_mb->sysfs_attrs = NULL;
+	xa_for_each(&doe_mb->feats, i, entry) {
+		if (attrs[i].show)
+			sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr,
+						     pci_dev_doe_feature_group.name);
+		kfree(attrs[i].attr.name);
+	}
+	kfree(attrs);
+}
+
+static int pci_doe_sysfs_feature_populate(struct pci_dev *pdev,
+					  struct pci_doe_mb *doe_mb)
+{
+	struct device *dev = &pdev->dev;
+	struct device_attribute *attrs;
+	unsigned long num_features = 0;
+	unsigned long vid, type;
+	unsigned long i;
+	void *entry;
+	int ret;
+
+	xa_for_each(&doe_mb->feats, i, entry)
+		num_features++;
+
+	attrs = kcalloc(num_features, sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		return -ENOMEM;
+
+	doe_mb->sysfs_attrs = attrs;
+	xa_for_each(&doe_mb->feats, i, entry) {
+		sysfs_attr_init(&attrs[i].attr);
+		vid = xa_to_value(entry) >> 8;
+		type = xa_to_value(entry) & 0xFF;
+		attrs[i].attr.name = kasprintf(GFP_KERNEL,
+					       "0x%04lX:%02lX", vid, type);
+		if (!attrs[i].attr.name) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		attrs[i].attr.mode = 0444;
+		attrs[i].show = pci_doe_sysfs_feature_show;
+
+		ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr,
+					      pci_dev_doe_feature_group.name);
+		if (ret) {
+			attrs[i].show = NULL;
+			goto fail;
+		}
+	}
+
+	return 0;
+
+fail:
+	pci_doe_sysfs_feature_remove(pdev, doe_mb);
+	return ret;
+}
+
+void pci_doe_sysfs_teardown(struct pci_dev *pdev)
+{
+	struct pci_doe_mb *doe_mb;
+	unsigned long index;
+
+	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
+		pci_doe_sysfs_feature_remove(pdev, doe_mb);
+	}
+}
+
+int pci_doe_sysfs_init(struct pci_dev *pdev)
+{
+	struct pci_doe_mb *doe_mb;
+	unsigned long index;
+	int ret;
+
+	xa_for_each(&pdev->doe_mbs, index, doe_mb) {
+		ret = pci_doe_sysfs_feature_populate(pdev, doe_mb);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+#endif
+
 static int pci_doe_wait(struct pci_doe_mb *doe_mb, unsigned long timeout)
 {
 	if (wait_event_timeout(doe_mb->wq,
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index d9eede2dbc0e..64bf765df5e2 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -16,6 +16,7 @@ 
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/pci.h>
+#include <linux/pci-doe.h>
 #include <linux/stat.h>
 #include <linux/export.h>
 #include <linux/topology.h>
@@ -1153,6 +1154,10 @@  static void pci_remove_resource_files(struct pci_dev *pdev)
 {
 	int i;
 
+	if (IS_ENABLED(CONFIG_PCI_DOE)) {
+		pci_doe_sysfs_teardown(pdev);
+	}
+
 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 		struct bin_attribute *res_attr;
 
@@ -1230,6 +1235,12 @@  static int pci_create_resource_files(struct pci_dev *pdev)
 	int i;
 	int retval;
 
+	if (IS_ENABLED(CONFIG_PCI_DOE)) {
+		retval = pci_doe_sysfs_init(pdev);
+		if (retval)
+			return retval;
+	}
+
 	/* Expose the PCI resources from this device as files */
 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 
@@ -1655,6 +1666,9 @@  static const struct attribute_group *pci_dev_attr_groups[] = {
 #endif
 #ifdef CONFIG_PCIEASPM
 	&aspm_ctrl_attr_group,
+#endif
+#ifdef CONFIG_PCI_DOE
+	&pci_dev_doe_feature_group,
 #endif
 	NULL,
 };
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 39a8932dc340..83065e07c491 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -186,6 +186,7 @@  extern const struct attribute_group *pci_dev_groups[];
 extern const struct attribute_group *pcibus_groups[];
 extern const struct device_type pci_dev_type;
 extern const struct attribute_group *pci_bus_groups[];
+extern const struct attribute_group pci_dev_doe_feature_group;
 
 extern unsigned long pci_hotplug_io_size;
 extern unsigned long pci_hotplug_mmio_size;
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
index 1f14aed4354b..c5529ae604ff 100644
--- a/include/linux/pci-doe.h
+++ b/include/linux/pci-doe.h
@@ -22,4 +22,6 @@  int pci_doe(struct pci_doe_mb *doe_mb, u16 vendor, u8 type,
 	    const void *request, size_t request_sz,
 	    void *response, size_t response_sz);
 
+int pci_doe_sysfs_init(struct pci_dev *pci_dev);
+void pci_doe_sysfs_teardown(struct pci_dev *pdev);
 #endif