On Wed, Jan 04, 2023 at 10:27:33AM +0530, Rajat Khandelwal wrote:
> Hi Bjorn,
>
> Thanks for the acknowledgement.
>
> On 1/4/2023 12:44 AM, Bjorn Helgaas wrote:
> > [+cc Paul, Sasha, Leon, Frederick]
> >
> > (Please cc folks who have commented on previous versions of your
> > patch.)
> >
> > On Tue, Jan 03, 2023 at 10:25:48PM +0530, Rajat Khandelwal wrote:
> > > There are many instances where correctable errors tend to inundate
> > > the message buffer. We observe such instances during thunderbolt PCIe
> > > tunneling.
<...>
> > > [54982.838808] igc 0000:2b:00.0: device [8086:5502] error status/mask=00001000/00002000
> > > [54982.838817] igc 0000:2b:00.0: [12] Timeout
> > Please remove the timestamps; they don't contribute to understanding
> > the problem.
>
> --> Sure.
Please don't add "-->" or any marker to replies. It breaks mail color
scheme.
>
> >
> > > This gets repeated continuously, thus inundating the buffer.
> > Did you verify that we actually clear the Correctable Error Status
> > register?
>
> --> This patch targets only rate limiting the correctable errors since they are
> non-fatal, and they kind of inundate the CPU logs, particularly during thunderbolt
> connections. It doesn't have an impact anywhere else.
> As per your suggestion in the igc patch, I found rate limiting as a doable option
> currently. Have eradicated any kind of masking the bits.
You didn't answer on the asked question. "Did you verify that we actually clear
the Correctable Error Status register?".
Thanks
@@ -684,23 +684,24 @@ static void __aer_print_error(struct pci_dev *dev,
{
const char **strings;
unsigned long status = info->status & ~info->mask;
- const char *level, *errmsg;
+ const char *errmsg;
int i;
- if (info->severity == AER_CORRECTABLE) {
+ if (info->severity == AER_CORRECTABLE)
strings = aer_correctable_error_string;
- level = KERN_WARNING;
- } else {
+ else
strings = aer_uncorrectable_error_string;
- level = KERN_ERR;
- }
for_each_set_bit(i, &status, 32) {
errmsg = strings[i];
if (!errmsg)
errmsg = "Unknown Error Bit";
- pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
+ if (info->severity == AER_CORRECTABLE)
+ pci_warn_ratelimited(dev, " [%2d] %-22s%s\n", i, errmsg,
+ info->first_error == i ? " (First)" : "");
+ else
+ pci_err(dev, " [%2d] %-22s%s\n", i, errmsg,
info->first_error == i ? " (First)" : "");
}
pci_dev_aer_stats_incr(dev, info);
@@ -710,7 +711,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
{
int layer, agent;
int id = ((dev->bus->number << 8) | dev->devfn);
- const char *level;
if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
@@ -721,14 +718,21 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
layer = AER_GET_LAYER_ERROR(info->severity, info->status);
agent = AER_GET_AGENT(info->severity, info->status);
- level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
+ if (info->severity == AER_CORRECTABLE) {
+ pci_warn_ratelimited(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
+ aer_error_severity_string[info->severity],
+ aer_error_layer[layer], aer_agent_string[agent]);
- pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
- aer_error_severity_string[info->severity],
- aer_error_layer[layer], aer_agent_string[agent]);
+ pci_warn_ratelimited(dev, " device [%04x:%04x] error status/mask=%08x/%08x\n",
+ dev->vendor, dev->device, info->status, info->mask);
+ } else {
+ pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
+ aer_error_severity_string[info->severity],
+ aer_error_layer[layer], aer_agent_string[agent]);
- pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n",
- dev->vendor, dev->device, info->status, info->mask);
+ pci_err(dev, " device [%04x:%04x] error status/mask=%08x/%08x\n",
+ dev->vendor, dev->device, info->status, info->mask);
+ }
__aer_print_error(dev, info);
@@ -748,11 +755,19 @@ static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
u8 bus = info->id >> 8;
u8 devfn = info->id & 0xff;
- pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n",
- info->multi_error_valid ? "Multiple " : "",
- aer_error_severity_string[info->severity],
- pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
- PCI_FUNC(devfn));
+ if (info->severity == AER_CORRECTABLE)
+ pci_info_ratelimited(dev, "%s%s error received: %04x:%02x:%02x.%d\n",
+ info->multi_error_valid ? "Multiple " : "",
+ aer_error_severity_string[info->severity],
+ pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
+ PCI_FUNC(devfn));
+ else
+ pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n",
+ info->multi_error_valid ? "Multiple " : "",
+ aer_error_severity_string[info->severity],
+ pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
+ PCI_FUNC(devfn));
+
}
#ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -2491,6 +2491,9 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type);
#define pci_info_ratelimited(pdev, fmt, arg...) \
dev_info_ratelimited(&(pdev)->dev, fmt, ##arg)
+#define pci_warn_ratelimited(pdev, fmt, arg...) \
+ dev_warn_ratelimited(&(pdev)->dev, fmt, ##arg)
+
#define pci_WARN(pdev, condition, fmt, arg...) \
WARN(condition, "%s %s: " fmt, \
dev_driver_string(&(pdev)->dev), pci_name(pdev), ##arg)