From 95d69308ce00f5d78fd3908529cb750720b42d9f Mon Sep 17 00:00:00 2001 From: Yury Murashka Date: Wed, 27 May 2026 19:45:15 +0000 Subject: [PATCH 1/2] Add noaer_recovery pci kernel boot option AER error recovery is part of the AER error handling subsystem in the Linux kernel. AER is enabled by default in the SONiC Linux kernel. The default Linux behavior is incompatible with Arista chassis hardware architecture. Enabling AER recovery on large modular systems with a complex PCIe tree could cause unexpected behavior and side effects. It would be nice to have an option to disable AER recovery on some chassis. Add pci=noaer_recovery kernel boot option to disable AER error recovery when an uncorrectable error is reported. Signed-off-by: Yury Murashka --- ...iver-arista-pci-aer-disable-recovery.patch | 104 ++++++++++++++++++ patches-sonic/series | 1 + 2 files changed, 105 insertions(+) create mode 100644 patches-sonic/driver-arista-pci-aer-disable-recovery.patch diff --git a/patches-sonic/driver-arista-pci-aer-disable-recovery.patch b/patches-sonic/driver-arista-pci-aer-disable-recovery.patch new file mode 100644 index 000000000..b477b0f7c --- /dev/null +++ b/patches-sonic/driver-arista-pci-aer-disable-recovery.patch @@ -0,0 +1,104 @@ +From: yurypm +Date: Mon, 25 May 2026 13:45:51 +0000 +Subject: Add noaer_recovery pci kernel boot option + +AER error recovery is part of the AER error handling subsystem in +the Linux kernel. AER is enabled by default in the SONiC Linux +kernel. The default Linux behavior is incompatible with Arista +chassis hardware architecture. Enabling AER recovery on large +modular systems with a complex PCIe tree could cause unexpected +behavior and side effects. It would be nice to have an option to +disable AER recovery on some chassis. + +Add pci=noaer_recovery kernel boot option to disable AER error +recovery when an uncorrectable error is reported. + +Signed-off-by: Yury Murashka +--- + Documentation/admin-guide/kernel-parameters.txt | 4 ++++ + drivers/pci/pci.c | 2 ++ + drivers/pci/pci.h | 2 ++ + drivers/pci/pcie/err.c | 15 +++++++++++++++ + 4 files changed, 23 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index f402bba..2bbd7ab 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4483,6 +4483,10 @@ + noaer [PCIE] If the PCIEAER kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of PCIE advanced error reporting. ++ noaer_recovery [PCIE] If the PCIEAER kernel config parameter is ++ enabled, this kernel boot option can be used to ++ disable AER error recovery when an uncorrectable ++ error is reported. + nodomains [PCI] Disable support for multiple PCI + root domains (aka PCI segments, in ACPI-speak). + nommconf [X86] Disable use of MMCONFIG for PCI +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index 51a09e4..77d0786 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -6896,6 +6896,8 @@ static int __init pci_setup(char *str) + pcie_ats_disabled = true; + } else if (!strcmp(str, "noaer")) { + pci_no_aer(); ++ } else if (!strcmp(str, "noaer_recovery")) { ++ pci_no_aer_recovery(); + } else if (!strcmp(str, "earlydump")) { + pci_early_dump = true; + } else if (!strncmp(str, "realloc=", 8)) { +diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h +index 65df6d2..551b6e8 100644 +--- a/drivers/pci/pci.h ++++ b/drivers/pci/pci.h +@@ -826,6 +826,7 @@ static inline void of_pci_remove_node(struct pci_dev *pdev) { } + + #ifdef CONFIG_PCIEAER + void pci_no_aer(void); ++void pci_no_aer_recovery(void); + void pci_aer_init(struct pci_dev *dev); + void pci_aer_exit(struct pci_dev *dev); + extern const struct attribute_group aer_stats_attr_group; +@@ -836,6 +837,7 @@ void pci_save_aer_state(struct pci_dev *dev); + void pci_restore_aer_state(struct pci_dev *dev); + #else + static inline void pci_no_aer(void) { } ++static inline void pci_no_aer_recovery(void) { } + static inline void pci_aer_init(struct pci_dev *d) { } + static inline void pci_aer_exit(struct pci_dev *d) { } + static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } +diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c +index 3109077..bb5ec0c 100644 +--- a/drivers/pci/pcie/err.c ++++ b/drivers/pci/pcie/err.c +@@ -21,6 +21,13 @@ + #include "portdrv.h" + #include "../pci.h" + ++static int pcie_aer_recovery_disable = 0; ++ ++void pci_no_aer_recovery(void) ++{ ++ pcie_aer_recovery_disable = 1; ++} ++ + static pci_ers_result_t merge_result(enum pci_ers_result orig, + enum pci_ers_result new) + { +@@ -197,6 +204,14 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, + pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER; + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); + ++ if (pcie_aer_recovery_disable) { ++ if (host->native_aer || pcie_ports_native) { ++ pcie_clear_device_status(dev); ++ pci_aer_clear_nonfatal_status(dev); ++ } ++ return status; ++ } ++ + /* + * If the error was detected by a Root Port, Downstream Port, RCEC, + * or RCiEP, recovery runs on the device itself. For Ports, that diff --git a/patches-sonic/series b/patches-sonic/series index b84d84667..278799ccf 100644 --- a/patches-sonic/series +++ b/patches-sonic/series @@ -11,6 +11,7 @@ driver-arista-pci-reassign-pref-mem.patch driver-arista-mmcblk-not-working-on-AMD-platforms.patch driver-arista-restrict-eMMC-drive-to-50Mhz-from-userland.patch driver-arista-i2c-designware-shutdown.patch +driver-arista-pci-aer-disable-recovery.patch driver-support-sff-8436-eeprom.patch driver-support-sff-8436-eeprom-update.patch driver-sff-8436-use-nvmem-framework.patch From 282220c3a45de5ad84324e35042a35bcc791d487 Mon Sep 17 00:00:00 2001 From: Yury Murashka Date: Wed, 27 May 2026 19:45:24 +0000 Subject: [PATCH 2/2] Add nodpc pci kernel boot option PCI DPC (Downstream Port Containment) is enabled by default in the SONiC Linux kernel. DPC support can be advertised by PCIe devices, but it might not be fully supported in the firmware. The default Linux behavior is incompatible with Arista chassis hardware architecture. Enabling DPC could cause unexpected behavior and side effects. It would be nice to have an option to disable DPC on some chassis. Add pci=nodpc kernel boot option to disable PCI DPC. Signed-off-by: Yury Murashka --- .../driver-arista-pci-dpc-disable.patch | 121 ++++++++++++++++++ patches-sonic/series | 1 + 2 files changed, 122 insertions(+) create mode 100644 patches-sonic/driver-arista-pci-dpc-disable.patch diff --git a/patches-sonic/driver-arista-pci-dpc-disable.patch b/patches-sonic/driver-arista-pci-dpc-disable.patch new file mode 100644 index 000000000..26504c41a --- /dev/null +++ b/patches-sonic/driver-arista-pci-dpc-disable.patch @@ -0,0 +1,121 @@ +From: yurypm +Date: Mon, 25 May 2026 13:45:50 +0000 +Subject: Add nodpc pci kernel boot option + +PCI DPC (Downstream Port Containment) is enabled by default in the +SONiC Linux kernel. DPC support can be advertised by PCIe devices, +but it might not be fully supported in the firmware. The default Linux +behavior is incompatible with Arista chassis hardware architecture. +Enabling DPC could cause unexpected behavior and side effects. It +would be nice to have an option to disable DPC on some chassis. + +Add pci=nodpc kernel boot option to disable PCI DPC. + +Signed-off-by: Yury Murashka +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + drivers/pci/pci.c | 2 ++ + drivers/pci/pci.h | 2 ++ + drivers/pci/pcie/dpc.c | 16 +++++++++++++--- + 4 files changed, 20 insertions(+), 3 deletions(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 2bbd7ab..068891e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4480,6 +4480,9 @@ + through ports 0xC000-0xCFFF). + See http://wiki.osdev.org/PCI for more info + on the configuration access mechanisms. ++ nodpc [PCIE] If the PCIE_DPC kernel config parameter is ++ enabled, this kernel boot option can be used to ++ disable the use of PCIE DPC. + noaer [PCIE] If the PCIEAER kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of PCIE advanced error reporting. +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index 77d0786..f6a4c2f 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -6894,6 +6894,8 @@ static int __init pci_setup(char *str) + } else if (!strncmp(str, "noats", 5)) { + pr_info("PCIe: ATS is disabled\n"); + pcie_ats_disabled = true; ++ } else if (!strcmp(str, "nodpc")) { ++ pci_no_dpc(); + } else if (!strcmp(str, "noaer")) { + pci_no_aer(); + } else if (!strcmp(str, "noaer_recovery")) { +diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h +index 551b6e8..bece8b9 100644 +--- a/drivers/pci/pci.h ++++ b/drivers/pci/pci.h +@@ -535,6 +535,7 @@ struct rcec_ea { + #endif + + #ifdef CONFIG_PCIE_DPC ++void pci_no_dpc(void); + void pci_save_dpc_state(struct pci_dev *dev); + void pci_restore_dpc_state(struct pci_dev *dev); + void pci_dpc_init(struct pci_dev *pdev); +@@ -542,6 +543,7 @@ void dpc_process_error(struct pci_dev *pdev); + pci_ers_result_t dpc_reset_link(struct pci_dev *pdev); + bool pci_dpc_recovered(struct pci_dev *pdev); + #else ++static inline void pci_no_dpc(void) { } + static inline void pci_save_dpc_state(struct pci_dev *dev) { } + static inline void pci_restore_dpc_state(struct pci_dev *dev) { } + static inline void pci_dpc_init(struct pci_dev *pdev) { } +diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c +index cdc5431..8eb2a1f 100644 +--- a/drivers/pci/pcie/dpc.c ++++ b/drivers/pci/pcie/dpc.c +@@ -43,12 +43,19 @@ static const char * const rp_pio_error_string[] = { + "Memory Request Completion Timeout", /* Bit Position 18 */ + }; + ++static int pcie_dpc_disable = 0; ++ ++void pci_no_dpc(void) ++{ ++ pcie_dpc_disable = 1; ++} ++ + void pci_save_dpc_state(struct pci_dev *dev) + { + struct pci_cap_saved_state *save_state; + u16 *cap; + +- if (!pci_is_pcie(dev)) ++ if (pcie_dpc_disable || !pci_is_pcie(dev)) + return; + + save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC); +@@ -64,7 +71,7 @@ void pci_restore_dpc_state(struct pci_dev *dev) + struct pci_cap_saved_state *save_state; + u16 *cap; + +- if (!pci_is_pcie(dev)) ++ if (pcie_dpc_disable || !pci_is_pcie(dev)) + return; + + save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC); +@@ -104,7 +111,7 @@ bool pci_dpc_recovered(struct pci_dev *pdev) + { + struct pci_host_bridge *host; + +- if (!pdev->dpc_cap) ++ if (pcie_dpc_disable || !pdev->dpc_cap) + return false; + + /* +@@ -398,6 +405,9 @@ void pci_dpc_init(struct pci_dev *pdev) + { + u16 cap; + ++ if (pcie_dpc_disable) ++ return; ++ + pdev->dpc_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DPC); + if (!pdev->dpc_cap) + return; diff --git a/patches-sonic/series b/patches-sonic/series index 278799ccf..8b133188b 100644 --- a/patches-sonic/series +++ b/patches-sonic/series @@ -12,6 +12,7 @@ driver-arista-mmcblk-not-working-on-AMD-platforms.patch driver-arista-restrict-eMMC-drive-to-50Mhz-from-userland.patch driver-arista-i2c-designware-shutdown.patch driver-arista-pci-aer-disable-recovery.patch +driver-arista-pci-dpc-disable.patch driver-support-sff-8436-eeprom.patch driver-support-sff-8436-eeprom-update.patch driver-sff-8436-use-nvmem-framework.patch