From 891ea92882bc2a18474b0d73d92db9513eff4120 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Fri, 6 Mar 2026 23:41:52 +0000 Subject: [PATCH 01/11] [chassisd] Persist DPU reboot cause on EMPTY-to-Offline transition When chassisd restarts (e.g. during config reload) while a DPU is offline, STATE_DB is flushed so the previous operational status is EMPTY. The existing code only persists reboot cause on a non-EMPTY non-Offline to Offline transition, causing the new reboot cause to be lost if it changed while chassisd was down. Add handling for the EMPTY-to-Offline case: compare the current reboot cause from the platform with the last persisted one, and record it if they differ. Skip the check on first boot (no previous-reboot-cause.json) to avoid duplicates. Fixes: sonic-net/sonic-buildimage#24275 Signed-off-by: Vasundhara Volam --- sonic-chassisd/scripts/chassisd | 46 +++++++--- sonic-chassisd/tests/test_chassisd.py | 123 ++++++++++++++++++++++---- 2 files changed, 138 insertions(+), 31 deletions(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index 6db144b31..43b703be3 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -288,14 +288,14 @@ class ModuleUpdater(logger.Logger): self.chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB") if self._is_supervisor(): - self.asic_table = swsscommon.Table(self.chassis_state_db, + self.asic_table = swsscommon.Table(self.chassis_state_db, CHASSIS_FABRIC_ASIC_INFO_TABLE) else: - self.asic_table = swsscommon.Table(self.chassis_state_db, + self.asic_table = swsscommon.Table(self.chassis_state_db, CHASSIS_ASIC_INFO_TABLE) self.hostname_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_HOSTNAME_TABLE) - self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE) + self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE) self.down_modules = {} self.chassis_app_db_clean_sha = None @@ -306,7 +306,7 @@ class ModuleUpdater(logger.Logger): field = line.split('=')[0].strip() if field == "linecard_reboot_timeout": self.linecard_reboot_timeout = int(line.split('=')[1].strip()) - + self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False) if not self.midplane_initialized: self.log_error("Chassisd midplane intialization failed") @@ -408,7 +408,7 @@ class ModuleUpdater(logger.Logger): else: if self.phy_entity_table.get(key) is not None: self.phy_entity_table._del(key) - + # Construct key for down_modules dict. Example down_modules key format: LINE-CARD0| fvs = self.hostname_table.get(key) if isinstance(fvs, list) and fvs[0] is True: @@ -422,7 +422,7 @@ class ModuleUpdater(logger.Logger): if prev_status == ModuleBase.MODULE_STATUS_ONLINE: notOnlineModules.append(key) # Record the time when the module down was detected to track the - # module down time. Used for chassis db cleanup for all asics of the module if the module is down for a + # module down time. Used for chassis db cleanup for all asics of the module if the module is down for a # long time like 30 mins. # All down modules including supervisor are added to the down modules dictionary. This is to help # identifying module operational status change. But the clean up will not be attempted for supervisor @@ -458,12 +458,12 @@ class ModuleUpdater(logger.Logger): self.asic_table.set(asic_key, asic_fvs) # In line card push the hostname of the module and num_asics to the chassis state db. - # The hostname is used as key to access chassis app db entries + # The hostname is used as key to access chassis app db entries if not self._is_supervisor(): module_info_dict = self._get_module_info(my_index) hostname_key = "{}{}".format(ModuleBase.MODULE_TYPE_LINE, int(self.my_slot) - 1) hostname = try_get(device_info.get_hostname, default="None") - hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(self.my_slot)), + hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(self.my_slot)), (CHASSIS_MODULE_INFO_HOSTNAME_FIELD, hostname), (CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(len(module_info_dict[CHASSIS_MODULE_INFO_ASICS])))]) self.hostname_table.set(hostname_key, hostname_fvs) @@ -521,12 +521,12 @@ class ModuleUpdater(logger.Logger): if fvs[CHASSIS_MODULE_REBOOT_REBOOT_FIELD] == "expected": return True return False - + def module_reboot_set_time(self, key): time_now = time.time() fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))]) self.module_reboot_table.set(key,fvs) - + def is_module_reboot_system_up_expired(self, key): fvs = self.module_reboot_table.get(key) if isinstance(fvs, list) and fvs[0] is True: @@ -538,7 +538,7 @@ class ModuleUpdater(logger.Logger): self.module_reboot_table._del(key) return True return False - + def check_midplane_reachability(self): if not self.midplane_initialized: return @@ -585,7 +585,7 @@ class ModuleUpdater(logger.Logger): elif midplane_access is False and current_midplane_state == 'False': if self.is_module_reboot_system_up_expired(module_key): self.log_warning("Unexpected: Module {} (Slot {}) midplane connectivity is not restored in {} seconds".format(module_key, module.get_slot(), self.linecard_reboot_timeout)) - + # Update db with midplane information fvs = swsscommon.FieldValuePairs([(CHASSIS_MIDPLANE_INFO_IP_FIELD, midplane_ip), (CHASSIS_MIDPLANE_INFO_ACCESS_FIELD, str(midplane_access))]) @@ -810,6 +810,28 @@ class SmartSwitchModuleUpdater(ModuleUpdater): # publish reboot cause to db self.update_dpu_reboot_cause_to_db(key) + # DPU discovered offline after chassisd restart (e.g. config reload + # during an ongoing DPU reboot). prev_status is EMPTY because + # STATE_DB was flushed. Check whether the reboot cause changed + # since the last persisted one and, if so, record it. + elif prev_status == ModuleBase.MODULE_STATUS_EMPTY and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE): + reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) + stored_cause, _ = self.retrieve_dpu_reboot_info(key) + + if stored_cause is not None: + if isinstance(reboot_cause, (tuple, list)): + current_cause = reboot_cause[0] + else: + current_cause = reboot_cause + + if current_cause and current_cause != stored_cause: + self.log_notice( + f"{key}: Reboot cause changed while chassisd was down " + f"(stored: {stored_cause}, current: {current_cause})") + self.persist_dpu_reboot_time(key) + self.persist_dpu_reboot_cause(reboot_cause, key) + self.update_dpu_reboot_cause_to_db(key) + elif (prev_status == ModuleBase.MODULE_STATUS_EMPTY or prev_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE): self.log_notice(f"{key} operational status transitioning to online") diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index d65ec6fbd..addbf4025 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -178,7 +178,7 @@ def test_moduleupdater_check_phyentity_entry_after_fabric_removal(): module_updater.module_db_update() fvs = module_updater.phy_entity_table.get(name) assert fvs == None - + def test_smartswitch_moduleupdater_check_valid_fields(): chassis = MockSmartSwitchChassis() index = 0 @@ -284,6 +284,91 @@ def test_online_transition_skips_reboot_update(): mock_persist.assert_not_called() mock_update.assert_not_called() +def test_empty_to_offline_persists_changed_reboot_cause(): + """ + If chassisd restarts (e.g. config reload) while a DPU is offline, + prev_status will be EMPTY (STATE_DB flushed) and current status OFFLINE. + If the reboot cause changed since the last persisted one, the new cause + must be recorded. Regression test for issue #24275. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + # DPU is currently offline + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + # Ensure module_table is empty so prev_status == EMPTY (simulates STATE_DB flush) + + with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "power auxiliary outage or reload")), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Reboot", "2025_10_10_07_36_12")), \ + patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + updater.module_db_update() + + # Reboot cause changed ("Reboot" → "Power Loss") so it must be persisted + mock_persist_time.assert_called_once_with(name) + mock_persist_cause.assert_called_once() + mock_update_db.assert_called_once_with(name) + +def test_empty_to_offline_skips_same_reboot_cause(): + """ + After chassisd restart, if the DPU is offline but the reboot cause + has NOT changed, we must NOT create a duplicate entry. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(module, 'get_reboot_cause', return_value=("Reboot", "Reset from Main board")), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Reboot", "2025_10_10_07_36_12")), \ + patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + updater.module_db_update() + + # Same reboot cause — no persistence expected + mock_persist_time.assert_not_called() + mock_persist_cause.assert_not_called() + mock_update_db.assert_not_called() + +def test_empty_to_offline_skips_first_boot(): + """ + On first boot there is no previous-reboot-cause.json so stored_cause + is None. The EMPTY→OFFLINE path must NOT persist in that case. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "N/A")), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=(None, None)), \ + patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + updater.module_db_update() + + # No stored cause (first boot) — should NOT persist + mock_persist_time.assert_not_called() + mock_persist_cause.assert_not_called() + mock_update_db.assert_not_called() + def test_retrieve_dpu_reboot_info_success(): class DummyChassis: def get_num_modules(self): return 0 @@ -833,7 +918,7 @@ def test_moduleupdater_check_string_slot(): midplane_table = module_updater.midplane_table #Check only one entry in database assert 1 == midplane_table.size() - + def test_midplane_presence_modules(): chassis = MockChassis() @@ -1039,7 +1124,7 @@ def lc_mock_open(*args, **kwargs): @patch('os.path.isfile', MagicMock(return_value=True)) def test_midplane_presence_modules_linecard_reboot(): chassis = MockChassis() - + #Supervisor index = 0 name = "SUPERVISOR0" @@ -1104,7 +1189,7 @@ def test_midplane_presence_modules_linecard_reboot(): assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD] assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD] - + #Set access of line-card to Down (to mock midplane connectivity state change) module.set_midplane_reachable(False) # set expected reboot of linecard @@ -1144,9 +1229,9 @@ def test_midplane_presence_modules_linecard_reboot(): if isinstance(fvs, list): fvs = dict(fvs[-1]) assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD] - assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD] - assert module_updater.linecard_reboot_timeout == 240 - + assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD] + assert module_updater.linecard_reboot_timeout == 240 + def test_midplane_presence_supervisor(): chassis = MockChassis() @@ -1454,7 +1539,7 @@ def test_daemon_run_smartswitch(): def test_set_initial_dpu_admin_state_up(): """Test set_initial_dpu_admin_state when admin state is up""" chassis = MockSmartSwitchChassis() - + # DPU0 details index = 0 name = "DPU0" @@ -1469,12 +1554,12 @@ def test_set_initial_dpu_admin_state_up(): status = ModuleBase.MODULE_STATUS_ONLINE module.set_oper_status(status) chassis.module_list.append(module) - + # Supervisor ModuleUpdater module_updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) module_updater.module_db_update() module_updater.modules_num_update() - + # ChassisdDaemon setup daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER, chassis) daemon_chassisd.module_updater = module_updater @@ -1505,7 +1590,7 @@ def test_set_initial_dpu_admin_state_up(): def test_set_initial_dpu_admin_state_empty_offline(): """Test set_initial_dpu_admin_state when admin state is empty and operational state is offline""" chassis = MockSmartSwitchChassis() - + # DPU0 details index = 0 name = "DPU0" @@ -1520,12 +1605,12 @@ def test_set_initial_dpu_admin_state_empty_offline(): status = ModuleBase.MODULE_STATUS_OFFLINE module.set_oper_status(status) chassis.module_list.append(module) - + # Supervisor ModuleUpdater module_updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) module_updater.module_db_update() module_updater.modules_num_update() - + # ChassisdDaemon setup daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER, chassis) daemon_chassisd.module_updater = module_updater @@ -1607,7 +1692,7 @@ def test_set_initial_dpu_admin_state_empty_not_offline(): def test_set_initial_dpu_admin_state_exception(): """Test set_initial_dpu_admin_state handles exceptions gracefully""" chassis = MockSmartSwitchChassis() - + # DPU0 details index = 0 name = "DPU0" @@ -1816,7 +1901,7 @@ def test_chassis_db_cleanup(): # Mock hostname table update for the line card LINE-CARD0 hostname = "lc1-host-00" num_asics = 1 - hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(lc_slot)), + hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(lc_slot)), (CHASSIS_MODULE_INFO_HOSTNAME_FIELD, hostname), (CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(num_asics))]) sup_module_updater.hostname_table.set(lc_name, hostname_fvs) @@ -1849,7 +1934,7 @@ def test_chassis_db_cleanup(): # Mock >= CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD module down period for LINE-CARD1 down_module_key = lc2_name+"|" assert down_module_key not in sup_module_updater.down_modules.keys() - + sup_module_updater.module_down_chassis_db_cleanup() def test_chassis_db_bootup_with_empty_slot(): @@ -1895,7 +1980,7 @@ def test_chassis_db_bootup_with_empty_slot(): # Supervisor ModuleUpdater sup_module_updater = ModuleUpdater(SYSLOG_IDENTIFIER, chassis, sup_slot, sup_slot) sup_module_updater.modules_num_update() - + sup_module_updater.module_db_update() # check LC1 STATUS ONLINE in module table @@ -1904,14 +1989,14 @@ def test_chassis_db_bootup_with_empty_slot(): fvs = dict(fvs[-1]) assert ModuleBase.MODULE_STATUS_ONLINE == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] - # check LC2 STATUS EMPTY in module table + # check LC2 STATUS EMPTY in module table fvs = sup_module_updater.module_table.get(lc2_name) if isinstance(fvs, list): fvs = dict(fvs[-1]) assert ModuleBase.MODULE_STATUS_EMPTY == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] # Both should no tbe in down_module keys. - + down_module_lc1_key = lc_name+"|" assert down_module_lc1_key not in sup_module_updater.down_modules.keys() down_module_lc2_key = lc_name+"|" From e9d2092d823429b6e8e046ec09971508185249f5 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Wed, 25 Mar 2026 18:59:35 +0000 Subject: [PATCH 02/11] [chassisd] Defer DPU reboot-cause query to online transition Do not call get_reboot_cause while a DPU is powered off / offline. Instead, persist only the reboot time on the Known->Offline transition and defer the reboot-cause query until the DPU comes back online. For EMPTY->Offline (chassisd restart while DPU is down), defer entirely and only persist if the cause actually changed once the DPU is online. Remove the now-unused _record_reboot helper and its tests. Fixes: https://github.com/sonic-net/sonic-buildimage/issues/26254 Signed-off-by: Vasundhara Volam --- sonic-chassisd/scripts/chassisd | 168 ++++++++++++++------------ sonic-chassisd/tests/test_chassisd.py | 101 ++++++++++++++-- 2 files changed, 179 insertions(+), 90 deletions(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index 43b703be3..23b7ca749 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -719,6 +719,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater): if not self.midplane_initialized: self.log_error("Chassisd midplane intialization failed") + self._pending_reboot_check = {} self.dpu_reboot_timeout = DEFAULT_DPU_REBOOT_TIMEOUT if os.path.isfile(PLATFORM_JSON_FILE): try: @@ -775,91 +776,102 @@ class SmartSwitchModuleUpdater(ModuleUpdater): self.log_error(f"{module}: Failed to read previous-reboot-cause.json: {e}") return None, None - def module_db_update(self): - for module_index in range(0, self.num_modules): - module_info_dict = self._get_module_info(module_index) - if module_info_dict is not None: - key = module_info_dict[CHASSIS_MODULE_INFO_NAME_FIELD] + def _extract_cause(self, reboot_cause): + """Extract the scalar cause string from a reboot_cause that may be a tuple, list, or string.""" + if isinstance(reboot_cause, (tuple, list)): + return reboot_cause[0] + return reboot_cause + + def _handle_module_status_change(self, key, module_index, prev_status, current_status): + """Handle DPU operational-status transitions and reboot-cause tracking.""" + was_known = prev_status != ModuleBase.MODULE_STATUS_EMPTY + was_offline = prev_status == str(ModuleBase.MODULE_STATUS_OFFLINE) + now_offline = current_status == str(ModuleBase.MODULE_STATUS_OFFLINE) + + # Transition to offline from a known non-offline state. + # Persist the down-time but defer get_reboot_cause until the DPU + # is back online — querying a powered-off DPU is unreliable. + if was_known and not was_offline and now_offline: + self.log_notice(f"{key} operational status transitioning to offline") + self.persist_dpu_reboot_time(key) + self._pending_reboot_check[key] = 'reboot' + return - if not key.startswith(ModuleBase.MODULE_TYPE_DPU): - self.log_error("Incorrect module-name {}. Should start with {} ".format(key, - ModuleBase.MODULE_TYPE_DPU)) - continue + # DPU discovered offline after chassisd restart (e.g. config reload + # during an ongoing DPU reboot). prev_status is EMPTY because + # STATE_DB was flushed. Defer reboot-cause check until DPU is online. + if not was_known and now_offline: + self._pending_reboot_check[key] = 'restart' + return - fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_DESC_FIELD, module_info_dict[CHASSIS_MODULE_INFO_DESC_FIELD]), - (CHASSIS_MODULE_INFO_SLOT_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SLOT_FIELD]), - (CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]), - (CHASSIS_MODULE_INFO_SERIAL_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SERIAL_FIELD])]) + # Transition to online from EMPTY or OFFLINE + if (not was_known or was_offline) and not now_offline: + self.log_notice(f"{key} operational status transitioning to online") + + deferred = self._pending_reboot_check.pop(key, None) + + reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) + current_cause = self._extract_cause(reboot_cause) + stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key) + + # Deferred from Online→Offline: the DPU went down and is now back. + # Reboot time was already persisted; unconditionally record the cause. + if deferred == 'reboot': + self.persist_dpu_reboot_cause(reboot_cause, key) + self.update_dpu_reboot_cause_to_db(key) + return + + # Deferred from EMPTY→Offline: chassisd restarted while DPU was + # offline, now it is online so we can read the real reboot cause. + # Only persist if the cause actually changed. + if deferred == 'restart': + if stored_cause is not None and current_cause and current_cause != stored_cause: + self.log_notice( + f"{key}: Reboot cause changed while chassisd was down " + f"(stored: {stored_cause}, current: {current_cause})") + self.persist_dpu_reboot_time(key) + self.persist_dpu_reboot_cause(reboot_cause, key) + self.update_dpu_reboot_cause_to_db(key) + return - # Get a copy of the previous operational status of the module - prev_status = self.get_module_current_status(key) - self.module_table.set(key, fvs) + is_reboot = False + if current_cause and stored_cause and stored_time_str: + try: + stored_dt = datetime.strptime(stored_time_str, "%Y_%m_%d_%H_%M_%S").replace(tzinfo=timezone.utc) + delta_sec = (datetime.now(timezone.utc) - stored_dt).total_seconds() - # Get a copy of the current operational status of the module - current_status = module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] + if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION: + self.log_info(f"{key}: is_reboot=True \u2014 same reboot cause within {int(delta_sec)}s") + is_reboot = True + except Exception as e: + self.log_error(f"{key}: Reboot cause/time comparison failed: {e}") - # Operational status transitioning to offline - if prev_status != ModuleBase.MODULE_STATUS_EMPTY and prev_status != str(ModuleBase.MODULE_STATUS_OFFLINE) and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE): - self.log_notice("{} operational status transitioning to offline".format(key)) + if not is_reboot and (stored_time_str is not None or self._is_first_boot(key)): + self.persist_dpu_reboot_cause(reboot_cause, key) + self.update_dpu_reboot_cause_to_db(key) - # Persist dpu down time - self.persist_dpu_reboot_time(key) - # persist reboot cause - reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) - self.persist_dpu_reboot_cause(reboot_cause, key) - # publish reboot cause to db - self.update_dpu_reboot_cause_to_db(key) + def module_db_update(self): + for module_index in range(0, self.num_modules): + module_info_dict = self._get_module_info(module_index) + if module_info_dict is None: + continue + + key = module_info_dict[CHASSIS_MODULE_INFO_NAME_FIELD] + if not key.startswith(ModuleBase.MODULE_TYPE_DPU): + self.log_error("Incorrect module-name {}. Should start with {} ".format(key, + ModuleBase.MODULE_TYPE_DPU)) + continue + + fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_DESC_FIELD, module_info_dict[CHASSIS_MODULE_INFO_DESC_FIELD]), + (CHASSIS_MODULE_INFO_SLOT_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SLOT_FIELD]), + (CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]), + (CHASSIS_MODULE_INFO_SERIAL_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SERIAL_FIELD])]) + + prev_status = self.get_module_current_status(key) + self.module_table.set(key, fvs) + current_status = module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] - # DPU discovered offline after chassisd restart (e.g. config reload - # during an ongoing DPU reboot). prev_status is EMPTY because - # STATE_DB was flushed. Check whether the reboot cause changed - # since the last persisted one and, if so, record it. - elif prev_status == ModuleBase.MODULE_STATUS_EMPTY and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE): - reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) - stored_cause, _ = self.retrieve_dpu_reboot_info(key) - - if stored_cause is not None: - if isinstance(reboot_cause, (tuple, list)): - current_cause = reboot_cause[0] - else: - current_cause = reboot_cause - - if current_cause and current_cause != stored_cause: - self.log_notice( - f"{key}: Reboot cause changed while chassisd was down " - f"(stored: {stored_cause}, current: {current_cause})") - self.persist_dpu_reboot_time(key) - self.persist_dpu_reboot_cause(reboot_cause, key) - self.update_dpu_reboot_cause_to_db(key) - - elif (prev_status == ModuleBase.MODULE_STATUS_EMPTY or prev_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE): - self.log_notice(f"{key} operational status transitioning to online") - - reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) - if isinstance(reboot_cause, (tuple, list)): - current_cause = reboot_cause[0] - else: - current_cause = reboot_cause - - stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key) - - is_reboot = False - if current_cause and stored_cause and stored_time_str: - try: - stored_dt = datetime.strptime(stored_time_str, "%Y_%m_%d_%H_%M_%S").replace(tzinfo=timezone.utc) - now = datetime.now(timezone.utc) - delta_sec = (now - stored_dt).total_seconds() - - if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION: - self.log_info(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s") - is_reboot = True - except Exception as e: - self.log_error(f"{key}: Reboot cause/time comparison failed: {e}") - - if not is_reboot and (stored_time_str is not None or self._is_first_boot(key)): - # persist reboot cause and publish to db - self.persist_dpu_reboot_cause(reboot_cause, key) - self.update_dpu_reboot_cause_to_db(key) + self._handle_module_status_change(key, module_index, prev_status, current_status) def _get_module_info(self, module_index): """ diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index addbf4025..81d5440ae 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -224,37 +224,83 @@ def test_smartswitch_moduleupdater_status_transitions(): # Create the updater module_updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + # Initial poll to populate prev_status as ONLINE + module_updater.module_db_update() + # Mock dependent methods with patch.object(module_updater, 'retrieve_dpu_reboot_info', return_value=("Switch rebooted DPU", "2023_01_01_00_00_00")) as mock_reboot_info, \ patch.object(module_updater, '_is_first_boot', return_value=False) as mock_is_first_boot, \ patch.object(module_updater, 'persist_dpu_reboot_cause') as mock_persist_reboot_cause, \ patch.object(module_updater, 'update_dpu_reboot_cause_to_db') as mock_update_reboot_db, \ + patch.object(module_updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch("os.makedirs") as mock_makedirs, \ patch("builtins.open", mock_open()) as mock_file, \ patch.object(module_updater, '_get_history_path', return_value="/tmp/prev_reboot_time.txt") as mock_get_history_path: - # Transition from ONLINE to OFFLINE + # Transition from ONLINE to OFFLINE — should persist time but NOT + # query reboot cause (DPU is powered off). Issue #26254. offline_status = ModuleBase.MODULE_STATUS_OFFLINE module.set_oper_status(offline_status) module_updater.module_db_update() assert module.get_oper_status() == offline_status + mock_persist_time.assert_called_once_with(name) + mock_persist_reboot_cause.assert_not_called() + mock_update_reboot_db.assert_not_called() + assert name in module_updater._pending_reboot_check # Reset mocks for next transition mock_file.reset_mock() mock_makedirs.reset_mock() mock_persist_reboot_cause.reset_mock() mock_update_reboot_db.reset_mock() + mock_persist_time.reset_mock() - # Ensure ONLINE transition is handled correctly + # Ensure ONLINE transition persists the deferred reboot cause online_status = ModuleBase.MODULE_STATUS_ONLINE module.set_oper_status(online_status) module_updater.module_db_update() assert module.get_oper_status() == online_status + assert name not in module_updater._pending_reboot_check # Validate mock calls for ONLINE transition mock_persist_reboot_cause.assert_called_once() mock_update_reboot_db.assert_called_once() +def test_admin_shutdown_does_not_query_reboot_cause(): + """ + When a DPU is powered off via 'config chassis modules shutdown', + chassisd must NOT call get_reboot_cause while the DPU is offline. + Regression test for issue #26254. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # First poll: DPU is online + updater.module_db_update() + + with patch.object(module, 'get_reboot_cause') as mock_get_cause, \ + patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # Admin shutdown: DPU goes offline + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + updater.module_db_update() + + # get_reboot_cause must NOT be called while DPU is off + mock_get_cause.assert_not_called() + # But reboot time should be persisted + mock_persist_time.assert_called_once_with(name) + # Cause should be deferred + mock_persist_cause.assert_not_called() + mock_update_db.assert_not_called() + assert name in updater._pending_reboot_check + def test_online_transition_skips_reboot_update(): chassis = MockSmartSwitchChassis() index = 0 @@ -288,8 +334,9 @@ def test_empty_to_offline_persists_changed_reboot_cause(): """ If chassisd restarts (e.g. config reload) while a DPU is offline, prev_status will be EMPTY (STATE_DB flushed) and current status OFFLINE. - If the reboot cause changed since the last persisted one, the new cause - must be recorded. Regression test for issue #24275. + The EMPTY→OFFLINE transition defers get_reboot_cause until the DPU + comes online. When it does come online and the cause differs from + the stored one, the new cause must be recorded. """ chassis = MockSmartSwitchChassis() name = "DPU0" @@ -299,7 +346,6 @@ def test_empty_to_offline_persists_changed_reboot_cause(): chassis.module_list.append(module) updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) - # Ensure module_table is empty so prev_status == EMPTY (simulates STATE_DB flush) with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "power auxiliary outage or reload")), \ patch.object(updater, 'retrieve_dpu_reboot_info', @@ -308,17 +354,26 @@ def test_empty_to_offline_persists_changed_reboot_cause(): patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + # EMPTY→OFFLINE: should defer, not persist yet updater.module_db_update() + assert name in updater._pending_reboot_check + mock_persist_time.assert_not_called() + mock_persist_cause.assert_not_called() + mock_update_db.assert_not_called() - # Reboot cause changed ("Reboot" → "Power Loss") so it must be persisted + # DPU comes online — deferred check fires, cause changed + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check mock_persist_time.assert_called_once_with(name) - mock_persist_cause.assert_called_once() + mock_persist_cause.assert_called_once_with(("Power Loss", "power auxiliary outage or reload"), name) mock_update_db.assert_called_once_with(name) def test_empty_to_offline_skips_same_reboot_cause(): """ - After chassisd restart, if the DPU is offline but the reboot cause - has NOT changed, we must NOT create a duplicate entry. + After chassisd restart, if the DPU is offline the EMPTY→OFFLINE + transition defers the check. When the DPU comes online with the + same reboot cause, we must NOT create a duplicate entry. """ chassis = MockSmartSwitchChassis() name = "DPU0" @@ -335,9 +390,14 @@ def test_empty_to_offline_skips_same_reboot_cause(): patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + # EMPTY→OFFLINE: deferred updater.module_db_update() + assert name in updater._pending_reboot_check - # Same reboot cause — no persistence expected + # DPU comes online — same cause, no persistence + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check mock_persist_time.assert_not_called() mock_persist_cause.assert_not_called() mock_update_db.assert_not_called() @@ -345,7 +405,8 @@ def test_empty_to_offline_skips_same_reboot_cause(): def test_empty_to_offline_skips_first_boot(): """ On first boot there is no previous-reboot-cause.json so stored_cause - is None. The EMPTY→OFFLINE path must NOT persist in that case. + is None. The EMPTY→OFFLINE transition defers, and when the DPU + comes online the deferred path must NOT persist (stored_cause is None). """ chassis = MockSmartSwitchChassis() name = "DPU0" @@ -362,9 +423,14 @@ def test_empty_to_offline_skips_first_boot(): patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + # EMPTY→OFFLINE: deferred updater.module_db_update() + assert name in updater._pending_reboot_check - # No stored cause (first boot) — should NOT persist + # DPU comes online — no stored cause (first boot), should NOT persist + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check mock_persist_time.assert_not_called() mock_persist_cause.assert_not_called() mock_update_db.assert_not_called() @@ -393,6 +459,17 @@ def init_midplane_switch(self): return False # required for SmartSwitchModuleUp assert cause is None assert time_str is None +def test_extract_cause_tuple(): + class DummyChassis: + def get_num_modules(self): return 0 + def init_midplane_switch(self): return False + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, DummyChassis()) + assert updater._extract_cause(("Power Loss", "auxiliary outage")) == "Power Loss" + assert updater._extract_cause(["Reboot", "reset"]) == "Reboot" + assert updater._extract_cause("Watchdog") == "Watchdog" + assert updater._extract_cause(None) is None + def test_smartswitch_moduleupdater_check_invalid_name(): chassis = MockSmartSwitchChassis() index = 0 From 1371c10f06fb13a38968af4ba6f535afc4a9850c Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 30 Mar 2026 19:28:55 +0000 Subject: [PATCH 03/11] [chassisd] Skip duplicate reboot cause on back-to-back DPU reboot The deferred=='reboot' path (Online->Offline->Online) was unconditionally persisting the reboot cause. For back-to-back reboots where the cause and timestamp are within MAX_DPU_REBOOT_DURATION, this created duplicate history entries. Add is_reboot detection to the deferred=='reboot' path: if the current cause matches the stored cause and the time delta is within the threshold, skip persist_dpu_reboot_cause but still update the DB to keep it in sync. Add unit test test_deferred_reboot_skips_back_to_back_same_cause to cover the new behavior. Signed-off-by: Vasundhara Volam --- sonic-chassisd/scripts/chassisd | 13 +++++++- sonic-chassisd/tests/test_chassisd.py | 46 +++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index 23b7ca749..67521e326 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -815,8 +815,19 @@ class SmartSwitchModuleUpdater(ModuleUpdater): stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key) # Deferred from Online→Offline: the DPU went down and is now back. - # Reboot time was already persisted; unconditionally record the cause. + # Reboot time was already persisted. Skip persisting if this is a + # back-to-back reboot with the same cause (is_reboot detection). if deferred == 'reboot': + if current_cause and stored_cause and stored_time_str: + try: + stored_dt = datetime.strptime(stored_time_str, "%Y_%m_%d_%H_%M_%S").replace(tzinfo=timezone.utc) + delta_sec = (datetime.now(timezone.utc) - stored_dt).total_seconds() + if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION: + self.log_info(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s") + self.update_dpu_reboot_cause_to_db(key) + return + except Exception as e: + self.log_error(f"{key}: Reboot cause/time comparison failed: {e}") self.persist_dpu_reboot_cause(reboot_cause, key) self.update_dpu_reboot_cause_to_db(key) return diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index 81d5440ae..a6ac48b07 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -330,6 +330,52 @@ def test_online_transition_skips_reboot_update(): mock_persist.assert_not_called() mock_update.assert_not_called() +def test_deferred_reboot_skips_back_to_back_same_cause(): + """ + Back-to-back reboot: DPU goes ONLINE→OFFLINE→ONLINE twice quickly. + On the second ONLINE transition the deferred=='reboot' path should + detect that the reboot cause is the same and the time delta is within + MAX_DPU_REBOOT_DURATION, triggering is_reboot=True and skipping + persist_dpu_reboot_cause. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # First poll: establish ONLINE prev_status + updater.module_db_update() + + # Stored reboot info: same cause, very recent timestamp + recent_time = datetime.now(timezone.utc).strftime("%Y_%m_%d_%H_%M_%S") + + with patch.object(module, 'get_reboot_cause', return_value="Switch rebooted DPU"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Switch rebooted DPU", recent_time)), \ + patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # ONLINE→OFFLINE: deferred='reboot', time persisted + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + updater.module_db_update() + mock_persist_time.assert_called_once_with(name) + assert updater._pending_reboot_check[name] == 'reboot' + + mock_persist_time.reset_mock() + + # OFFLINE→ONLINE: deferred='reboot' fires, but same cause + recent + # timestamp → is_reboot=True, so persist_dpu_reboot_cause is skipped + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check + mock_persist_cause.assert_not_called() + # DB should still be updated to keep it in sync + mock_update_db.assert_called_once_with(name) + def test_empty_to_offline_persists_changed_reboot_cause(): """ If chassisd restarts (e.g. config reload) while a DPU is offline, From 2cff18202996ae9238c565293cb31217ac9200be Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 11 May 2026 22:47:01 +0000 Subject: [PATCH 04/11] =?UTF-8?q?[chassisd]=20Fix=20spurious=20reboot-caus?= =?UTF-8?q?e=20persist=20on=20EMPTY=E2=86=92ONLINE=20transition?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When chassisd restarts (e.g. config reload) and a DPU is already online, the EMPTY→ONLINE transition fell through to the generic is_reboot path. If the stored reboot timestamp was >800s old, is_reboot=False and the code spuriously persisted a new reboot cause. This poisoned stored data, causing subsequent real reboots to be falsely skipped. Fix: short-circuit the EMPTY→ONLINE (no deferred) path — just repopulate CHASSIS_STATE_DB from on-disk history. On first boot, persist the initial cause. This prevents spurious updates to all DPUs after config reload. Also change log_info to log_notice for is_reboot=True messages so they appear in syslog at the NOTICE level, fixing test_ss_module_db_update test 6 as reported by chartsai-nvidia. Signed-off-by: Vasundhara Volam --- sonic-chassisd/scripts/chassisd | 17 +++++- sonic-chassisd/tests/test_chassisd.py | 80 ++++++++++++++++++++++----- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index f2887d326..a1d2559e2 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -809,6 +809,18 @@ class SmartSwitchModuleUpdater(ModuleUpdater): deferred = self._pending_reboot_check.pop(key, None) + # EMPTY→ONLINE with no deferred: chassisd restarted while the + # DPU was already online — no reboot happened during this cycle. + # On first boot persist the initial cause; otherwise just + # repopulate CHASSIS_STATE_DB from on-disk data. + if deferred is None and not was_known: + if self._is_first_boot(key): + reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) + self.persist_dpu_reboot_time(key) + self.persist_dpu_reboot_cause(reboot_cause, key) + self.update_dpu_reboot_cause_to_db(key) + return + reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) current_cause = self._extract_cause(reboot_cause) stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key) @@ -822,7 +834,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater): stored_dt = datetime.strptime(stored_time_str, "%Y_%m_%d_%H_%M_%S").replace(tzinfo=timezone.utc) delta_sec = (datetime.now(timezone.utc) - stored_dt).total_seconds() if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION: - self.log_info(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s") + self.log_notice(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s") self.update_dpu_reboot_cause_to_db(key) return except Exception as e: @@ -844,6 +856,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater): self.update_dpu_reboot_cause_to_db(key) return + # OFFLINE→ONLINE with no deferred (edge case). is_reboot = False if current_cause and stored_cause and stored_time_str: try: @@ -851,7 +864,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater): delta_sec = (datetime.now(timezone.utc) - stored_dt).total_seconds() if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION: - self.log_info(f"{key}: is_reboot=True \u2014 same reboot cause within {int(delta_sec)}s") + self.log_notice(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s") is_reboot = True except Exception as e: self.log_error(f"{key}: Reboot cause/time comparison failed: {e}") diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index 594b4925b..c16ee8357 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -302,33 +302,87 @@ def test_admin_shutdown_does_not_query_reboot_cause(): assert name in updater._pending_reboot_check def test_online_transition_skips_reboot_update(): + """ + EMPTY→ONLINE with no deferred and no first-boot marker: chassisd + restarted while the DPU was already online. No reboot happened, + so persist_dpu_reboot_cause must NOT be called. + update_dpu_reboot_cause_to_db SHOULD be called to repopulate + CHASSIS_STATE_DB from on-disk history files. + """ chassis = MockSmartSwitchChassis() index = 0 name = "DPU0" module = MockModule(index, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN123") - module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) chassis.module_list.append(module) updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) - # Mock the module going ONLINE + with patch.object(updater, '_is_first_boot', return_value=False), \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update: + + updater.module_db_update() + + # No persist — no reboot happened + mock_persist.assert_not_called() + # DB should be repopulated from on-disk history + mock_update.assert_called_once_with(name) + +def test_empty_to_online_first_boot_persists(): + """ + On first boot, EMPTY→ONLINE with no deferred should persist the + initial reboot cause (the _is_first_boot marker file is present). + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) - with patch.object(updater, 'retrieve_dpu_reboot_info', - return_value=("Switch rebooted DPU", datetime.now(timezone.utc).strftime("%Y_%m_%d_%H_%M_%S"))), \ - patch.object(module, 'get_reboot_cause', return_value="Switch rebooted DPU"), \ + with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "N/A")), \ + patch.object(updater, '_is_first_boot', return_value=True), \ + patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + updater.module_db_update() + + mock_persist_time.assert_called_once_with(name) + mock_persist_cause.assert_called_once_with(("Power Loss", "N/A"), name) + mock_update_db.assert_called_once_with(name) + +def test_empty_to_online_no_spurious_persist(): + """ + After config reload (no DPU reboot), DPU is immediately online. + EMPTY→ONLINE with no deferred must NOT persist a new reboot cause. + Regression test for chartsai-nvidia's bug report on PR #771. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(module, 'get_reboot_cause', return_value="Non-Hardware"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Non-Hardware", "2025_01_01_00_00_00")), \ patch.object(updater, '_is_first_boot', return_value=False), \ - patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist, \ - patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update, \ - patch("builtins.open", mock_open()), \ - patch("os.makedirs"), \ - patch.object(updater, '_get_history_path', return_value="/tmp/fake.json"): + patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: updater.module_db_update() - # Ensure no reboot update due to is_reboot = True - mock_persist.assert_not_called() - mock_update.assert_not_called() + # No persist — DPU didn't reboot, chassisd just restarted + mock_persist_time.assert_not_called() + mock_persist_cause.assert_not_called() + # DB should still be repopulated + mock_update_db.assert_called_once_with(name) def test_deferred_reboot_skips_back_to_back_same_cause(): """ From a6fb958ff3fd01b84f987db118e76835b9ba04fd Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Tue, 12 May 2026 20:02:59 +0000 Subject: [PATCH 05/11] [chassisd] Replace timeout-based is_reboot with reboot-time comparison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the MAX_DPU_REBOOT_DURATION (800s) timeout used to detect duplicate reboot-cause persistence with a direct comparison of the stored cause timestamp vs the reboot execution time from prev_reboot_time.txt. New logic: - If stored_cause_time >= reboot_time: cause already persisted → skip - If stored_cause_time < reboot_time: new reboot, not yet persisted → persist This eliminates the arbitrary timeout and correctly handles: - Back-to-back reboots (each gets its own entry) - Config reload after cause already persisted (no duplicate) - Any reboot duration (no 800s ceiling) Signed-off-by: Vasundhara Volam --- sonic-chassisd/scripts/chassisd | 41 ++++++--------- sonic-chassisd/tests/test_chassisd.py | 74 ++++++++++++++++++++++----- 2 files changed, 75 insertions(+), 40 deletions(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index a1d2559e2..020751fe9 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -826,19 +826,14 @@ class SmartSwitchModuleUpdater(ModuleUpdater): stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key) # Deferred from Online→Offline: the DPU went down and is now back. - # Reboot time was already persisted. Skip persisting if this is a - # back-to-back reboot with the same cause (is_reboot detection). + # Reboot time was already persisted. Skip persisting if the + # cause was already recorded after this reboot's execution time. if deferred == 'reboot': - if current_cause and stored_cause and stored_time_str: - try: - stored_dt = datetime.strptime(stored_time_str, "%Y_%m_%d_%H_%M_%S").replace(tzinfo=timezone.utc) - delta_sec = (datetime.now(timezone.utc) - stored_dt).total_seconds() - if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION: - self.log_notice(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s") - self.update_dpu_reboot_cause_to_db(key) - return - except Exception as e: - self.log_error(f"{key}: Reboot cause/time comparison failed: {e}") + reboot_time = self.retrieve_dpu_reboot_time(key) + if stored_time_str and reboot_time and stored_time_str >= reboot_time: + self.log_notice(f"{key}: Reboot cause already persisted after reboot at {reboot_time}") + self.update_dpu_reboot_cause_to_db(key) + return self.persist_dpu_reboot_cause(reboot_cause, key) self.update_dpu_reboot_cause_to_db(key) return @@ -853,23 +848,17 @@ class SmartSwitchModuleUpdater(ModuleUpdater): f"(stored: {stored_cause}, current: {current_cause})") self.persist_dpu_reboot_time(key) self.persist_dpu_reboot_cause(reboot_cause, key) - self.update_dpu_reboot_cause_to_db(key) + self.update_dpu_reboot_cause_to_db(key) return # OFFLINE→ONLINE with no deferred (edge case). - is_reboot = False - if current_cause and stored_cause and stored_time_str: - try: - stored_dt = datetime.strptime(stored_time_str, "%Y_%m_%d_%H_%M_%S").replace(tzinfo=timezone.utc) - delta_sec = (datetime.now(timezone.utc) - stored_dt).total_seconds() - - if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION: - self.log_notice(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s") - is_reboot = True - except Exception as e: - self.log_error(f"{key}: Reboot cause/time comparison failed: {e}") - - if not is_reboot and (stored_time_str is not None or self._is_first_boot(key)): + already_persisted = False + reboot_time = self.retrieve_dpu_reboot_time(key) + if stored_time_str and reboot_time and stored_time_str >= reboot_time: + self.log_notice(f"{key}: Reboot cause already persisted after reboot at {reboot_time}") + already_persisted = True + + if not already_persisted and (stored_time_str is not None or self._is_first_boot(key)): self.persist_dpu_reboot_cause(reboot_cause, key) self.update_dpu_reboot_cause_to_db(key) diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index c16ee8357..6da0d4a8a 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -229,6 +229,7 @@ def test_smartswitch_moduleupdater_status_transitions(): # Mock dependent methods with patch.object(module_updater, 'retrieve_dpu_reboot_info', return_value=("Switch rebooted DPU", "2023_01_01_00_00_00")) as mock_reboot_info, \ + patch.object(module_updater, 'retrieve_dpu_reboot_time', return_value="2025_01_01_00_00_00"), \ patch.object(module_updater, '_is_first_boot', return_value=False) as mock_is_first_boot, \ patch.object(module_updater, 'persist_dpu_reboot_cause') as mock_persist_reboot_cause, \ patch.object(module_updater, 'update_dpu_reboot_cause_to_db') as mock_update_reboot_db, \ @@ -384,13 +385,11 @@ def test_empty_to_online_no_spurious_persist(): # DB should still be repopulated mock_update_db.assert_called_once_with(name) -def test_deferred_reboot_skips_back_to_back_same_cause(): +def test_deferred_reboot_skips_already_persisted(): """ - Back-to-back reboot: DPU goes ONLINE→OFFLINE→ONLINE twice quickly. - On the second ONLINE transition the deferred=='reboot' path should - detect that the reboot cause is the same and the time delta is within - MAX_DPU_REBOOT_DURATION, triggering is_reboot=True and skipping - persist_dpu_reboot_cause. + Deferred='reboot' path: if the stored reboot cause timestamp is at + or after the reboot execution time (from prev_reboot_time.txt), the + cause was already persisted for this reboot — skip duplicate persist. """ chassis = MockSmartSwitchChassis() name = "DPU0" @@ -403,12 +402,14 @@ def test_deferred_reboot_skips_back_to_back_same_cause(): # First poll: establish ONLINE prev_status updater.module_db_update() - # Stored reboot info: same cause, very recent timestamp - recent_time = datetime.now(timezone.utc).strftime("%Y_%m_%d_%H_%M_%S") + # Stored cause timestamp is AFTER the reboot time → already persisted + reboot_time = "2025_06_01_10_00_00" + stored_time = "2025_06_01_10_05_00" with patch.object(module, 'get_reboot_cause', return_value="Switch rebooted DPU"), \ patch.object(updater, 'retrieve_dpu_reboot_info', - return_value=("Switch rebooted DPU", recent_time)), \ + return_value=("Switch rebooted DPU", stored_time)), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: @@ -421,8 +422,7 @@ def test_deferred_reboot_skips_back_to_back_same_cause(): mock_persist_time.reset_mock() - # OFFLINE→ONLINE: deferred='reboot' fires, but same cause + recent - # timestamp → is_reboot=True, so persist_dpu_reboot_cause is skipped + # OFFLINE→ONLINE: stored_time >= reboot_time → already persisted, skip module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() assert name not in updater._pending_reboot_check @@ -430,6 +430,50 @@ def test_deferred_reboot_skips_back_to_back_same_cause(): # DB should still be updated to keep it in sync mock_update_db.assert_called_once_with(name) +def test_deferred_reboot_persists_new_reboot(): + """ + Deferred='reboot' path: if the stored reboot cause timestamp is + BEFORE the reboot execution time, this is a new reboot whose cause + hasn't been persisted yet — persist it. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # First poll: establish ONLINE prev_status + updater.module_db_update() + + # Stored cause timestamp is BEFORE the reboot time → new reboot + reboot_time = "2025_06_01_10_00_00" + stored_time = "2025_05_01_08_00_00" + + with patch.object(module, 'get_reboot_cause', return_value="Switch rebooted DPU"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Switch rebooted DPU", stored_time)), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ + patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # ONLINE→OFFLINE: deferred='reboot', time persisted + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + updater.module_db_update() + mock_persist_time.assert_called_once_with(name) + assert updater._pending_reboot_check[name] == 'reboot' + + mock_persist_time.reset_mock() + + # OFFLINE→ONLINE: stored_time < reboot_time → persist + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check + mock_persist_cause.assert_called_once() + mock_update_db.assert_called_once_with(name) + def test_empty_to_offline_persists_changed_reboot_cause(): """ If chassisd restarts (e.g. config reload) while a DPU is offline, @@ -494,13 +538,14 @@ def test_empty_to_offline_skips_same_reboot_cause(): updater.module_db_update() assert name in updater._pending_reboot_check - # DPU comes online — same cause, no persistence + # DPU comes online — same cause, no new persistence module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() assert name not in updater._pending_reboot_check mock_persist_time.assert_not_called() mock_persist_cause.assert_not_called() - mock_update_db.assert_not_called() + # DB should still be repopulated (STATE_DB was flushed) + mock_update_db.assert_called_once_with(name) def test_empty_to_offline_skips_first_boot(): """ @@ -533,7 +578,8 @@ def test_empty_to_offline_skips_first_boot(): assert name not in updater._pending_reboot_check mock_persist_time.assert_not_called() mock_persist_cause.assert_not_called() - mock_update_db.assert_not_called() + # DB should still be repopulated (STATE_DB was flushed) + mock_update_db.assert_called_once_with(name) def test_retrieve_dpu_reboot_info_success(): class DummyChassis: From 917266a4f83acb67cecb421738ec8b00c6d4fc8a Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 18 May 2026 18:26:39 +0000 Subject: [PATCH 06/11] [chassisd] Add testbed script for DPU reboot cause persistence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add end-to-end testbed verification script that covers: - Normal ONLINE→OFFLINE→ONLINE transition records reboot cause - Config reload during DPU reboot (deferred EMPTY→OFFLINE→ONLINE check) - Config reload with DPU already offline (no duplicate) - Reboot cause history preserved across config reload - Deferred check with same cause skips duplicate - is_reboot=True back-to-back reboot skips duplicate persist Signed-off-by: Vasundhara Volam --- .../test_dpu_reboot_cause_persistence.sh | 777 ++++++++++++++++++ 1 file changed, 777 insertions(+) create mode 100755 sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh diff --git a/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh b/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh new file mode 100755 index 000000000..c1b1564d0 --- /dev/null +++ b/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh @@ -0,0 +1,777 @@ +#!/bin/bash +# +# Testbed verification script for SmartSwitchModuleUpdater.module_db_update +# +# Covers: +# Test 1: Normal ONLINE→OFFLINE→ONLINE transition records reboot cause +# Test 2: Config reload during DPU reboot (EMPTY→OFFLINE→ONLINE deferred check) +# Test 3: Config reload with DPU already offline (same cause — no duplicate) +# Test 4: Reboot cause history is not lost after config reload +# Test 5: Deferred EMPTY→OFFLINE→ONLINE with same cause (no duplicate) +# Test 6: Back-to-back reboot skips duplicate persist +# + +SCRIPT_NAME=$(basename "$0") + +usage() { + cat </sonic-chassisd/scripts/chassisd admin@:/tmp/chassisd_patched + +Options: + -d DPU DPU name to test (default: DPU0) + -p PATH Path to patched chassisd on the switch (default: /tmp/chassisd_patched) + -t SECONDS Delay between DPU reboot and config reload in Test 2 (default: 10) + -h Show this help message and exit + +Tests: + 1. Normal ONLINE → OFFLINE → ONLINE + Reboots the DPU normally and verifies reboot cause is recorded, + history file count increases, DB entries exist, and syslog shows + both offline and online transition messages. + + 2. Config reload during DPU reboot (deferred check) + Reboots the DPU, then issues 'config reload -y' while the DPU is + still rebooting. Chassisd restarts with an empty STATE_DB and + discovers the DPU offline (EMPTY→OFFLINE). The deferred check + should fire when the DPU comes online, persisting the updated + reboot cause. + + 3. Config reload with DPU already stable offline + Shuts down the DPU (admin down), then issues config reload. + Chassisd sees EMPTY→OFFLINE but the DPU stays offline, so + no reboot cause should be persisted. Verifies no duplicate + entries are created, including after the DPU is brought back. + + 4. Reboot cause history preserved across config reload + Issues config reload with the DPU online and verifies on-disk + history files survive the STATE_DB flush and that CHASSIS_STATE_DB + entries are repopulated. + + 5. Deferred check with same reboot cause + Issues config reload (no DPU reboot) so the DPU cycles through + EMPTY→OFFLINE→ONLINE with the same reboot cause. Verifies that + the deferred check detects the cause is unchanged and does NOT + create a duplicate entry. + + 6. Back-to-back rapid reboot + Reboots the DPU twice in quick succession. The second online + transition should detect the cause was already persisted and skip + creating a duplicate reboot cause entry. + +Examples: + sudo bash $SCRIPT_NAME + sudo bash $SCRIPT_NAME -d DPU1 + sudo bash $SCRIPT_NAME -d DPU0 -p /tmp/my_chassisd -t 15 + +EOF + exit 0 +} + +# --------------------------------------------------------------------------- +# Parse arguments +# --------------------------------------------------------------------------- +DPU="DPU0" +CONFIG_RELOAD_DELAY=10 +PATCHED_CHASSISD="/tmp/chassisd_patched" + +while getopts "d:p:t:h" opt; do + case $opt in + d) DPU="$OPTARG" ;; + p) PATCHED_CHASSISD="$OPTARG" ;; + t) CONFIG_RELOAD_DELAY="$OPTARG" ;; + h) usage ;; + *) usage ;; + esac +done +shift $((OPTIND - 1)) + +set -euo pipefail + +CHASSISD_PATH="/usr/local/bin/chassisd" +BACKUP_CHASSISD="/tmp/chassisd_original" +PMON_CONTAINER="pmon" +REBOOT_CAUSE_DIR="/host/reboot-cause/module" +MAX_HISTORY_FILES=10 +PASSED=0 +FAILED=0 +SKIPPED=0 + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*"; } +log_step() { echo -e "\n${CYAN}=== $* ===${NC}"; } + +pass() { echo -e " ${GREEN}PASS${NC}: $*"; (( PASSED++ )) || true; } +fail() { echo -e " ${RED}FAIL${NC}: $*"; (( FAILED++ )) || true; } +skip() { echo -e " ${YELLOW}SKIP${NC}: $*"; (( SKIPPED++ )) || true; } + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +find_chassisd_path() { + if ! docker ps --format '{{.Names}}' | grep -q "^${PMON_CONTAINER}$"; then + log_error "pmon container is not running" + return 1 + fi + for candidate in \ + "/usr/local/bin/chassisd" \ + "$(docker exec "$PMON_CONTAINER" which chassisd 2>/dev/null || true)" \ + "$(docker exec "$PMON_CONTAINER" find /usr -name chassisd 2>/dev/null | head -1 || true)"; do + if [[ -n "$candidate" ]] && docker exec "$PMON_CONTAINER" test -f "$candidate" 2>/dev/null; then + CHASSISD_PATH="$candidate" + return 0 + fi + done + log_error "Could not find chassisd inside $PMON_CONTAINER container" + return 1 +} + +wait_for_system_ready() { + local timeout=${1:-300} + local elapsed=0 + log_info "Waiting for system ready (timeout: ${timeout}s)..." + while (( elapsed < timeout )); do + if show system-health summary 2>/dev/null | grep -q "System is ready"; then + log_info "System is ready after ${elapsed}s" + return 0 + fi + sleep 5 + (( elapsed += 5 )) + done + log_warn "System not ready after ${timeout}s, proceeding anyway" + return 1 +} + +wait_for_dpu_online() { + local dpu=$1 + local timeout=${2:-600} + local elapsed=0 + log_info "Waiting for $dpu to come online (timeout: ${timeout}s)..." + while (( elapsed < timeout )); do + local status + status=$(sonic-db-cli STATE_DB HGET "CHASSIS_MODULE_TABLE|${dpu}" "oper_status" 2>/dev/null || true) + if [[ "$status" == "Online" || "$status" == "Partial Online" ]]; then + log_info "$dpu is online (status: $status) after ${elapsed}s" + return 0 + fi + sleep 10 + (( elapsed += 10 )) + done + log_error "$dpu did not come online within ${timeout}s" + return 1 +} + +wait_for_dpu_offline() { + local dpu=$1 + local timeout=${2:-300} + local elapsed=0 + log_info "Waiting for $dpu to go offline (timeout: ${timeout}s)..." + while (( elapsed < timeout )); do + local status + status=$(sonic-db-cli STATE_DB HGET "CHASSIS_MODULE_TABLE|${dpu}" "oper_status" 2>/dev/null || true) + if [[ "$status" == "Offline" ]]; then + log_info "$dpu is offline after ${elapsed}s" + return 0 + fi + sleep 5 + (( elapsed += 5 )) + done + log_error "$dpu did not go offline within ${timeout}s" + return 1 +} + +get_dpu_status() { + sonic-db-cli STATE_DB HGET "CHASSIS_MODULE_TABLE|${1}" "oper_status" 2>/dev/null || echo "Unknown" +} + +get_latest_reboot_cause() { + local dpu=$1 + show reboot-cause all 2>/dev/null | grep -i "$dpu" | head -1 || echo "(none)" +} + +get_reboot_history_count() { + local dpu=$1 + local dir="${REBOOT_CAUSE_DIR}/${dpu,,}/history" + if [[ -d "$dir" ]]; then + find "$dir" -name '*_reboot_cause.json' 2>/dev/null | wc -l + else + echo 0 + fi +} + +get_previous_reboot_cause_json() { + local dpu=$1 + local path="${REBOOT_CAUSE_DIR}/${dpu,,}/previous-reboot-cause.json" + if [[ -L "$path" ]] || [[ -f "$path" ]]; then + cat "$path" 2>/dev/null || echo "{}" + else + echo "(not found)" + fi +} + +get_db_reboot_cause_keys() { + local dpu=$1 + sonic-db-cli CHASSIS_STATE_DB KEYS "REBOOT_CAUSE|${dpu}|*" 2>/dev/null || true +} + +snapshot_reboot_state() { + local label=$1 + log_step "Snapshot: $label" + echo "--- DPU status ---" + get_dpu_status "$DPU" + echo "" + echo "--- show reboot-cause all ---" + show reboot-cause all 2>/dev/null || true + echo "" + echo "--- show reboot-cause history $DPU ---" + show reboot-cause history "$DPU" 2>/dev/null || true + echo "" + echo "--- previous-reboot-cause.json ---" + get_previous_reboot_cause_json "$DPU" + echo "" + echo "--- History file count ---" + get_reboot_history_count "$DPU" + echo "" + echo "--- CHASSIS_STATE_DB keys ---" + get_db_reboot_cause_keys "$DPU" + echo "" +} + +check_syslog_for() { + local pattern=$1 + local since=${2:-"10 minutes ago"} + local result="" + # Try journalctl filtered by chassisd identifier + result=$(journalctl -t chassisd --since "$since" 2>/dev/null | grep -i "$pattern" | tail -5) + if [[ -n "$result" ]]; then echo "$result"; return; fi + # Try journalctl unfiltered (in case identifier differs) + result=$(journalctl --since "$since" 2>/dev/null | grep -i "chassisd" | grep -i "$pattern" | tail -5) + if [[ -n "$result" ]]; then echo "$result"; return; fi + # Try host /var/log/syslog filtered by chassisd (SONiC forwards container logs here) + result=$(grep -i "chassisd" /var/log/syslog 2>/dev/null | grep -i "$pattern" | tail -5) + if [[ -n "$result" ]]; then echo "$result"; return; fi + # Try inside pmon container + result=$(docker exec "$PMON_CONTAINER" cat /var/log/syslog 2>/dev/null | grep -i "$pattern" | tail -5) + if [[ -n "$result" ]]; then echo "$result"; return; fi + # Try supervisord stdout log inside pmon container + result=$(docker exec "$PMON_CONTAINER" cat /var/log/supervisor/supervisord.log 2>/dev/null | grep -i "$pattern" | tail -5) + if [[ -n "$result" ]]; then echo "$result"; return; fi +} + +restart_chassisd() { + log_info "Restarting pmon (chassisd)..." + systemctl restart pmon + sleep 15 + log_info "pmon restarted" +} + +# --------------------------------------------------------------------------- +# Pre-flight checks +# --------------------------------------------------------------------------- +log_step "Pre-flight checks" + +if [[ $EUID -ne 0 ]]; then + log_error "This script must be run as root (sudo)" + exit 1 +fi + +find_chassisd_path +log_info "Found chassisd at: $PMON_CONTAINER:$CHASSISD_PATH" + +if [[ ! -f "$PATCHED_CHASSISD" ]]; then + log_error "Patched chassisd not found at $PATCHED_CHASSISD" + log_error "Please copy it first:" + log_error " scp /sonic-chassisd/scripts/chassisd admin@:$PATCHED_CHASSISD" + log_error "Run '$SCRIPT_NAME -h' for help." + exit 1 +fi + +# Verify DPU exists +DPU_STATUS=$(get_dpu_status "$DPU") +if [[ "$DPU_STATUS" == "Unknown" ]]; then + log_error "$DPU not found in STATE_DB. Is this a smartswitch platform?" + exit 1 +fi +log_info "$DPU current status: $DPU_STATUS" + +# --------------------------------------------------------------------------- +# Install patched chassisd +# --------------------------------------------------------------------------- +log_step "Installing patched chassisd" + +docker cp "$PMON_CONTAINER:$CHASSISD_PATH" "$BACKUP_CHASSISD" +log_info "Backup saved to $BACKUP_CHASSISD" + +docker cp "$PATCHED_CHASSISD" "$PMON_CONTAINER:$CHASSISD_PATH" +log_info "Patched chassisd installed" + +restart_chassisd + +log_step "Ensuring $DPU is online before tests" +wait_for_dpu_online "$DPU" 600 || { + log_error "Cannot proceed: $DPU is not online" + exit 1 +} + +# Record initial state +INITIAL_HISTORY_COUNT=$(get_reboot_history_count "$DPU") +log_info "Initial history file count: $INITIAL_HISTORY_COUNT" + +TIMESTAMP_START=$(date '+%Y-%m-%d %H:%M:%S') + +# =========================================================================== +# Test 1: Normal ONLINE → OFFLINE → ONLINE transition +# =========================================================================== +log_step "Test 1: Normal ONLINE → OFFLINE → ONLINE reboot cause recording" + +snapshot_reboot_state "Test 1 — BEFORE" + +CAUSE_BEFORE=$(get_previous_reboot_cause_json "$DPU") +HISTORY_BEFORE=$(get_reboot_history_count "$DPU") + +log_info "Issuing 'reboot -d $DPU'..." +reboot -d "$DPU" & + +log_info "Waiting for $DPU to go offline..." +wait_for_dpu_offline "$DPU" 300 || { + fail "Test 1: $DPU did not go offline after reboot" +} + +log_info "Waiting for $DPU to come back online..." +wait_for_dpu_online "$DPU" 600 || { + fail "Test 1: $DPU did not come back online" +} + +# Give chassisd a few poll cycles +sleep 30 + +snapshot_reboot_state "Test 1 — AFTER" + +CAUSE_AFTER=$(get_previous_reboot_cause_json "$DPU") +HISTORY_AFTER=$(get_reboot_history_count "$DPU") + +# Verify reboot cause was recorded +if [[ "$CAUSE_AFTER" != "$CAUSE_BEFORE" ]]; then + pass "Test 1: Reboot cause updated (normal transition)" +else + fail "Test 1: Reboot cause did NOT change after normal reboot" +fi + +# Verify history file count increased (capped at MAX_HISTORY_FILES) +if (( HISTORY_BEFORE < MAX_HISTORY_FILES )); then + if (( HISTORY_AFTER > HISTORY_BEFORE )); then + pass "Test 1: History file count increased ($HISTORY_BEFORE -> $HISTORY_AFTER)" + else + fail "Test 1: History file count did not increase ($HISTORY_BEFORE -> $HISTORY_AFTER)" + fi +else + if (( HISTORY_AFTER == MAX_HISTORY_FILES )); then + pass "Test 1: History file count at max ($HISTORY_AFTER), oldest rotated out" + else + fail "Test 1: History file count unexpected ($HISTORY_BEFORE -> $HISTORY_AFTER)" + fi +fi + +# Verify DB entry exists +DB_KEYS=$(get_db_reboot_cause_keys "$DPU") +if [[ -n "$DB_KEYS" ]]; then + pass "Test 1: CHASSIS_STATE_DB has reboot cause entries" +else + fail "Test 1: No reboot cause entries in CHASSIS_STATE_DB" +fi + +# Check syslog for the offline transition log +OFFLINE_LOG=$(check_syslog_for "operational status transitioning to offline" "$TIMESTAMP_START") +if [[ -n "$OFFLINE_LOG" ]]; then + pass "Test 1: Syslog shows offline transition" +else + fail "Test 1: No offline transition logged in syslog" +fi + +# Check syslog for the online transition log +ONLINE_LOG=$(check_syslog_for "operational status transitioning to online" "$TIMESTAMP_START") +if [[ -n "$ONLINE_LOG" ]]; then + pass "Test 1: Syslog shows online transition" +else + fail "Test 1: No online transition logged in syslog" +fi + +# =========================================================================== +# Test 2: Config reload during DPU reboot (deferred EMPTY→OFFLINE→ONLINE) +# =========================================================================== +log_step "Test 2: Config reload during DPU reboot (deferred cause check)" + +log_info "Ensuring $DPU is online..." +wait_for_dpu_online "$DPU" 600 || { + fail "Test 2: $DPU not online, cannot proceed" +} + +TIMESTAMP_T2=$(date '+%Y-%m-%d %H:%M:%S') + +CAUSE_BEFORE_T2=$(get_previous_reboot_cause_json "$DPU") +HISTORY_BEFORE_T2=$(get_reboot_history_count "$DPU") + +log_info "Issuing 'reboot -d $DPU'..." +reboot -d "$DPU" & + +log_info "Waiting ${CONFIG_RELOAD_DELAY}s before config reload..." +sleep "$CONFIG_RELOAD_DELAY" + +log_info "Issuing 'config reload -y' (STATE_DB will be flushed, chassisd restarts)..." +config reload -y &>/dev/null & + +log_info "Waiting for system to recover..." +sleep 30 +wait_for_system_ready 300 || true + +# After config reload, chassisd restarts. It will see EMPTY→OFFLINE for the +# DPU (STATE_DB was flushed). With the deferred check, it should NOT call +# get_reboot_cause now — it should wait until the DPU comes online. + +log_info "Waiting for $DPU to come back online..." +wait_for_dpu_online "$DPU" 600 || { + fail "Test 2: $DPU did not come back online after config reload + reboot" +} + +# Give chassisd poll cycles to detect the online transition and run deferred check +sleep 30 + +snapshot_reboot_state "Test 2 — AFTER" + +CAUSE_AFTER_T2=$(get_previous_reboot_cause_json "$DPU") +HISTORY_AFTER_T2=$(get_reboot_history_count "$DPU") + +# The deferred check should have detected the cause changed and persisted it +if [[ "$CAUSE_AFTER_T2" != "$CAUSE_BEFORE_T2" ]]; then + pass "Test 2: Reboot cause updated after deferred check (config reload during reboot)" +else + # The cause might be the same type ("Power Loss") but the timestamp should differ. + # Check history count instead (capped at MAX_HISTORY_FILES). + if (( HISTORY_BEFORE_T2 < MAX_HISTORY_FILES && HISTORY_AFTER_T2 > HISTORY_BEFORE_T2 )); then + pass "Test 2: New history entry created after deferred check" + elif (( HISTORY_BEFORE_T2 >= MAX_HISTORY_FILES && HISTORY_AFTER_T2 == MAX_HISTORY_FILES )); then + pass "Test 2: History at max ($MAX_HISTORY_FILES), oldest rotated (deferred check)" + else + fail "Test 2: Reboot cause NOT updated after config reload during reboot" + fi +fi + +# Check for syslog messages. +# The deferred=='reboot' path (Online→Offline→Online after config reload) +# will persist the cause unless it was already persisted after the reboot time. +# - "Reboot cause changed while chassisd was down" only appears for deferred=='restart'. +# - "Reboot cause already persisted" appears when the cause was already recorded. +# Neither is guaranteed in Test 2, so treat as informational. +DEFERRED_LOG=$(check_syslog_for "Reboot cause changed while chassisd was down" "$TIMESTAMP_T2") +ALREADY_PERSISTED_LOG=$(check_syslog_for "Reboot cause already persisted after reboot" "$TIMESTAMP_T2") +if [[ -n "$DEFERRED_LOG" ]]; then + pass "Test 2: Syslog shows deferred reboot cause detection (restart path)" + log_info " $DEFERRED_LOG" +elif [[ -n "$ALREADY_PERSISTED_LOG" ]]; then + pass "Test 2: Syslog shows cause already persisted (same cause, recent — expected)" + log_info " $ALREADY_PERSISTED_LOG" +else + log_info "Test 2: No deferred-check syslog (expected for normal deferred reboot path)" +fi + +# Check that the online transition was logged after config reload. +ONLINE_LOG_T2=$(check_syslog_for "operational status transitioning to online" "$TIMESTAMP_T2") +ONLINE_COUNT=$(echo "$ONLINE_LOG_T2" | grep -c . 2>/dev/null || echo 0) +log_info "Online transition log count since test start: $ONLINE_COUNT" + +# =========================================================================== +# Test 3: Config reload with DPU already stable offline (same cause) +# =========================================================================== +log_step "Test 3: Config reload with DPU already stable offline (no duplicate)" + +log_info "Ensuring $DPU is online..." +wait_for_dpu_online "$DPU" 600 || { + skip "Test 3: $DPU not online, skipping" +} + +# Reboot the DPU and let it fully come back to create a known reboot cause +log_info "Rebooting $DPU and waiting for full cycle..." +reboot -d "$DPU" & +wait_for_dpu_offline "$DPU" 300 || true +wait_for_dpu_online "$DPU" 600 || { + fail "Test 3: $DPU did not recover" +} +sleep 30 + +# Now power off DPU so it stays offline +log_info "Powering off $DPU (it will stay offline)..." +config chassis modules shutdown "$DPU" 2>/dev/null || \ + sonic-db-cli CONFIG_DB HSET "CHASSIS_MODULE|$DPU" "admin_status" "down" 2>/dev/null || true +sleep 30 + +TIMESTAMP_T3=$(date '+%Y-%m-%d %H:%M:%S') +HISTORY_BEFORE_T3=$(get_reboot_history_count "$DPU") +CAUSE_BEFORE_T3=$(get_previous_reboot_cause_json "$DPU") + +# Config reload — STATE_DB flushed, chassisd will see EMPTY→OFFLINE for DPU +# But DPU stays offline this time (admin down), so the deferred check should +# remain pending and no reboot cause should be persisted. +log_info "Issuing 'config reload -y'..." +config reload -y &>/dev/null & +sleep 30 +wait_for_system_ready 300 || true + +# DPU should still be offline (admin down) +sleep 30 +DPU_STATUS_T3=$(get_dpu_status "$DPU") +log_info "$DPU status after config reload: $DPU_STATUS_T3" + +HISTORY_AFTER_T3=$(get_reboot_history_count "$DPU") +CAUSE_AFTER_T3=$(get_previous_reboot_cause_json "$DPU") + +if (( HISTORY_AFTER_T3 == HISTORY_BEFORE_T3 )) || (( HISTORY_BEFORE_T3 >= MAX_HISTORY_FILES && HISTORY_AFTER_T3 == MAX_HISTORY_FILES )); then + pass "Test 3: No duplicate history entry created (DPU stayed offline)" +else + fail "Test 3: Unexpected history entry created while DPU stayed offline ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3)" +fi + +if [[ "$CAUSE_AFTER_T3" == "$CAUSE_BEFORE_T3" ]]; then + pass "Test 3: previous-reboot-cause.json unchanged" +else + fail "Test 3: previous-reboot-cause.json changed unexpectedly" +fi + +# Bring DPU back up for next test. +# Note: the deferred flag may still be set from the config reload above. +# When DPU comes online, the deferred check will fire but the cause should +# be the same (no new reboot happened), so nothing should be persisted. +log_info "Bringing $DPU back online..." +config chassis modules startup "$DPU" 2>/dev/null || \ + sonic-db-cli CONFIG_DB HSET "CHASSIS_MODULE|$DPU" "admin_status" "up" 2>/dev/null || true +wait_for_dpu_online "$DPU" 600 || log_warn "$DPU did not come back online" +sleep 30 + +# Verify deferred flag did not cause a spurious persist +HISTORY_AFTER_T3_ONLINE=$(get_reboot_history_count "$DPU") +if (( HISTORY_AFTER_T3_ONLINE == HISTORY_BEFORE_T3 )) || (( HISTORY_BEFORE_T3 >= MAX_HISTORY_FILES && HISTORY_AFTER_T3_ONLINE == MAX_HISTORY_FILES )); then + pass "Test 3: No spurious persist when DPU came back online (deferred same cause)" +else + fail "Test 3: Deferred check created unexpected entry when DPU came online ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3_ONLINE)" +fi + +# =========================================================================== +# Test 4: Reboot cause history preserved across config reload +# =========================================================================== +log_step "Test 4: Reboot cause history preserved across config reload" + +HISTORY_BEFORE_T4=$(get_reboot_history_count "$DPU") +DB_KEYS_BEFORE_T4=$(get_db_reboot_cause_keys "$DPU" | wc -l) + +log_info "History files before: $HISTORY_BEFORE_T4, DB keys before: $DB_KEYS_BEFORE_T4" + +log_info "Issuing 'config reload -y'..." +config reload -y &>/dev/null & +sleep 30 +wait_for_system_ready 300 || true +wait_for_dpu_online "$DPU" 600 || true +sleep 30 + +HISTORY_AFTER_T4=$(get_reboot_history_count "$DPU") +DB_KEYS_AFTER_T4=$(get_db_reboot_cause_keys "$DPU" | wc -l) + +log_info "History files after: $HISTORY_AFTER_T4, DB keys after: $DB_KEYS_AFTER_T4" + +# History files on disk should be preserved (they live under /host, not in STATE_DB) +# When at max, count stays the same; otherwise it should not decrease. +if (( HISTORY_AFTER_T4 >= HISTORY_BEFORE_T4 )) || (( HISTORY_BEFORE_T4 >= MAX_HISTORY_FILES && HISTORY_AFTER_T4 == MAX_HISTORY_FILES )); then + pass "Test 4: History files preserved across config reload" +else + fail "Test 4: History files lost after config reload ($HISTORY_BEFORE_T4 -> $HISTORY_AFTER_T4)" +fi + +# DB entries should be repopulated by update_dpu_reboot_cause_to_db +if (( DB_KEYS_AFTER_T4 > 0 )); then + pass "Test 4: CHASSIS_STATE_DB reboot cause entries present after config reload" +else + fail "Test 4: No CHASSIS_STATE_DB entries after config reload" +fi + +# =========================================================================== +# Test 5: Deferred EMPTY→OFFLINE→ONLINE with same cause (no duplicate) +# =========================================================================== +log_step "Test 5: Deferred check with same reboot cause (no duplicate entry)" + +log_info "Ensuring $DPU is online..." +wait_for_dpu_online "$DPU" 600 || { + skip "Test 5: $DPU not online, skipping" +} + +# Reboot DPU, let it fully cycle to establish a known reboot cause +log_info "Rebooting $DPU for a clean reboot cause baseline..." +reboot -d "$DPU" & +wait_for_dpu_offline "$DPU" 300 || true +wait_for_dpu_online "$DPU" 600 || { + fail "Test 5: $DPU did not recover from baseline reboot" +} +sleep 30 + +TIMESTAMP_T5=$(date '+%Y-%m-%d %H:%M:%S') +HISTORY_BEFORE_T5=$(get_reboot_history_count "$DPU") +CAUSE_BEFORE_T5=$(get_previous_reboot_cause_json "$DPU") +log_info "Baseline cause: $CAUSE_BEFORE_T5" +log_info "Baseline history count: $HISTORY_BEFORE_T5" + +# Config reload — no DPU reboot this time, DPU just stays online then +# becomes EMPTY→OFFLINE→ONLINE as pmon/chassisd restarts. +# The deferred path fires, but the cause is the SAME as stored (no new reboot +# happened), so nothing should be persisted. +log_info "Issuing 'config reload -y' (no DPU reboot — same cause expected)..." +config reload -y &>/dev/null & +sleep 30 +wait_for_system_ready 300 || true +wait_for_dpu_online "$DPU" 600 || { + fail "Test 5: $DPU did not come back online after config reload" +} +sleep 30 + +HISTORY_AFTER_T5=$(get_reboot_history_count "$DPU") +CAUSE_AFTER_T5=$(get_previous_reboot_cause_json "$DPU") + +if (( HISTORY_AFTER_T5 == HISTORY_BEFORE_T5 )) || (( HISTORY_BEFORE_T5 >= MAX_HISTORY_FILES && HISTORY_AFTER_T5 == MAX_HISTORY_FILES )); then + pass "Test 5: No duplicate history entry (same cause after deferred check)" +else + fail "Test 5: Unexpected history entry created ($HISTORY_BEFORE_T5 -> $HISTORY_AFTER_T5)" +fi + +if [[ "$CAUSE_AFTER_T5" == "$CAUSE_BEFORE_T5" ]]; then + pass "Test 5: previous-reboot-cause.json unchanged (same cause)" +else + fail "Test 5: previous-reboot-cause.json changed unexpectedly" +fi + +# Check syslog — should NOT show "Reboot cause changed while chassisd was down" +DEFERRED_LOG_T5=$(check_syslog_for "Reboot cause changed while chassisd was down" "$TIMESTAMP_T5") +if [[ -z "$DEFERRED_LOG_T5" ]]; then + pass "Test 5: No deferred-cause-changed syslog (expected — cause is the same)" +else + fail "Test 5: Unexpected 'Reboot cause changed' in syslog: $DEFERRED_LOG_T5" +fi + +# =========================================================================== +# Test 6: Back-to-back reboot skips duplicate persist +# =========================================================================== +log_step "Test 6: Back-to-back rapid reboot skips duplicate persist" + +log_info "Ensuring $DPU is online..." +wait_for_dpu_online "$DPU" 600 || { + skip "Test 6: $DPU not online, skipping" +} + +# Reboot the DPU and let it fully cycle (creates a reboot cause entry) +log_info "First reboot: establishing reboot cause..." +reboot -d "$DPU" & +wait_for_dpu_offline "$DPU" 300 || true +wait_for_dpu_online "$DPU" 600 || { + fail "Test 6: $DPU did not recover from first reboot" +} +sleep 30 + +TIMESTAMP_T6=$(date '+%Y-%m-%d %H:%M:%S') +HISTORY_BEFORE_T6=$(get_reboot_history_count "$DPU") +log_info "History count after first reboot: $HISTORY_BEFORE_T6" + +# Immediately reboot again — chassisd should detect the cause was already +# persisted after this reboot time and skip the duplicate. +log_info "Second reboot: rapid back-to-back (duplicate skip expected)..." +reboot -d "$DPU" & +wait_for_dpu_offline "$DPU" 300 || true +wait_for_dpu_online "$DPU" 600 || { + fail "Test 6: $DPU did not recover from second reboot" +} +sleep 30 + +HISTORY_AFTER_T6=$(get_reboot_history_count "$DPU") +log_info "History count after second reboot: $HISTORY_AFTER_T6" + +# The OFFLINE→ONLINE transition persists a new entry (from _record_reboot on +# the offline transition). But the online transition should detect the cause +# was already persisted and skip the duplicate. So history should increase by +# exactly 1 (from the offline detection), not 2. +if (( HISTORY_BEFORE_T6 < MAX_HISTORY_FILES )); then + EXPECTED_T6=$(( HISTORY_BEFORE_T6 + 1 )) + if (( HISTORY_AFTER_T6 == EXPECTED_T6 )); then + pass "Test 6: History increased by exactly 1 (duplicate skipped)" + elif (( HISTORY_AFTER_T6 == HISTORY_BEFORE_T6 )); then + pass "Test 6: No new history entry (duplicate detected at both transitions)" + else + fail "Test 6: Expected $EXPECTED_T6 history entries, got $HISTORY_AFTER_T6" + fi +else + if (( HISTORY_AFTER_T6 == MAX_HISTORY_FILES )); then + pass "Test 6: History at max ($MAX_HISTORY_FILES), rotation handled correctly" + else + fail "Test 6: Expected $MAX_HISTORY_FILES history entries at max, got $HISTORY_AFTER_T6" + fi +fi + +# Check syslog for duplicate-skip detection (informational). +# In the back-to-back case, each reboot has a distinct reboot_time so both +# legitimately persist a new cause. The "already persisted" message only +# fires when the same reboot's cause was somehow recorded twice (e.g. race). +IS_REBOOT_LOG_T6=$(check_syslog_for "Reboot cause already persisted after reboot" "$TIMESTAMP_T6") +if [[ -n "$IS_REBOOT_LOG_T6" ]]; then + pass "Test 6: Syslog confirms duplicate reboot cause skipped" + log_info " $IS_REBOOT_LOG_T6" +else + log_info "Test 6: No 'already persisted' syslog (expected — each reboot has unique time)" +fi + +# =========================================================================== +# Summary +# =========================================================================== +log_step "Test Summary" + +TOTAL=$(( PASSED + FAILED + SKIPPED )) +echo "" +echo -e " Total: $TOTAL" +echo -e " ${GREEN}Passed: $PASSED${NC}" +echo -e " ${RED}Failed: $FAILED${NC}" +echo -e " ${YELLOW}Skipped: $SKIPPED${NC}" +echo "" + +if (( FAILED > 0 )); then + log_error "Some tests FAILED. Check syslog for chassisd messages:" + log_error " grep -i 'chassisd\\|reboot.cause\\|transitioning' /var/log/syslog | tail -30" +fi + +# --------------------------------------------------------------------------- +# Cleanup prompt +# --------------------------------------------------------------------------- +echo "" +read -p "Restore original chassisd? [y/N] " -r +if [[ "$REPLY" =~ ^[Yy]$ ]]; then + docker cp "$BACKUP_CHASSISD" "$PMON_CONTAINER:$CHASSISD_PATH" + restart_chassisd + log_info "Original chassisd restored" +else + log_info "Patched chassisd left in place at $PMON_CONTAINER:$CHASSISD_PATH" + log_info "To restore later: docker cp $BACKUP_CHASSISD $PMON_CONTAINER:$CHASSISD_PATH && systemctl restart pmon" +fi + +log_step "Done" + +if (( FAILED > 0 )); then + exit 1 +fi +exit 0 From 1627248cf6d001e498931ece9b80d1dbf93a8ff9 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Sat, 23 May 2026 02:02:58 +0000 Subject: [PATCH 07/11] [chassisd] Fix same-cause reboot not persisted in deferred='restart' path The deferred='restart' path (EMPTY->Offline->Online after config reload) only compared cause strings to decide whether to persist. Back-to-back reboots that produce the same cause type (e.g. 'Non-Hardware') were silently dropped because current_cause == stored_cause. Fix: use timestamp comparison (same as deferred='reboot' path). If prev_reboot_time.txt records a reboot execution time newer than the stored cause timestamp in previous-reboot-cause.json, persist regardless of whether the cause string changed. Also persist the initial cause on first boot via this path (stored_cause is None), aligning with the EMPTY->ONLINE (no deferred) first-boot behavior. Signed-off-by: Vasundhara Volam --- sonic-chassisd/scripts/chassisd | 25 ++- sonic-chassisd/tests/test_chassisd.py | 62 ++++++- .../test_dpu_reboot_cause_persistence.sh | 168 ++++++++++++++---- 3 files changed, 212 insertions(+), 43 deletions(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index 020751fe9..7f50042c7 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -840,13 +840,30 @@ class SmartSwitchModuleUpdater(ModuleUpdater): # Deferred from EMPTY→Offline: chassisd restarted while DPU was # offline, now it is online so we can read the real reboot cause. - # Only persist if the cause actually changed. + # Persist if: (a) a reboot execution time exists that is newer than + # the stored cause timestamp (same logic as deferred=='reboot'), or + # (b) the cause string itself changed. This handles back-to-back + # reboots that produce the same cause type. if deferred == 'restart': - if stored_cause is not None and current_cause and current_cause != stored_cause: + reboot_time = self.retrieve_dpu_reboot_time(key) + should_persist = False + if reboot_time and stored_time_str and stored_time_str < reboot_time: + self.log_notice( + f"{key}: New reboot detected (stored time {stored_time_str} " + f"< reboot time {reboot_time})") + should_persist = True + elif stored_cause is not None and current_cause and current_cause != stored_cause: self.log_notice( f"{key}: Reboot cause changed while chassisd was down " f"(stored: {stored_cause}, current: {current_cause})") - self.persist_dpu_reboot_time(key) + should_persist = True + elif stored_cause is None and current_cause: + # First reboot ever — no stored cause yet + should_persist = True + + if should_persist: + if not reboot_time: + self.persist_dpu_reboot_time(key) self.persist_dpu_reboot_cause(reboot_cause, key) self.update_dpu_reboot_cause_to_db(key) return @@ -860,7 +877,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater): if not already_persisted and (stored_time_str is not None or self._is_first_boot(key)): self.persist_dpu_reboot_cause(reboot_cause, key) - self.update_dpu_reboot_cause_to_db(key) + self.update_dpu_reboot_cause_to_db(key) def module_db_update(self): for module_index in range(0, self.num_modules): diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index c424f5db4..aea5921be 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -491,6 +491,7 @@ def test_empty_to_offline_persists_changed_reboot_cause(): with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "power auxiliary outage or reload")), \ patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Reboot", "2025_10_10_07_36_12")), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: @@ -510,11 +511,53 @@ def test_empty_to_offline_persists_changed_reboot_cause(): mock_persist_cause.assert_called_once_with(("Power Loss", "power auxiliary outage or reload"), name) mock_update_db.assert_called_once_with(name) -def test_empty_to_offline_skips_same_reboot_cause(): +def test_empty_to_offline_persists_same_cause_new_reboot(): """ After chassisd restart, if the DPU is offline the EMPTY→OFFLINE transition defers the check. When the DPU comes online with the - same reboot cause, we must NOT create a duplicate entry. + same reboot cause but a reboot_time newer than stored_time, we MUST + persist — this is a new reboot that produced the same cause type. + Fixes sonic-net/sonic-buildimage#24275. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # Stored cause is from a prior reboot, reboot_time is from the current reboot + stored_time = "2025_10_10_07_36_12" + reboot_time = "2025_10_10_07_40_34" + + with patch.object(module, 'get_reboot_cause', return_value=("Reboot", "Reset from Main board")), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Reboot", stored_time)), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ + patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # EMPTY→OFFLINE: deferred + updater.module_db_update() + assert name in updater._pending_reboot_check + + # DPU comes online — same cause but reboot_time > stored_time → persist + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check + mock_persist_time.assert_not_called() # reboot_time already exists + mock_persist_cause.assert_called_once() + mock_update_db.assert_called_once_with(name) + + +def test_empty_to_offline_skips_same_cause_no_new_reboot(): + """ + After chassisd restart, if the DPU is offline the EMPTY→OFFLINE + transition defers the check. When the DPU comes online with the + same reboot cause and NO reboot_time (or stored_time >= reboot_time), + we must NOT create a duplicate entry — no new reboot happened. """ chassis = MockSmartSwitchChassis() name = "DPU0" @@ -527,6 +570,7 @@ def test_empty_to_offline_skips_same_reboot_cause(): with patch.object(module, 'get_reboot_cause', return_value=("Reboot", "Reset from Main board")), \ patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Reboot", "2025_10_10_07_36_12")), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: @@ -535,7 +579,7 @@ def test_empty_to_offline_skips_same_reboot_cause(): updater.module_db_update() assert name in updater._pending_reboot_check - # DPU comes online — same cause, no new persistence + # DPU comes online — same cause, no reboot_time → skip module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() assert name not in updater._pending_reboot_check @@ -544,11 +588,11 @@ def test_empty_to_offline_skips_same_reboot_cause(): # DB should still be repopulated (STATE_DB was flushed) mock_update_db.assert_called_once_with(name) -def test_empty_to_offline_skips_first_boot(): +def test_empty_to_offline_persists_first_boot(): """ On first boot there is no previous-reboot-cause.json so stored_cause is None. The EMPTY→OFFLINE transition defers, and when the DPU - comes online the deferred path must NOT persist (stored_cause is None). + comes online the deferred path persists the initial cause. """ chassis = MockSmartSwitchChassis() name = "DPU0" @@ -561,6 +605,7 @@ def test_empty_to_offline_skips_first_boot(): with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "N/A")), \ patch.object(updater, 'retrieve_dpu_reboot_info', return_value=(None, None)), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: @@ -569,13 +614,12 @@ def test_empty_to_offline_skips_first_boot(): updater.module_db_update() assert name in updater._pending_reboot_check - # DPU comes online — no stored cause (first boot), should NOT persist + # DPU comes online — first boot, persist the initial cause module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() assert name not in updater._pending_reboot_check - mock_persist_time.assert_not_called() - mock_persist_cause.assert_not_called() - # DB should still be repopulated (STATE_DB was flushed) + mock_persist_time.assert_called_once_with(name) + mock_persist_cause.assert_called_once() mock_update_db.assert_called_once_with(name) def test_retrieve_dpu_reboot_info_success(): diff --git a/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh b/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh index c1b1564d0..35f601146 100755 --- a/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh +++ b/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh @@ -5,7 +5,7 @@ # Covers: # Test 1: Normal ONLINE→OFFLINE→ONLINE transition records reboot cause # Test 2: Config reload during DPU reboot (EMPTY→OFFLINE→ONLINE deferred check) -# Test 3: Config reload with DPU already offline (same cause — no duplicate) +# Test 3: Config reload with DPU already offline (admin-shutdown power cycle) # Test 4: Reboot cause history is not lost after config reload # Test 5: Deferred EMPTY→OFFLINE→ONLINE with same cause (no duplicate) # Test 6: Back-to-back reboot skips duplicate persist @@ -95,7 +95,7 @@ while getopts "d:p:t:h" opt; do done shift $((OPTIND - 1)) -set -euo pipefail +set -uo pipefail CHASSISD_PATH="/usr/local/bin/chassisd" BACKUP_CHASSISD="/tmp/chassisd_original" @@ -225,6 +225,15 @@ get_previous_reboot_cause_json() { fi } +extract_cause_fields() { + # Extract only "cause" and "comment" from JSON (ignore time/name which change on every persist) + local json=$1 + local cause comment + cause=$(echo "$json" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('cause',''))" 2>/dev/null || echo "") + comment=$(echo "$json" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('comment',''))" 2>/dev/null || echo "") + echo "${cause}|${comment}" +} + get_db_reboot_cause_keys() { local dpu=$1 sonic-db-cli CHASSIS_STATE_DB KEYS "REBOOT_CAUSE|${dpu}|*" 2>/dev/null || true @@ -263,9 +272,41 @@ check_syslog_for() { # Try journalctl unfiltered (in case identifier differs) result=$(journalctl --since "$since" 2>/dev/null | grep -i "chassisd" | grep -i "$pattern" | tail -5) if [[ -n "$result" ]]; then echo "$result"; return; fi - # Try host /var/log/syslog filtered by chassisd (SONiC forwards container logs here) - result=$(grep -i "chassisd" /var/log/syslog 2>/dev/null | grep -i "$pattern" | tail -5) - if [[ -n "$result" ]]; then echo "$result"; return; fi + # Try host /var/log/syslog filtered by chassisd with time filtering. + # SONiC syslog format: "2026 May 23 20:15:26.123456 hostname ..." + if [[ -f /var/log/syslog ]]; then + # Convert 'since' to numeric timestamp (YYYYMMDDHHmmss) for comparison + local since_numeric + since_numeric=$(date -d "$since" +%Y%m%d%H%M%S 2>/dev/null || echo 0) + if [[ "$since_numeric" != "0" && ${#since_numeric} -eq 14 ]]; then + # Pure awk comparison — no external commands per line + result=$(grep -i "chassisd" /var/log/syslog 2>/dev/null | grep -i "$pattern" | \ + awk -v since_ts="$since_numeric" ' + BEGIN { + mon["Jan"]=1; mon["Feb"]=2; mon["Mar"]=3; mon["Apr"]=4 + mon["May"]=5; mon["Jun"]=6; mon["Jul"]=7; mon["Aug"]=8 + mon["Sep"]=9; mon["Oct"]=10; mon["Nov"]=11; mon["Dec"]=12 + } + { + # Parse "YYYY Mon DD HH:MM:SS.usec" from fields 1-4 + year = $1+0; m = mon[$2]+0; day = $3+0 + split($4, t, /[:.]/) + h = t[1]+0; mi = t[2]+0; s = t[3]+0 + if (year > 0 && m > 0) { + line_ts = sprintf("%04d%02d%02d%02d%02d%02d", year, m, day, h, mi, s) + if (line_ts >= since_ts) print + } + }' | tail -5) + if [[ -n "$result" ]]; then echo "$result"; return; fi + # Time filtering was applied — trust the result (even if empty). + # Do NOT fall through to unfiltered grep, which would defeat + # "should not find" assertions in Test 5. + return + fi + # since_numeric could not be computed — use unfiltered grep as last resort + result=$(grep -i "chassisd" /var/log/syslog 2>/dev/null | grep -i "$pattern" | tail -5) + if [[ -n "$result" ]]; then echo "$result"; return; fi + fi # Try inside pmon container result=$(docker exec "$PMON_CONTAINER" cat /var/log/syslog 2>/dev/null | grep -i "$pattern" | tail -5) if [[ -n "$result" ]]; then echo "$result"; return; fi @@ -540,34 +581,82 @@ log_info "$DPU status after config reload: $DPU_STATUS_T3" HISTORY_AFTER_T3=$(get_reboot_history_count "$DPU") CAUSE_AFTER_T3=$(get_previous_reboot_cause_json "$DPU") -if (( HISTORY_AFTER_T3 == HISTORY_BEFORE_T3 )) || (( HISTORY_BEFORE_T3 >= MAX_HISTORY_FILES && HISTORY_AFTER_T3 == MAX_HISTORY_FILES )); then - pass "Test 3: No duplicate history entry created (DPU stayed offline)" -else - fail "Test 3: Unexpected history entry created while DPU stayed offline ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3)" -fi - -if [[ "$CAUSE_AFTER_T3" == "$CAUSE_BEFORE_T3" ]]; then - pass "Test 3: previous-reboot-cause.json unchanged" +# Note: admin-shutdown is a RUNTIME command — not persisted to config_db. +# After config reload, DPU comes back online (config says admin_status=up). +# The deferred check fires immediately and detects the admin-shutdown power cycle +# via prev_reboot_time.txt timestamp > stored cause timestamp. +# So either: +# - DPU is still offline: no new entry yet (deferred hasn't fired) +# - DPU is already online: new entry persisted (deferred already fired) +if [[ "$DPU_STATUS_T3" == *"Online"* ]]; then + # DPU already came back online — deferred check already fired and persisted + if (( HISTORY_BEFORE_T3 >= MAX_HISTORY_FILES )); then + if (( HISTORY_AFTER_T3 == MAX_HISTORY_FILES )); then + pass "Test 3: History at max after deferred persist (rotation handled)" + else + fail "Test 3: Unexpected history count ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3)" + fi + else + if (( HISTORY_AFTER_T3 == HISTORY_BEFORE_T3 + 1 )); then + pass "Test 3: New entry persisted (DPU came online after config reload)" + else + fail "Test 3: Expected history+1 ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3)" + fi + fi + if [[ "$CAUSE_AFTER_T3" != "$CAUSE_BEFORE_T3" ]]; then + pass "Test 3: previous-reboot-cause.json updated (admin-shutdown is real power cycle)" + else + fail "Test 3: previous-reboot-cause.json should have changed (DPU already online)" + fi else - fail "Test 3: previous-reboot-cause.json changed unexpectedly" + # DPU shows offline at check time. However, admin_status=up was restored + # by config reload (runtime shutdown is not persisted), so DPU is being + # brought back up. It may have briefly come online (firing the deferred + # check and persisting) before going to Offline again, or chassisd may + # not have seen the transition yet. Accept BOTH outcomes. + if (( HISTORY_AFTER_T3 == HISTORY_BEFORE_T3 )) || (( HISTORY_BEFORE_T3 >= MAX_HISTORY_FILES && HISTORY_AFTER_T3 == MAX_HISTORY_FILES )); then + pass "Test 3: History count stable while DPU shows offline ($HISTORY_AFTER_T3)" + elif (( HISTORY_AFTER_T3 == HISTORY_BEFORE_T3 + 1 )); then + pass "Test 3: Deferred already fired (DPU briefly came online)" + else + fail "Test 3: Unexpected history count while DPU offline ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3)" + fi + if [[ "$CAUSE_AFTER_T3" == "$CAUSE_BEFORE_T3" ]]; then + pass "Test 3: previous-reboot-cause.json unchanged (deferred still pending)" + else + # Deferred already fired — DPU briefly came online during the wait + pass "Test 3: previous-reboot-cause.json updated (deferred fired early)" + fi fi # Bring DPU back up for next test. -# Note: the deferred flag may still be set from the config reload above. -# When DPU comes online, the deferred check will fire but the cause should -# be the same (no new reboot happened), so nothing should be persisted. +# Note: the deferred flag is set from the config reload above (deferred='restart'). +# When DPU comes online, the deferred check will fire and detect that +# prev_reboot_time.txt (written during admin-shutdown) is newer than the stored +# cause timestamp. Since admin-shutdown + startup is a real power cycle, +# a new reboot cause entry SHOULD be persisted. log_info "Bringing $DPU back online..." config chassis modules startup "$DPU" 2>/dev/null || \ sonic-db-cli CONFIG_DB HSET "CHASSIS_MODULE|$DPU" "admin_status" "up" 2>/dev/null || true wait_for_dpu_online "$DPU" 600 || log_warn "$DPU did not come back online" sleep 30 -# Verify deferred flag did not cause a spurious persist +# The deferred check should persist a new entry (admin-shutdown is a real power cycle) HISTORY_AFTER_T3_ONLINE=$(get_reboot_history_count "$DPU") -if (( HISTORY_AFTER_T3_ONLINE == HISTORY_BEFORE_T3 )) || (( HISTORY_BEFORE_T3 >= MAX_HISTORY_FILES && HISTORY_AFTER_T3_ONLINE == MAX_HISTORY_FILES )); then - pass "Test 3: No spurious persist when DPU came back online (deferred same cause)" +if (( HISTORY_BEFORE_T3 < MAX_HISTORY_FILES )); then + if (( HISTORY_AFTER_T3_ONLINE == HISTORY_BEFORE_T3 + 1 )); then + pass "Test 3: New entry persisted when DPU came back (admin-shutdown is a real power cycle)" + elif (( HISTORY_AFTER_T3_ONLINE == HISTORY_BEFORE_T3 )); then + fail "Test 3: No new entry — deferred check should detect power cycle via timestamp" + else + fail "Test 3: Unexpected history count ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3_ONLINE)" + fi else - fail "Test 3: Deferred check created unexpected entry when DPU came online ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3_ONLINE)" + if (( HISTORY_AFTER_T3_ONLINE == MAX_HISTORY_FILES )); then + pass "Test 3: History at max ($MAX_HISTORY_FILES), rotation handled" + else + fail "Test 3: Unexpected history count at max ($HISTORY_AFTER_T3_ONLINE)" + fi fi # =========================================================================== @@ -654,13 +743,26 @@ else fail "Test 5: Unexpected history entry created ($HISTORY_BEFORE_T5 -> $HISTORY_AFTER_T5)" fi +# Compare cause/comment fields (ignoring timestamp which may differ on re-persist) +CAUSE_FIELDS_BEFORE_T5=$(extract_cause_fields "$CAUSE_BEFORE_T5") +CAUSE_FIELDS_AFTER_T5=$(extract_cause_fields "$CAUSE_AFTER_T5") + if [[ "$CAUSE_AFTER_T5" == "$CAUSE_BEFORE_T5" ]]; then - pass "Test 5: previous-reboot-cause.json unchanged (same cause)" + pass "Test 5: previous-reboot-cause.json unchanged (same cause, no re-persist)" +elif [[ "$CAUSE_FIELDS_AFTER_T5" == "$CAUSE_FIELDS_BEFORE_T5" ]]; then + # Same cause/comment but different timestamp — chassisd re-persisted unnecessarily + # This is acceptable behavior (not a correctness bug) but worth noting + log_warn "Test 5: previous-reboot-cause.json timestamp changed (same cause re-persisted)" + log_info " Before: $CAUSE_BEFORE_T5" + log_info " After: $CAUSE_AFTER_T5" + pass "Test 5: Reboot cause type unchanged (same cause after deferred check)" else - fail "Test 5: previous-reboot-cause.json changed unexpectedly" + fail "Test 5: Reboot cause type changed unexpectedly" + log_info " Before: $CAUSE_BEFORE_T5" + log_info " After: $CAUSE_AFTER_T5" fi -# Check syslog — should NOT show "Reboot cause changed while chassisd was down" +# Check syslog — should NOT show "Reboot cause changed" or "New reboot detected" DEFERRED_LOG_T5=$(check_syslog_for "Reboot cause changed while chassisd was down" "$TIMESTAMP_T5") if [[ -z "$DEFERRED_LOG_T5" ]]; then pass "Test 5: No deferred-cause-changed syslog (expected — cause is the same)" @@ -668,6 +770,13 @@ else fail "Test 5: Unexpected 'Reboot cause changed' in syslog: $DEFERRED_LOG_T5" fi +NEW_REBOOT_LOG_T5=$(check_syslog_for "New reboot detected" "$TIMESTAMP_T5") +if [[ -z "$NEW_REBOOT_LOG_T5" ]]; then + pass "Test 5: No 'New reboot detected' syslog (expected — no prev_reboot_time.txt)" +else + fail "Test 5: Unexpected 'New reboot detected' in syslog: $NEW_REBOOT_LOG_T5" +fi + # =========================================================================== # Test 6: Back-to-back reboot skips duplicate persist # =========================================================================== @@ -704,16 +813,15 @@ sleep 30 HISTORY_AFTER_T6=$(get_reboot_history_count "$DPU") log_info "History count after second reboot: $HISTORY_AFTER_T6" -# The OFFLINE→ONLINE transition persists a new entry (from _record_reboot on -# the offline transition). But the online transition should detect the cause -# was already persisted and skip the duplicate. So history should increase by -# exactly 1 (from the offline detection), not 2. +# The second reboot has a distinct reboot_time (T2 > T1 from first reboot), +# so the deferred='reboot' path correctly detects it as a new event and +# persists it. History should increase by exactly 1 from the second reboot. if (( HISTORY_BEFORE_T6 < MAX_HISTORY_FILES )); then EXPECTED_T6=$(( HISTORY_BEFORE_T6 + 1 )) if (( HISTORY_AFTER_T6 == EXPECTED_T6 )); then - pass "Test 6: History increased by exactly 1 (duplicate skipped)" + pass "Test 6: History increased by exactly 1 (second reboot correctly persisted)" elif (( HISTORY_AFTER_T6 == HISTORY_BEFORE_T6 )); then - pass "Test 6: No new history entry (duplicate detected at both transitions)" + fail "Test 6: No new history entry — second reboot should be persisted" else fail "Test 6: Expected $EXPECTED_T6 history entries, got $HISTORY_AFTER_T6" fi From f23e6a4c537dd1bf4bc396453b729e6a6028a453 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Tue, 26 May 2026 18:38:12 +0000 Subject: [PATCH 08/11] [chassisd] Improve readability: transition predicates, named constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback from gpunathilell: - Add DEFERRED_DPU_REBOOT / DEFERRED_CHASSISD_RESTART constants to replace ambiguous 'reboot' / 'restart' string literals in _pending_reboot_check - Extract transition predicates into named helper methods: _is_known_to_offline(), _is_empty_to_offline(), _is_back_to_online() matching the documented Transition Matrix - Replace 'was_known' (used only negated) with positive-sense 'prev_status_empty' for clarity Pure readability refactor — no logic changes. Signed-off-by: Vasundhara Volam --- sonic-chassisd/scripts/chassisd | 44 +++++++++++++++++++-------- sonic-chassisd/tests/test_chassisd.py | 8 ++--- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index 7f50042c7..676f01eee 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -78,6 +78,11 @@ CHASSIS_MODULE_INFO_HOSTNAME_FIELD = 'hostname' CHASSIS_MODULE_REBOOT_INFO_TABLE = 'CHASSIS_MODULE_REBOOT_INFO_TABLE' CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp' CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot' + +# Deferred reboot-cause check reasons stored in _pending_reboot_check +DEFERRED_DPU_REBOOT = 'dpu_reboot' # chassisd witnessed Online→Offline +DEFERRED_CHASSISD_RESTART = 'chassisd_restart' # chassisd found DPU already offline after restart + DEFAULT_LINECARD_REBOOT_TIMEOUT = 180 DEFAULT_DPU_REBOOT_TIMEOUT = 360 MAX_DPU_REBOOT_DURATION = 800 @@ -781,30 +786,45 @@ class SmartSwitchModuleUpdater(ModuleUpdater): return reboot_cause[0] return reboot_cause + def _is_known_to_offline(self, prev_status, current_status): + """Online (or any non-EMPTY non-OFFLINE) → OFFLINE — chassisd witnessed it.""" + return (prev_status != ModuleBase.MODULE_STATUS_EMPTY + and prev_status != str(ModuleBase.MODULE_STATUS_OFFLINE) + and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) + + def _is_empty_to_offline(self, prev_status, current_status): + """EMPTY → OFFLINE — chassisd just (re)started, found DPU already offline.""" + return (prev_status == ModuleBase.MODULE_STATUS_EMPTY + and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) + + def _is_back_to_online(self, prev_status, current_status): + """EMPTY/OFFLINE → ONLINE — DPU usable again, safe to call get_reboot_cause().""" + return (prev_status in (ModuleBase.MODULE_STATUS_EMPTY, + str(ModuleBase.MODULE_STATUS_OFFLINE)) + and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE)) + def _handle_module_status_change(self, key, module_index, prev_status, current_status): """Handle DPU operational-status transitions and reboot-cause tracking.""" - was_known = prev_status != ModuleBase.MODULE_STATUS_EMPTY - was_offline = prev_status == str(ModuleBase.MODULE_STATUS_OFFLINE) - now_offline = current_status == str(ModuleBase.MODULE_STATUS_OFFLINE) + prev_status_empty = prev_status == ModuleBase.MODULE_STATUS_EMPTY # Transition to offline from a known non-offline state. # Persist the down-time but defer get_reboot_cause until the DPU # is back online — querying a powered-off DPU is unreliable. - if was_known and not was_offline and now_offline: + if self._is_known_to_offline(prev_status, current_status): self.log_notice(f"{key} operational status transitioning to offline") self.persist_dpu_reboot_time(key) - self._pending_reboot_check[key] = 'reboot' + self._pending_reboot_check[key] = DEFERRED_DPU_REBOOT return # DPU discovered offline after chassisd restart (e.g. config reload # during an ongoing DPU reboot). prev_status is EMPTY because # STATE_DB was flushed. Defer reboot-cause check until DPU is online. - if not was_known and now_offline: - self._pending_reboot_check[key] = 'restart' + if self._is_empty_to_offline(prev_status, current_status): + self._pending_reboot_check[key] = DEFERRED_CHASSISD_RESTART return # Transition to online from EMPTY or OFFLINE - if (not was_known or was_offline) and not now_offline: + if self._is_back_to_online(prev_status, current_status): self.log_notice(f"{key} operational status transitioning to online") deferred = self._pending_reboot_check.pop(key, None) @@ -813,7 +833,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater): # DPU was already online — no reboot happened during this cycle. # On first boot persist the initial cause; otherwise just # repopulate CHASSIS_STATE_DB from on-disk data. - if deferred is None and not was_known: + if deferred is None and prev_status_empty: if self._is_first_boot(key): reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) self.persist_dpu_reboot_time(key) @@ -828,7 +848,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater): # Deferred from Online→Offline: the DPU went down and is now back. # Reboot time was already persisted. Skip persisting if the # cause was already recorded after this reboot's execution time. - if deferred == 'reboot': + if deferred == DEFERRED_DPU_REBOOT: reboot_time = self.retrieve_dpu_reboot_time(key) if stored_time_str and reboot_time and stored_time_str >= reboot_time: self.log_notice(f"{key}: Reboot cause already persisted after reboot at {reboot_time}") @@ -841,10 +861,10 @@ class SmartSwitchModuleUpdater(ModuleUpdater): # Deferred from EMPTY→Offline: chassisd restarted while DPU was # offline, now it is online so we can read the real reboot cause. # Persist if: (a) a reboot execution time exists that is newer than - # the stored cause timestamp (same logic as deferred=='reboot'), or + # the stored cause timestamp (same logic as DEFERRED_DPU_REBOOT), or # (b) the cause string itself changed. This handles back-to-back # reboots that produce the same cause type. - if deferred == 'restart': + if deferred == DEFERRED_CHASSISD_RESTART: reboot_time = self.retrieve_dpu_reboot_time(key) should_persist = False if reboot_time and stored_time_str and stored_time_str < reboot_time: diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index aea5921be..ff12c45ab 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -411,11 +411,11 @@ def test_deferred_reboot_skips_already_persisted(): patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: - # ONLINE→OFFLINE: deferred='reboot', time persisted + # ONLINE→OFFLINE: deferred=DEFERRED_DPU_REBOOT, time persisted module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) updater.module_db_update() mock_persist_time.assert_called_once_with(name) - assert updater._pending_reboot_check[name] == 'reboot' + assert updater._pending_reboot_check[name] == DEFERRED_DPU_REBOOT mock_persist_time.reset_mock() @@ -456,11 +456,11 @@ def test_deferred_reboot_persists_new_reboot(): patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: - # ONLINE→OFFLINE: deferred='reboot', time persisted + # ONLINE→OFFLINE: deferred=DEFERRED_DPU_REBOOT, time persisted module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) updater.module_db_update() mock_persist_time.assert_called_once_with(name) - assert updater._pending_reboot_check[name] == 'reboot' + assert updater._pending_reboot_check[name] == DEFERRED_DPU_REBOOT mock_persist_time.reset_mock() From 15d07b5010e62acd2b5522b662465bad7dbabb49 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Thu, 28 May 2026 05:05:08 +0000 Subject: [PATCH 09/11] [chassisd] Remove prev_reboot_time.txt generation from chassisd prev_reboot_time.txt is now exclusively written by module_base.py at command invocation time (reboot/set_admin_state), ensuring a single source of truth and eliminating the race window between DPU power-off and chassisd's poll detection. Depends-On: sonic-net/sonic-platform-common#680 Signed-off-by: Vasundhara Volam --- sonic-chassisd/scripts/chassisd | 17 ++----------- sonic-chassisd/tests/test_chassisd.py | 36 +++------------------------ 2 files changed, 6 insertions(+), 47 deletions(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index 676f01eee..a340ef29f 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -808,11 +808,10 @@ class SmartSwitchModuleUpdater(ModuleUpdater): prev_status_empty = prev_status == ModuleBase.MODULE_STATUS_EMPTY # Transition to offline from a known non-offline state. - # Persist the down-time but defer get_reboot_cause until the DPU - # is back online — querying a powered-off DPU is unreliable. + # Defer get_reboot_cause until the DPU is back online — + # querying a powered-off DPU is unreliable. if self._is_known_to_offline(prev_status, current_status): self.log_notice(f"{key} operational status transitioning to offline") - self.persist_dpu_reboot_time(key) self._pending_reboot_check[key] = DEFERRED_DPU_REBOOT return @@ -836,7 +835,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater): if deferred is None and prev_status_empty: if self._is_first_boot(key): reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) - self.persist_dpu_reboot_time(key) self.persist_dpu_reboot_cause(reboot_cause, key) self.update_dpu_reboot_cause_to_db(key) return @@ -882,8 +880,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater): should_persist = True if should_persist: - if not reboot_time: - self.persist_dpu_reboot_time(key) self.persist_dpu_reboot_cause(reboot_cause, key) self.update_dpu_reboot_cause_to_db(key) return @@ -1017,15 +1013,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater): except FileNotFoundError: return False - def persist_dpu_reboot_time(self, module): - """Persist the current reboot time to a file.""" - time_str = self._get_current_time_str() - path = os.path.join(MODULE_REBOOT_CAUSE_DIR, module.lower(), "prev_reboot_time.txt") - - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, 'w') as f: - f.write(time_str) - def retrieve_dpu_reboot_time(self, module): """Retrieve the persisted reboot time from a file.""" path = os.path.join(MODULE_REBOOT_CAUSE_DIR, module.lower(), "prev_reboot_time.txt") diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index ff12c45ab..4ee2fa5bb 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -230,18 +230,16 @@ def test_smartswitch_moduleupdater_status_transitions(): patch.object(module_updater, '_is_first_boot', return_value=False) as mock_is_first_boot, \ patch.object(module_updater, 'persist_dpu_reboot_cause') as mock_persist_reboot_cause, \ patch.object(module_updater, 'update_dpu_reboot_cause_to_db') as mock_update_reboot_db, \ - patch.object(module_updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch("os.makedirs") as mock_makedirs, \ patch("builtins.open", mock_open()) as mock_file, \ patch.object(module_updater, '_get_history_path', return_value="/tmp/prev_reboot_time.txt") as mock_get_history_path: - # Transition from ONLINE to OFFLINE — should persist time but NOT - # query reboot cause (DPU is powered off). Issue #26254. + # Transition from ONLINE to OFFLINE — should defer reboot cause + # query until DPU is back online (DPU is powered off). Issue #26254. offline_status = ModuleBase.MODULE_STATUS_OFFLINE module.set_oper_status(offline_status) module_updater.module_db_update() assert module.get_oper_status() == offline_status - mock_persist_time.assert_called_once_with(name) mock_persist_reboot_cause.assert_not_called() mock_update_reboot_db.assert_not_called() assert name in module_updater._pending_reboot_check @@ -251,7 +249,6 @@ def test_smartswitch_moduleupdater_status_transitions(): mock_makedirs.reset_mock() mock_persist_reboot_cause.reset_mock() mock_update_reboot_db.reset_mock() - mock_persist_time.reset_mock() # Ensure ONLINE transition persists the deferred reboot cause online_status = ModuleBase.MODULE_STATUS_ONLINE @@ -282,7 +279,6 @@ def test_admin_shutdown_does_not_query_reboot_cause(): updater.module_db_update() with patch.object(module, 'get_reboot_cause') as mock_get_cause, \ - patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: @@ -292,8 +288,6 @@ def test_admin_shutdown_does_not_query_reboot_cause(): # get_reboot_cause must NOT be called while DPU is off mock_get_cause.assert_not_called() - # But reboot time should be persisted - mock_persist_time.assert_called_once_with(name) # Cause should be deferred mock_persist_cause.assert_not_called() mock_update_db.assert_not_called() @@ -342,13 +336,11 @@ def test_empty_to_online_first_boot_persists(): with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "N/A")), \ patch.object(updater, '_is_first_boot', return_value=True), \ - patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: updater.module_db_update() - mock_persist_time.assert_called_once_with(name) mock_persist_cause.assert_called_once_with(("Power Loss", "N/A"), name) mock_update_db.assert_called_once_with(name) @@ -370,14 +362,12 @@ def test_empty_to_online_no_spurious_persist(): patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Non-Hardware", "2025_01_01_00_00_00")), \ patch.object(updater, '_is_first_boot', return_value=False), \ - patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: updater.module_db_update() # No persist — DPU didn't reboot, chassisd just restarted - mock_persist_time.assert_not_called() mock_persist_cause.assert_not_called() # DB should still be repopulated mock_update_db.assert_called_once_with(name) @@ -407,18 +397,14 @@ def test_deferred_reboot_skips_already_persisted(): patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Switch rebooted DPU", stored_time)), \ patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ - patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: - # ONLINE→OFFLINE: deferred=DEFERRED_DPU_REBOOT, time persisted + # ONLINE→OFFLINE: deferred=DEFERRED_DPU_REBOOT module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) updater.module_db_update() - mock_persist_time.assert_called_once_with(name) assert updater._pending_reboot_check[name] == DEFERRED_DPU_REBOOT - mock_persist_time.reset_mock() - # OFFLINE→ONLINE: stored_time >= reboot_time → already persisted, skip module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() @@ -452,18 +438,14 @@ def test_deferred_reboot_persists_new_reboot(): patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Switch rebooted DPU", stored_time)), \ patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ - patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: - # ONLINE→OFFLINE: deferred=DEFERRED_DPU_REBOOT, time persisted + # ONLINE→OFFLINE: deferred=DEFERRED_DPU_REBOOT module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) updater.module_db_update() - mock_persist_time.assert_called_once_with(name) assert updater._pending_reboot_check[name] == DEFERRED_DPU_REBOOT - mock_persist_time.reset_mock() - # OFFLINE→ONLINE: stored_time < reboot_time → persist module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() @@ -492,14 +474,12 @@ def test_empty_to_offline_persists_changed_reboot_cause(): patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Reboot", "2025_10_10_07_36_12")), \ patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ - patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: # EMPTY→OFFLINE: should defer, not persist yet updater.module_db_update() assert name in updater._pending_reboot_check - mock_persist_time.assert_not_called() mock_persist_cause.assert_not_called() mock_update_db.assert_not_called() @@ -507,7 +487,6 @@ def test_empty_to_offline_persists_changed_reboot_cause(): module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() assert name not in updater._pending_reboot_check - mock_persist_time.assert_called_once_with(name) mock_persist_cause.assert_called_once_with(("Power Loss", "power auxiliary outage or reload"), name) mock_update_db.assert_called_once_with(name) @@ -535,7 +514,6 @@ def test_empty_to_offline_persists_same_cause_new_reboot(): patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Reboot", stored_time)), \ patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ - patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: @@ -547,7 +525,6 @@ def test_empty_to_offline_persists_same_cause_new_reboot(): module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() assert name not in updater._pending_reboot_check - mock_persist_time.assert_not_called() # reboot_time already exists mock_persist_cause.assert_called_once() mock_update_db.assert_called_once_with(name) @@ -571,7 +548,6 @@ def test_empty_to_offline_skips_same_cause_no_new_reboot(): patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Reboot", "2025_10_10_07_36_12")), \ patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ - patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: @@ -583,7 +559,6 @@ def test_empty_to_offline_skips_same_cause_no_new_reboot(): module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() assert name not in updater._pending_reboot_check - mock_persist_time.assert_not_called() mock_persist_cause.assert_not_called() # DB should still be repopulated (STATE_DB was flushed) mock_update_db.assert_called_once_with(name) @@ -606,7 +581,6 @@ def test_empty_to_offline_persists_first_boot(): patch.object(updater, 'retrieve_dpu_reboot_info', return_value=(None, None)), \ patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ - patch.object(updater, 'persist_dpu_reboot_time') as mock_persist_time, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: @@ -618,7 +592,6 @@ def test_empty_to_offline_persists_first_boot(): module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) updater.module_db_update() assert name not in updater._pending_reboot_check - mock_persist_time.assert_called_once_with(name) mock_persist_cause.assert_called_once() mock_update_db.assert_called_once_with(name) @@ -1061,7 +1034,6 @@ def test_smartswitch_module_db_update(): # Call the function to test module_updater.persist_dpu_reboot_cause(reboot_cause, key) module_updater._is_first_boot(name) - module_updater.persist_dpu_reboot_time(name) module_updater.update_dpu_reboot_cause_to_db(name) From 5403e8129727488388c72ccb3fd894307b34890f Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Fri, 29 May 2026 21:48:26 +0000 Subject: [PATCH 10/11] [chassisd] Add fallback prev_reboot_time.txt generation for spontaneous DPU crashes When module_base.py has not created prev_reboot_time.txt (e.g. spontaneous DPU crash/kernel panic with no command invocation), chassisd generates it as a fallback on the Online->Offline transition. Detection: compare the time in prev_reboot_time.txt with the stored reboot cause timestamp from previous-reboot-cause.json. If the file does not exist or its time is <= the stored cause time (stale from a prior reboot), write the current time. If module_base.py already created it (file time > stored cause time), skip to avoid overwriting the more accurate command-time. Signed-off-by: Vasundhara Volam --- sonic-chassisd/scripts/chassisd | 25 +++++++++ sonic-chassisd/tests/test_chassisd.py | 73 +++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index a340ef29f..7338d868d 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -803,6 +803,30 @@ class SmartSwitchModuleUpdater(ModuleUpdater): str(ModuleBase.MODULE_STATUS_OFFLINE)) and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE)) + def persist_dpu_reboot_time_if_needed(self, module): + """Persist the current reboot time to prev_reboot_time.txt if module_base.py + has not already created it for this reboot cycle. + + Detection: compare the time in prev_reboot_time.txt with the stored reboot + cause timestamp (from previous-reboot-cause.json). If the file does not + exist, or its time is <= the stored cause time (stale from a prior reboot), + write the current time. + """ + file_time = self.retrieve_dpu_reboot_time(module) + _, stored_time_str = self.retrieve_dpu_reboot_info(module) + + if file_time is not None and (stored_time_str is None or file_time > stored_time_str): + # File already has a time newer than the last stored cause — + # module_base.py already created it for this reboot cycle. + return + + # Write current time + time_str = self._get_current_time_str() + path = os.path.join(MODULE_REBOOT_CAUSE_DIR, module.lower(), "prev_reboot_time.txt") + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, 'w') as f: + f.write(time_str) + def _handle_module_status_change(self, key, module_index, prev_status, current_status): """Handle DPU operational-status transitions and reboot-cause tracking.""" prev_status_empty = prev_status == ModuleBase.MODULE_STATUS_EMPTY @@ -812,6 +836,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater): # querying a powered-off DPU is unreliable. if self._is_known_to_offline(prev_status, current_status): self.log_notice(f"{key} operational status transitioning to offline") + self.persist_dpu_reboot_time_if_needed(key) self._pending_reboot_check[key] = DEFERRED_DPU_REBOOT return diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index 4ee2fa5bb..c76e9d048 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -280,6 +280,7 @@ def test_admin_shutdown_does_not_query_reboot_cause(): with patch.object(module, 'get_reboot_cause') as mock_get_cause, \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'persist_dpu_reboot_time_if_needed') as mock_persist_time, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: # Admin shutdown: DPU goes offline @@ -293,6 +294,77 @@ def test_admin_shutdown_does_not_query_reboot_cause(): mock_update_db.assert_not_called() assert name in updater._pending_reboot_check +def test_persist_dpu_reboot_time_if_needed_creates_file_when_missing(): + """ + When prev_reboot_time.txt does not exist (module_base.py hasn't created it), + persist_dpu_reboot_time_if_needed should write the current time. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ + patch.object(updater, 'retrieve_dpu_reboot_info', return_value=(None, None)), \ + patch("os.makedirs") as mock_makedirs, \ + patch("builtins.open", mock_open()) as mock_file: + + updater.persist_dpu_reboot_time_if_needed(name) + + mock_makedirs.assert_called_once() + mock_file.assert_called_once() + +def test_persist_dpu_reboot_time_if_needed_skips_when_file_newer(): + """ + When prev_reboot_time.txt already has a time newer than the stored + reboot cause timestamp, module_base.py already created it — do not overwrite. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # File time is newer than stored cause time — module_base.py already handled it + with patch.object(updater, 'retrieve_dpu_reboot_time', return_value="2025_06_01_12_00_00"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Power Loss", "2025_06_01_10_00_00")), \ + patch("os.makedirs") as mock_makedirs, \ + patch("builtins.open", mock_open()) as mock_file: + + updater.persist_dpu_reboot_time_if_needed(name) + + # Should NOT write — file is already up to date + mock_file.assert_not_called() + +def test_persist_dpu_reboot_time_if_needed_writes_when_file_stale(): + """ + When prev_reboot_time.txt has a time <= the stored reboot cause timestamp, + the file is stale from a prior reboot — write a new time. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # File time is older than stored cause time — stale, needs update + with patch.object(updater, 'retrieve_dpu_reboot_time', return_value="2025_05_01_08_00_00"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Power Loss", "2025_06_01_10_00_00")), \ + patch("os.makedirs") as mock_makedirs, \ + patch("builtins.open", mock_open()) as mock_file: + + updater.persist_dpu_reboot_time_if_needed(name) + + mock_makedirs.assert_called_once() + mock_file.assert_called_once() + def test_online_transition_skips_reboot_update(): """ EMPTY→ONLINE with no deferred and no first-boot marker: chassisd @@ -398,6 +470,7 @@ def test_deferred_reboot_skips_already_persisted(): return_value=("Switch rebooted DPU", stored_time)), \ patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'persist_dpu_reboot_time_if_needed') as mock_persist_time, \ patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: # ONLINE→OFFLINE: deferred=DEFERRED_DPU_REBOOT From 5e726a20ac18721268d227e668c8bf947c1f5f7d Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Fri, 29 May 2026 23:33:02 +0000 Subject: [PATCH 11/11] [chassisd] Move testbed script to separate PR The DPU reboot-cause testbed verification script is decoupled into a separate PR to keep this change focused on the core logic fix. Signed-off-by: Vasundhara Volam --- .../test_dpu_reboot_cause_persistence.sh | 885 ------------------ 1 file changed, 885 deletions(-) delete mode 100755 sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh diff --git a/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh b/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh deleted file mode 100755 index 35f601146..000000000 --- a/sonic-chassisd/tests/testbed/test_dpu_reboot_cause_persistence.sh +++ /dev/null @@ -1,885 +0,0 @@ -#!/bin/bash -# -# Testbed verification script for SmartSwitchModuleUpdater.module_db_update -# -# Covers: -# Test 1: Normal ONLINE→OFFLINE→ONLINE transition records reboot cause -# Test 2: Config reload during DPU reboot (EMPTY→OFFLINE→ONLINE deferred check) -# Test 3: Config reload with DPU already offline (admin-shutdown power cycle) -# Test 4: Reboot cause history is not lost after config reload -# Test 5: Deferred EMPTY→OFFLINE→ONLINE with same cause (no duplicate) -# Test 6: Back-to-back reboot skips duplicate persist -# - -SCRIPT_NAME=$(basename "$0") - -usage() { - cat </sonic-chassisd/scripts/chassisd admin@:/tmp/chassisd_patched - -Options: - -d DPU DPU name to test (default: DPU0) - -p PATH Path to patched chassisd on the switch (default: /tmp/chassisd_patched) - -t SECONDS Delay between DPU reboot and config reload in Test 2 (default: 10) - -h Show this help message and exit - -Tests: - 1. Normal ONLINE → OFFLINE → ONLINE - Reboots the DPU normally and verifies reboot cause is recorded, - history file count increases, DB entries exist, and syslog shows - both offline and online transition messages. - - 2. Config reload during DPU reboot (deferred check) - Reboots the DPU, then issues 'config reload -y' while the DPU is - still rebooting. Chassisd restarts with an empty STATE_DB and - discovers the DPU offline (EMPTY→OFFLINE). The deferred check - should fire when the DPU comes online, persisting the updated - reboot cause. - - 3. Config reload with DPU already stable offline - Shuts down the DPU (admin down), then issues config reload. - Chassisd sees EMPTY→OFFLINE but the DPU stays offline, so - no reboot cause should be persisted. Verifies no duplicate - entries are created, including after the DPU is brought back. - - 4. Reboot cause history preserved across config reload - Issues config reload with the DPU online and verifies on-disk - history files survive the STATE_DB flush and that CHASSIS_STATE_DB - entries are repopulated. - - 5. Deferred check with same reboot cause - Issues config reload (no DPU reboot) so the DPU cycles through - EMPTY→OFFLINE→ONLINE with the same reboot cause. Verifies that - the deferred check detects the cause is unchanged and does NOT - create a duplicate entry. - - 6. Back-to-back rapid reboot - Reboots the DPU twice in quick succession. The second online - transition should detect the cause was already persisted and skip - creating a duplicate reboot cause entry. - -Examples: - sudo bash $SCRIPT_NAME - sudo bash $SCRIPT_NAME -d DPU1 - sudo bash $SCRIPT_NAME -d DPU0 -p /tmp/my_chassisd -t 15 - -EOF - exit 0 -} - -# --------------------------------------------------------------------------- -# Parse arguments -# --------------------------------------------------------------------------- -DPU="DPU0" -CONFIG_RELOAD_DELAY=10 -PATCHED_CHASSISD="/tmp/chassisd_patched" - -while getopts "d:p:t:h" opt; do - case $opt in - d) DPU="$OPTARG" ;; - p) PATCHED_CHASSISD="$OPTARG" ;; - t) CONFIG_RELOAD_DELAY="$OPTARG" ;; - h) usage ;; - *) usage ;; - esac -done -shift $((OPTIND - 1)) - -set -uo pipefail - -CHASSISD_PATH="/usr/local/bin/chassisd" -BACKUP_CHASSISD="/tmp/chassisd_original" -PMON_CONTAINER="pmon" -REBOOT_CAUSE_DIR="/host/reboot-cause/module" -MAX_HISTORY_FILES=10 -PASSED=0 -FAILED=0 -SKIPPED=0 - -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -CYAN='\033[0;36m' -NC='\033[0m' - -log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } -log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } -log_error() { echo -e "${RED}[ERROR]${NC} $*"; } -log_step() { echo -e "\n${CYAN}=== $* ===${NC}"; } - -pass() { echo -e " ${GREEN}PASS${NC}: $*"; (( PASSED++ )) || true; } -fail() { echo -e " ${RED}FAIL${NC}: $*"; (( FAILED++ )) || true; } -skip() { echo -e " ${YELLOW}SKIP${NC}: $*"; (( SKIPPED++ )) || true; } - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- -find_chassisd_path() { - if ! docker ps --format '{{.Names}}' | grep -q "^${PMON_CONTAINER}$"; then - log_error "pmon container is not running" - return 1 - fi - for candidate in \ - "/usr/local/bin/chassisd" \ - "$(docker exec "$PMON_CONTAINER" which chassisd 2>/dev/null || true)" \ - "$(docker exec "$PMON_CONTAINER" find /usr -name chassisd 2>/dev/null | head -1 || true)"; do - if [[ -n "$candidate" ]] && docker exec "$PMON_CONTAINER" test -f "$candidate" 2>/dev/null; then - CHASSISD_PATH="$candidate" - return 0 - fi - done - log_error "Could not find chassisd inside $PMON_CONTAINER container" - return 1 -} - -wait_for_system_ready() { - local timeout=${1:-300} - local elapsed=0 - log_info "Waiting for system ready (timeout: ${timeout}s)..." - while (( elapsed < timeout )); do - if show system-health summary 2>/dev/null | grep -q "System is ready"; then - log_info "System is ready after ${elapsed}s" - return 0 - fi - sleep 5 - (( elapsed += 5 )) - done - log_warn "System not ready after ${timeout}s, proceeding anyway" - return 1 -} - -wait_for_dpu_online() { - local dpu=$1 - local timeout=${2:-600} - local elapsed=0 - log_info "Waiting for $dpu to come online (timeout: ${timeout}s)..." - while (( elapsed < timeout )); do - local status - status=$(sonic-db-cli STATE_DB HGET "CHASSIS_MODULE_TABLE|${dpu}" "oper_status" 2>/dev/null || true) - if [[ "$status" == "Online" || "$status" == "Partial Online" ]]; then - log_info "$dpu is online (status: $status) after ${elapsed}s" - return 0 - fi - sleep 10 - (( elapsed += 10 )) - done - log_error "$dpu did not come online within ${timeout}s" - return 1 -} - -wait_for_dpu_offline() { - local dpu=$1 - local timeout=${2:-300} - local elapsed=0 - log_info "Waiting for $dpu to go offline (timeout: ${timeout}s)..." - while (( elapsed < timeout )); do - local status - status=$(sonic-db-cli STATE_DB HGET "CHASSIS_MODULE_TABLE|${dpu}" "oper_status" 2>/dev/null || true) - if [[ "$status" == "Offline" ]]; then - log_info "$dpu is offline after ${elapsed}s" - return 0 - fi - sleep 5 - (( elapsed += 5 )) - done - log_error "$dpu did not go offline within ${timeout}s" - return 1 -} - -get_dpu_status() { - sonic-db-cli STATE_DB HGET "CHASSIS_MODULE_TABLE|${1}" "oper_status" 2>/dev/null || echo "Unknown" -} - -get_latest_reboot_cause() { - local dpu=$1 - show reboot-cause all 2>/dev/null | grep -i "$dpu" | head -1 || echo "(none)" -} - -get_reboot_history_count() { - local dpu=$1 - local dir="${REBOOT_CAUSE_DIR}/${dpu,,}/history" - if [[ -d "$dir" ]]; then - find "$dir" -name '*_reboot_cause.json' 2>/dev/null | wc -l - else - echo 0 - fi -} - -get_previous_reboot_cause_json() { - local dpu=$1 - local path="${REBOOT_CAUSE_DIR}/${dpu,,}/previous-reboot-cause.json" - if [[ -L "$path" ]] || [[ -f "$path" ]]; then - cat "$path" 2>/dev/null || echo "{}" - else - echo "(not found)" - fi -} - -extract_cause_fields() { - # Extract only "cause" and "comment" from JSON (ignore time/name which change on every persist) - local json=$1 - local cause comment - cause=$(echo "$json" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('cause',''))" 2>/dev/null || echo "") - comment=$(echo "$json" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('comment',''))" 2>/dev/null || echo "") - echo "${cause}|${comment}" -} - -get_db_reboot_cause_keys() { - local dpu=$1 - sonic-db-cli CHASSIS_STATE_DB KEYS "REBOOT_CAUSE|${dpu}|*" 2>/dev/null || true -} - -snapshot_reboot_state() { - local label=$1 - log_step "Snapshot: $label" - echo "--- DPU status ---" - get_dpu_status "$DPU" - echo "" - echo "--- show reboot-cause all ---" - show reboot-cause all 2>/dev/null || true - echo "" - echo "--- show reboot-cause history $DPU ---" - show reboot-cause history "$DPU" 2>/dev/null || true - echo "" - echo "--- previous-reboot-cause.json ---" - get_previous_reboot_cause_json "$DPU" - echo "" - echo "--- History file count ---" - get_reboot_history_count "$DPU" - echo "" - echo "--- CHASSIS_STATE_DB keys ---" - get_db_reboot_cause_keys "$DPU" - echo "" -} - -check_syslog_for() { - local pattern=$1 - local since=${2:-"10 minutes ago"} - local result="" - # Try journalctl filtered by chassisd identifier - result=$(journalctl -t chassisd --since "$since" 2>/dev/null | grep -i "$pattern" | tail -5) - if [[ -n "$result" ]]; then echo "$result"; return; fi - # Try journalctl unfiltered (in case identifier differs) - result=$(journalctl --since "$since" 2>/dev/null | grep -i "chassisd" | grep -i "$pattern" | tail -5) - if [[ -n "$result" ]]; then echo "$result"; return; fi - # Try host /var/log/syslog filtered by chassisd with time filtering. - # SONiC syslog format: "2026 May 23 20:15:26.123456 hostname ..." - if [[ -f /var/log/syslog ]]; then - # Convert 'since' to numeric timestamp (YYYYMMDDHHmmss) for comparison - local since_numeric - since_numeric=$(date -d "$since" +%Y%m%d%H%M%S 2>/dev/null || echo 0) - if [[ "$since_numeric" != "0" && ${#since_numeric} -eq 14 ]]; then - # Pure awk comparison — no external commands per line - result=$(grep -i "chassisd" /var/log/syslog 2>/dev/null | grep -i "$pattern" | \ - awk -v since_ts="$since_numeric" ' - BEGIN { - mon["Jan"]=1; mon["Feb"]=2; mon["Mar"]=3; mon["Apr"]=4 - mon["May"]=5; mon["Jun"]=6; mon["Jul"]=7; mon["Aug"]=8 - mon["Sep"]=9; mon["Oct"]=10; mon["Nov"]=11; mon["Dec"]=12 - } - { - # Parse "YYYY Mon DD HH:MM:SS.usec" from fields 1-4 - year = $1+0; m = mon[$2]+0; day = $3+0 - split($4, t, /[:.]/) - h = t[1]+0; mi = t[2]+0; s = t[3]+0 - if (year > 0 && m > 0) { - line_ts = sprintf("%04d%02d%02d%02d%02d%02d", year, m, day, h, mi, s) - if (line_ts >= since_ts) print - } - }' | tail -5) - if [[ -n "$result" ]]; then echo "$result"; return; fi - # Time filtering was applied — trust the result (even if empty). - # Do NOT fall through to unfiltered grep, which would defeat - # "should not find" assertions in Test 5. - return - fi - # since_numeric could not be computed — use unfiltered grep as last resort - result=$(grep -i "chassisd" /var/log/syslog 2>/dev/null | grep -i "$pattern" | tail -5) - if [[ -n "$result" ]]; then echo "$result"; return; fi - fi - # Try inside pmon container - result=$(docker exec "$PMON_CONTAINER" cat /var/log/syslog 2>/dev/null | grep -i "$pattern" | tail -5) - if [[ -n "$result" ]]; then echo "$result"; return; fi - # Try supervisord stdout log inside pmon container - result=$(docker exec "$PMON_CONTAINER" cat /var/log/supervisor/supervisord.log 2>/dev/null | grep -i "$pattern" | tail -5) - if [[ -n "$result" ]]; then echo "$result"; return; fi -} - -restart_chassisd() { - log_info "Restarting pmon (chassisd)..." - systemctl restart pmon - sleep 15 - log_info "pmon restarted" -} - -# --------------------------------------------------------------------------- -# Pre-flight checks -# --------------------------------------------------------------------------- -log_step "Pre-flight checks" - -if [[ $EUID -ne 0 ]]; then - log_error "This script must be run as root (sudo)" - exit 1 -fi - -find_chassisd_path -log_info "Found chassisd at: $PMON_CONTAINER:$CHASSISD_PATH" - -if [[ ! -f "$PATCHED_CHASSISD" ]]; then - log_error "Patched chassisd not found at $PATCHED_CHASSISD" - log_error "Please copy it first:" - log_error " scp /sonic-chassisd/scripts/chassisd admin@:$PATCHED_CHASSISD" - log_error "Run '$SCRIPT_NAME -h' for help." - exit 1 -fi - -# Verify DPU exists -DPU_STATUS=$(get_dpu_status "$DPU") -if [[ "$DPU_STATUS" == "Unknown" ]]; then - log_error "$DPU not found in STATE_DB. Is this a smartswitch platform?" - exit 1 -fi -log_info "$DPU current status: $DPU_STATUS" - -# --------------------------------------------------------------------------- -# Install patched chassisd -# --------------------------------------------------------------------------- -log_step "Installing patched chassisd" - -docker cp "$PMON_CONTAINER:$CHASSISD_PATH" "$BACKUP_CHASSISD" -log_info "Backup saved to $BACKUP_CHASSISD" - -docker cp "$PATCHED_CHASSISD" "$PMON_CONTAINER:$CHASSISD_PATH" -log_info "Patched chassisd installed" - -restart_chassisd - -log_step "Ensuring $DPU is online before tests" -wait_for_dpu_online "$DPU" 600 || { - log_error "Cannot proceed: $DPU is not online" - exit 1 -} - -# Record initial state -INITIAL_HISTORY_COUNT=$(get_reboot_history_count "$DPU") -log_info "Initial history file count: $INITIAL_HISTORY_COUNT" - -TIMESTAMP_START=$(date '+%Y-%m-%d %H:%M:%S') - -# =========================================================================== -# Test 1: Normal ONLINE → OFFLINE → ONLINE transition -# =========================================================================== -log_step "Test 1: Normal ONLINE → OFFLINE → ONLINE reboot cause recording" - -snapshot_reboot_state "Test 1 — BEFORE" - -CAUSE_BEFORE=$(get_previous_reboot_cause_json "$DPU") -HISTORY_BEFORE=$(get_reboot_history_count "$DPU") - -log_info "Issuing 'reboot -d $DPU'..." -reboot -d "$DPU" & - -log_info "Waiting for $DPU to go offline..." -wait_for_dpu_offline "$DPU" 300 || { - fail "Test 1: $DPU did not go offline after reboot" -} - -log_info "Waiting for $DPU to come back online..." -wait_for_dpu_online "$DPU" 600 || { - fail "Test 1: $DPU did not come back online" -} - -# Give chassisd a few poll cycles -sleep 30 - -snapshot_reboot_state "Test 1 — AFTER" - -CAUSE_AFTER=$(get_previous_reboot_cause_json "$DPU") -HISTORY_AFTER=$(get_reboot_history_count "$DPU") - -# Verify reboot cause was recorded -if [[ "$CAUSE_AFTER" != "$CAUSE_BEFORE" ]]; then - pass "Test 1: Reboot cause updated (normal transition)" -else - fail "Test 1: Reboot cause did NOT change after normal reboot" -fi - -# Verify history file count increased (capped at MAX_HISTORY_FILES) -if (( HISTORY_BEFORE < MAX_HISTORY_FILES )); then - if (( HISTORY_AFTER > HISTORY_BEFORE )); then - pass "Test 1: History file count increased ($HISTORY_BEFORE -> $HISTORY_AFTER)" - else - fail "Test 1: History file count did not increase ($HISTORY_BEFORE -> $HISTORY_AFTER)" - fi -else - if (( HISTORY_AFTER == MAX_HISTORY_FILES )); then - pass "Test 1: History file count at max ($HISTORY_AFTER), oldest rotated out" - else - fail "Test 1: History file count unexpected ($HISTORY_BEFORE -> $HISTORY_AFTER)" - fi -fi - -# Verify DB entry exists -DB_KEYS=$(get_db_reboot_cause_keys "$DPU") -if [[ -n "$DB_KEYS" ]]; then - pass "Test 1: CHASSIS_STATE_DB has reboot cause entries" -else - fail "Test 1: No reboot cause entries in CHASSIS_STATE_DB" -fi - -# Check syslog for the offline transition log -OFFLINE_LOG=$(check_syslog_for "operational status transitioning to offline" "$TIMESTAMP_START") -if [[ -n "$OFFLINE_LOG" ]]; then - pass "Test 1: Syslog shows offline transition" -else - fail "Test 1: No offline transition logged in syslog" -fi - -# Check syslog for the online transition log -ONLINE_LOG=$(check_syslog_for "operational status transitioning to online" "$TIMESTAMP_START") -if [[ -n "$ONLINE_LOG" ]]; then - pass "Test 1: Syslog shows online transition" -else - fail "Test 1: No online transition logged in syslog" -fi - -# =========================================================================== -# Test 2: Config reload during DPU reboot (deferred EMPTY→OFFLINE→ONLINE) -# =========================================================================== -log_step "Test 2: Config reload during DPU reboot (deferred cause check)" - -log_info "Ensuring $DPU is online..." -wait_for_dpu_online "$DPU" 600 || { - fail "Test 2: $DPU not online, cannot proceed" -} - -TIMESTAMP_T2=$(date '+%Y-%m-%d %H:%M:%S') - -CAUSE_BEFORE_T2=$(get_previous_reboot_cause_json "$DPU") -HISTORY_BEFORE_T2=$(get_reboot_history_count "$DPU") - -log_info "Issuing 'reboot -d $DPU'..." -reboot -d "$DPU" & - -log_info "Waiting ${CONFIG_RELOAD_DELAY}s before config reload..." -sleep "$CONFIG_RELOAD_DELAY" - -log_info "Issuing 'config reload -y' (STATE_DB will be flushed, chassisd restarts)..." -config reload -y &>/dev/null & - -log_info "Waiting for system to recover..." -sleep 30 -wait_for_system_ready 300 || true - -# After config reload, chassisd restarts. It will see EMPTY→OFFLINE for the -# DPU (STATE_DB was flushed). With the deferred check, it should NOT call -# get_reboot_cause now — it should wait until the DPU comes online. - -log_info "Waiting for $DPU to come back online..." -wait_for_dpu_online "$DPU" 600 || { - fail "Test 2: $DPU did not come back online after config reload + reboot" -} - -# Give chassisd poll cycles to detect the online transition and run deferred check -sleep 30 - -snapshot_reboot_state "Test 2 — AFTER" - -CAUSE_AFTER_T2=$(get_previous_reboot_cause_json "$DPU") -HISTORY_AFTER_T2=$(get_reboot_history_count "$DPU") - -# The deferred check should have detected the cause changed and persisted it -if [[ "$CAUSE_AFTER_T2" != "$CAUSE_BEFORE_T2" ]]; then - pass "Test 2: Reboot cause updated after deferred check (config reload during reboot)" -else - # The cause might be the same type ("Power Loss") but the timestamp should differ. - # Check history count instead (capped at MAX_HISTORY_FILES). - if (( HISTORY_BEFORE_T2 < MAX_HISTORY_FILES && HISTORY_AFTER_T2 > HISTORY_BEFORE_T2 )); then - pass "Test 2: New history entry created after deferred check" - elif (( HISTORY_BEFORE_T2 >= MAX_HISTORY_FILES && HISTORY_AFTER_T2 == MAX_HISTORY_FILES )); then - pass "Test 2: History at max ($MAX_HISTORY_FILES), oldest rotated (deferred check)" - else - fail "Test 2: Reboot cause NOT updated after config reload during reboot" - fi -fi - -# Check for syslog messages. -# The deferred=='reboot' path (Online→Offline→Online after config reload) -# will persist the cause unless it was already persisted after the reboot time. -# - "Reboot cause changed while chassisd was down" only appears for deferred=='restart'. -# - "Reboot cause already persisted" appears when the cause was already recorded. -# Neither is guaranteed in Test 2, so treat as informational. -DEFERRED_LOG=$(check_syslog_for "Reboot cause changed while chassisd was down" "$TIMESTAMP_T2") -ALREADY_PERSISTED_LOG=$(check_syslog_for "Reboot cause already persisted after reboot" "$TIMESTAMP_T2") -if [[ -n "$DEFERRED_LOG" ]]; then - pass "Test 2: Syslog shows deferred reboot cause detection (restart path)" - log_info " $DEFERRED_LOG" -elif [[ -n "$ALREADY_PERSISTED_LOG" ]]; then - pass "Test 2: Syslog shows cause already persisted (same cause, recent — expected)" - log_info " $ALREADY_PERSISTED_LOG" -else - log_info "Test 2: No deferred-check syslog (expected for normal deferred reboot path)" -fi - -# Check that the online transition was logged after config reload. -ONLINE_LOG_T2=$(check_syslog_for "operational status transitioning to online" "$TIMESTAMP_T2") -ONLINE_COUNT=$(echo "$ONLINE_LOG_T2" | grep -c . 2>/dev/null || echo 0) -log_info "Online transition log count since test start: $ONLINE_COUNT" - -# =========================================================================== -# Test 3: Config reload with DPU already stable offline (same cause) -# =========================================================================== -log_step "Test 3: Config reload with DPU already stable offline (no duplicate)" - -log_info "Ensuring $DPU is online..." -wait_for_dpu_online "$DPU" 600 || { - skip "Test 3: $DPU not online, skipping" -} - -# Reboot the DPU and let it fully come back to create a known reboot cause -log_info "Rebooting $DPU and waiting for full cycle..." -reboot -d "$DPU" & -wait_for_dpu_offline "$DPU" 300 || true -wait_for_dpu_online "$DPU" 600 || { - fail "Test 3: $DPU did not recover" -} -sleep 30 - -# Now power off DPU so it stays offline -log_info "Powering off $DPU (it will stay offline)..." -config chassis modules shutdown "$DPU" 2>/dev/null || \ - sonic-db-cli CONFIG_DB HSET "CHASSIS_MODULE|$DPU" "admin_status" "down" 2>/dev/null || true -sleep 30 - -TIMESTAMP_T3=$(date '+%Y-%m-%d %H:%M:%S') -HISTORY_BEFORE_T3=$(get_reboot_history_count "$DPU") -CAUSE_BEFORE_T3=$(get_previous_reboot_cause_json "$DPU") - -# Config reload — STATE_DB flushed, chassisd will see EMPTY→OFFLINE for DPU -# But DPU stays offline this time (admin down), so the deferred check should -# remain pending and no reboot cause should be persisted. -log_info "Issuing 'config reload -y'..." -config reload -y &>/dev/null & -sleep 30 -wait_for_system_ready 300 || true - -# DPU should still be offline (admin down) -sleep 30 -DPU_STATUS_T3=$(get_dpu_status "$DPU") -log_info "$DPU status after config reload: $DPU_STATUS_T3" - -HISTORY_AFTER_T3=$(get_reboot_history_count "$DPU") -CAUSE_AFTER_T3=$(get_previous_reboot_cause_json "$DPU") - -# Note: admin-shutdown is a RUNTIME command — not persisted to config_db. -# After config reload, DPU comes back online (config says admin_status=up). -# The deferred check fires immediately and detects the admin-shutdown power cycle -# via prev_reboot_time.txt timestamp > stored cause timestamp. -# So either: -# - DPU is still offline: no new entry yet (deferred hasn't fired) -# - DPU is already online: new entry persisted (deferred already fired) -if [[ "$DPU_STATUS_T3" == *"Online"* ]]; then - # DPU already came back online — deferred check already fired and persisted - if (( HISTORY_BEFORE_T3 >= MAX_HISTORY_FILES )); then - if (( HISTORY_AFTER_T3 == MAX_HISTORY_FILES )); then - pass "Test 3: History at max after deferred persist (rotation handled)" - else - fail "Test 3: Unexpected history count ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3)" - fi - else - if (( HISTORY_AFTER_T3 == HISTORY_BEFORE_T3 + 1 )); then - pass "Test 3: New entry persisted (DPU came online after config reload)" - else - fail "Test 3: Expected history+1 ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3)" - fi - fi - if [[ "$CAUSE_AFTER_T3" != "$CAUSE_BEFORE_T3" ]]; then - pass "Test 3: previous-reboot-cause.json updated (admin-shutdown is real power cycle)" - else - fail "Test 3: previous-reboot-cause.json should have changed (DPU already online)" - fi -else - # DPU shows offline at check time. However, admin_status=up was restored - # by config reload (runtime shutdown is not persisted), so DPU is being - # brought back up. It may have briefly come online (firing the deferred - # check and persisting) before going to Offline again, or chassisd may - # not have seen the transition yet. Accept BOTH outcomes. - if (( HISTORY_AFTER_T3 == HISTORY_BEFORE_T3 )) || (( HISTORY_BEFORE_T3 >= MAX_HISTORY_FILES && HISTORY_AFTER_T3 == MAX_HISTORY_FILES )); then - pass "Test 3: History count stable while DPU shows offline ($HISTORY_AFTER_T3)" - elif (( HISTORY_AFTER_T3 == HISTORY_BEFORE_T3 + 1 )); then - pass "Test 3: Deferred already fired (DPU briefly came online)" - else - fail "Test 3: Unexpected history count while DPU offline ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3)" - fi - if [[ "$CAUSE_AFTER_T3" == "$CAUSE_BEFORE_T3" ]]; then - pass "Test 3: previous-reboot-cause.json unchanged (deferred still pending)" - else - # Deferred already fired — DPU briefly came online during the wait - pass "Test 3: previous-reboot-cause.json updated (deferred fired early)" - fi -fi - -# Bring DPU back up for next test. -# Note: the deferred flag is set from the config reload above (deferred='restart'). -# When DPU comes online, the deferred check will fire and detect that -# prev_reboot_time.txt (written during admin-shutdown) is newer than the stored -# cause timestamp. Since admin-shutdown + startup is a real power cycle, -# a new reboot cause entry SHOULD be persisted. -log_info "Bringing $DPU back online..." -config chassis modules startup "$DPU" 2>/dev/null || \ - sonic-db-cli CONFIG_DB HSET "CHASSIS_MODULE|$DPU" "admin_status" "up" 2>/dev/null || true -wait_for_dpu_online "$DPU" 600 || log_warn "$DPU did not come back online" -sleep 30 - -# The deferred check should persist a new entry (admin-shutdown is a real power cycle) -HISTORY_AFTER_T3_ONLINE=$(get_reboot_history_count "$DPU") -if (( HISTORY_BEFORE_T3 < MAX_HISTORY_FILES )); then - if (( HISTORY_AFTER_T3_ONLINE == HISTORY_BEFORE_T3 + 1 )); then - pass "Test 3: New entry persisted when DPU came back (admin-shutdown is a real power cycle)" - elif (( HISTORY_AFTER_T3_ONLINE == HISTORY_BEFORE_T3 )); then - fail "Test 3: No new entry — deferred check should detect power cycle via timestamp" - else - fail "Test 3: Unexpected history count ($HISTORY_BEFORE_T3 -> $HISTORY_AFTER_T3_ONLINE)" - fi -else - if (( HISTORY_AFTER_T3_ONLINE == MAX_HISTORY_FILES )); then - pass "Test 3: History at max ($MAX_HISTORY_FILES), rotation handled" - else - fail "Test 3: Unexpected history count at max ($HISTORY_AFTER_T3_ONLINE)" - fi -fi - -# =========================================================================== -# Test 4: Reboot cause history preserved across config reload -# =========================================================================== -log_step "Test 4: Reboot cause history preserved across config reload" - -HISTORY_BEFORE_T4=$(get_reboot_history_count "$DPU") -DB_KEYS_BEFORE_T4=$(get_db_reboot_cause_keys "$DPU" | wc -l) - -log_info "History files before: $HISTORY_BEFORE_T4, DB keys before: $DB_KEYS_BEFORE_T4" - -log_info "Issuing 'config reload -y'..." -config reload -y &>/dev/null & -sleep 30 -wait_for_system_ready 300 || true -wait_for_dpu_online "$DPU" 600 || true -sleep 30 - -HISTORY_AFTER_T4=$(get_reboot_history_count "$DPU") -DB_KEYS_AFTER_T4=$(get_db_reboot_cause_keys "$DPU" | wc -l) - -log_info "History files after: $HISTORY_AFTER_T4, DB keys after: $DB_KEYS_AFTER_T4" - -# History files on disk should be preserved (they live under /host, not in STATE_DB) -# When at max, count stays the same; otherwise it should not decrease. -if (( HISTORY_AFTER_T4 >= HISTORY_BEFORE_T4 )) || (( HISTORY_BEFORE_T4 >= MAX_HISTORY_FILES && HISTORY_AFTER_T4 == MAX_HISTORY_FILES )); then - pass "Test 4: History files preserved across config reload" -else - fail "Test 4: History files lost after config reload ($HISTORY_BEFORE_T4 -> $HISTORY_AFTER_T4)" -fi - -# DB entries should be repopulated by update_dpu_reboot_cause_to_db -if (( DB_KEYS_AFTER_T4 > 0 )); then - pass "Test 4: CHASSIS_STATE_DB reboot cause entries present after config reload" -else - fail "Test 4: No CHASSIS_STATE_DB entries after config reload" -fi - -# =========================================================================== -# Test 5: Deferred EMPTY→OFFLINE→ONLINE with same cause (no duplicate) -# =========================================================================== -log_step "Test 5: Deferred check with same reboot cause (no duplicate entry)" - -log_info "Ensuring $DPU is online..." -wait_for_dpu_online "$DPU" 600 || { - skip "Test 5: $DPU not online, skipping" -} - -# Reboot DPU, let it fully cycle to establish a known reboot cause -log_info "Rebooting $DPU for a clean reboot cause baseline..." -reboot -d "$DPU" & -wait_for_dpu_offline "$DPU" 300 || true -wait_for_dpu_online "$DPU" 600 || { - fail "Test 5: $DPU did not recover from baseline reboot" -} -sleep 30 - -TIMESTAMP_T5=$(date '+%Y-%m-%d %H:%M:%S') -HISTORY_BEFORE_T5=$(get_reboot_history_count "$DPU") -CAUSE_BEFORE_T5=$(get_previous_reboot_cause_json "$DPU") -log_info "Baseline cause: $CAUSE_BEFORE_T5" -log_info "Baseline history count: $HISTORY_BEFORE_T5" - -# Config reload — no DPU reboot this time, DPU just stays online then -# becomes EMPTY→OFFLINE→ONLINE as pmon/chassisd restarts. -# The deferred path fires, but the cause is the SAME as stored (no new reboot -# happened), so nothing should be persisted. -log_info "Issuing 'config reload -y' (no DPU reboot — same cause expected)..." -config reload -y &>/dev/null & -sleep 30 -wait_for_system_ready 300 || true -wait_for_dpu_online "$DPU" 600 || { - fail "Test 5: $DPU did not come back online after config reload" -} -sleep 30 - -HISTORY_AFTER_T5=$(get_reboot_history_count "$DPU") -CAUSE_AFTER_T5=$(get_previous_reboot_cause_json "$DPU") - -if (( HISTORY_AFTER_T5 == HISTORY_BEFORE_T5 )) || (( HISTORY_BEFORE_T5 >= MAX_HISTORY_FILES && HISTORY_AFTER_T5 == MAX_HISTORY_FILES )); then - pass "Test 5: No duplicate history entry (same cause after deferred check)" -else - fail "Test 5: Unexpected history entry created ($HISTORY_BEFORE_T5 -> $HISTORY_AFTER_T5)" -fi - -# Compare cause/comment fields (ignoring timestamp which may differ on re-persist) -CAUSE_FIELDS_BEFORE_T5=$(extract_cause_fields "$CAUSE_BEFORE_T5") -CAUSE_FIELDS_AFTER_T5=$(extract_cause_fields "$CAUSE_AFTER_T5") - -if [[ "$CAUSE_AFTER_T5" == "$CAUSE_BEFORE_T5" ]]; then - pass "Test 5: previous-reboot-cause.json unchanged (same cause, no re-persist)" -elif [[ "$CAUSE_FIELDS_AFTER_T5" == "$CAUSE_FIELDS_BEFORE_T5" ]]; then - # Same cause/comment but different timestamp — chassisd re-persisted unnecessarily - # This is acceptable behavior (not a correctness bug) but worth noting - log_warn "Test 5: previous-reboot-cause.json timestamp changed (same cause re-persisted)" - log_info " Before: $CAUSE_BEFORE_T5" - log_info " After: $CAUSE_AFTER_T5" - pass "Test 5: Reboot cause type unchanged (same cause after deferred check)" -else - fail "Test 5: Reboot cause type changed unexpectedly" - log_info " Before: $CAUSE_BEFORE_T5" - log_info " After: $CAUSE_AFTER_T5" -fi - -# Check syslog — should NOT show "Reboot cause changed" or "New reboot detected" -DEFERRED_LOG_T5=$(check_syslog_for "Reboot cause changed while chassisd was down" "$TIMESTAMP_T5") -if [[ -z "$DEFERRED_LOG_T5" ]]; then - pass "Test 5: No deferred-cause-changed syslog (expected — cause is the same)" -else - fail "Test 5: Unexpected 'Reboot cause changed' in syslog: $DEFERRED_LOG_T5" -fi - -NEW_REBOOT_LOG_T5=$(check_syslog_for "New reboot detected" "$TIMESTAMP_T5") -if [[ -z "$NEW_REBOOT_LOG_T5" ]]; then - pass "Test 5: No 'New reboot detected' syslog (expected — no prev_reboot_time.txt)" -else - fail "Test 5: Unexpected 'New reboot detected' in syslog: $NEW_REBOOT_LOG_T5" -fi - -# =========================================================================== -# Test 6: Back-to-back reboot skips duplicate persist -# =========================================================================== -log_step "Test 6: Back-to-back rapid reboot skips duplicate persist" - -log_info "Ensuring $DPU is online..." -wait_for_dpu_online "$DPU" 600 || { - skip "Test 6: $DPU not online, skipping" -} - -# Reboot the DPU and let it fully cycle (creates a reboot cause entry) -log_info "First reboot: establishing reboot cause..." -reboot -d "$DPU" & -wait_for_dpu_offline "$DPU" 300 || true -wait_for_dpu_online "$DPU" 600 || { - fail "Test 6: $DPU did not recover from first reboot" -} -sleep 30 - -TIMESTAMP_T6=$(date '+%Y-%m-%d %H:%M:%S') -HISTORY_BEFORE_T6=$(get_reboot_history_count "$DPU") -log_info "History count after first reboot: $HISTORY_BEFORE_T6" - -# Immediately reboot again — chassisd should detect the cause was already -# persisted after this reboot time and skip the duplicate. -log_info "Second reboot: rapid back-to-back (duplicate skip expected)..." -reboot -d "$DPU" & -wait_for_dpu_offline "$DPU" 300 || true -wait_for_dpu_online "$DPU" 600 || { - fail "Test 6: $DPU did not recover from second reboot" -} -sleep 30 - -HISTORY_AFTER_T6=$(get_reboot_history_count "$DPU") -log_info "History count after second reboot: $HISTORY_AFTER_T6" - -# The second reboot has a distinct reboot_time (T2 > T1 from first reboot), -# so the deferred='reboot' path correctly detects it as a new event and -# persists it. History should increase by exactly 1 from the second reboot. -if (( HISTORY_BEFORE_T6 < MAX_HISTORY_FILES )); then - EXPECTED_T6=$(( HISTORY_BEFORE_T6 + 1 )) - if (( HISTORY_AFTER_T6 == EXPECTED_T6 )); then - pass "Test 6: History increased by exactly 1 (second reboot correctly persisted)" - elif (( HISTORY_AFTER_T6 == HISTORY_BEFORE_T6 )); then - fail "Test 6: No new history entry — second reboot should be persisted" - else - fail "Test 6: Expected $EXPECTED_T6 history entries, got $HISTORY_AFTER_T6" - fi -else - if (( HISTORY_AFTER_T6 == MAX_HISTORY_FILES )); then - pass "Test 6: History at max ($MAX_HISTORY_FILES), rotation handled correctly" - else - fail "Test 6: Expected $MAX_HISTORY_FILES history entries at max, got $HISTORY_AFTER_T6" - fi -fi - -# Check syslog for duplicate-skip detection (informational). -# In the back-to-back case, each reboot has a distinct reboot_time so both -# legitimately persist a new cause. The "already persisted" message only -# fires when the same reboot's cause was somehow recorded twice (e.g. race). -IS_REBOOT_LOG_T6=$(check_syslog_for "Reboot cause already persisted after reboot" "$TIMESTAMP_T6") -if [[ -n "$IS_REBOOT_LOG_T6" ]]; then - pass "Test 6: Syslog confirms duplicate reboot cause skipped" - log_info " $IS_REBOOT_LOG_T6" -else - log_info "Test 6: No 'already persisted' syslog (expected — each reboot has unique time)" -fi - -# =========================================================================== -# Summary -# =========================================================================== -log_step "Test Summary" - -TOTAL=$(( PASSED + FAILED + SKIPPED )) -echo "" -echo -e " Total: $TOTAL" -echo -e " ${GREEN}Passed: $PASSED${NC}" -echo -e " ${RED}Failed: $FAILED${NC}" -echo -e " ${YELLOW}Skipped: $SKIPPED${NC}" -echo "" - -if (( FAILED > 0 )); then - log_error "Some tests FAILED. Check syslog for chassisd messages:" - log_error " grep -i 'chassisd\\|reboot.cause\\|transitioning' /var/log/syslog | tail -30" -fi - -# --------------------------------------------------------------------------- -# Cleanup prompt -# --------------------------------------------------------------------------- -echo "" -read -p "Restore original chassisd? [y/N] " -r -if [[ "$REPLY" =~ ^[Yy]$ ]]; then - docker cp "$BACKUP_CHASSISD" "$PMON_CONTAINER:$CHASSISD_PATH" - restart_chassisd - log_info "Original chassisd restored" -else - log_info "Patched chassisd left in place at $PMON_CONTAINER:$CHASSISD_PATH" - log_info "To restore later: docker cp $BACKUP_CHASSISD $PMON_CONTAINER:$CHASSISD_PATH && systemctl restart pmon" -fi - -log_step "Done" - -if (( FAILED > 0 )); then - exit 1 -fi -exit 0