diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index e1f47dedd..7338d868d 100755 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -78,6 +78,11 @@ CHASSIS_MODULE_INFO_HOSTNAME_FIELD = 'hostname' CHASSIS_MODULE_REBOOT_INFO_TABLE = 'CHASSIS_MODULE_REBOOT_INFO_TABLE' CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp' CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot' + +# Deferred reboot-cause check reasons stored in _pending_reboot_check +DEFERRED_DPU_REBOOT = 'dpu_reboot' # chassisd witnessed Online→Offline +DEFERRED_CHASSISD_RESTART = 'chassisd_restart' # chassisd found DPU already offline after restart + DEFAULT_LINECARD_REBOOT_TIMEOUT = 180 DEFAULT_DPU_REBOOT_TIMEOUT = 360 MAX_DPU_REBOOT_DURATION = 800 @@ -287,14 +292,14 @@ class ModuleUpdater(logger.Logger): self.chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB") if self._is_supervisor(): - self.asic_table = swsscommon.Table(self.chassis_state_db, + self.asic_table = swsscommon.Table(self.chassis_state_db, CHASSIS_FABRIC_ASIC_INFO_TABLE) else: - self.asic_table = swsscommon.Table(self.chassis_state_db, + self.asic_table = swsscommon.Table(self.chassis_state_db, CHASSIS_ASIC_INFO_TABLE) self.hostname_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_HOSTNAME_TABLE) - self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE) + self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE) self.down_modules = {} self.chassis_app_db_clean_sha = None @@ -305,7 +310,7 @@ class ModuleUpdater(logger.Logger): field = line.split('=')[0].strip() if field == "linecard_reboot_timeout": self.linecard_reboot_timeout = int(line.split('=')[1].strip()) - + self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False) if not self.midplane_initialized: self.log_error("Chassisd midplane intialization failed") @@ -407,7 +412,7 @@ class ModuleUpdater(logger.Logger): else: if self.phy_entity_table.get(key) is not None: self.phy_entity_table._del(key) - + # Construct key for down_modules dict. Example down_modules key format: LINE-CARD0| fvs = self.hostname_table.get(key) if isinstance(fvs, list) and fvs[0] is True: @@ -421,7 +426,7 @@ class ModuleUpdater(logger.Logger): if prev_status == ModuleBase.MODULE_STATUS_ONLINE: notOnlineModules.append(key) # Record the time when the module down was detected to track the - # module down time. Used for chassis db cleanup for all asics of the module if the module is down for a + # module down time. Used for chassis db cleanup for all asics of the module if the module is down for a # long time like 30 mins. # All down modules including supervisor are added to the down modules dictionary. This is to help # identifying module operational status change. But the clean up will not be attempted for supervisor @@ -457,12 +462,12 @@ class ModuleUpdater(logger.Logger): self.asic_table.set(asic_key, asic_fvs) # In line card push the hostname of the module and num_asics to the chassis state db. - # The hostname is used as key to access chassis app db entries + # The hostname is used as key to access chassis app db entries if not self._is_supervisor(): module_info_dict = self._get_module_info(my_index) hostname_key = "{}{}".format(ModuleBase.MODULE_TYPE_LINE, int(self.my_slot) - 1) hostname = try_get(device_info.get_hostname, default="None") - hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(self.my_slot)), + hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(self.my_slot)), (CHASSIS_MODULE_INFO_HOSTNAME_FIELD, hostname), (CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(len(module_info_dict[CHASSIS_MODULE_INFO_ASICS])))]) self.hostname_table.set(hostname_key, hostname_fvs) @@ -520,12 +525,12 @@ class ModuleUpdater(logger.Logger): if fvs[CHASSIS_MODULE_REBOOT_REBOOT_FIELD] == "expected": return True return False - + def module_reboot_set_time(self, key): time_now = time.time() fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))]) self.module_reboot_table.set(key,fvs) - + def is_module_reboot_system_up_expired(self, key): fvs = self.module_reboot_table.get(key) if isinstance(fvs, list) and fvs[0] is True: @@ -537,7 +542,7 @@ class ModuleUpdater(logger.Logger): self.module_reboot_table._del(key) return True return False - + def check_midplane_reachability(self): if not self.midplane_initialized: return @@ -584,7 +589,7 @@ class ModuleUpdater(logger.Logger): elif midplane_access is False and current_midplane_state == 'False': if self.is_module_reboot_system_up_expired(module_key): self.log_warning("Unexpected: Module {} (Slot {}) midplane connectivity is not restored in {} seconds".format(module_key, module.get_slot(), self.linecard_reboot_timeout)) - + # Update db with midplane information fvs = swsscommon.FieldValuePairs([(CHASSIS_MIDPLANE_INFO_IP_FIELD, midplane_ip), (CHASSIS_MIDPLANE_INFO_ACCESS_FIELD, str(midplane_access))]) @@ -718,6 +723,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater): if not self.midplane_initialized: self.log_error("Chassisd midplane intialization failed") + self._pending_reboot_check = {} self.dpu_reboot_timeout = DEFAULT_DPU_REBOOT_TIMEOUT if os.path.isfile(PLATFORM_JSON_FILE): try: @@ -774,69 +780,168 @@ class SmartSwitchModuleUpdater(ModuleUpdater): self.log_error(f"{module}: Failed to read previous-reboot-cause.json: {e}") return None, None - def module_db_update(self): - for module_index in range(0, self.num_modules): - module_info_dict = self._get_module_info(module_index) - if module_info_dict is not None: - key = module_info_dict[CHASSIS_MODULE_INFO_NAME_FIELD] + def _extract_cause(self, reboot_cause): + """Extract the scalar cause string from a reboot_cause that may be a tuple, list, or string.""" + if isinstance(reboot_cause, (tuple, list)): + return reboot_cause[0] + return reboot_cause + + def _is_known_to_offline(self, prev_status, current_status): + """Online (or any non-EMPTY non-OFFLINE) → OFFLINE — chassisd witnessed it.""" + return (prev_status != ModuleBase.MODULE_STATUS_EMPTY + and prev_status != str(ModuleBase.MODULE_STATUS_OFFLINE) + and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) + + def _is_empty_to_offline(self, prev_status, current_status): + """EMPTY → OFFLINE — chassisd just (re)started, found DPU already offline.""" + return (prev_status == ModuleBase.MODULE_STATUS_EMPTY + and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) + + def _is_back_to_online(self, prev_status, current_status): + """EMPTY/OFFLINE → ONLINE — DPU usable again, safe to call get_reboot_cause().""" + return (prev_status in (ModuleBase.MODULE_STATUS_EMPTY, + str(ModuleBase.MODULE_STATUS_OFFLINE)) + and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE)) + + def persist_dpu_reboot_time_if_needed(self, module): + """Persist the current reboot time to prev_reboot_time.txt if module_base.py + has not already created it for this reboot cycle. + + Detection: compare the time in prev_reboot_time.txt with the stored reboot + cause timestamp (from previous-reboot-cause.json). If the file does not + exist, or its time is <= the stored cause time (stale from a prior reboot), + write the current time. + """ + file_time = self.retrieve_dpu_reboot_time(module) + _, stored_time_str = self.retrieve_dpu_reboot_info(module) - if not key.startswith(ModuleBase.MODULE_TYPE_DPU): - self.log_error("Incorrect module-name {}. Should start with {} ".format(key, - ModuleBase.MODULE_TYPE_DPU)) - continue + if file_time is not None and (stored_time_str is None or file_time > stored_time_str): + # File already has a time newer than the last stored cause — + # module_base.py already created it for this reboot cycle. + return - fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_DESC_FIELD, module_info_dict[CHASSIS_MODULE_INFO_DESC_FIELD]), - (CHASSIS_MODULE_INFO_SLOT_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SLOT_FIELD]), - (CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]), - (CHASSIS_MODULE_INFO_SERIAL_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SERIAL_FIELD])]) + # Write current time + time_str = self._get_current_time_str() + path = os.path.join(MODULE_REBOOT_CAUSE_DIR, module.lower(), "prev_reboot_time.txt") + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, 'w') as f: + f.write(time_str) - # Get a copy of the previous operational status of the module - prev_status = self.get_module_current_status(key) - self.module_table.set(key, fvs) + def _handle_module_status_change(self, key, module_index, prev_status, current_status): + """Handle DPU operational-status transitions and reboot-cause tracking.""" + prev_status_empty = prev_status == ModuleBase.MODULE_STATUS_EMPTY + + # Transition to offline from a known non-offline state. + # Defer get_reboot_cause until the DPU is back online — + # querying a powered-off DPU is unreliable. + if self._is_known_to_offline(prev_status, current_status): + self.log_notice(f"{key} operational status transitioning to offline") + self.persist_dpu_reboot_time_if_needed(key) + self._pending_reboot_check[key] = DEFERRED_DPU_REBOOT + return + + # DPU discovered offline after chassisd restart (e.g. config reload + # during an ongoing DPU reboot). prev_status is EMPTY because + # STATE_DB was flushed. Defer reboot-cause check until DPU is online. + if self._is_empty_to_offline(prev_status, current_status): + self._pending_reboot_check[key] = DEFERRED_CHASSISD_RESTART + return - # Get a copy of the current operational status of the module - current_status = module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] + # Transition to online from EMPTY or OFFLINE + if self._is_back_to_online(prev_status, current_status): + self.log_notice(f"{key} operational status transitioning to online") - # Operational status transitioning to offline - if prev_status != ModuleBase.MODULE_STATUS_EMPTY and prev_status != str(ModuleBase.MODULE_STATUS_OFFLINE) and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE): - self.log_notice("{} operational status transitioning to offline".format(key)) + deferred = self._pending_reboot_check.pop(key, None) - # Persist dpu down time - self.persist_dpu_reboot_time(key) - # persist reboot cause + # EMPTY→ONLINE with no deferred: chassisd restarted while the + # DPU was already online — no reboot happened during this cycle. + # On first boot persist the initial cause; otherwise just + # repopulate CHASSIS_STATE_DB from on-disk data. + if deferred is None and prev_status_empty: + if self._is_first_boot(key): reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) self.persist_dpu_reboot_cause(reboot_cause, key) - # publish reboot cause to db + self.update_dpu_reboot_cause_to_db(key) + return + + reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) + current_cause = self._extract_cause(reboot_cause) + stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key) + + # Deferred from Online→Offline: the DPU went down and is now back. + # Reboot time was already persisted. Skip persisting if the + # cause was already recorded after this reboot's execution time. + if deferred == DEFERRED_DPU_REBOOT: + reboot_time = self.retrieve_dpu_reboot_time(key) + if stored_time_str and reboot_time and stored_time_str >= reboot_time: + self.log_notice(f"{key}: Reboot cause already persisted after reboot at {reboot_time}") self.update_dpu_reboot_cause_to_db(key) + return + self.persist_dpu_reboot_cause(reboot_cause, key) + self.update_dpu_reboot_cause_to_db(key) + return + + # Deferred from EMPTY→Offline: chassisd restarted while DPU was + # offline, now it is online so we can read the real reboot cause. + # Persist if: (a) a reboot execution time exists that is newer than + # the stored cause timestamp (same logic as DEFERRED_DPU_REBOOT), or + # (b) the cause string itself changed. This handles back-to-back + # reboots that produce the same cause type. + if deferred == DEFERRED_CHASSISD_RESTART: + reboot_time = self.retrieve_dpu_reboot_time(key) + should_persist = False + if reboot_time and stored_time_str and stored_time_str < reboot_time: + self.log_notice( + f"{key}: New reboot detected (stored time {stored_time_str} " + f"< reboot time {reboot_time})") + should_persist = True + elif stored_cause is not None and current_cause and current_cause != stored_cause: + self.log_notice( + f"{key}: Reboot cause changed while chassisd was down " + f"(stored: {stored_cause}, current: {current_cause})") + should_persist = True + elif stored_cause is None and current_cause: + # First reboot ever — no stored cause yet + should_persist = True + + if should_persist: + self.persist_dpu_reboot_cause(reboot_cause, key) + self.update_dpu_reboot_cause_to_db(key) + return - elif (prev_status == ModuleBase.MODULE_STATUS_EMPTY or prev_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE): - self.log_notice(f"{key} operational status transitioning to online") + # OFFLINE→ONLINE with no deferred (edge case). + already_persisted = False + reboot_time = self.retrieve_dpu_reboot_time(key) + if stored_time_str and reboot_time and stored_time_str >= reboot_time: + self.log_notice(f"{key}: Reboot cause already persisted after reboot at {reboot_time}") + already_persisted = True - reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause) - if isinstance(reboot_cause, (tuple, list)): - current_cause = reboot_cause[0] - else: - current_cause = reboot_cause - - stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key) - - is_reboot = False - if current_cause and stored_cause and stored_time_str: - try: - stored_dt = datetime.strptime(stored_time_str, "%Y_%m_%d_%H_%M_%S").replace(tzinfo=timezone.utc) - now = datetime.now(timezone.utc) - delta_sec = (now - stored_dt).total_seconds() - - if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION: - self.log_info(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s") - is_reboot = True - except Exception as e: - self.log_error(f"{key}: Reboot cause/time comparison failed: {e}") - - if not is_reboot and (stored_time_str is not None or self._is_first_boot(key)): - # persist reboot cause and publish to db - self.persist_dpu_reboot_cause(reboot_cause, key) - self.update_dpu_reboot_cause_to_db(key) + if not already_persisted and (stored_time_str is not None or self._is_first_boot(key)): + self.persist_dpu_reboot_cause(reboot_cause, key) + self.update_dpu_reboot_cause_to_db(key) + + def module_db_update(self): + for module_index in range(0, self.num_modules): + module_info_dict = self._get_module_info(module_index) + if module_info_dict is None: + continue + + key = module_info_dict[CHASSIS_MODULE_INFO_NAME_FIELD] + if not key.startswith(ModuleBase.MODULE_TYPE_DPU): + self.log_error("Incorrect module-name {}. Should start with {} ".format(key, + ModuleBase.MODULE_TYPE_DPU)) + continue + + fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_DESC_FIELD, module_info_dict[CHASSIS_MODULE_INFO_DESC_FIELD]), + (CHASSIS_MODULE_INFO_SLOT_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SLOT_FIELD]), + (CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]), + (CHASSIS_MODULE_INFO_SERIAL_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SERIAL_FIELD])]) + + prev_status = self.get_module_current_status(key) + self.module_table.set(key, fvs) + current_status = module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] + + self._handle_module_status_change(key, module_index, prev_status, current_status) def _get_module_info(self, module_index): """ @@ -933,15 +1038,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater): except FileNotFoundError: return False - def persist_dpu_reboot_time(self, module): - """Persist the current reboot time to a file.""" - time_str = self._get_current_time_str() - path = os.path.join(MODULE_REBOOT_CAUSE_DIR, module.lower(), "prev_reboot_time.txt") - - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, 'w') as f: - f.write(time_str) - def retrieve_dpu_reboot_time(self, module): """Retrieve the persisted reboot time from a file.""" path = os.path.join(MODULE_REBOOT_CAUSE_DIR, module.lower(), "prev_reboot_time.txt") diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index 81c1555b1..c76e9d048 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -175,7 +175,7 @@ def test_moduleupdater_check_phyentity_entry_after_fabric_removal(): module_updater.module_db_update() fvs = module_updater.phy_entity_table.get(name) assert fvs == None - + def test_smartswitch_moduleupdater_check_valid_fields(): chassis = MockSmartSwitchChassis() index = 0 @@ -221,8 +221,12 @@ def test_smartswitch_moduleupdater_status_transitions(): # Create the updater module_updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + # Initial poll to populate prev_status as ONLINE + module_updater.module_db_update() + # Mock dependent methods with patch.object(module_updater, 'retrieve_dpu_reboot_info', return_value=("Switch rebooted DPU", "2023_01_01_00_00_00")) as mock_reboot_info, \ + patch.object(module_updater, 'retrieve_dpu_reboot_time', return_value="2025_01_01_00_00_00"), \ patch.object(module_updater, '_is_first_boot', return_value=False) as mock_is_first_boot, \ patch.object(module_updater, 'persist_dpu_reboot_cause') as mock_persist_reboot_cause, \ patch.object(module_updater, 'update_dpu_reboot_cause_to_db') as mock_update_reboot_db, \ @@ -230,11 +234,15 @@ def test_smartswitch_moduleupdater_status_transitions(): patch("builtins.open", mock_open()) as mock_file, \ patch.object(module_updater, '_get_history_path', return_value="/tmp/prev_reboot_time.txt") as mock_get_history_path: - # Transition from ONLINE to OFFLINE + # Transition from ONLINE to OFFLINE — should defer reboot cause + # query until DPU is back online (DPU is powered off). Issue #26254. offline_status = ModuleBase.MODULE_STATUS_OFFLINE module.set_oper_status(offline_status) module_updater.module_db_update() assert module.get_oper_status() == offline_status + mock_persist_reboot_cause.assert_not_called() + mock_update_reboot_db.assert_not_called() + assert name in module_updater._pending_reboot_check # Reset mocks for next transition mock_file.reset_mock() @@ -242,44 +250,423 @@ def test_smartswitch_moduleupdater_status_transitions(): mock_persist_reboot_cause.reset_mock() mock_update_reboot_db.reset_mock() - # Ensure ONLINE transition is handled correctly + # Ensure ONLINE transition persists the deferred reboot cause online_status = ModuleBase.MODULE_STATUS_ONLINE module.set_oper_status(online_status) module_updater.module_db_update() assert module.get_oper_status() == online_status + assert name not in module_updater._pending_reboot_check # Validate mock calls for ONLINE transition mock_persist_reboot_cause.assert_called_once() mock_update_reboot_db.assert_called_once() +def test_admin_shutdown_does_not_query_reboot_cause(): + """ + When a DPU is powered off via 'config chassis modules shutdown', + chassisd must NOT call get_reboot_cause while the DPU is offline. + Regression test for issue #26254. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # First poll: DPU is online + updater.module_db_update() + + with patch.object(module, 'get_reboot_cause') as mock_get_cause, \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'persist_dpu_reboot_time_if_needed') as mock_persist_time, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # Admin shutdown: DPU goes offline + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + updater.module_db_update() + + # get_reboot_cause must NOT be called while DPU is off + mock_get_cause.assert_not_called() + # Cause should be deferred + mock_persist_cause.assert_not_called() + mock_update_db.assert_not_called() + assert name in updater._pending_reboot_check + +def test_persist_dpu_reboot_time_if_needed_creates_file_when_missing(): + """ + When prev_reboot_time.txt does not exist (module_base.py hasn't created it), + persist_dpu_reboot_time_if_needed should write the current time. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ + patch.object(updater, 'retrieve_dpu_reboot_info', return_value=(None, None)), \ + patch("os.makedirs") as mock_makedirs, \ + patch("builtins.open", mock_open()) as mock_file: + + updater.persist_dpu_reboot_time_if_needed(name) + + mock_makedirs.assert_called_once() + mock_file.assert_called_once() + +def test_persist_dpu_reboot_time_if_needed_skips_when_file_newer(): + """ + When prev_reboot_time.txt already has a time newer than the stored + reboot cause timestamp, module_base.py already created it — do not overwrite. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # File time is newer than stored cause time — module_base.py already handled it + with patch.object(updater, 'retrieve_dpu_reboot_time', return_value="2025_06_01_12_00_00"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Power Loss", "2025_06_01_10_00_00")), \ + patch("os.makedirs") as mock_makedirs, \ + patch("builtins.open", mock_open()) as mock_file: + + updater.persist_dpu_reboot_time_if_needed(name) + + # Should NOT write — file is already up to date + mock_file.assert_not_called() + +def test_persist_dpu_reboot_time_if_needed_writes_when_file_stale(): + """ + When prev_reboot_time.txt has a time <= the stored reboot cause timestamp, + the file is stale from a prior reboot — write a new time. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # File time is older than stored cause time — stale, needs update + with patch.object(updater, 'retrieve_dpu_reboot_time', return_value="2025_05_01_08_00_00"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', return_value=("Power Loss", "2025_06_01_10_00_00")), \ + patch("os.makedirs") as mock_makedirs, \ + patch("builtins.open", mock_open()) as mock_file: + + updater.persist_dpu_reboot_time_if_needed(name) + + mock_makedirs.assert_called_once() + mock_file.assert_called_once() + def test_online_transition_skips_reboot_update(): + """ + EMPTY→ONLINE with no deferred and no first-boot marker: chassisd + restarted while the DPU was already online. No reboot happened, + so persist_dpu_reboot_cause must NOT be called. + update_dpu_reboot_cause_to_db SHOULD be called to repopulate + CHASSIS_STATE_DB from on-disk history files. + """ chassis = MockSmartSwitchChassis() index = 0 name = "DPU0" module = MockModule(index, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN123") - module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) chassis.module_list.append(module) updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) - # Mock the module going ONLINE + with patch.object(updater, '_is_first_boot', return_value=False), \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update: + + updater.module_db_update() + + # No persist — no reboot happened + mock_persist.assert_not_called() + # DB should be repopulated from on-disk history + mock_update.assert_called_once_with(name) + +def test_empty_to_online_first_boot_persists(): + """ + On first boot, EMPTY→ONLINE with no deferred should persist the + initial reboot cause (the _is_first_boot marker file is present). + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "N/A")), \ + patch.object(updater, '_is_first_boot', return_value=True), \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + updater.module_db_update() - with patch.object(updater, 'retrieve_dpu_reboot_info', - return_value=("Switch rebooted DPU", datetime.now(timezone.utc).strftime("%Y_%m_%d_%H_%M_%S"))), \ - patch.object(module, 'get_reboot_cause', return_value="Switch rebooted DPU"), \ + mock_persist_cause.assert_called_once_with(("Power Loss", "N/A"), name) + mock_update_db.assert_called_once_with(name) + +def test_empty_to_online_no_spurious_persist(): + """ + After config reload (no DPU reboot), DPU is immediately online. + EMPTY→ONLINE with no deferred must NOT persist a new reboot cause. + Regression test for chartsai-nvidia's bug report on PR #771. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(module, 'get_reboot_cause', return_value="Non-Hardware"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Non-Hardware", "2025_01_01_00_00_00")), \ patch.object(updater, '_is_first_boot', return_value=False), \ - patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist, \ - patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update, \ - patch("builtins.open", mock_open()), \ - patch("os.makedirs"), \ - patch.object(updater, '_get_history_path', return_value="/tmp/fake.json"): + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: updater.module_db_update() - # Ensure no reboot update due to is_reboot = True - mock_persist.assert_not_called() - mock_update.assert_not_called() + # No persist — DPU didn't reboot, chassisd just restarted + mock_persist_cause.assert_not_called() + # DB should still be repopulated + mock_update_db.assert_called_once_with(name) + +def test_deferred_reboot_skips_already_persisted(): + """ + Deferred='reboot' path: if the stored reboot cause timestamp is at + or after the reboot execution time (from prev_reboot_time.txt), the + cause was already persisted for this reboot — skip duplicate persist. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # First poll: establish ONLINE prev_status + updater.module_db_update() + + # Stored cause timestamp is AFTER the reboot time → already persisted + reboot_time = "2025_06_01_10_00_00" + stored_time = "2025_06_01_10_05_00" + + with patch.object(module, 'get_reboot_cause', return_value="Switch rebooted DPU"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Switch rebooted DPU", stored_time)), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'persist_dpu_reboot_time_if_needed') as mock_persist_time, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # ONLINE→OFFLINE: deferred=DEFERRED_DPU_REBOOT + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + updater.module_db_update() + assert updater._pending_reboot_check[name] == DEFERRED_DPU_REBOOT + + # OFFLINE→ONLINE: stored_time >= reboot_time → already persisted, skip + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check + mock_persist_cause.assert_not_called() + # DB should still be updated to keep it in sync + mock_update_db.assert_called_once_with(name) + +def test_deferred_reboot_persists_new_reboot(): + """ + Deferred='reboot' path: if the stored reboot cause timestamp is + BEFORE the reboot execution time, this is a new reboot whose cause + hasn't been persisted yet — persist it. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # First poll: establish ONLINE prev_status + updater.module_db_update() + + # Stored cause timestamp is BEFORE the reboot time → new reboot + reboot_time = "2025_06_01_10_00_00" + stored_time = "2025_05_01_08_00_00" + + with patch.object(module, 'get_reboot_cause', return_value="Switch rebooted DPU"), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Switch rebooted DPU", stored_time)), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # ONLINE→OFFLINE: deferred=DEFERRED_DPU_REBOOT + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + updater.module_db_update() + assert updater._pending_reboot_check[name] == DEFERRED_DPU_REBOOT + + # OFFLINE→ONLINE: stored_time < reboot_time → persist + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check + mock_persist_cause.assert_called_once() + mock_update_db.assert_called_once_with(name) + +def test_empty_to_offline_persists_changed_reboot_cause(): + """ + If chassisd restarts (e.g. config reload) while a DPU is offline, + prev_status will be EMPTY (STATE_DB flushed) and current status OFFLINE. + The EMPTY→OFFLINE transition defers get_reboot_cause until the DPU + comes online. When it does come online and the cause differs from + the stored one, the new cause must be recorded. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + # DPU is currently offline + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "power auxiliary outage or reload")), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Reboot", "2025_10_10_07_36_12")), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # EMPTY→OFFLINE: should defer, not persist yet + updater.module_db_update() + assert name in updater._pending_reboot_check + mock_persist_cause.assert_not_called() + mock_update_db.assert_not_called() + + # DPU comes online — deferred check fires, cause changed + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check + mock_persist_cause.assert_called_once_with(("Power Loss", "power auxiliary outage or reload"), name) + mock_update_db.assert_called_once_with(name) + +def test_empty_to_offline_persists_same_cause_new_reboot(): + """ + After chassisd restart, if the DPU is offline the EMPTY→OFFLINE + transition defers the check. When the DPU comes online with the + same reboot cause but a reboot_time newer than stored_time, we MUST + persist — this is a new reboot that produced the same cause type. + Fixes sonic-net/sonic-buildimage#24275. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + # Stored cause is from a prior reboot, reboot_time is from the current reboot + stored_time = "2025_10_10_07_36_12" + reboot_time = "2025_10_10_07_40_34" + + with patch.object(module, 'get_reboot_cause', return_value=("Reboot", "Reset from Main board")), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Reboot", stored_time)), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=reboot_time), \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # EMPTY→OFFLINE: deferred + updater.module_db_update() + assert name in updater._pending_reboot_check + + # DPU comes online — same cause but reboot_time > stored_time → persist + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check + mock_persist_cause.assert_called_once() + mock_update_db.assert_called_once_with(name) + + +def test_empty_to_offline_skips_same_cause_no_new_reboot(): + """ + After chassisd restart, if the DPU is offline the EMPTY→OFFLINE + transition defers the check. When the DPU comes online with the + same reboot cause and NO reboot_time (or stored_time >= reboot_time), + we must NOT create a duplicate entry — no new reboot happened. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(module, 'get_reboot_cause', return_value=("Reboot", "Reset from Main board")), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=("Reboot", "2025_10_10_07_36_12")), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # EMPTY→OFFLINE: deferred + updater.module_db_update() + assert name in updater._pending_reboot_check + + # DPU comes online — same cause, no reboot_time → skip + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check + mock_persist_cause.assert_not_called() + # DB should still be repopulated (STATE_DB was flushed) + mock_update_db.assert_called_once_with(name) + +def test_empty_to_offline_persists_first_boot(): + """ + On first boot there is no previous-reboot-cause.json so stored_cause + is None. The EMPTY→OFFLINE transition defers, and when the DPU + comes online the deferred path persists the initial cause. + """ + chassis = MockSmartSwitchChassis() + name = "DPU0" + module = MockModule(0, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN0") + module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + chassis.module_list.append(module) + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) + + with patch.object(module, 'get_reboot_cause', return_value=("Power Loss", "N/A")), \ + patch.object(updater, 'retrieve_dpu_reboot_info', + return_value=(None, None)), \ + patch.object(updater, 'retrieve_dpu_reboot_time', return_value=None), \ + patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist_cause, \ + patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update_db: + + # EMPTY→OFFLINE: deferred + updater.module_db_update() + assert name in updater._pending_reboot_check + + # DPU comes online — first boot, persist the initial cause + module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + updater.module_db_update() + assert name not in updater._pending_reboot_check + mock_persist_cause.assert_called_once() + mock_update_db.assert_called_once_with(name) def test_retrieve_dpu_reboot_info_success(): class DummyChassis: @@ -305,6 +692,17 @@ def init_midplane_switch(self): return False # required for SmartSwitchModuleUp assert cause is None assert time_str is None +def test_extract_cause_tuple(): + class DummyChassis: + def get_num_modules(self): return 0 + def init_midplane_switch(self): return False + + updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, DummyChassis()) + assert updater._extract_cause(("Power Loss", "auxiliary outage")) == "Power Loss" + assert updater._extract_cause(["Reboot", "reset"]) == "Reboot" + assert updater._extract_cause("Watchdog") == "Watchdog" + assert updater._extract_cause(None) is None + def test_smartswitch_moduleupdater_check_invalid_name(): chassis = MockSmartSwitchChassis() index = 0 @@ -709,7 +1107,6 @@ def test_smartswitch_module_db_update(): # Call the function to test module_updater.persist_dpu_reboot_cause(reboot_cause, key) module_updater._is_first_boot(name) - module_updater.persist_dpu_reboot_time(name) module_updater.update_dpu_reboot_cause_to_db(name) @@ -830,7 +1227,7 @@ def test_moduleupdater_check_string_slot(): midplane_table = module_updater.midplane_table #Check only one entry in database assert 1 == midplane_table.size() - + def test_midplane_presence_modules(): chassis = MockChassis() @@ -1036,7 +1433,7 @@ def lc_mock_open(*args, **kwargs): @patch('os.path.isfile', MagicMock(return_value=True)) def test_midplane_presence_modules_linecard_reboot(): chassis = MockChassis() - + #Supervisor index = 0 name = "SUPERVISOR0" @@ -1101,7 +1498,7 @@ def test_midplane_presence_modules_linecard_reboot(): assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD] assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD] - + #Set access of line-card to Down (to mock midplane connectivity state change) module.set_midplane_reachable(False) # set expected reboot of linecard @@ -1141,9 +1538,9 @@ def test_midplane_presence_modules_linecard_reboot(): if isinstance(fvs, list): fvs = dict(fvs[-1]) assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD] - assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD] - assert module_updater.linecard_reboot_timeout == 240 - + assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD] + assert module_updater.linecard_reboot_timeout == 240 + def test_midplane_presence_supervisor(): chassis = MockChassis() @@ -1451,7 +1848,7 @@ def test_daemon_run_smartswitch(): def test_set_initial_dpu_admin_state_up(): """Test set_initial_dpu_admin_state when admin state is up""" chassis = MockSmartSwitchChassis() - + # DPU0 details index = 0 name = "DPU0" @@ -1466,12 +1863,12 @@ def test_set_initial_dpu_admin_state_up(): status = ModuleBase.MODULE_STATUS_ONLINE module.set_oper_status(status) chassis.module_list.append(module) - + # Supervisor ModuleUpdater module_updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) module_updater.module_db_update() module_updater.modules_num_update() - + # ChassisdDaemon setup daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER, chassis) daemon_chassisd.module_updater = module_updater @@ -1502,7 +1899,7 @@ def test_set_initial_dpu_admin_state_up(): def test_set_initial_dpu_admin_state_empty_offline(): """Test set_initial_dpu_admin_state when admin state is empty and operational state is offline""" chassis = MockSmartSwitchChassis() - + # DPU0 details index = 0 name = "DPU0" @@ -1517,12 +1914,12 @@ def test_set_initial_dpu_admin_state_empty_offline(): status = ModuleBase.MODULE_STATUS_OFFLINE module.set_oper_status(status) chassis.module_list.append(module) - + # Supervisor ModuleUpdater module_updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis) module_updater.module_db_update() module_updater.modules_num_update() - + # ChassisdDaemon setup daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER, chassis) daemon_chassisd.module_updater = module_updater @@ -1604,7 +2001,7 @@ def test_set_initial_dpu_admin_state_empty_not_offline(): def test_set_initial_dpu_admin_state_exception(): """Test set_initial_dpu_admin_state handles exceptions gracefully""" chassis = MockSmartSwitchChassis() - + # DPU0 details index = 0 name = "DPU0" @@ -1813,7 +2210,7 @@ def test_chassis_db_cleanup(): # Mock hostname table update for the line card LINE-CARD0 hostname = "lc1-host-00" num_asics = 1 - hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(lc_slot)), + hostname_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_SLOT_FIELD, str(lc_slot)), (CHASSIS_MODULE_INFO_HOSTNAME_FIELD, hostname), (CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(num_asics))]) sup_module_updater.hostname_table.set(lc_name, hostname_fvs) @@ -1846,7 +2243,7 @@ def test_chassis_db_cleanup(): # Mock >= CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD module down period for LINE-CARD1 down_module_key = lc2_name+"|" assert down_module_key not in sup_module_updater.down_modules.keys() - + sup_module_updater.module_down_chassis_db_cleanup() def test_chassis_db_bootup_with_empty_slot(): @@ -1892,7 +2289,7 @@ def test_chassis_db_bootup_with_empty_slot(): # Supervisor ModuleUpdater sup_module_updater = ModuleUpdater(SYSLOG_IDENTIFIER, chassis, sup_slot, sup_slot) sup_module_updater.modules_num_update() - + sup_module_updater.module_db_update() # check LC1 STATUS ONLINE in module table @@ -1901,14 +2298,14 @@ def test_chassis_db_bootup_with_empty_slot(): fvs = dict(fvs[-1]) assert ModuleBase.MODULE_STATUS_ONLINE == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] - # check LC2 STATUS EMPTY in module table + # check LC2 STATUS EMPTY in module table fvs = sup_module_updater.module_table.get(lc2_name) if isinstance(fvs, list): fvs = dict(fvs[-1]) assert ModuleBase.MODULE_STATUS_EMPTY == fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] # Both should no tbe in down_module keys. - + down_module_lc1_key = lc_name+"|" assert down_module_lc1_key not in sup_module_updater.down_modules.keys() down_module_lc2_key = lc_name+"|"