From 22d09372d9b8d7224244f1ecb2f876b56bb2a0bc Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Sat, 21 Mar 2026 03:44:32 +0000 Subject: [PATCH 01/12] Add design doc: replace docker exec gnoi_client with native gRPC calls Proposes replacing subprocess-based gnoi_client invocations in gnoi_shutdown_daemon with direct Python gRPC calls. Documents the gnoi_client output format, a parsing bug in RebootStatus polling, and the difficulty of diagnosing RPC failures through Go panic stack traces. Ref: sonic-net/sonic-host-services#360 Signed-off-by: Dawei Huang --- doc/gnoi-native-grpc-design.md | 306 +++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 doc/gnoi-native-grpc-design.md diff --git a/doc/gnoi-native-grpc-design.md b/doc/gnoi-native-grpc-design.md new file mode 100644 index 00000000..8e7fbb5f --- /dev/null +++ b/doc/gnoi-native-grpc-design.md @@ -0,0 +1,306 @@ +# Design: Replace `docker exec gnoi_client` with Native gRPC Calls + +## 1. Background + +The `gnoi_shutdown_daemon` on SmartSwitch NPU orchestrates graceful DPU shutdown by issuing gNOI `System.Reboot(HALT)` and polling `System.RebootStatus`. Today it does this by shelling out: + +``` +docker exec gnmi gnoi_client -target=: -notls -module System -rpc Reboot ... +docker exec gnmi gnoi_client -target=: -notls -module System -rpc RebootStatus +``` + +This has several problems: + +| Problem | Impact | +|---------|--------| +| Requires the `gnmi` container to be running and healthy | If gnmi container is restarting or unhealthy, DPU shutdown fails silently | +| Subprocess overhead per RPC call | Extra process creation, Docker CLI round-trip, stdout parsing | +| Fragile output parsing | `"reboot complete" in out_s.lower()` breaks on any output format change | +| No structured error handling | gRPC status codes are lost; only `rc != 0` is checked | +| Error output is a Go panic stack trace | Extremely painful to diagnose failures (see §1.1) | +| Security surface | Shell-out through Docker CLI is a wider attack surface than a direct socket | + +### 1.1 gnoi_client Output Format Analysis + +The `gnoi_client` binary in sonic-gnmi is a Go CLI tool. Understanding its output format reveals why the current approach is fragile: + +**Reboot RPC (`-rpc Reboot`):** +- On **success**: prints `"System Reboot\n"` to stdout, exits 0. No structured output. +- On **failure**: calls `panic(err.Error())`, which dumps a **Go panic stack trace** to stderr and exits with a non-zero code. The daemon only checks `rc != 0` — the actual gRPC error code, message, and details are buried in a multi-line panic dump that is not parsed. + +**RebootStatus RPC (`-rpc RebootStatus`):** +- On **success**: prints `"System RebootStatus\n"` header followed by JSON-marshaled `RebootStatusResponse`, e.g.: + ```json + System RebootStatus + {"active":false,"status":{"status":"STATUS_SUCCESS","message":"..."}} + ``` +- On **failure**: same `panic(err.Error())` — Go stack trace, non-zero exit. + +**The parsing bug:** The daemon currently checks: +```python +if rc_s == 0 and out_s and ("reboot complete" in out_s.lower()): + return True +``` +But the actual protobuf `RebootStatusResponse` serialized to JSON contains fields like `"active":false` and `"status":"STATUS_SUCCESS"` — the string `"reboot complete"` never appears in the output. This means the poll loop **always times out** regardless of whether the DPU successfully halted, and the daemon proceeds purely on the timeout path. + +**Why this matters for error diagnosis:** When a gNOI RPC fails (DPU unreachable, TLS mismatch, auth failure, server-side error), the only signal is a Go panic: +``` +panic: rpc error: code = Unavailable desc = connection error: ... + +goroutine 1 [running]: +main.main() + /sonic/gnoi_client/gnoi_client.go:42 +... +``` +The daemon captures this in `err` (stderr) but never logs or inspects it — it just logs `"Reboot command failed"` with no context. Diagnosing production failures requires SSHing into the switch, manually running the docker exec command, and reading Go stack traces. + +## 2. Goal + +Replace the subprocess-based `gnoi_client` invocations with direct Python gRPC calls using generated protobuf stubs for the [OpenConfig gNOI System service](https://github.com/openconfig/gnoi/blob/main/system/system.proto). + +## 3. Scope + +### In Scope +- Generate or vendor Python gRPC stubs for `gnoi.system.System` (Reboot, RebootStatus RPCs) +- Create a lightweight `GnoiClient` wrapper class +- Refactor `GnoiRebootHandler._send_reboot_command()` and `_poll_reboot_status()` to use native gRPC +- Remove `execute_command()` helper (becomes unused) +- Update unit tests to mock at the gRPC stub level +- Add `grpcio` and `protobuf` to package dependencies + +### Out of Scope +- TLS/mTLS on the midplane channel (future work; midplane is trusted today) +- Refactoring the daemon's main loop or config DB subscription logic +- Other gNOI services beyond `System` +- Changes to how DPU IP/port are discovered from CONFIG_DB + +## 4. Design + +### 4.1 Phase 1 — Proto Stubs + +Vendor pre-generated Python stubs from the gNOI `system.proto` definition. + +**Files to add:** +``` +host_modules/gnoi/ +├── __init__.py +├── system_pb2.py # generated message classes +└── system_pb2_grpc.py # generated service stubs +``` + +The stubs are generated from: +- https://github.com/openconfig/gnoi/blob/main/system/system.proto +- https://github.com/openconfig/gnoi/blob/main/types/types.proto (dependency) + +Generation command (for reference / CI reproducibility): +```bash +python -m grpc_tools.protoc \ + -I./proto \ + --python_out=host_modules/gnoi \ + --grpc_python_out=host_modules/gnoi \ + system/system.proto types/types.proto +``` + +**Why vendor instead of build-time generation?** +- sonic-host-services has no existing proto compilation infrastructure +- The gNOI System proto is stable (no changes in years) +- Keeps the build simple; can migrate to build-time generation later if more protos are needed + +### 4.2 Phase 2 — GnoiClient Wrapper + +A thin wrapper providing the two RPCs we need: + +```python +# host_modules/gnoi/client.py + +import grpc +from . import system_pb2, system_pb2_grpc + +class GnoiClient: + """Lightweight gNOI System service client for DPU communication.""" + + def __init__(self, target: str, timeout: int = 30): + """ + Args: + target: gRPC target in "host:port" format + timeout: Default RPC timeout in seconds + """ + self._channel = grpc.insecure_channel(target) + self._stub = system_pb2_grpc.SystemStub(self._channel) + self._timeout = timeout + + def reboot(self, method: int = 3, message: str = "") -> None: + """ + Send System.Reboot RPC. + + Args: + method: RebootMethod enum value (3 = HALT) + message: Human-readable reason string + + Raises: + grpc.RpcError: on any gRPC failure + """ + request = system_pb2.RebootRequest( + method=method, + message=message, + ) + self._stub.Reboot(request, timeout=self._timeout) + + def reboot_status(self) -> system_pb2.RebootStatusResponse: + """ + Poll System.RebootStatus RPC. + + Returns: + RebootStatusResponse with .active and .wait fields + + Raises: + grpc.RpcError: on any gRPC failure + """ + request = system_pb2.RebootStatusRequest() + return self._stub.RebootStatus(request, timeout=self._timeout) + + def close(self): + """Close the underlying gRPC channel.""" + if self._channel: + self._channel.close() + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() +``` + +### 4.3 Phase 3 — Refactor gnoi_shutdown_daemon + +Replace the two subprocess call sites in `GnoiRebootHandler`: + +#### `_send_reboot_command` (before) +```python +def _send_reboot_command(self, dpu_name, dpu_ip, port): + reboot_cmd = ["docker", "exec", "gnmi", "gnoi_client", ...] + rc, out, err = execute_command(reboot_cmd, ...) + return rc == 0 +``` + +#### `_send_reboot_command` (after) +```python +def _send_reboot_command(self, dpu_name, dpu_ip, port): + try: + with GnoiClient(f"{dpu_ip}:{port}", timeout=REBOOT_RPC_TIMEOUT_SEC) as client: + client.reboot( + method=REBOOT_METHOD_HALT, + message="Triggered by SmartSwitch graceful shutdown" + ) + return True + except grpc.RpcError as e: + logger.log_error(f"{dpu_name}: gNOI Reboot failed: {e.code()} {e.details()}") + return False +``` + +#### `_poll_reboot_status` (before) +```python +def _poll_reboot_status(self, dpu_name, dpu_ip, port): + status_cmd = ["docker", "exec", "gnmi", "gnoi_client", ...] + while time.monotonic() < deadline: + rc_s, out_s, _ = execute_command(status_cmd, ...) + if rc_s == 0 and "reboot complete" in out_s.lower(): + return True +``` + +#### `_poll_reboot_status` (after) +```python +def _poll_reboot_status(self, dpu_name, dpu_ip, port): + deadline = time.monotonic() + _get_halt_timeout() + with GnoiClient(f"{dpu_ip}:{port}", timeout=STATUS_RPC_TIMEOUT_SEC) as client: + while time.monotonic() < deadline: + try: + resp = client.reboot_status() + if not resp.active: + status_str = system_pb2.RebootStatus.Status.Name(resp.status.status) + logger.log_notice(f"{dpu_name}: RebootStatus complete: {status_str} - {resp.status.message}") + return resp.status.status == system_pb2.RebootStatus.Status.STATUS_SUCCESS + except grpc.RpcError as e: + logger.log_warning( + f"{dpu_name}: RebootStatus poll error: code={e.code()} details={e.details()}" + ) + time.sleep(STATUS_POLL_INTERVAL_SEC) + return False +``` + +**Key improvements over the subprocess approach:** +- **Fixes the parsing bug**: checks `resp.active == False` directly instead of the broken `"reboot complete" in stdout` match that never triggers +- **Distinguishes success from failure**: inspects `resp.status.status` enum (`STATUS_SUCCESS` vs `STATUS_FAILURE` vs `STATUS_RETRIABLE_FAILURE`) +- **Actionable error logs**: gRPC errors include status code and details (e.g., `code=UNAVAILABLE details=connection refused`) instead of opaque "command failed" + +#### Removals +- `execute_command()` function — no longer needed +- `import subprocess` — no longer needed + +#### Additions +- `import grpc` +- `from host_modules.gnoi.client import GnoiClient` + +### 4.4 Phase 4 — Update Tests + +Current tests mock `execute_command` and check return codes. New tests mock at the gRPC level: + +```python +@mock.patch('gnoi_shutdown_daemon.GnoiClient') +def test_send_reboot_command_success(self, MockClient): + mock_client = MockClient.return_value.__enter__.return_value + # reboot() returns None on success + mock_client.reboot.return_value = None + + result = handler._send_reboot_command("DPU0", "10.0.0.1", "8080") + assert result is True + mock_client.reboot.assert_called_once() + +@mock.patch('gnoi_shutdown_daemon.GnoiClient') +def test_send_reboot_command_failure(self, MockClient): + mock_client = MockClient.return_value.__enter__.return_value + mock_client.reboot.side_effect = grpc.RpcError() + + result = handler._send_reboot_command("DPU0", "10.0.0.1", "8080") + assert result is False +``` + +### 4.5 Dependencies + +| Package | Version | Notes | +|---------|---------|-------| +| `grpcio` | >=1.51.0 | Already in SONiC build environment | +| `protobuf` | >=4.21.0 | Already in SONiC build environment | + +Verify these are available in the sonic-host-services build context. If not, add to `setup.py` `install_requires`. + +## 5. Implementation Plan + +| Phase | Description | PR | +|-------|-------------|----| +| 1 | Vendor gNOI System proto stubs | PR #1 | +| 2 | Add `GnoiClient` wrapper + unit tests | PR #1 (same) | +| 3 | Refactor `gnoi_shutdown_daemon` to use `GnoiClient` | PR #1 (same) | +| 4 | Update existing daemon tests | PR #1 (same) | + +All phases can ship as a single PR since they form one atomic change — the old subprocess path is fully replaced. + +## 6. Testing + +- **Unit tests**: Mock gRPC stubs, verify correct protobuf messages are sent, verify error handling for various `grpc.StatusCode` values +- **Integration test**: On a SmartSwitch testbed, trigger `config chassis modules shutdown DPU0` and verify gNOI HALT is sent and RebootStatus is polled successfully via syslog +- **Regression**: Existing CI pipeline covers the daemon; updated mocks ensure no regressions + +## 7. Risks & Mitigations + +| Risk | Mitigation | +|------|------------| +| gRPC/protobuf not available in host environment | Verify during build; these are already used by other SONiC components | +| Proto stub drift from upstream gnoi | Pin to a specific gnoi commit; stubs are stable | +| Insecure channel on midplane | Same trust model as today's `gnoi_client -notls`; TLS is future work | + +## 8. Future Work + +- **TLS support**: Add optional mTLS when midplane security is hardened +- **Build-time proto generation**: If more gNOI/gNMI services are needed, add a proto compilation step +- **Connection pooling**: Reuse gRPC channels across polls instead of creating per-call (minor optimization) From 90df6107e0f154dbe43695d85d5a1d0a150f2807 Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Sat, 21 Mar 2026 04:05:12 +0000 Subject: [PATCH 02/12] design doc: add reference code for gnoi_client and openconfig system proto Signed-off-by: Dawei Huang --- doc/gnoi-native-grpc-design.md | 226 ++++++++++++++++++++++++++++++++- 1 file changed, 225 insertions(+), 1 deletion(-) diff --git a/doc/gnoi-native-grpc-design.md b/doc/gnoi-native-grpc-design.md index 8e7fbb5f..d65ce94c 100644 --- a/doc/gnoi-native-grpc-design.md +++ b/doc/gnoi-native-grpc-design.md @@ -299,7 +299,231 @@ All phases can ship as a single PR since they form one atomic change — the old | Proto stub drift from upstream gnoi | Pin to a specific gnoi commit; stubs are stable | | Insecure channel on midplane | Same trust model as today's `gnoi_client -notls`; TLS is future work | -## 8. Future Work +## 8. Reference Code + +This section provides the upstream source code that this design replaces and builds upon, so the document is self-contained. + +### 8.1 Current `gnoi_client` Entry Point + +**Source:** [`sonic-gnmi/gnoi_client/gnoi_client.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/gnoi_client.go) + +```go +package main + +import ( + "context" + "os" + "os/signal" + + "github.com/google/gnxi/utils/credentials" + "github.com/sonic-net/sonic-gnmi/gnoi_client/config" + "github.com/sonic-net/sonic-gnmi/gnoi_client/system" + "google.golang.org/grpc" +) + +func main() { + config.ParseFlag() + opts := credentials.ClientCredentials(*config.TargetName) + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt) + <-c + cancel() + }() + conn, err := grpc.Dial(*config.Target, opts...) + if err != nil { + panic(err.Error()) + } + + switch *config.Module { + case "System": + switch *config.Rpc { + case "Reboot": + system.Reboot(conn, ctx) + case "RebootStatus": + system.RebootStatus(conn, ctx) + case "SetPackage": + system.SetPackage(conn, ctx) + // ... other RPCs omitted for brevity + } + // ... other modules omitted + } +} +``` + +### 8.2 `gnoi_client/system/reboot.go` — Reboot & RebootStatus Implementation + +**Source:** [`sonic-gnmi/gnoi_client/system/reboot.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/system/reboot.go) + +```go +package system + +import ( + "context" + "encoding/json" + "fmt" + pb "github.com/openconfig/gnoi/system" + "github.com/sonic-net/sonic-gnmi/gnoi_client/config" + "github.com/sonic-net/sonic-gnmi/gnoi_client/utils" + "google.golang.org/grpc" +) + +func Reboot(conn *grpc.ClientConn, ctx context.Context) { + fmt.Println("System Reboot") + ctx = utils.SetUserCreds(ctx) + sc := pb.NewSystemClient(conn) + req := &pb.RebootRequest{} + json.Unmarshal([]byte(*config.Args), req) + _, err := sc.Reboot(ctx, req) + if err != nil { + panic(err.Error()) // ← Error is lost in a Go panic stack trace + } +} + +func RebootStatus(conn *grpc.ClientConn, ctx context.Context) { + fmt.Println("System RebootStatus") + ctx = utils.SetUserCreds(ctx) + sc := pb.NewSystemClient(conn) + req := &pb.RebootStatusRequest{} + resp, err := sc.RebootStatus(ctx, req) + if err != nil { + panic(err.Error()) + } + respstr, err := json.Marshal(resp) + if err != nil { + panic(err.Error()) + } + fmt.Println(string(respstr)) // ← Output that daemon tries to parse +} +``` + +**Key observations:** +- `Reboot()` prints `"System Reboot\n"` on success, panics on failure — no structured output +- `RebootStatus()` prints JSON-serialized `RebootStatusResponse` — the daemon searches for `"reboot complete"` which never appears in this JSON +- Errors use `panic()` which produces Go stack traces instead of parseable error output + +### 8.3 `gnoi_client/system/set_package.go` — SetPackage Implementation + +**Source:** [`sonic-gnmi/gnoi_client/system/set_package.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/system/set_package.go) + +```go +func SetPackage(conn *grpc.ClientConn, ctx context.Context) { + ctx = utils.SetUserCreds(ctx) + sc := newSystemClient(conn) + + download := &common.RemoteDownload{Path: *url} + pkg := &system.Package{ + Filename: *filename, + Version: *version, + Activate: *activate, + RemoteDownload: download, + } + + req := &system.SetPackageRequest{ + Request: &system.SetPackageRequest_Package{Package: pkg}, + } + + stream, err := sc.SetPackage(ctx) + if err != nil { + return fmt.Errorf("error creating stream: %v", err) + } + stream.Send(req) + stream.CloseSend() + resp, err := stream.CloseAndRecv() + // ... +} +``` + +This is the RPC path that triggered the `too_many_pings` issue fixed in PR #620 — the streaming `SetPackage` call is long-lived and sensitive to keepalive misconfiguration. + +### 8.4 OpenConfig gNOI System Proto Definition + +**Source:** [`openconfig/gnoi/system/system.proto`](https://github.com/openconfig/gnoi/blob/main/system/system.proto) (vendored at `sonic-gnmi/vendor/github.com/openconfig/gnoi/system/system.proto`) + +```protobuf +syntax = "proto3"; +package gnoi.system; + +service System { + rpc Reboot(RebootRequest) returns (RebootResponse) {} + rpc RebootStatus(RebootStatusRequest) returns (RebootStatusResponse) {} + rpc CancelReboot(CancelRebootRequest) returns (CancelRebootResponse) {} + rpc SetPackage(stream SetPackageRequest) returns (SetPackageResponse) {} + rpc KillProcess(KillProcessRequest) returns (KillProcessResponse) {} + rpc Time(TimeRequest) returns (TimeResponse) {} + // ... Ping, Traceroute, SwitchControlProcessor omitted +} + +message RebootRequest { + RebootMethod method = 1; + uint64 delay = 2; // Delay in nanoseconds + string message = 3; // Informational reason + repeated types.Path subcomponents = 4; + bool force = 5; +} + +message RebootResponse {} + +enum RebootMethod { + UNKNOWN = 0; + COLD = 1; // Shutdown and restart OS and all hardware + POWERDOWN = 2; // Halt and power down + HALT = 3; // Halt (used for DPU shutdown) + WARM = 4; // Reload configuration only + NSF = 5; // Non-stop-forwarding reboot + POWERUP = 7; // Apply power +} + +message RebootStatusRequest { + repeated types.Path subcomponents = 1; +} + +message RebootStatusResponse { + bool active = 1; // If reboot is active + uint64 wait = 2; // Time left until reboot (ns) + uint64 when = 3; // Reboot time (ns since epoch) + string reason = 4; + uint32 count = 5; + RebootMethod method = 6; + RebootStatus status = 7; // Only meaningful when active = false +} + +message RebootStatus { + enum Status { + STATUS_UNKNOWN = 0; + STATUS_SUCCESS = 1; + STATUS_RETRIABLE_FAILURE = 2; + STATUS_FAILURE = 3; + } + Status status = 1; + string message = 2; +} + +// SetPackage — streaming RPC for software packages +message SetPackageRequest { + oneof request { + Package package = 1; + bytes contents = 2; + types.HashType hash = 3; + } +} + +message Package { + string filename = 1; + string version = 4; + bool activate = 5; + common.RemoteDownload remote_download = 6; +} +``` + +**Key proto details for the Python wrapper:** +- `RebootMethod.HALT = 3` — the method used for DPU graceful shutdown +- `RebootStatusResponse.active == false` with `status.status == STATUS_SUCCESS` indicates successful halt completion +- `SetPackage` is a client-streaming RPC — the only streaming call in our scope + +## 9. Future Work - **TLS support**: Add optional mTLS when midplane security is hardened - **Build-time proto generation**: If more gNOI/gNMI services are needed, add a proto compilation step From 8bbf237a5f12b4e9836df5892d37910b12b54042 Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Sat, 21 Mar 2026 04:06:42 +0000 Subject: [PATCH 03/12] design doc: remove unrelated SetPackage references from reference section Signed-off-by: Dawei Huang --- doc/gnoi-native-grpc-design.md | 57 ++-------------------------------- 1 file changed, 2 insertions(+), 55 deletions(-) diff --git a/doc/gnoi-native-grpc-design.md b/doc/gnoi-native-grpc-design.md index d65ce94c..1c95ba7d 100644 --- a/doc/gnoi-native-grpc-design.md +++ b/doc/gnoi-native-grpc-design.md @@ -344,8 +344,6 @@ func main() { system.Reboot(conn, ctx) case "RebootStatus": system.RebootStatus(conn, ctx) - case "SetPackage": - system.SetPackage(conn, ctx) // ... other RPCs omitted for brevity } // ... other modules omitted @@ -404,41 +402,7 @@ func RebootStatus(conn *grpc.ClientConn, ctx context.Context) { - `RebootStatus()` prints JSON-serialized `RebootStatusResponse` — the daemon searches for `"reboot complete"` which never appears in this JSON - Errors use `panic()` which produces Go stack traces instead of parseable error output -### 8.3 `gnoi_client/system/set_package.go` — SetPackage Implementation - -**Source:** [`sonic-gnmi/gnoi_client/system/set_package.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/system/set_package.go) - -```go -func SetPackage(conn *grpc.ClientConn, ctx context.Context) { - ctx = utils.SetUserCreds(ctx) - sc := newSystemClient(conn) - - download := &common.RemoteDownload{Path: *url} - pkg := &system.Package{ - Filename: *filename, - Version: *version, - Activate: *activate, - RemoteDownload: download, - } - - req := &system.SetPackageRequest{ - Request: &system.SetPackageRequest_Package{Package: pkg}, - } - - stream, err := sc.SetPackage(ctx) - if err != nil { - return fmt.Errorf("error creating stream: %v", err) - } - stream.Send(req) - stream.CloseSend() - resp, err := stream.CloseAndRecv() - // ... -} -``` - -This is the RPC path that triggered the `too_many_pings` issue fixed in PR #620 — the streaming `SetPackage` call is long-lived and sensitive to keepalive misconfiguration. - -### 8.4 OpenConfig gNOI System Proto Definition +### 8.3 OpenConfig gNOI System Proto Definition **Source:** [`openconfig/gnoi/system/system.proto`](https://github.com/openconfig/gnoi/blob/main/system/system.proto) (vendored at `sonic-gnmi/vendor/github.com/openconfig/gnoi/system/system.proto`) @@ -450,10 +414,9 @@ service System { rpc Reboot(RebootRequest) returns (RebootResponse) {} rpc RebootStatus(RebootStatusRequest) returns (RebootStatusResponse) {} rpc CancelReboot(CancelRebootRequest) returns (CancelRebootResponse) {} - rpc SetPackage(stream SetPackageRequest) returns (SetPackageResponse) {} rpc KillProcess(KillProcessRequest) returns (KillProcessResponse) {} rpc Time(TimeRequest) returns (TimeResponse) {} - // ... Ping, Traceroute, SwitchControlProcessor omitted + // ... Ping, Traceroute, SwitchControlProcessor, SetPackage omitted } message RebootRequest { @@ -501,27 +464,11 @@ message RebootStatus { string message = 2; } -// SetPackage — streaming RPC for software packages -message SetPackageRequest { - oneof request { - Package package = 1; - bytes contents = 2; - types.HashType hash = 3; - } -} - -message Package { - string filename = 1; - string version = 4; - bool activate = 5; - common.RemoteDownload remote_download = 6; -} ``` **Key proto details for the Python wrapper:** - `RebootMethod.HALT = 3` — the method used for DPU graceful shutdown - `RebootStatusResponse.active == false` with `status.status == STATUS_SUCCESS` indicates successful halt completion -- `SetPackage` is a client-streaming RPC — the only streaming call in our scope ## 9. Future Work From 0bf9f219b81a6b504e371b24d8791dad5ee04d1a Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Sat, 21 Mar 2026 04:09:19 +0000 Subject: [PATCH 04/12] design doc: address Copilot review comments - Add types_pb2.py to vendored stubs file list (types.proto dependency) - Fix RebootStatus enum access pattern for protobuf Python codegen - Fix test mock to implement grpc.Call interface (code()/details()) Signed-off-by: Dawei Huang --- doc/gnoi-native-grpc-design.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/doc/gnoi-native-grpc-design.md b/doc/gnoi-native-grpc-design.md index 1c95ba7d..1622cba0 100644 --- a/doc/gnoi-native-grpc-design.md +++ b/doc/gnoi-native-grpc-design.md @@ -85,7 +85,9 @@ Vendor pre-generated Python stubs from the gNOI `system.proto` definition. host_modules/gnoi/ ├── __init__.py ├── system_pb2.py # generated message classes -└── system_pb2_grpc.py # generated service stubs +├── system_pb2_grpc.py # generated service stubs +├── types_pb2.py # generated types (dependency of system_pb2) +└── types_pb2_grpc.py # generated (empty, no services in types.proto) ``` The stubs are generated from: @@ -217,9 +219,10 @@ def _poll_reboot_status(self, dpu_name, dpu_ip, port): try: resp = client.reboot_status() if not resp.active: - status_str = system_pb2.RebootStatus.Status.Name(resp.status.status) + status_enum = resp.status.status + status_str = system_pb2.RebootStatus.Status.Name(status_enum) logger.log_notice(f"{dpu_name}: RebootStatus complete: {status_str} - {resp.status.message}") - return resp.status.status == system_pb2.RebootStatus.Status.STATUS_SUCCESS + return status_enum == system_pb2.RebootStatus.STATUS_SUCCESS except grpc.RpcError as e: logger.log_warning( f"{dpu_name}: RebootStatus poll error: code={e.code()} details={e.details()}" @@ -259,7 +262,10 @@ def test_send_reboot_command_success(self, MockClient): @mock.patch('gnoi_shutdown_daemon.GnoiClient') def test_send_reboot_command_failure(self, MockClient): mock_client = MockClient.return_value.__enter__.return_value - mock_client.reboot.side_effect = grpc.RpcError() + error = mock.create_autospec(grpc.RpcError) + error.code.return_value = grpc.StatusCode.UNAVAILABLE + error.details.return_value = "connection refused" + mock_client.reboot.side_effect = error result = handler._send_reboot_command("DPU0", "10.0.0.1", "8080") assert result is False From bf4c41cb2389ee01e60b10bd38fdf06c123c1c65 Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Sat, 21 Mar 2026 04:58:58 +0000 Subject: [PATCH 05/12] Address Copilot review: fix stderr description, add packaging note - Correct stderr statement: _send_reboot_command uses suppress_stderr=True, so panic output is suppressed, not captured - Add packaging note: setup.py must include host_modules.gnoi in packages list for vendored stubs to be installed Signed-off-by: Dawei Huang --- doc/gnoi-native-grpc-design.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/gnoi-native-grpc-design.md b/doc/gnoi-native-grpc-design.md index 1622cba0..1c2b2f69 100644 --- a/doc/gnoi-native-grpc-design.md +++ b/doc/gnoi-native-grpc-design.md @@ -52,7 +52,7 @@ main.main() /sonic/gnoi_client/gnoi_client.go:42 ... ``` -The daemon captures this in `err` (stderr) but never logs or inspects it — it just logs `"Reboot command failed"` with no context. Diagnosing production failures requires SSHing into the switch, manually running the docker exec command, and reading Go stack traces. +For the Reboot call, `_send_reboot_command()` invokes `execute_command(..., suppress_stderr=True)`, so this panic output on stderr is suppressed rather than logged or inspected — the daemon just logs `"Reboot command failed"` with no actionable context. Diagnosing production failures requires SSHing into the switch, manually running the docker exec command, and reading Go stack traces. ## 2. Goal @@ -103,6 +103,8 @@ python -m grpc_tools.protoc \ system/system.proto types/types.proto ``` +**Packaging note:** `setup.py` currently lists packages explicitly (`['host_modules', 'utils']`). The implementation must add `'host_modules.gnoi'` to the `packages` list and corresponding `package_dir` entry, otherwise the vendored stubs won't be installed and imports will fail in deployed environments. + **Why vendor instead of build-time generation?** - sonic-host-services has no existing proto compilation infrastructure - The gNOI System proto is stable (no changes in years) From 72eb52a4888d284396dbf2637ea74368ba5206b3 Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Sat, 21 Mar 2026 05:12:39 +0000 Subject: [PATCH 06/12] Replace inline reference code with links to source files Section 8 now links to gnoi_client, reboot.go, and system.proto instead of inlining full source code. Signed-off-by: Dawei Huang --- doc/gnoi-native-grpc-design.md | 172 +-------------------------------- 1 file changed, 4 insertions(+), 168 deletions(-) diff --git a/doc/gnoi-native-grpc-design.md b/doc/gnoi-native-grpc-design.md index 1c2b2f69..b5919de6 100644 --- a/doc/gnoi-native-grpc-design.md +++ b/doc/gnoi-native-grpc-design.md @@ -309,174 +309,10 @@ All phases can ship as a single PR since they form one atomic change — the old ## 8. Reference Code -This section provides the upstream source code that this design replaces and builds upon, so the document is self-contained. - -### 8.1 Current `gnoi_client` Entry Point - -**Source:** [`sonic-gnmi/gnoi_client/gnoi_client.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/gnoi_client.go) - -```go -package main - -import ( - "context" - "os" - "os/signal" - - "github.com/google/gnxi/utils/credentials" - "github.com/sonic-net/sonic-gnmi/gnoi_client/config" - "github.com/sonic-net/sonic-gnmi/gnoi_client/system" - "google.golang.org/grpc" -) - -func main() { - config.ParseFlag() - opts := credentials.ClientCredentials(*config.TargetName) - - ctx, cancel := context.WithCancel(context.Background()) - go func() { - c := make(chan os.Signal, 1) - signal.Notify(c, os.Interrupt) - <-c - cancel() - }() - conn, err := grpc.Dial(*config.Target, opts...) - if err != nil { - panic(err.Error()) - } - - switch *config.Module { - case "System": - switch *config.Rpc { - case "Reboot": - system.Reboot(conn, ctx) - case "RebootStatus": - system.RebootStatus(conn, ctx) - // ... other RPCs omitted for brevity - } - // ... other modules omitted - } -} -``` - -### 8.2 `gnoi_client/system/reboot.go` — Reboot & RebootStatus Implementation - -**Source:** [`sonic-gnmi/gnoi_client/system/reboot.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/system/reboot.go) - -```go -package system - -import ( - "context" - "encoding/json" - "fmt" - pb "github.com/openconfig/gnoi/system" - "github.com/sonic-net/sonic-gnmi/gnoi_client/config" - "github.com/sonic-net/sonic-gnmi/gnoi_client/utils" - "google.golang.org/grpc" -) - -func Reboot(conn *grpc.ClientConn, ctx context.Context) { - fmt.Println("System Reboot") - ctx = utils.SetUserCreds(ctx) - sc := pb.NewSystemClient(conn) - req := &pb.RebootRequest{} - json.Unmarshal([]byte(*config.Args), req) - _, err := sc.Reboot(ctx, req) - if err != nil { - panic(err.Error()) // ← Error is lost in a Go panic stack trace - } -} - -func RebootStatus(conn *grpc.ClientConn, ctx context.Context) { - fmt.Println("System RebootStatus") - ctx = utils.SetUserCreds(ctx) - sc := pb.NewSystemClient(conn) - req := &pb.RebootStatusRequest{} - resp, err := sc.RebootStatus(ctx, req) - if err != nil { - panic(err.Error()) - } - respstr, err := json.Marshal(resp) - if err != nil { - panic(err.Error()) - } - fmt.Println(string(respstr)) // ← Output that daemon tries to parse -} -``` - -**Key observations:** -- `Reboot()` prints `"System Reboot\n"` on success, panics on failure — no structured output -- `RebootStatus()` prints JSON-serialized `RebootStatusResponse` — the daemon searches for `"reboot complete"` which never appears in this JSON -- Errors use `panic()` which produces Go stack traces instead of parseable error output - -### 8.3 OpenConfig gNOI System Proto Definition - -**Source:** [`openconfig/gnoi/system/system.proto`](https://github.com/openconfig/gnoi/blob/main/system/system.proto) (vendored at `sonic-gnmi/vendor/github.com/openconfig/gnoi/system/system.proto`) - -```protobuf -syntax = "proto3"; -package gnoi.system; - -service System { - rpc Reboot(RebootRequest) returns (RebootResponse) {} - rpc RebootStatus(RebootStatusRequest) returns (RebootStatusResponse) {} - rpc CancelReboot(CancelRebootRequest) returns (CancelRebootResponse) {} - rpc KillProcess(KillProcessRequest) returns (KillProcessResponse) {} - rpc Time(TimeRequest) returns (TimeResponse) {} - // ... Ping, Traceroute, SwitchControlProcessor, SetPackage omitted -} - -message RebootRequest { - RebootMethod method = 1; - uint64 delay = 2; // Delay in nanoseconds - string message = 3; // Informational reason - repeated types.Path subcomponents = 4; - bool force = 5; -} - -message RebootResponse {} - -enum RebootMethod { - UNKNOWN = 0; - COLD = 1; // Shutdown and restart OS and all hardware - POWERDOWN = 2; // Halt and power down - HALT = 3; // Halt (used for DPU shutdown) - WARM = 4; // Reload configuration only - NSF = 5; // Non-stop-forwarding reboot - POWERUP = 7; // Apply power -} - -message RebootStatusRequest { - repeated types.Path subcomponents = 1; -} - -message RebootStatusResponse { - bool active = 1; // If reboot is active - uint64 wait = 2; // Time left until reboot (ns) - uint64 when = 3; // Reboot time (ns since epoch) - string reason = 4; - uint32 count = 5; - RebootMethod method = 6; - RebootStatus status = 7; // Only meaningful when active = false -} - -message RebootStatus { - enum Status { - STATUS_UNKNOWN = 0; - STATUS_SUCCESS = 1; - STATUS_RETRIABLE_FAILURE = 2; - STATUS_FAILURE = 3; - } - Status status = 1; - string message = 2; -} - -``` - -**Key proto details for the Python wrapper:** -- `RebootMethod.HALT = 3` — the method used for DPU graceful shutdown -- `RebootStatusResponse.active == false` with `status.status == STATUS_SUCCESS` indicates successful halt completion +- **`gnoi_client` entry point:** [`sonic-gnmi/gnoi_client/gnoi_client.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/gnoi_client.go) — dispatches to per-module handlers; errors use `panic()` producing Go stack traces +- **Reboot/RebootStatus implementation:** [`sonic-gnmi/gnoi_client/system/reboot.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/system/reboot.go) — `Reboot()` prints `"System Reboot\n"` on success; `RebootStatus()` prints JSON-serialized `RebootStatusResponse` (the output the daemon tries to parse with `"reboot complete"`) +- **gNOI System proto:** [`openconfig/gnoi/system/system.proto`](https://github.com/openconfig/gnoi/blob/main/system/system.proto) — defines `RebootMethod.HALT = 3`, `RebootStatusResponse.active`, and `RebootStatus.Status` enum (`STATUS_SUCCESS = 1`) +- **gNOI types proto (dependency):** [`openconfig/gnoi/types/types.proto`](https://github.com/openconfig/gnoi/blob/main/types/types.proto) ## 9. Future Work From f47c59609d47b9d97a409854b81f8a7a476ac8b5 Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Mon, 23 Mar 2026 16:23:10 +0000 Subject: [PATCH 07/12] =?UTF-8?q?design=20doc:=20trim=20for=20human=20read?= =?UTF-8?q?ability=20=E2=80=94=20lead=20with=20the=20bug,=20cut=20boilerpl?= =?UTF-8?q?ate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Dawei Huang --- doc/gnoi-native-grpc-design.md | 302 +++++---------------------------- 1 file changed, 41 insertions(+), 261 deletions(-) diff --git a/doc/gnoi-native-grpc-design.md b/doc/gnoi-native-grpc-design.md index b5919de6..0b132f91 100644 --- a/doc/gnoi-native-grpc-design.md +++ b/doc/gnoi-native-grpc-design.md @@ -1,218 +1,63 @@ # Design: Replace `docker exec gnoi_client` with Native gRPC Calls -## 1. Background +## TL;DR -The `gnoi_shutdown_daemon` on SmartSwitch NPU orchestrates graceful DPU shutdown by issuing gNOI `System.Reboot(HALT)` and polling `System.RebootStatus`. Today it does this by shelling out: +`gnoi_shutdown_daemon` polls DPU reboot status by checking for `"reboot complete"` in `gnoi_client` stdout — but that string never appears in the output. Every DPU shutdown poll **times out unconditionally**. The fix: replace the subprocess calls with direct Python gRPC, which also eliminates the gnmi container dependency and gives us real error messages. -``` -docker exec gnmi gnoi_client -target=: -notls -module System -rpc Reboot ... -docker exec gnmi gnoi_client -target=: -notls -module System -rpc RebootStatus -``` - -This has several problems: - -| Problem | Impact | -|---------|--------| -| Requires the `gnmi` container to be running and healthy | If gnmi container is restarting or unhealthy, DPU shutdown fails silently | -| Subprocess overhead per RPC call | Extra process creation, Docker CLI round-trip, stdout parsing | -| Fragile output parsing | `"reboot complete" in out_s.lower()` breaks on any output format change | -| No structured error handling | gRPC status codes are lost; only `rc != 0` is checked | -| Error output is a Go panic stack trace | Extremely painful to diagnose failures (see §1.1) | -| Security surface | Shell-out through Docker CLI is a wider attack surface than a direct socket | - -### 1.1 gnoi_client Output Format Analysis - -The `gnoi_client` binary in sonic-gnmi is a Go CLI tool. Understanding its output format reveals why the current approach is fragile: - -**Reboot RPC (`-rpc Reboot`):** -- On **success**: prints `"System Reboot\n"` to stdout, exits 0. No structured output. -- On **failure**: calls `panic(err.Error())`, which dumps a **Go panic stack trace** to stderr and exits with a non-zero code. The daemon only checks `rc != 0` — the actual gRPC error code, message, and details are buried in a multi-line panic dump that is not parsed. +## The Bug -**RebootStatus RPC (`-rpc RebootStatus`):** -- On **success**: prints `"System RebootStatus\n"` header followed by JSON-marshaled `RebootStatusResponse`, e.g.: - ```json - System RebootStatus - {"active":false,"status":{"status":"STATUS_SUCCESS","message":"..."}} - ``` -- On **failure**: same `panic(err.Error())` — Go stack trace, non-zero exit. +`_poll_reboot_status()` in `scripts/gnoi_shutdown_daemon.py`: -**The parsing bug:** The daemon currently checks: ```python if rc_s == 0 and out_s and ("reboot complete" in out_s.lower()): return True ``` -But the actual protobuf `RebootStatusResponse` serialized to JSON contains fields like `"active":false` and `"status":"STATUS_SUCCESS"` — the string `"reboot complete"` never appears in the output. This means the poll loop **always times out** regardless of whether the DPU successfully halted, and the daemon proceeds purely on the timeout path. -**Why this matters for error diagnosis:** When a gNOI RPC fails (DPU unreachable, TLS mismatch, auth failure, server-side error), the only signal is a Go panic: -``` -panic: rpc error: code = Unavailable desc = connection error: ... +Actual `gnoi_client -rpc RebootStatus` output: -goroutine 1 [running]: -main.main() - /sonic/gnoi_client/gnoi_client.go:42 -... ``` -For the Reboot call, `_send_reboot_command()` invokes `execute_command(..., suppress_stderr=True)`, so this panic output on stderr is suppressed rather than logged or inspected — the daemon just logs `"Reboot command failed"` with no actionable context. Diagnosing production failures requires SSHing into the switch, manually running the docker exec command, and reading Go stack traces. - -## 2. Goal - -Replace the subprocess-based `gnoi_client` invocations with direct Python gRPC calls using generated protobuf stubs for the [OpenConfig gNOI System service](https://github.com/openconfig/gnoi/blob/main/system/system.proto). - -## 3. Scope +System RebootStatus +{"active":false,"status":{"status":"STATUS_SUCCESS","message":"..."}} +``` -### In Scope -- Generate or vendor Python gRPC stubs for `gnoi.system.System` (Reboot, RebootStatus RPCs) -- Create a lightweight `GnoiClient` wrapper class -- Refactor `GnoiRebootHandler._send_reboot_command()` and `_poll_reboot_status()` to use native gRPC -- Remove `execute_command()` helper (becomes unused) -- Update unit tests to mock at the gRPC stub level -- Add `grpcio` and `protobuf` to package dependencies +The string `"reboot complete"` never appears. The poll always exhausts its timeout, then proceeds as if the DPU halted — whether it did or not. -### Out of Scope -- TLS/mTLS on the midplane channel (future work; midplane is trusted today) -- Refactoring the daemon's main loop or config DB subscription logic -- Other gNOI services beyond `System` -- Changes to how DPU IP/port are discovered from CONFIG_DB +A secondary problem: when the Reboot RPC fails, `gnoi_client` panics with a Go stack trace on stderr. The daemon calls `execute_command(..., suppress_stderr=True)`, so the error goes to `/dev/null`. The only log is `"Reboot command failed"` with zero context. -## 4. Design +## What Changes -### 4.1 Phase 1 — Proto Stubs +Replace `docker exec gnmi gnoi_client` subprocess calls with direct Python gRPC using vendored [gNOI System proto](https://github.com/openconfig/gnoi/blob/main/system/system.proto) stubs. -Vendor pre-generated Python stubs from the gNOI `system.proto` definition. +### New files -**Files to add:** ``` host_modules/gnoi/ ├── __init__.py -├── system_pb2.py # generated message classes -├── system_pb2_grpc.py # generated service stubs -├── types_pb2.py # generated types (dependency of system_pb2) -└── types_pb2_grpc.py # generated (empty, no services in types.proto) -``` - -The stubs are generated from: -- https://github.com/openconfig/gnoi/blob/main/system/system.proto -- https://github.com/openconfig/gnoi/blob/main/types/types.proto (dependency) - -Generation command (for reference / CI reproducibility): -```bash -python -m grpc_tools.protoc \ - -I./proto \ - --python_out=host_modules/gnoi \ - --grpc_python_out=host_modules/gnoi \ - system/system.proto types/types.proto +├── client.py # GnoiClient wrapper (reboot + reboot_status) +├── system_pb2.py # vendored proto stubs +├── system_pb2_grpc.py +├── types_pb2.py +└── types_pb2_grpc.py ``` -**Packaging note:** `setup.py` currently lists packages explicitly (`['host_modules', 'utils']`). The implementation must add `'host_modules.gnoi'` to the `packages` list and corresponding `package_dir` entry, otherwise the vendored stubs won't be installed and imports will fail in deployed environments. - -**Why vendor instead of build-time generation?** -- sonic-host-services has no existing proto compilation infrastructure -- The gNOI System proto is stable (no changes in years) -- Keeps the build simple; can migrate to build-time generation later if more protos are needed +### Modified files -### 4.2 Phase 2 — GnoiClient Wrapper - -A thin wrapper providing the two RPCs we need: +**`scripts/gnoi_shutdown_daemon.py`** — the two RPC call sites change: -```python -# host_modules/gnoi/client.py - -import grpc -from . import system_pb2, system_pb2_grpc - -class GnoiClient: - """Lightweight gNOI System service client for DPU communication.""" - - def __init__(self, target: str, timeout: int = 30): - """ - Args: - target: gRPC target in "host:port" format - timeout: Default RPC timeout in seconds - """ - self._channel = grpc.insecure_channel(target) - self._stub = system_pb2_grpc.SystemStub(self._channel) - self._timeout = timeout - - def reboot(self, method: int = 3, message: str = "") -> None: - """ - Send System.Reboot RPC. - - Args: - method: RebootMethod enum value (3 = HALT) - message: Human-readable reason string - - Raises: - grpc.RpcError: on any gRPC failure - """ - request = system_pb2.RebootRequest( - method=method, - message=message, - ) - self._stub.Reboot(request, timeout=self._timeout) - - def reboot_status(self) -> system_pb2.RebootStatusResponse: - """ - Poll System.RebootStatus RPC. - - Returns: - RebootStatusResponse with .active and .wait fields - - Raises: - grpc.RpcError: on any gRPC failure - """ - request = system_pb2.RebootStatusRequest() - return self._stub.RebootStatus(request, timeout=self._timeout) - - def close(self): - """Close the underlying gRPC channel.""" - if self._channel: - self._channel.close() - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() -``` - -### 4.3 Phase 3 — Refactor gnoi_shutdown_daemon - -Replace the two subprocess call sites in `GnoiRebootHandler`: - -#### `_send_reboot_command` (before) -```python -def _send_reboot_command(self, dpu_name, dpu_ip, port): - reboot_cmd = ["docker", "exec", "gnmi", "gnoi_client", ...] - rc, out, err = execute_command(reboot_cmd, ...) - return rc == 0 -``` - -#### `_send_reboot_command` (after) +`_send_reboot_command` becomes: ```python def _send_reboot_command(self, dpu_name, dpu_ip, port): try: with GnoiClient(f"{dpu_ip}:{port}", timeout=REBOOT_RPC_TIMEOUT_SEC) as client: - client.reboot( - method=REBOOT_METHOD_HALT, - message="Triggered by SmartSwitch graceful shutdown" - ) + client.reboot(method=REBOOT_METHOD_HALT, + message="Triggered by SmartSwitch graceful shutdown") return True except grpc.RpcError as e: logger.log_error(f"{dpu_name}: gNOI Reboot failed: {e.code()} {e.details()}") return False ``` -#### `_poll_reboot_status` (before) -```python -def _poll_reboot_status(self, dpu_name, dpu_ip, port): - status_cmd = ["docker", "exec", "gnmi", "gnoi_client", ...] - while time.monotonic() < deadline: - rc_s, out_s, _ = execute_command(status_cmd, ...) - if rc_s == 0 and "reboot complete" in out_s.lower(): - return True -``` - -#### `_poll_reboot_status` (after) +`_poll_reboot_status` becomes: ```python def _poll_reboot_status(self, dpu_name, dpu_ip, port): deadline = time.monotonic() + _get_halt_timeout() @@ -221,101 +66,36 @@ def _poll_reboot_status(self, dpu_name, dpu_ip, port): try: resp = client.reboot_status() if not resp.active: - status_enum = resp.status.status - status_str = system_pb2.RebootStatus.Status.Name(status_enum) - logger.log_notice(f"{dpu_name}: RebootStatus complete: {status_str} - {resp.status.message}") - return status_enum == system_pb2.RebootStatus.STATUS_SUCCESS + return resp.status.status == system_pb2.RebootStatus.STATUS_SUCCESS except grpc.RpcError as e: - logger.log_warning( - f"{dpu_name}: RebootStatus poll error: code={e.code()} details={e.details()}" - ) + logger.log_warning(f"{dpu_name}: RebootStatus poll error: {e.code()} {e.details()}") time.sleep(STATUS_POLL_INTERVAL_SEC) return False ``` -**Key improvements over the subprocess approach:** -- **Fixes the parsing bug**: checks `resp.active == False` directly instead of the broken `"reboot complete" in stdout` match that never triggers -- **Distinguishes success from failure**: inspects `resp.status.status` enum (`STATUS_SUCCESS` vs `STATUS_FAILURE` vs `STATUS_RETRIABLE_FAILURE`) -- **Actionable error logs**: gRPC errors include status code and details (e.g., `code=UNAVAILABLE details=connection refused`) instead of opaque "command failed" - -#### Removals -- `execute_command()` function — no longer needed -- `import subprocess` — no longer needed - -#### Additions -- `import grpc` -- `from host_modules.gnoi.client import GnoiClient` - -### 4.4 Phase 4 — Update Tests - -Current tests mock `execute_command` and check return codes. New tests mock at the gRPC level: - -```python -@mock.patch('gnoi_shutdown_daemon.GnoiClient') -def test_send_reboot_command_success(self, MockClient): - mock_client = MockClient.return_value.__enter__.return_value - # reboot() returns None on success - mock_client.reboot.return_value = None - - result = handler._send_reboot_command("DPU0", "10.0.0.1", "8080") - assert result is True - mock_client.reboot.assert_called_once() - -@mock.patch('gnoi_shutdown_daemon.GnoiClient') -def test_send_reboot_command_failure(self, MockClient): - mock_client = MockClient.return_value.__enter__.return_value - error = mock.create_autospec(grpc.RpcError) - error.code.return_value = grpc.StatusCode.UNAVAILABLE - error.details.return_value = "connection refused" - mock_client.reboot.side_effect = error - - result = handler._send_reboot_command("DPU0", "10.0.0.1", "8080") - assert result is False -``` - -### 4.5 Dependencies - -| Package | Version | Notes | -|---------|---------|-------| -| `grpcio` | >=1.51.0 | Already in SONiC build environment | -| `protobuf` | >=4.21.0 | Already in SONiC build environment | - -Verify these are available in the sonic-host-services build context. If not, add to `setup.py` `install_requires`. +`execute_command()` and `import subprocess` are removed. -## 5. Implementation Plan +**`setup.py`** — add `host_modules.gnoi` to packages list. -| Phase | Description | PR | -|-------|-------------|----| -| 1 | Vendor gNOI System proto stubs | PR #1 | -| 2 | Add `GnoiClient` wrapper + unit tests | PR #1 (same) | -| 3 | Refactor `gnoi_shutdown_daemon` to use `GnoiClient` | PR #1 (same) | -| 4 | Update existing daemon tests | PR #1 (same) | +**`tests/gnoi_shutdown_daemon_test.py`** — mocks move from `execute_command` to `GnoiClient`. -All phases can ship as a single PR since they form one atomic change — the old subprocess path is fully replaced. +### What stays the same -## 6. Testing +Main loop, CONFIG_DB subscription, DPU IP/port discovery, halt flag handling, threading model — all unchanged. -- **Unit tests**: Mock gRPC stubs, verify correct protobuf messages are sent, verify error handling for various `grpc.StatusCode` values -- **Integration test**: On a SmartSwitch testbed, trigger `config chassis modules shutdown DPU0` and verify gNOI HALT is sent and RebootStatus is polled successfully via syslog -- **Regression**: Existing CI pipeline covers the daemon; updated mocks ensure no regressions +## Why vendor stubs instead of build-time generation? -## 7. Risks & Mitigations +sonic-host-services has no proto compilation infra. The gNOI System proto hasn't changed in years. We can migrate to build-time generation later if more protos are needed. -| Risk | Mitigation | -|------|------------| -| gRPC/protobuf not available in host environment | Verify during build; these are already used by other SONiC components | -| Proto stub drift from upstream gnoi | Pin to a specific gnoi commit; stubs are stable | -| Insecure channel on midplane | Same trust model as today's `gnoi_client -notls`; TLS is future work | +## Risks -## 8. Reference Code +- **grpcio/protobuf availability**: both are already in the SONiC build environment. +- **Proto drift**: pin to a specific gnoi commit; the System service is stable. +- **Insecure channel**: same trust model as today's `-notls` flag on midplane. TLS is future work. -- **`gnoi_client` entry point:** [`sonic-gnmi/gnoi_client/gnoi_client.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/gnoi_client.go) — dispatches to per-module handlers; errors use `panic()` producing Go stack traces -- **Reboot/RebootStatus implementation:** [`sonic-gnmi/gnoi_client/system/reboot.go`](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/system/reboot.go) — `Reboot()` prints `"System Reboot\n"` on success; `RebootStatus()` prints JSON-serialized `RebootStatusResponse` (the output the daemon tries to parse with `"reboot complete"`) -- **gNOI System proto:** [`openconfig/gnoi/system/system.proto`](https://github.com/openconfig/gnoi/blob/main/system/system.proto) — defines `RebootMethod.HALT = 3`, `RebootStatusResponse.active`, and `RebootStatus.Status` enum (`STATUS_SUCCESS = 1`) -- **gNOI types proto (dependency):** [`openconfig/gnoi/types/types.proto`](https://github.com/openconfig/gnoi/blob/main/types/types.proto) +## Appendix: gnoi_client output format -## 9. Future Work +For readers who want to verify the bug claim — here's what `gnoi_client` actually does ([source](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/system/reboot.go)): -- **TLS support**: Add optional mTLS when midplane security is hardened -- **Build-time proto generation**: If more gNOI/gNMI services are needed, add a proto compilation step -- **Connection pooling**: Reuse gRPC channels across polls instead of creating per-call (minor optimization) +- **Reboot**: prints `"System Reboot\n"` on success, `panic(err.Error())` on failure (Go stack trace to stderr). +- **RebootStatus**: prints `"System RebootStatus\n"` + `json.Marshal(resp)` on success, same panic on failure. The JSON is protobuf-serialized `RebootStatusResponse` with fields `active`, `wait`, `status.status`, `status.message`. From 4940fab8f391b33ad07238a1565a4b503a726240 Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Mon, 23 Mar 2026 16:33:39 +0000 Subject: [PATCH 08/12] design doc: reframe as footgun pattern, trim implementation details Signed-off-by: Dawei Huang --- doc/gnoi-native-grpc-design.md | 117 +++++++++++++++------------------ 1 file changed, 53 insertions(+), 64 deletions(-) diff --git a/doc/gnoi-native-grpc-design.md b/doc/gnoi-native-grpc-design.md index 0b132f91..6af3e2f2 100644 --- a/doc/gnoi-native-grpc-design.md +++ b/doc/gnoi-native-grpc-design.md @@ -2,100 +2,89 @@ ## TL;DR -`gnoi_shutdown_daemon` polls DPU reboot status by checking for `"reboot complete"` in `gnoi_client` stdout — but that string never appears in the output. Every DPU shutdown poll **times out unconditionally**. The fix: replace the subprocess calls with direct Python gRPC, which also eliminates the gnmi container dependency and gives us real error messages. +`gnoi_shutdown_daemon` issues gNOI RPCs by shelling out to `docker exec gnmi gnoi_client`. This is fragile, opaque, and already causing silent failures. Replace with direct Python gRPC calls using vendored proto stubs. -## The Bug +## 1. Why This Pattern Is Bad -`_poll_reboot_status()` in `scripts/gnoi_shutdown_daemon.py`: +| Problem | Impact | +|---------|--------| +| Requires `gnmi` container running and healthy | DPU shutdown silently fails if gnmi is restarting | +| Subprocess + Docker CLI overhead per RPC | Extra process creation, Docker round-trip, stdout capture | +| Output is unstructured text with a header line | Any format change in `gnoi_client` breaks parsing | +| gRPC status codes are lost | Caller only sees `rc != 0` — no code, no details | +| Errors are Go `panic()` stack traces on stderr | Production diagnosis requires SSH + manual docker exec | +| `suppress_stderr=True` discards those panics | Error output goes to `/dev/null`, logs say only "command failed" | +| String matching for completion detection | `"reboot complete" in out_s.lower()` doesn't match actual output (see §2) | +| Tight coupling to CLI flag interface | `-module System -rpc Reboot -jsonin '{...}'` is a serialization layer we don't need | +| Security surface | Shell-out through Docker CLI is wider than a direct gRPC socket | + +## 2. Already Broken: RebootStatus Parsing + +The poll loop checks: ```python if rc_s == 0 and out_s and ("reboot complete" in out_s.lower()): return True ``` -Actual `gnoi_client -rpc RebootStatus` output: +Actual `gnoi_client` output ([source](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/system/reboot.go)): ``` System RebootStatus {"active":false,"status":{"status":"STATUS_SUCCESS","message":"..."}} ``` -The string `"reboot complete"` never appears. The poll always exhausts its timeout, then proceeds as if the DPU halted — whether it did or not. - -A secondary problem: when the Reboot RPC fails, `gnoi_client` panics with a Go stack trace on stderr. The daemon calls `execute_command(..., suppress_stderr=True)`, so the error goes to `/dev/null`. The only log is `"Reboot command failed"` with zero context. +`"reboot complete"` never appears → poll **always times out** regardless of DPU state. -## What Changes +## 3. Proposed Change -Replace `docker exec gnmi gnoi_client` subprocess calls with direct Python gRPC using vendored [gNOI System proto](https://github.com/openconfig/gnoi/blob/main/system/system.proto) stubs. - -### New files +Replace subprocess calls with a thin Python gRPC client using vendored [gNOI System proto](https://github.com/openconfig/gnoi/blob/main/system/system.proto) stubs. +**Before** (subprocess): ``` -host_modules/gnoi/ -├── __init__.py -├── client.py # GnoiClient wrapper (reboot + reboot_status) -├── system_pb2.py # vendored proto stubs -├── system_pb2_grpc.py -├── types_pb2.py -└── types_pb2_grpc.py +docker exec gnmi gnoi_client -target=: -notls -module System -rpc Reboot -jsonin '{"method":3}' ``` -### Modified files - -**`scripts/gnoi_shutdown_daemon.py`** — the two RPC call sites change: - -`_send_reboot_command` becomes: +**After** (direct gRPC): ```python -def _send_reboot_command(self, dpu_name, dpu_ip, port): - try: - with GnoiClient(f"{dpu_ip}:{port}", timeout=REBOOT_RPC_TIMEOUT_SEC) as client: - client.reboot(method=REBOOT_METHOD_HALT, - message="Triggered by SmartSwitch graceful shutdown") - return True - except grpc.RpcError as e: - logger.log_error(f"{dpu_name}: gNOI Reboot failed: {e.code()} {e.details()}") - return False +with GnoiClient(f"{dpu_ip}:{port}") as client: + client.reboot(method=REBOOT_METHOD_HALT, message="graceful shutdown") ``` -`_poll_reboot_status` becomes: +For RebootStatus, check the protobuf response directly instead of string matching: ```python -def _poll_reboot_status(self, dpu_name, dpu_ip, port): - deadline = time.monotonic() + _get_halt_timeout() - with GnoiClient(f"{dpu_ip}:{port}", timeout=STATUS_RPC_TIMEOUT_SEC) as client: - while time.monotonic() < deadline: - try: - resp = client.reboot_status() - if not resp.active: - return resp.status.status == system_pb2.RebootStatus.STATUS_SUCCESS - except grpc.RpcError as e: - logger.log_warning(f"{dpu_name}: RebootStatus poll error: {e.code()} {e.details()}") - time.sleep(STATUS_POLL_INTERVAL_SEC) - return False +resp = client.reboot_status() +if not resp.active and resp.status.status == STATUS_SUCCESS: + return True ``` -`execute_command()` and `import subprocess` are removed. - -**`setup.py`** — add `host_modules.gnoi` to packages list. - -**`tests/gnoi_shutdown_daemon_test.py`** — mocks move from `execute_command` to `GnoiClient`. - -### What stays the same - -Main loop, CONFIG_DB subscription, DPU IP/port discovery, halt flag handling, threading model — all unchanged. +### What this gives us +- **Structured errors**: `grpc.RpcError` with status code + details instead of opaque exit codes +- **Correct completion check**: inspect `resp.active` and `resp.status.status` directly +- **No container dependency**: gRPC goes straight to the DPU, gnmi container health is irrelevant +- **No parsing**: protobuf deserialization, not string matching on CLI output -## Why vendor stubs instead of build-time generation? +## 4. Scope -sonic-host-services has no proto compilation infra. The gNOI System proto hasn't changed in years. We can migrate to build-time generation later if more protos are needed. +### In scope +- Vendor Python gRPC stubs for `gnoi.system.System` (Reboot, RebootStatus) +- Lightweight `GnoiClient` wrapper +- Refactor the two RPC call sites in `GnoiRebootHandler` +- Update unit tests -## Risks +### Out of scope +- TLS/mTLS on midplane (future work; midplane is trusted today) +- Main loop, config DB subscription, halt flag handling — unchanged +- Other gNOI services beyond System -- **grpcio/protobuf availability**: both are already in the SONiC build environment. -- **Proto drift**: pin to a specific gnoi commit; the System service is stable. -- **Insecure channel**: same trust model as today's `-notls` flag on midplane. TLS is future work. +## 5. Why Vendor Stubs? -## Appendix: gnoi_client output format +sonic-host-services has no proto compilation infra. The gNOI System proto is stable (no changes in years). Vendoring keeps the build simple; can migrate to build-time generation later if more protos are needed. -For readers who want to verify the bug claim — here's what `gnoi_client` actually does ([source](https://github.com/sonic-net/sonic-gnmi/blob/master/gnoi_client/system/reboot.go)): +## 6. Risks -- **Reboot**: prints `"System Reboot\n"` on success, `panic(err.Error())` on failure (Go stack trace to stderr). -- **RebootStatus**: prints `"System RebootStatus\n"` + `json.Marshal(resp)` on success, same panic on failure. The JSON is protobuf-serialized `RebootStatusResponse` with fields `active`, `wait`, `status.status`, `status.message`. +| Risk | Mitigation | +|------|------------| +| grpcio/protobuf not in host environment | Already used by other SONiC components | +| Proto drift from upstream gnoi | Pin to a specific commit; System service is stable | +| Insecure channel on midplane | Same trust model as today's `-notls`; TLS is future work | From eb0cf402ca08dc81f5d240b30d4c70ccdc3c44e6 Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Mon, 23 Mar 2026 16:35:44 +0000 Subject: [PATCH 09/12] design doc: soften language, use neutral framing Signed-off-by: Dawei Huang --- doc/gnoi-native-grpc-design.md | 89 ++++++++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 19 deletions(-) diff --git a/doc/gnoi-native-grpc-design.md b/doc/gnoi-native-grpc-design.md index 6af3e2f2..e534824a 100644 --- a/doc/gnoi-native-grpc-design.md +++ b/doc/gnoi-native-grpc-design.md @@ -2,23 +2,23 @@ ## TL;DR -`gnoi_shutdown_daemon` issues gNOI RPCs by shelling out to `docker exec gnmi gnoi_client`. This is fragile, opaque, and already causing silent failures. Replace with direct Python gRPC calls using vendored proto stubs. +`gnoi_shutdown_daemon` issues gNOI RPCs by shelling out to `docker exec gnmi gnoi_client`. This introduces several layers of indirection that make failures hard to diagnose and completion detection unreliable. This document proposes replacing the subprocess path with direct Python gRPC calls using vendored proto stubs. -## 1. Why This Pattern Is Bad +## 1. Limitations of the Current Approach -| Problem | Impact | -|---------|--------| -| Requires `gnmi` container running and healthy | DPU shutdown silently fails if gnmi is restarting | -| Subprocess + Docker CLI overhead per RPC | Extra process creation, Docker round-trip, stdout capture | -| Output is unstructured text with a header line | Any format change in `gnoi_client` breaks parsing | -| gRPC status codes are lost | Caller only sees `rc != 0` — no code, no details | -| Errors are Go `panic()` stack traces on stderr | Production diagnosis requires SSH + manual docker exec | -| `suppress_stderr=True` discards those panics | Error output goes to `/dev/null`, logs say only "command failed" | -| String matching for completion detection | `"reboot complete" in out_s.lower()` doesn't match actual output (see §2) | -| Tight coupling to CLI flag interface | `-module System -rpc Reboot -jsonin '{...}'` is a serialization layer we don't need | -| Security surface | Shell-out through Docker CLI is wider than a direct gRPC socket | +| Observation | Consequence | +|-------------|-------------| +| Requires NPU `gnmi` container running and healthy | DPU shutdown depends on an unrelated NPU container's availability, even though the RPC target is the DPU's own gnmi server | +| Subprocess + Docker CLI overhead per RPC | Extra process creation, Docker round-trip, stdout capture on each call | +| Output is unstructured text with a header line | Parsing is coupled to `gnoi_client`'s print format, which has no stability guarantee | +| gRPC status codes are not propagated | Caller only sees `rc != 0` — no status code, no error details | +| Errors surface as Go `panic()` stack traces on stderr | Diagnosing RPC failures requires SSH + manual docker exec | +| `suppress_stderr=True` on the Reboot call | Panic output is discarded; logs show only "command failed" | +| Completion check uses string matching | `"reboot complete" in out_s.lower()` does not match actual output format (see §2) | +| Tight coupling to CLI flag interface | `-module System -rpc Reboot -jsonin '{...}'` adds a serialization layer between caller and protobuf | +| Broader privilege surface | Shell-out through Docker CLI vs. a direct gRPC socket | -## 2. Already Broken: RebootStatus Parsing +## 2. Existing Issue: RebootStatus Completion Detection The poll loop checks: @@ -34,7 +34,7 @@ System RebootStatus {"active":false,"status":{"status":"STATUS_SUCCESS","message":"..."}} ``` -`"reboot complete"` never appears → poll **always times out** regardless of DPU state. +`"reboot complete"` does not appear in this output → the poll always exhausts its timeout regardless of DPU state. ## 3. Proposed Change @@ -59,10 +59,42 @@ if not resp.active and resp.status.status == STATUS_SUCCESS: ``` ### What this gives us -- **Structured errors**: `grpc.RpcError` with status code + details instead of opaque exit codes -- **Correct completion check**: inspect `resp.active` and `resp.status.status` directly -- **No container dependency**: gRPC goes straight to the DPU, gnmi container health is irrelevant -- **No parsing**: protobuf deserialization, not string matching on CLI output + +**Better error detection** — gRPC errors carry status codes and details natively: +```python +except grpc.RpcError as e: + logger.log_error(f"{dpu_name}: Reboot failed: {e.code()} {e.details()}") + # e.g. "UNAVAILABLE: connection refused" vs today's "command failed" +``` + +**Better testing** — mocks operate on typed protobuf objects instead of crafting subprocess stdout strings: +```python +# Today: mock must reproduce gnoi_client's exact text output +mock_execute.return_value = (0, "reboot complete", "") # this doesn't even match reality + +# After: mock returns a typed response +mock_client.reboot_status.return_value = RebootStatusResponse( + active=False, status=RebootStatus(status=STATUS_SUCCESS)) +``` + +**Correct completion check** — inspect `resp.active` and `resp.status.status` directly instead of string matching. + +**Removes unnecessary NPU gnmi container dependency** — the current approach shells into the NPU's `gnmi` container to run `gnoi_client`, but there's no reason the NPU daemon needs the NPU gnmi container as an intermediary. The DPU's own gnmi server is the actual RPC endpoint; direct gRPC connects to it without involving the NPU container. + +**Scales to future RPCs** — the same pattern extends to any gNOI or gNMI call without adding more subprocess wrappers: +```python +# Adding a new gNOI RPC is just another method on the client +class GnoiClient: + def reboot(self, ...): ... + def reboot_status(self, ...): ... + def cancel_reboot(self, ...): ... # future + def system_time(self, ...): ... # future + +# Or a gNMI client alongside it +with GnmiClient(f"{dpu_ip}:{port}") as client: + client.get(path="/system/state/...") +``` +Each new RPC is a typed method with protobuf request/response — no new shell commands, no new output formats to parse. ## 4. Scope @@ -72,6 +104,25 @@ if not resp.active and resp.status.status == STATUS_SUCCESS: - Refactor the two RPC call sites in `GnoiRebootHandler` - Update unit tests +New directory structure: +``` +host_modules/gnoi/ +├── __init__.py +├── client.py # GnoiClient: reboot(), reboot_status(), context manager +├── system_pb2.py # vendored from openconfig/gnoi system.proto +├── system_pb2_grpc.py +├── types_pb2.py # dependency of system.proto +└── types_pb2_grpc.py +``` + +The daemon change is essentially replacing `execute_command(["docker", "exec", ...])` with: +```python +with GnoiClient(f"{dpu_ip}:{port}") as client: + client.reboot(method=REBOOT_METHOD_HALT, ...) + # ... + resp = client.reboot_status() +``` + ### Out of scope - TLS/mTLS on midplane (future work; midplane is trusted today) - Main loop, config DB subscription, halt flag handling — unchanged From 110bf59327bdf6e288a34a82e173d78ba67cd63a Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Mon, 30 Mar 2026 16:17:08 +0000 Subject: [PATCH 10/12] Replace docker exec gnoi_client with native Python gRPC calls Refactor gnoi_shutdown_daemon to use direct gRPC instead of shelling out to 'docker exec gnmi gnoi_client'. This addresses three issues: 1. Broken RebootStatus completion detection - the old code checked for 'reboot complete' in stdout, but gnoi_client outputs JSON with 'STATUS_SUCCESS', so the poll always timed out regardless of DPU state. 2. Suppressed error context - _send_reboot_command used suppress_stderr=True, discarding Go panic traces. Now gRPC errors carry status codes and details natively (e.g. UNAVAILABLE: connection refused). 3. Unnecessary NPU gnmi container dependency - the DPU's own gnmi server is the actual RPC endpoint; direct gRPC connects without involving the NPU gnmi container as an intermediary. Changes: - Vendor gNOI System proto Python stubs (from openconfig/gnoi) into host_modules/gnoi/ (system_pb2, types_pb2, common_pb2) - Add GnoiClient wrapper (client.py) with reboot(), reboot_status(), cancel_reboot() methods and context manager support - Refactor _send_reboot_command to use GnoiClient.reboot() with proper grpc.RpcError handling - Refactor _poll_reboot_status to check resp.active and resp.status.status protobuf fields directly instead of string matching - Add grpcio and protobuf to setup.py install_requires - Add host_modules.gnoi to setup.py packages list - Update all unit tests to mock GnoiClient instead of subprocess calls - Add new tests: gRPC error handling, transient error recovery, failure status detection Signed-off-by: sigabrtv1-ui Signed-off-by: Dawei Huang --- host_modules/gnoi/__init__.py | 1 + host_modules/gnoi/client.py | 99 ++++++ host_modules/gnoi/common_pb2.py | 40 +++ host_modules/gnoi/common_pb2_grpc.py | 24 ++ host_modules/gnoi/system_pb2.py | 95 ++++++ host_modules/gnoi/system_pb2_grpc.py | 480 +++++++++++++++++++++++++++ host_modules/gnoi/types_pb2.py | 52 +++ host_modules/gnoi/types_pb2_grpc.py | 24 ++ scripts/gnoi_shutdown_daemon.py | 61 ++-- setup.py | 6 +- tests/gnoi_shutdown_daemon_test.py | 393 +++++++++++++--------- 11 files changed, 1084 insertions(+), 191 deletions(-) create mode 100644 host_modules/gnoi/__init__.py create mode 100644 host_modules/gnoi/client.py create mode 100644 host_modules/gnoi/common_pb2.py create mode 100644 host_modules/gnoi/common_pb2_grpc.py create mode 100644 host_modules/gnoi/system_pb2.py create mode 100644 host_modules/gnoi/system_pb2_grpc.py create mode 100644 host_modules/gnoi/types_pb2.py create mode 100644 host_modules/gnoi/types_pb2_grpc.py diff --git a/host_modules/gnoi/__init__.py b/host_modules/gnoi/__init__.py new file mode 100644 index 00000000..5990d743 --- /dev/null +++ b/host_modules/gnoi/__init__.py @@ -0,0 +1 @@ +# gNOI Python client - vendored proto stubs and client wrapper diff --git a/host_modules/gnoi/client.py b/host_modules/gnoi/client.py new file mode 100644 index 00000000..a480da89 --- /dev/null +++ b/host_modules/gnoi/client.py @@ -0,0 +1,99 @@ +""" +Lightweight Python gRPC client for gNOI System service. + +Wraps the vendored proto stubs to provide reboot() and reboot_status() +with proper error handling and structured responses. +""" + +import grpc +from host_modules.gnoi import system_pb2 +from host_modules.gnoi import system_pb2_grpc + + +class GnoiClient: + """gNOI System service client using direct gRPC (no Docker/subprocess).""" + + def __init__(self, target, timeout=60): + """ + Args: + target: gRPC target address, e.g. "10.0.0.1:8080" + timeout: Default RPC timeout in seconds. + """ + self._target = target + self._timeout = timeout + self._channel = None + self._stub = None + + def __enter__(self): + self._channel = grpc.insecure_channel(self._target) + self._stub = system_pb2_grpc.SystemStub(self._channel) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + return False + + def close(self): + if self._channel: + self._channel.close() + self._channel = None + self._stub = None + + def reboot(self, method, message="", timeout=None): + """ + Send a gNOI System.Reboot RPC. + + Args: + method: RebootMethod enum value (e.g. system_pb2.HALT = 3). + message: Human-readable reason string. + timeout: RPC timeout in seconds (overrides default). + + Returns: + RebootResponse protobuf message. + + Raises: + grpc.RpcError: On any gRPC failure (with code() and details()). + """ + request = system_pb2.RebootRequest( + method=method, + message=message, + ) + return self._stub.Reboot(request, timeout=timeout or self._timeout) + + def reboot_status(self, timeout=None): + """ + Poll gNOI System.RebootStatus RPC. + + Args: + timeout: RPC timeout in seconds (overrides default). + + Returns: + RebootStatusResponse protobuf message with fields: + - active (bool): True if reboot is still in progress + - wait (uint64): nanoseconds before next poll + - when (uint64): reboot scheduled time + - reason (str): reason for reboot + - count (uint32): number of reboots since active + + Raises: + grpc.RpcError: On any gRPC failure. + """ + request = system_pb2.RebootStatusRequest() + return self._stub.RebootStatus(request, timeout=timeout or self._timeout) + + def cancel_reboot(self, message="", timeout=None): + """ + Cancel a pending reboot via gNOI System.CancelReboot RPC. + + Args: + message: Human-readable reason for cancellation. + timeout: RPC timeout in seconds (overrides default). + + Returns: + CancelRebootResponse protobuf message. + + Raises: + grpc.RpcError: On any gRPC failure. + """ + request = system_pb2.CancelRebootRequest(message=message) + return self._stub.CancelReboot(request, timeout=timeout or self._timeout) diff --git a/host_modules/gnoi/common_pb2.py b/host_modules/gnoi/common_pb2.py new file mode 100644 index 00000000..d5eda351 --- /dev/null +++ b/host_modules/gnoi/common_pb2.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: github.com/openconfig/gnoi/common/common.proto +# Protobuf Python Version: 6.31.1 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 6, + 31, + 1, + '', + 'github.com/openconfig/gnoi/common/common.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from host_modules.gnoi import types_pb2 as github_dot_com_dot_openconfig_dot_gnoi_dot_types_dot_types__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n.github.com/openconfig/gnoi/common/common.proto\x12\x0bgnoi.common\x1a,github.com/openconfig/gnoi/types/types.proto\"\xf1\x01\n\x0eRemoteDownload\x12\x0c\n\x04path\x18\x01 \x01(\t\x12\x36\n\x08protocol\x18\x02 \x01(\x0e\x32$.gnoi.common.RemoteDownload.Protocol\x12,\n\x0b\x63redentials\x18\x03 \x01(\x0b\x32\x17.gnoi.types.Credentials\x12\x16\n\x0esource_address\x18\x04 \x01(\t\x12\x12\n\nsource_vrf\x18\x05 \x01(\t\"?\n\x08Protocol\x12\x0b\n\x07UNKNOWN\x10\x00\x12\x08\n\x04SFTP\x10\x01\x12\x08\n\x04HTTP\x10\x02\x12\t\n\x05HTTPS\x10\x03\x12\x07\n\x03SCP\x10\x04\x42#Z!github.com/openconfig/gnoi/commonb\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'github.com.openconfig.gnoi.common.common_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals['DESCRIPTOR']._loaded_options = None + _globals['DESCRIPTOR']._serialized_options = b'Z!github.com/openconfig/gnoi/common' + _globals['_REMOTEDOWNLOAD']._serialized_start=110 + _globals['_REMOTEDOWNLOAD']._serialized_end=351 + _globals['_REMOTEDOWNLOAD_PROTOCOL']._serialized_start=288 + _globals['_REMOTEDOWNLOAD_PROTOCOL']._serialized_end=351 +# @@protoc_insertion_point(module_scope) diff --git a/host_modules/gnoi/common_pb2_grpc.py b/host_modules/gnoi/common_pb2_grpc.py new file mode 100644 index 00000000..b2660b45 --- /dev/null +++ b/host_modules/gnoi/common_pb2_grpc.py @@ -0,0 +1,24 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc +import warnings + + +GRPC_GENERATED_VERSION = '1.80.0' +GRPC_VERSION = grpc.__version__ +_version_not_supported = False + +try: + from grpc._utilities import first_version_is_lower + _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) +except ImportError: + _version_not_supported = True + +if _version_not_supported: + raise RuntimeError( + f'The grpc package installed is at version {GRPC_VERSION},' + + ' but the generated code in github.com/openconfig/gnoi/common/common_pb2_grpc.py depends on' + + f' grpcio>={GRPC_GENERATED_VERSION}.' + + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' + + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' + ) diff --git a/host_modules/gnoi/system_pb2.py b/host_modules/gnoi/system_pb2.py new file mode 100644 index 00000000..2be513bd --- /dev/null +++ b/host_modules/gnoi/system_pb2.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: github.com/openconfig/gnoi/system/system.proto +# Protobuf Python Version: 6.31.1 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 6, + 31, + 1, + '', + 'github.com/openconfig/gnoi/system/system.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from host_modules.gnoi import common_pb2 as github_dot_com_dot_openconfig_dot_gnoi_dot_common_dot_common__pb2 +from host_modules.gnoi import types_pb2 as github_dot_com_dot_openconfig_dot_gnoi_dot_types_dot_types__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n.github.com/openconfig/gnoi/system/system.proto\x12\x0bgnoi.system\x1a.github.com/openconfig/gnoi/common/common.proto\x1a,github.com/openconfig/gnoi/types/types.proto\"L\n\x1dSwitchControlProcessorRequest\x12+\n\x11\x63ontrol_processor\x18\x01 \x01(\x0b\x32\x10.gnoi.types.Path\"n\n\x1eSwitchControlProcessorResponse\x12+\n\x11\x63ontrol_processor\x18\x01 \x01(\x0b\x32\x10.gnoi.types.Path\x12\x0f\n\x07version\x18\x02 \x01(\t\x12\x0e\n\x06uptime\x18\x03 \x01(\x03\"\x92\x01\n\rRebootRequest\x12)\n\x06method\x18\x01 \x01(\x0e\x32\x19.gnoi.system.RebootMethod\x12\r\n\x05\x64\x65lay\x18\x02 \x01(\x04\x12\x0f\n\x07message\x18\x03 \x01(\t\x12\'\n\rsubcomponents\x18\x04 \x03(\x0b\x32\x10.gnoi.types.Path\x12\r\n\x05\x66orce\x18\x05 \x01(\x08\"\x10\n\x0eRebootResponse\"O\n\x13\x43\x61ncelRebootRequest\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\'\n\rsubcomponents\x18\x02 \x03(\x0b\x32\x10.gnoi.types.Path\"\x16\n\x14\x43\x61ncelRebootResponse\">\n\x13RebootStatusRequest\x12\'\n\rsubcomponents\x18\x01 \x03(\x0b\x32\x10.gnoi.types.Path\"\xb7\x01\n\x14RebootStatusResponse\x12\x0e\n\x06\x61\x63tive\x18\x01 \x01(\x08\x12\x0c\n\x04wait\x18\x02 \x01(\x04\x12\x0c\n\x04when\x18\x03 \x01(\x04\x12\x0e\n\x06reason\x18\x04 \x01(\t\x12\r\n\x05\x63ount\x18\x05 \x01(\r\x12)\n\x06method\x18\x06 \x01(\x0e\x32\x19.gnoi.system.RebootMethod\x12)\n\x06status\x18\x07 \x01(\x0b\x32\x19.gnoi.system.RebootStatus\"\xb5\x01\n\x0cRebootStatus\x12\x30\n\x06status\x18\x01 \x01(\x0e\x32 .gnoi.system.RebootStatus.Status\x12\x0f\n\x07message\x18\x02 \x01(\t\"b\n\x06Status\x12\x12\n\x0eSTATUS_UNKNOWN\x10\x00\x12\x12\n\x0eSTATUS_SUCCESS\x10\x01\x12\x1c\n\x18STATUS_RETRIABLE_FAILURE\x10\x02\x12\x12\n\x0eSTATUS_FAILURE\x10\x03\"\r\n\x0bTimeRequest\"\x1c\n\x0cTimeResponse\x12\x0c\n\x04time\x18\x01 \x01(\x04\"\xe6\x01\n\x0bPingRequest\x12\x13\n\x0b\x64\x65stination\x18\x01 \x01(\t\x12\x0e\n\x06source\x18\x02 \x01(\t\x12\r\n\x05\x63ount\x18\x03 \x01(\x05\x12\x10\n\x08interval\x18\x04 \x01(\x03\x12\x0c\n\x04wait\x18\x05 \x01(\x03\x12\x0c\n\x04size\x18\x06 \x01(\x05\x12\x17\n\x0f\x64o_not_fragment\x18\x07 \x01(\x08\x12\x16\n\x0e\x64o_not_resolve\x18\x08 \x01(\x08\x12*\n\nl3protocol\x18\t \x01(\x0e\x32\x16.gnoi.types.L3Protocol\x12\x18\n\x10network_instance\x18\n \x01(\t\"\xc1\x01\n\x0cPingResponse\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04time\x18\x02 \x01(\x03\x12\x0c\n\x04sent\x18\x03 \x01(\x05\x12\x10\n\x08received\x18\x04 \x01(\x05\x12\x10\n\x08min_time\x18\x05 \x01(\x03\x12\x10\n\x08\x61vg_time\x18\x06 \x01(\x03\x12\x10\n\x08max_time\x18\x07 \x01(\x03\x12\x0f\n\x07std_dev\x18\x08 \x01(\x03\x12\r\n\x05\x62ytes\x18\x0b \x01(\x05\x12\x10\n\x08sequence\x18\x0c \x01(\x05\x12\x0b\n\x03ttl\x18\r \x01(\x05\"\xe7\x02\n\x11TracerouteRequest\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65stination\x18\x02 \x01(\t\x12\x13\n\x0binitial_ttl\x18\x03 \x01(\r\x12\x0f\n\x07max_ttl\x18\x04 \x01(\x05\x12\x0c\n\x04wait\x18\x05 \x01(\x03\x12\x17\n\x0f\x64o_not_fragment\x18\x06 \x01(\x08\x12\x16\n\x0e\x64o_not_resolve\x18\x07 \x01(\x08\x12*\n\nl3protocol\x18\x08 \x01(\x0e\x32\x16.gnoi.types.L3Protocol\x12=\n\nl4protocol\x18\t \x01(\x0e\x32).gnoi.system.TracerouteRequest.L4Protocol\x12\x19\n\x11\x64o_not_lookup_asn\x18\n \x01(\x08\x12\x18\n\x10network_instance\x18\x0b \x01(\t\"(\n\nL4Protocol\x12\x08\n\x04ICMP\x10\x00\x12\x07\n\x03TCP\x10\x01\x12\x07\n\x03UDP\x10\x02\"\xda\x05\n\x12TracerouteResponse\x12\x18\n\x10\x64\x65stination_name\x18\x01 \x01(\t\x12\x1b\n\x13\x64\x65stination_address\x18\x02 \x01(\t\x12\x0c\n\x04hops\x18\x03 \x01(\x05\x12\x13\n\x0bpacket_size\x18\x04 \x01(\x05\x12\x0b\n\x03hop\x18\x05 \x01(\x05\x12\x0f\n\x07\x61\x64\x64ress\x18\x06 \x01(\t\x12\x0c\n\x04name\x18\x07 \x01(\t\x12\x0b\n\x03rtt\x18\x08 \x01(\x03\x12\x34\n\x05state\x18\t \x01(\x0e\x32%.gnoi.system.TracerouteResponse.State\x12\x11\n\ticmp_code\x18\n \x01(\x05\x12\x37\n\x04mpls\x18\x0b \x03(\x0b\x32).gnoi.system.TracerouteResponse.MplsEntry\x12\x0f\n\x07\x61s_path\x18\x0c \x03(\x05\x12\x42\n\ricmp_ext_data\x18\r \x03(\x0b\x32+.gnoi.system.TracerouteResponse.IcmpExtData\x1a\x38\n\x0bIcmpExtData\x12\r\n\x05\x63lass\x18\x01 \x01(\r\x12\x0c\n\x04type\x18\x02 \x01(\r\x12\x0c\n\x04\x64\x61ta\x18\x03 \x03(\r\x1a+\n\tMplsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xf2\x01\n\x05State\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\x08\n\x04NONE\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x08\n\x04ICMP\x10\x03\x12\x14\n\x10HOST_UNREACHABLE\x10\x04\x12\x17\n\x13NETWORK_UNREACHABLE\x10\x05\x12\x18\n\x14PROTOCOL_UNREACHABLE\x10\x06\x12\x17\n\x13SOURCE_ROUTE_FAILED\x10\x07\x12\x18\n\x14\x46RAGMENTATION_NEEDED\x10\x08\x12\x0e\n\nPROHIBITED\x10\t\x12\x18\n\x14PRECEDENCE_VIOLATION\x10\n\x12\x15\n\x11PRECEDENCE_CUTOFF\x10\x0b\"t\n\x07Package\x12\x10\n\x08\x66ilename\x18\x01 \x01(\t\x12\x0f\n\x07version\x18\x04 \x01(\t\x12\x10\n\x08\x61\x63tivate\x18\x05 \x01(\x08\x12\x34\n\x0fremote_download\x18\x06 \x01(\x0b\x32\x1b.gnoi.common.RemoteDownload\"\x81\x01\n\x11SetPackageRequest\x12\'\n\x07package\x18\x01 \x01(\x0b\x32\x14.gnoi.system.PackageH\x00\x12\x12\n\x08\x63ontents\x18\x02 \x01(\x0cH\x00\x12$\n\x04hash\x18\x03 \x01(\x0b\x32\x14.gnoi.types.HashTypeH\x00\x42\t\n\x07request\"\x14\n\x12SetPackageResponse\"\xdd\x01\n\x12KillProcessRequest\x12\x0b\n\x03pid\x18\x01 \x01(\r\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x36\n\x06signal\x18\x03 \x01(\x0e\x32&.gnoi.system.KillProcessRequest.Signal\x12\x0f\n\x07restart\x18\x04 \x01(\x08\"c\n\x06Signal\x12\x16\n\x12SIGNAL_UNSPECIFIED\x10\x00\x12\x0f\n\x0bSIGNAL_TERM\x10\x01\x12\x0f\n\x0bSIGNAL_KILL\x10\x02\x12\x0e\n\nSIGNAL_HUP\x10\x03\x12\x0f\n\x0bSIGNAL_ABRT\x10\x04\"\x15\n\x13KillProcessResponse*d\n\x0cRebootMethod\x12\x0b\n\x07UNKNOWN\x10\x00\x12\x08\n\x04\x43OLD\x10\x01\x12\r\n\tPOWERDOWN\x10\x02\x12\x08\n\x04HALT\x10\x03\x12\x08\n\x04WARM\x10\x04\x12\x07\n\x03NSF\x10\x05\x12\x0b\n\x07POWERUP\x10\x07\"\x04\x08\x06\x10\x06\x32\xea\x05\n\x06System\x12?\n\x04Ping\x12\x18.gnoi.system.PingRequest\x1a\x19.gnoi.system.PingResponse\"\x00\x30\x01\x12Q\n\nTraceroute\x12\x1e.gnoi.system.TracerouteRequest\x1a\x1f.gnoi.system.TracerouteResponse\"\x00\x30\x01\x12=\n\x04Time\x12\x18.gnoi.system.TimeRequest\x1a\x19.gnoi.system.TimeResponse\"\x00\x12Q\n\nSetPackage\x12\x1e.gnoi.system.SetPackageRequest\x1a\x1f.gnoi.system.SetPackageResponse\"\x00(\x01\x12s\n\x16SwitchControlProcessor\x12*.gnoi.system.SwitchControlProcessorRequest\x1a+.gnoi.system.SwitchControlProcessorResponse\"\x00\x12\x43\n\x06Reboot\x12\x1a.gnoi.system.RebootRequest\x1a\x1b.gnoi.system.RebootResponse\"\x00\x12U\n\x0cRebootStatus\x12 .gnoi.system.RebootStatusRequest\x1a!.gnoi.system.RebootStatusResponse\"\x00\x12U\n\x0c\x43\x61ncelReboot\x12 .gnoi.system.CancelRebootRequest\x1a!.gnoi.system.CancelRebootResponse\"\x00\x12R\n\x0bKillProcess\x12\x1f.gnoi.system.KillProcessRequest\x1a .gnoi.system.KillProcessResponse\"\x00\x42+Z!github.com/openconfig/gnoi/system\xd2>\x05\x31.4.0b\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'github.com.openconfig.gnoi.system.system_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals['DESCRIPTOR']._loaded_options = None + _globals['DESCRIPTOR']._serialized_options = b'Z!github.com/openconfig/gnoi/system\322>\0051.4.0' + _globals['_TRACEROUTERESPONSE_MPLSENTRY']._loaded_options = None + _globals['_TRACEROUTERESPONSE_MPLSENTRY']._serialized_options = b'8\001' + _globals['_REBOOTMETHOD']._serialized_start=3141 + _globals['_REBOOTMETHOD']._serialized_end=3241 + _globals['_SWITCHCONTROLPROCESSORREQUEST']._serialized_start=157 + _globals['_SWITCHCONTROLPROCESSORREQUEST']._serialized_end=233 + _globals['_SWITCHCONTROLPROCESSORRESPONSE']._serialized_start=235 + _globals['_SWITCHCONTROLPROCESSORRESPONSE']._serialized_end=345 + _globals['_REBOOTREQUEST']._serialized_start=348 + _globals['_REBOOTREQUEST']._serialized_end=494 + _globals['_REBOOTRESPONSE']._serialized_start=496 + _globals['_REBOOTRESPONSE']._serialized_end=512 + _globals['_CANCELREBOOTREQUEST']._serialized_start=514 + _globals['_CANCELREBOOTREQUEST']._serialized_end=593 + _globals['_CANCELREBOOTRESPONSE']._serialized_start=595 + _globals['_CANCELREBOOTRESPONSE']._serialized_end=617 + _globals['_REBOOTSTATUSREQUEST']._serialized_start=619 + _globals['_REBOOTSTATUSREQUEST']._serialized_end=681 + _globals['_REBOOTSTATUSRESPONSE']._serialized_start=684 + _globals['_REBOOTSTATUSRESPONSE']._serialized_end=867 + _globals['_REBOOTSTATUS']._serialized_start=870 + _globals['_REBOOTSTATUS']._serialized_end=1051 + _globals['_REBOOTSTATUS_STATUS']._serialized_start=953 + _globals['_REBOOTSTATUS_STATUS']._serialized_end=1051 + _globals['_TIMEREQUEST']._serialized_start=1053 + _globals['_TIMEREQUEST']._serialized_end=1066 + _globals['_TIMERESPONSE']._serialized_start=1068 + _globals['_TIMERESPONSE']._serialized_end=1096 + _globals['_PINGREQUEST']._serialized_start=1099 + _globals['_PINGREQUEST']._serialized_end=1329 + _globals['_PINGRESPONSE']._serialized_start=1332 + _globals['_PINGRESPONSE']._serialized_end=1525 + _globals['_TRACEROUTEREQUEST']._serialized_start=1528 + _globals['_TRACEROUTEREQUEST']._serialized_end=1887 + _globals['_TRACEROUTEREQUEST_L4PROTOCOL']._serialized_start=1847 + _globals['_TRACEROUTEREQUEST_L4PROTOCOL']._serialized_end=1887 + _globals['_TRACEROUTERESPONSE']._serialized_start=1890 + _globals['_TRACEROUTERESPONSE']._serialized_end=2620 + _globals['_TRACEROUTERESPONSE_ICMPEXTDATA']._serialized_start=2274 + _globals['_TRACEROUTERESPONSE_ICMPEXTDATA']._serialized_end=2330 + _globals['_TRACEROUTERESPONSE_MPLSENTRY']._serialized_start=2332 + _globals['_TRACEROUTERESPONSE_MPLSENTRY']._serialized_end=2375 + _globals['_TRACEROUTERESPONSE_STATE']._serialized_start=2378 + _globals['_TRACEROUTERESPONSE_STATE']._serialized_end=2620 + _globals['_PACKAGE']._serialized_start=2622 + _globals['_PACKAGE']._serialized_end=2738 + _globals['_SETPACKAGEREQUEST']._serialized_start=2741 + _globals['_SETPACKAGEREQUEST']._serialized_end=2870 + _globals['_SETPACKAGERESPONSE']._serialized_start=2872 + _globals['_SETPACKAGERESPONSE']._serialized_end=2892 + _globals['_KILLPROCESSREQUEST']._serialized_start=2895 + _globals['_KILLPROCESSREQUEST']._serialized_end=3116 + _globals['_KILLPROCESSREQUEST_SIGNAL']._serialized_start=3017 + _globals['_KILLPROCESSREQUEST_SIGNAL']._serialized_end=3116 + _globals['_KILLPROCESSRESPONSE']._serialized_start=3118 + _globals['_KILLPROCESSRESPONSE']._serialized_end=3139 + _globals['_SYSTEM']._serialized_start=3244 + _globals['_SYSTEM']._serialized_end=3990 +# @@protoc_insertion_point(module_scope) diff --git a/host_modules/gnoi/system_pb2_grpc.py b/host_modules/gnoi/system_pb2_grpc.py new file mode 100644 index 00000000..f75f64fc --- /dev/null +++ b/host_modules/gnoi/system_pb2_grpc.py @@ -0,0 +1,480 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc +import warnings + +from host_modules.gnoi import system_pb2 as github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2 + +GRPC_GENERATED_VERSION = '1.80.0' +GRPC_VERSION = grpc.__version__ +_version_not_supported = False + +try: + from grpc._utilities import first_version_is_lower + _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) +except ImportError: + _version_not_supported = True + +if _version_not_supported: + raise RuntimeError( + f'The grpc package installed is at version {GRPC_VERSION},' + + ' but the generated code in github.com/openconfig/gnoi/system/system_pb2_grpc.py depends on' + + f' grpcio>={GRPC_GENERATED_VERSION}.' + + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' + + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' + ) + + +class SystemStub(object): + """The gNOI service is a collection of operational RPC's that allow for the + management of a target outside of the configuration and telemetry pipeline. + """ + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.Ping = channel.unary_stream( + '/gnoi.system.System/Ping', + request_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.PingRequest.SerializeToString, + response_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.PingResponse.FromString, + _registered_method=True) + self.Traceroute = channel.unary_stream( + '/gnoi.system.System/Traceroute', + request_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TracerouteRequest.SerializeToString, + response_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TracerouteResponse.FromString, + _registered_method=True) + self.Time = channel.unary_unary( + '/gnoi.system.System/Time', + request_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TimeRequest.SerializeToString, + response_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TimeResponse.FromString, + _registered_method=True) + self.SetPackage = channel.stream_unary( + '/gnoi.system.System/SetPackage', + request_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SetPackageRequest.SerializeToString, + response_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SetPackageResponse.FromString, + _registered_method=True) + self.SwitchControlProcessor = channel.unary_unary( + '/gnoi.system.System/SwitchControlProcessor', + request_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SwitchControlProcessorRequest.SerializeToString, + response_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SwitchControlProcessorResponse.FromString, + _registered_method=True) + self.Reboot = channel.unary_unary( + '/gnoi.system.System/Reboot', + request_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootRequest.SerializeToString, + response_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootResponse.FromString, + _registered_method=True) + self.RebootStatus = channel.unary_unary( + '/gnoi.system.System/RebootStatus', + request_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootStatusRequest.SerializeToString, + response_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootStatusResponse.FromString, + _registered_method=True) + self.CancelReboot = channel.unary_unary( + '/gnoi.system.System/CancelReboot', + request_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.CancelRebootRequest.SerializeToString, + response_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.CancelRebootResponse.FromString, + _registered_method=True) + self.KillProcess = channel.unary_unary( + '/gnoi.system.System/KillProcess', + request_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.KillProcessRequest.SerializeToString, + response_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.KillProcessResponse.FromString, + _registered_method=True) + + +class SystemServicer(object): + """The gNOI service is a collection of operational RPC's that allow for the + management of a target outside of the configuration and telemetry pipeline. + """ + + def Ping(self, request, context): + """Ping executes the ping command on the target and streams back + the results. Some targets may not stream any results until all + results are in. The stream should provide single ping packet responses + and must provide summary statistics. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def Traceroute(self, request, context): + """Traceroute executes the traceroute command on the target and streams back + the results. Some targets may not stream any results until all + results are in. If a hop count is not explicitly provided, + 30 is used. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def Time(self, request, context): + """Time returns the current time on the target. Time is typically used to + test if a target is actually responding. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def SetPackage(self, request_iterator, context): + """SetPackage places a software package (possibly including bootable images) + on the target. The file is sent in sequential messages, each message + up to 64KB of data. A final message must be sent that includes the hash + of the data sent. An error is returned if the location does not exist or + there is an error writing the data. If no checksum is received, the target + must assume the operation is incomplete and remove the partially + transmitted file. The target should initially write the file to a temporary + location so a failure does not destroy the original file. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def SwitchControlProcessor(self, request, context): + """SwitchControlProcessor will switch from the current route processor to the + provided route processor. If the current route processor is the same as the + one provided it is a NOOP. If the target does not exist an error is + returned. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def Reboot(self, request, context): + """Reboot causes the target to reboot, possibly at some point in the future. + If the method of reboot is not supported then the Reboot RPC will fail. + If the reboot is immediate the command will block until the subcomponents + have restarted. + If a reboot on the active control processor is pending the service must + reject all other reboot requests. + If a reboot request for active control processor is initiated with other + pending reboot requests it must be rejected. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def RebootStatus(self, request, context): + """RebootStatus returns the status of reboot for the target. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def CancelReboot(self, request, context): + """CancelReboot cancels any pending reboot request. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def KillProcess(self, request, context): + """KillProcess kills an OS process and optionally restarts it. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_SystemServicer_to_server(servicer, server): + rpc_method_handlers = { + 'Ping': grpc.unary_stream_rpc_method_handler( + servicer.Ping, + request_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.PingRequest.FromString, + response_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.PingResponse.SerializeToString, + ), + 'Traceroute': grpc.unary_stream_rpc_method_handler( + servicer.Traceroute, + request_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TracerouteRequest.FromString, + response_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TracerouteResponse.SerializeToString, + ), + 'Time': grpc.unary_unary_rpc_method_handler( + servicer.Time, + request_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TimeRequest.FromString, + response_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TimeResponse.SerializeToString, + ), + 'SetPackage': grpc.stream_unary_rpc_method_handler( + servicer.SetPackage, + request_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SetPackageRequest.FromString, + response_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SetPackageResponse.SerializeToString, + ), + 'SwitchControlProcessor': grpc.unary_unary_rpc_method_handler( + servicer.SwitchControlProcessor, + request_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SwitchControlProcessorRequest.FromString, + response_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SwitchControlProcessorResponse.SerializeToString, + ), + 'Reboot': grpc.unary_unary_rpc_method_handler( + servicer.Reboot, + request_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootRequest.FromString, + response_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootResponse.SerializeToString, + ), + 'RebootStatus': grpc.unary_unary_rpc_method_handler( + servicer.RebootStatus, + request_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootStatusRequest.FromString, + response_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootStatusResponse.SerializeToString, + ), + 'CancelReboot': grpc.unary_unary_rpc_method_handler( + servicer.CancelReboot, + request_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.CancelRebootRequest.FromString, + response_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.CancelRebootResponse.SerializeToString, + ), + 'KillProcess': grpc.unary_unary_rpc_method_handler( + servicer.KillProcess, + request_deserializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.KillProcessRequest.FromString, + response_serializer=github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.KillProcessResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'gnoi.system.System', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + server.add_registered_method_handlers('gnoi.system.System', rpc_method_handlers) + + + # This class is part of an EXPERIMENTAL API. +class System(object): + """The gNOI service is a collection of operational RPC's that allow for the + management of a target outside of the configuration and telemetry pipeline. + """ + + @staticmethod + def Ping(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream( + request, + target, + '/gnoi.system.System/Ping', + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.PingRequest.SerializeToString, + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.PingResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def Traceroute(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream( + request, + target, + '/gnoi.system.System/Traceroute', + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TracerouteRequest.SerializeToString, + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TracerouteResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def Time(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/gnoi.system.System/Time', + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TimeRequest.SerializeToString, + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.TimeResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def SetPackage(request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.stream_unary( + request_iterator, + target, + '/gnoi.system.System/SetPackage', + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SetPackageRequest.SerializeToString, + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SetPackageResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def SwitchControlProcessor(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/gnoi.system.System/SwitchControlProcessor', + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SwitchControlProcessorRequest.SerializeToString, + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.SwitchControlProcessorResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def Reboot(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/gnoi.system.System/Reboot', + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootRequest.SerializeToString, + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def RebootStatus(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/gnoi.system.System/RebootStatus', + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootStatusRequest.SerializeToString, + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.RebootStatusResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def CancelReboot(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/gnoi.system.System/CancelReboot', + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.CancelRebootRequest.SerializeToString, + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.CancelRebootResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) + + @staticmethod + def KillProcess(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/gnoi.system.System/KillProcess', + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.KillProcessRequest.SerializeToString, + github_dot_com_dot_openconfig_dot_gnoi_dot_system_dot_system__pb2.KillProcessResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + _registered_method=True) diff --git a/host_modules/gnoi/types_pb2.py b/host_modules/gnoi/types_pb2.py new file mode 100644 index 00000000..549fd9bc --- /dev/null +++ b/host_modules/gnoi/types_pb2.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: github.com/openconfig/gnoi/types/types.proto +# Protobuf Python Version: 6.31.1 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 6, + 31, + 1, + '', + 'github.com/openconfig/gnoi/types/types.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import descriptor_pb2 as google_dot_protobuf_dot_descriptor__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n,github.com/openconfig/gnoi/types/types.proto\x12\ngnoi.types\x1a google/protobuf/descriptor.proto\"\x89\x01\n\x08HashType\x12/\n\x06method\x18\x01 \x01(\x0e\x32\x1f.gnoi.types.HashType.HashMethod\x12\x0c\n\x04hash\x18\x02 \x01(\x0c\">\n\nHashMethod\x12\x0f\n\x0bUNSPECIFIED\x10\x00\x12\n\n\x06SHA256\x10\x01\x12\n\n\x06SHA512\x10\x02\x12\x07\n\x03MD5\x10\x03\":\n\x04Path\x12\x0e\n\x06origin\x18\x02 \x01(\t\x12\"\n\x04\x65lem\x18\x03 \x03(\x0b\x32\x14.gnoi.types.PathElem\"p\n\x08PathElem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12*\n\x03key\x18\x02 \x03(\x0b\x32\x1d.gnoi.types.PathElem.KeyEntry\x1a*\n\x08KeyEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"h\n\x0b\x43redentials\x12\x10\n\x08username\x18\x01 \x01(\t\x12\x13\n\tcleartext\x18\x02 \x01(\tH\x00\x12&\n\x06hashed\x18\x03 \x01(\x0b\x32\x14.gnoi.types.HashTypeH\x00\x42\n\n\x08password*1\n\nL3Protocol\x12\x0f\n\x0bUNSPECIFIED\x10\x00\x12\x08\n\x04IPV4\x10\x01\x12\x08\n\x04IPV6\x10\x02:3\n\x0cgnoi_version\x12\x1c.google.protobuf.FileOptions\x18\xea\x07 \x01(\tB\"Z github.com/openconfig/gnoi/typesb\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'github.com.openconfig.gnoi.types.types_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals['DESCRIPTOR']._loaded_options = None + _globals['DESCRIPTOR']._serialized_options = b'Z github.com/openconfig/gnoi/types' + _globals['_PATHELEM_KEYENTRY']._loaded_options = None + _globals['_PATHELEM_KEYENTRY']._serialized_options = b'8\001' + _globals['_L3PROTOCOL']._serialized_start=514 + _globals['_L3PROTOCOL']._serialized_end=563 + _globals['_HASHTYPE']._serialized_start=95 + _globals['_HASHTYPE']._serialized_end=232 + _globals['_HASHTYPE_HASHMETHOD']._serialized_start=170 + _globals['_HASHTYPE_HASHMETHOD']._serialized_end=232 + _globals['_PATH']._serialized_start=234 + _globals['_PATH']._serialized_end=292 + _globals['_PATHELEM']._serialized_start=294 + _globals['_PATHELEM']._serialized_end=406 + _globals['_PATHELEM_KEYENTRY']._serialized_start=364 + _globals['_PATHELEM_KEYENTRY']._serialized_end=406 + _globals['_CREDENTIALS']._serialized_start=408 + _globals['_CREDENTIALS']._serialized_end=512 +# @@protoc_insertion_point(module_scope) diff --git a/host_modules/gnoi/types_pb2_grpc.py b/host_modules/gnoi/types_pb2_grpc.py new file mode 100644 index 00000000..5dce0282 --- /dev/null +++ b/host_modules/gnoi/types_pb2_grpc.py @@ -0,0 +1,24 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc +import warnings + + +GRPC_GENERATED_VERSION = '1.80.0' +GRPC_VERSION = grpc.__version__ +_version_not_supported = False + +try: + from grpc._utilities import first_version_is_lower + _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION) +except ImportError: + _version_not_supported = True + +if _version_not_supported: + raise RuntimeError( + f'The grpc package installed is at version {GRPC_VERSION},' + + ' but the generated code in github.com/openconfig/gnoi/types/types_pb2_grpc.py depends on' + + f' grpcio>={GRPC_GENERATED_VERSION}.' + + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' + + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' + ) diff --git a/scripts/gnoi_shutdown_daemon.py b/scripts/gnoi_shutdown_daemon.py index ee5f565d..70b26875 100644 --- a/scripts/gnoi_shutdown_daemon.py +++ b/scripts/gnoi_shutdown_daemon.py @@ -13,9 +13,12 @@ import os import redis import threading +import grpc import sonic_py_common.daemon_base as daemon_base from sonic_py_common import syslogger from swsscommon import swsscommon +from host_modules.gnoi.client import GnoiClient +from host_modules.gnoi import system_pb2 REBOOT_RPC_TIMEOUT_SEC = 60 # gNOI System.Reboot call timeout STATUS_POLL_TIMEOUT_SEC = 60 # overall time - polling RebootStatus @@ -187,36 +190,42 @@ def _wait_for_gnoi_halt_in_progress(self, dpu_name: str) -> bool: return False def _send_reboot_command(self, dpu_name: str, dpu_ip: str, port: str) -> bool: - """Send gNOI Reboot HALT command to the DPU.""" - reboot_cmd = [ - "docker", "exec", "gnmi", "gnoi_client", - f"-target={dpu_ip}:{port}", - "-logtostderr", "-notls", - "-module", "System", - "-rpc", "Reboot", - "-jsonin", json.dumps({"method": REBOOT_METHOD_HALT, "message": "Triggered by SmartSwitch graceful shutdown"}) - ] - rc, out, err = execute_command(reboot_cmd, timeout_sec=REBOOT_RPC_TIMEOUT_SEC, suppress_stderr=True) - if rc != 0: - logger.log_error(f"{dpu_name}: Reboot command failed") + """Send gNOI Reboot HALT command to the DPU via direct gRPC.""" + try: + with GnoiClient(f"{dpu_ip}:{port}", timeout=REBOOT_RPC_TIMEOUT_SEC) as client: + client.reboot( + method=system_pb2.HALT, + message="Triggered by SmartSwitch graceful shutdown", + ) + return True + except grpc.RpcError as e: + logger.log_error(f"{dpu_name}: Reboot RPC failed: {e.code()} {e.details()}") + return False + except Exception as e: + logger.log_error(f"{dpu_name}: Reboot command failed: {e}") return False - return True def _poll_reboot_status(self, dpu_name: str, dpu_ip: str, port: str) -> bool: - """Poll RebootStatus until completion or timeout.""" + """Poll RebootStatus via direct gRPC until completion or timeout.""" deadline = time.monotonic() + _get_halt_timeout() - status_cmd = [ - "docker", "exec", "gnmi", "gnoi_client", - f"-target={dpu_ip}:{port}", - "-logtostderr", "-notls", - "-module", "System", - "-rpc", "RebootStatus" - ] - while time.monotonic() < deadline: - rc_s, out_s, err_s = execute_command(status_cmd, timeout_sec=STATUS_RPC_TIMEOUT_SEC) - if rc_s == 0 and out_s and ("reboot complete" in out_s.lower()): - return True - time.sleep(STATUS_POLL_INTERVAL_SEC) + try: + with GnoiClient(f"{dpu_ip}:{port}", timeout=STATUS_RPC_TIMEOUT_SEC) as client: + while time.monotonic() < deadline: + try: + resp = client.reboot_status() + if not resp.active and resp.status.status == system_pb2.RebootStatus.Status.STATUS_SUCCESS: + return True + if not resp.active and resp.status.status != system_pb2.RebootStatus.Status.STATUS_SUCCESS: + logger.log_error( + f"{dpu_name}: RebootStatus reports failure: " + f"status={resp.status.status}, message={resp.status.message}" + ) + return False + except grpc.RpcError as e: + logger.log_warning(f"{dpu_name}: RebootStatus poll RPC error: {e.code()} {e.details()}") + time.sleep(STATUS_POLL_INTERVAL_SEC) + except Exception as e: + logger.log_error(f"{dpu_name}: RebootStatus polling failed: {e}") logger.log_notice(f"{dpu_name}: Timeout waiting for RebootStatus completion, proceeding with halt flag clear") return False diff --git a/setup.py b/setup.py index 0b7252ed..b6155010 100644 --- a/setup.py +++ b/setup.py @@ -31,11 +31,13 @@ maintainer_email = 'jolevequ@microsoft.com', packages = [ 'host_modules', + 'host_modules.gnoi', 'utils' ], # Map packages to their actual dirs package_dir = { 'host_modules': 'host_modules', + 'host_modules.gnoi': 'host_modules/gnoi', 'utils': 'utils' }, scripts=[ @@ -58,7 +60,9 @@ 'Jinja2>=2.10', 'PyGObject', 'pycairo==1.26.1', - 'psutil' + 'psutil', + 'grpcio', + 'protobuf' ] + sonic_dependencies, setup_requires = [ 'pytest-runner', diff --git a/tests/gnoi_shutdown_daemon_test.py b/tests/gnoi_shutdown_daemon_test.py index 76ab7536..8464f3f0 100644 --- a/tests/gnoi_shutdown_daemon_test.py +++ b/tests/gnoi_shutdown_daemon_test.py @@ -1,17 +1,48 @@ import unittest -from unittest.mock import patch, MagicMock, call +from unittest.mock import patch, MagicMock, call, PropertyMock import subprocess import sys import os import json -# Mock redis module (available in SONiC runtime, not in test environment) +# Mock SONiC and system modules not available in test environment sys.modules['redis'] = MagicMock() +sys.modules['sonic_py_common'] = MagicMock() +sys.modules['sonic_py_common.daemon_base'] = MagicMock() +sys.modules['sonic_py_common.syslogger'] = MagicMock() +sys.modules['swsscommon'] = MagicMock() +sys.modules['swsscommon.swsscommon'] = MagicMock() + +# Mock grpc before importing daemon +mock_grpc = MagicMock() +mock_grpc.RpcError = type('RpcError', (Exception,), {}) +sys.modules['grpc'] = mock_grpc + +# Mock the gnoi proto stubs +mock_system_pb2 = MagicMock() +mock_system_pb2.HALT = 3 +mock_system_pb2.RebootRequest = MagicMock() +mock_system_pb2.RebootStatusRequest = MagicMock() +# RebootStatus.Status.STATUS_SUCCESS +mock_status_enum = MagicMock() +mock_status_enum.STATUS_SUCCESS = 1 +mock_system_pb2.RebootStatus.Status = mock_status_enum + +mock_system_pb2_grpc = MagicMock() +sys.modules['host_modules'] = MagicMock() +sys.modules['host_modules.gnoi'] = MagicMock() +sys.modules['host_modules.gnoi.client'] = MagicMock() +sys.modules['host_modules.gnoi.system_pb2'] = mock_system_pb2 +sys.modules['host_modules.gnoi.system_pb2_grpc'] = mock_system_pb2_grpc sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'scripts'))) import gnoi_shutdown_daemon +# Patch the module-level references after import +gnoi_shutdown_daemon.grpc = mock_grpc +gnoi_shutdown_daemon.system_pb2 = mock_system_pb2 + # Common fixtures mock_message = { "type": "pmessage", @@ -28,11 +59,10 @@ class TestGnoiShutdownDaemon(unittest.TestCase): def setUp(self): - # Ensure a clean state for each test gnoi_shutdown_daemon.main = gnoi_shutdown_daemon.__dict__["main"] def test_execute_command_success(self): - """Test successful execution of a gNOI command.""" + """Test successful execution of a command.""" with patch("gnoi_shutdown_daemon.subprocess.run") as mock_run: mock_run.return_value = MagicMock(returncode=0, stdout="success", stderr="") rc, stdout, stderr = gnoi_shutdown_daemon.execute_command(["dummy"]) @@ -41,7 +71,7 @@ def test_execute_command_success(self): self.assertEqual(stderr, "") def test_execute_command_timeout(self): - """Test gNOI command timeout.""" + """Test command timeout.""" with patch("gnoi_shutdown_daemon.subprocess.run", side_effect=subprocess.TimeoutExpired(cmd=["dummy"], timeout=60)): rc, stdout, stderr = gnoi_shutdown_daemon.execute_command(["dummy"]) self.assertEqual(rc, -1) @@ -49,7 +79,7 @@ def test_execute_command_timeout(self): self.assertIn("Command timed out", stderr) def test_execute_command_exception(self): - """Test gNOI command failure due to an exception.""" + """Test command failure due to an exception.""" with patch("gnoi_shutdown_daemon.subprocess.run", side_effect=Exception("Test error")): rc, stdout, stderr = gnoi_shutdown_daemon.execute_command(["dummy"]) self.assertEqual(rc, -2) @@ -97,7 +127,6 @@ def test_get_halt_timeout_default(self): def test_get_halt_timeout_exception(self): """Test _get_halt_timeout returns default on exception.""" - # Mock sonic_platform import to succeed, then mock file operation to raise exception mock_chassis = MagicMock() mock_chassis.get_name.return_value = "test-platform" mock_platform_class = MagicMock() @@ -114,15 +143,12 @@ def test_get_halt_timeout_exception(self): @patch('threading.Thread') def test_main_loop_flow(self, mock_thread, mock_config_db_connector_class, mock_gnoi_reboot_handler, mock_db_connect): """Test the main loop processing of a shutdown event.""" - # Mock DB connections mock_state_db = MagicMock() mock_config_db = MagicMock() mock_db_connect.side_effect = [mock_state_db, mock_config_db] - # Mock config_db.hget to return admin_status=down to trigger thread creation mock_config_db.hget.return_value = "down" - # Mock ConfigDBConnector for pubsub mock_config_db_connector = MagicMock() mock_config_db_connector.db_name = "CONFIG_DB" mock_pubsub = MagicMock() @@ -132,24 +158,19 @@ def test_main_loop_flow(self, mock_thread, mock_config_db_connector_class, mock_ mock_config_db_connector.get_redis_client.return_value = mock_redis_client mock_config_db_connector_class.return_value = mock_config_db_connector - # Mock chassis mock_chassis = MagicMock() mock_platform_instance = MagicMock() mock_platform_instance.get_chassis.return_value = mock_chassis - # Create mock for sonic_platform.platform module mock_platform_submodule = MagicMock() mock_platform_submodule.Platform.return_value = mock_platform_instance - # Create mock for sonic_platform parent module mock_sonic_platform = MagicMock() mock_sonic_platform.platform = mock_platform_submodule - # Mock the reboot handler's _handle_transition to avoid actual execution mock_handler_instance = MagicMock() mock_gnoi_reboot_handler.return_value = mock_handler_instance - # Temporarily add mocks to sys.modules for the duration of this test with patch.dict('sys.modules', { 'sonic_platform': mock_sonic_platform, 'sonic_platform.platform': mock_platform_submodule @@ -157,28 +178,24 @@ def test_main_loop_flow(self, mock_thread, mock_config_db_connector_class, mock_ with self.assertRaises(KeyboardInterrupt): gnoi_shutdown_daemon.main() - # Verify initialization mock_db_connect.assert_has_calls([call("STATE_DB"), call("CONFIG_DB")]) mock_gnoi_reboot_handler.assert_called_with(mock_state_db, mock_config_db, mock_chassis) - # Verify that a thread was created to handle the transition mock_thread.assert_called_once() - # Verify the thread was started mock_thread.return_value.start.assert_called_once() @patch('gnoi_shutdown_daemon._get_halt_timeout', return_value=60) @patch('gnoi_shutdown_daemon.get_dpu_ip') @patch('gnoi_shutdown_daemon.get_dpu_gnmi_port') - @patch('gnoi_shutdown_daemon.execute_command') + @patch('gnoi_shutdown_daemon.GnoiClient') @patch('gnoi_shutdown_daemon.time.sleep') @patch('gnoi_shutdown_daemon.time.monotonic') - def test_handle_transition_success(self, mock_monotonic, mock_sleep, mock_execute_command, mock_get_gnmi_port, mock_get_dpu_ip, mock_get_halt_timeout): - """Test the full successful transition handling.""" + def test_handle_transition_success(self, mock_monotonic, mock_sleep, mock_gnoi_client_class, mock_get_gnmi_port, mock_get_dpu_ip, mock_get_halt_timeout): + """Test the full successful transition handling with native gRPC.""" mock_db = MagicMock() mock_config_db = MagicMock() mock_chassis = MagicMock() - # Mock return values mock_get_dpu_ip.return_value = "10.0.0.1" mock_get_gnmi_port.return_value = "8080" @@ -186,17 +203,27 @@ def test_handle_transition_success(self, mock_monotonic, mock_sleep, mock_execut mock_table = MagicMock() mock_table.get.return_value = (True, [("gnoi_halt_in_progress", "True")]) - # Mock time for polling mock_monotonic.side_effect = [ 0, 1, # For _wait_for_gnoi_halt_in_progress 2, 3 # For _poll_reboot_status ] - # Reboot command success, RebootStatus success - mock_execute_command.side_effect = [ - (0, "reboot sent", ""), - (0, "reboot complete", "") - ] + # Mock gRPC client for reboot + mock_reboot_client = MagicMock() + mock_reboot_client.__enter__ = MagicMock(return_value=mock_reboot_client) + mock_reboot_client.__exit__ = MagicMock(return_value=False) + + # Mock gRPC client for status poll + mock_status_client = MagicMock() + mock_status_client.__enter__ = MagicMock(return_value=mock_status_client) + mock_status_client.__exit__ = MagicMock(return_value=False) + mock_status_resp = MagicMock() + mock_status_resp.active = False + mock_status_resp.status.status = mock_status_enum.STATUS_SUCCESS + mock_status_client.reboot_status.return_value = mock_status_resp + + # Two GnoiClient() calls: one for reboot, one for status polling + mock_gnoi_client_class.side_effect = [mock_reboot_client, mock_status_client] # Mock module for clear operation mock_module = MagicMock() @@ -208,18 +235,19 @@ def test_handle_transition_success(self, mock_monotonic, mock_sleep, mock_execut result = handler._handle_transition("DPU0", "shutdown") self.assertTrue(result) + mock_reboot_client.reboot.assert_called_once() + mock_status_client.reboot_status.assert_called_once() mock_chassis.get_module_index.assert_called_with("DPU0") mock_chassis.get_module.assert_called_with(0) mock_module.clear_module_gnoi_halt_in_progress.assert_called_once() - self.assertEqual(mock_execute_command.call_count, 2) @patch('gnoi_shutdown_daemon._get_halt_timeout', return_value=60) @patch('gnoi_shutdown_daemon.get_dpu_ip') @patch('gnoi_shutdown_daemon.get_dpu_gnmi_port') + @patch('gnoi_shutdown_daemon.GnoiClient') @patch('gnoi_shutdown_daemon.time.sleep') @patch('gnoi_shutdown_daemon.time.monotonic') - @patch('gnoi_shutdown_daemon.execute_command') - def test_handle_transition_gnoi_halt_timeout(self, mock_execute_command, mock_monotonic, mock_sleep, mock_get_gnmi_port, mock_get_dpu_ip, mock_get_halt_timeout): + def test_handle_transition_gnoi_halt_timeout(self, mock_monotonic, mock_sleep, mock_gnoi_client_class, mock_get_gnmi_port, mock_get_dpu_ip, mock_get_halt_timeout): """Test transition proceeds despite gnoi_halt_in_progress timeout.""" mock_db = MagicMock() mock_config_db = MagicMock() @@ -228,25 +256,29 @@ def test_handle_transition_gnoi_halt_timeout(self, mock_execute_command, mock_mo mock_get_dpu_ip.return_value = "10.0.0.1" mock_get_gnmi_port.return_value = "8080" - # Mock table.get() to never return True (simulates timeout in wait) mock_table = MagicMock() mock_table.get.return_value = (True, [("gnoi_halt_in_progress", "False")]) - # Simulate timeout in _wait_for_gnoi_halt_in_progress, then success in _poll_reboot_status mock_monotonic.side_effect = [ - # _wait_for_gnoi_halt_in_progress times out 0, 1, 2, gnoi_shutdown_daemon.STATUS_POLL_TIMEOUT_SEC + 1, - # _poll_reboot_status succeeds 0, 1 ] - # Reboot command and status succeed - mock_execute_command.side_effect = [ - (0, "reboot sent", ""), - (0, "reboot complete", "") - ] + # Mock gRPC clients + mock_reboot_client = MagicMock() + mock_reboot_client.__enter__ = MagicMock(return_value=mock_reboot_client) + mock_reboot_client.__exit__ = MagicMock(return_value=False) + + mock_status_client = MagicMock() + mock_status_client.__enter__ = MagicMock(return_value=mock_status_client) + mock_status_client.__exit__ = MagicMock(return_value=False) + mock_status_resp = MagicMock() + mock_status_resp.active = False + mock_status_resp.status.status = mock_status_enum.STATUS_SUCCESS + mock_status_client.reboot_status.return_value = mock_status_resp + + mock_gnoi_client_class.side_effect = [mock_reboot_client, mock_status_client] - # Mock module for clear operation mock_module = MagicMock() mock_chassis.get_module_index.return_value = 0 mock_chassis.get_module.return_value = mock_module @@ -255,7 +287,6 @@ def test_handle_transition_gnoi_halt_timeout(self, mock_execute_command, mock_mo handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis) result = handler._handle_transition("DPU0", "shutdown") - # Should still succeed - code proceeds anyway after timeout warning self.assertTrue(result) mock_chassis.get_module_index.assert_called_with("DPU0") mock_chassis.get_module.assert_called_with(0) @@ -263,7 +294,6 @@ def test_handle_transition_gnoi_halt_timeout(self, mock_execute_command, mock_mo def test_get_dpu_ip_and_port(self): """Test DPU IP and gNMI port retrieval.""" - # Test IP retrieval mock_config = MagicMock() mock_config.hget.return_value = "10.0.0.1" @@ -271,14 +301,12 @@ def test_get_dpu_ip_and_port(self): self.assertEqual(ip, "10.0.0.1") mock_config.hget.assert_called_with("DHCP_SERVER_IPV4_PORT|bridge-midplane|dpu0", "ips@") - # Test port retrieval mock_config = MagicMock() mock_config.hget.return_value = "12345" port = gnoi_shutdown_daemon.get_dpu_gnmi_port(mock_config, "DPU0") self.assertEqual(port, "12345") - # Test port fallback mock_config = MagicMock() mock_config.hget.return_value = None @@ -294,40 +322,182 @@ def test_handle_transition_ip_failure(self, mock_get_gnmi_port, mock_get_dpu_ip, mock_config_db = MagicMock() mock_chassis = MagicMock() - # Mock module for clear operation mock_module = MagicMock() mock_chassis.get_module_index.return_value = 0 mock_chassis.get_module.return_value = mock_module handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis) - - # Mock _wait_for_gnoi_halt_in_progress to return immediately to prevent hanging handler._wait_for_gnoi_halt_in_progress = MagicMock(return_value=True) - + result = handler._handle_transition("DPU0", "shutdown") self.assertFalse(result) - # Verify that clear_module_gnoi_halt_in_progress was called mock_chassis.get_module_index.assert_called_with("DPU0") mock_chassis.get_module.assert_called_with(0) mock_module.clear_module_gnoi_halt_in_progress.assert_called_once() - @patch('gnoi_shutdown_daemon.get_dpu_ip', return_value="10.0.0.1") - @patch('gnoi_shutdown_daemon.get_dpu_gnmi_port', return_value="8080") - @patch('gnoi_shutdown_daemon.execute_command', return_value=(-1, "", "error")) - def test_send_reboot_command_failure(self, mock_execute, mock_get_port, mock_get_ip): - """Test failure of _send_reboot_command.""" + @patch('gnoi_shutdown_daemon.GnoiClient') + def test_send_reboot_command_grpc_error(self, mock_gnoi_client_class): + """Test _send_reboot_command handles gRPC errors with code/details.""" + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + + # Create a proper RpcError with code() and details() + rpc_error = mock_grpc.RpcError() + rpc_error.code = MagicMock(return_value="UNAVAILABLE") + rpc_error.details = MagicMock(return_value="connection refused") + mock_client.reboot.side_effect = rpc_error + + mock_gnoi_client_class.return_value = mock_client + handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) result = handler._send_reboot_command("DPU0", "10.0.0.1", "8080") self.assertFalse(result) + @patch('gnoi_shutdown_daemon.GnoiClient') + def test_send_reboot_command_success(self, mock_gnoi_client_class): + """Test successful _send_reboot_command via gRPC.""" + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_gnoi_client_class.return_value = mock_client + + handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) + result = handler._send_reboot_command("DPU0", "10.0.0.1", "8080") + self.assertTrue(result) + mock_client.reboot.assert_called_once() + + @patch('gnoi_shutdown_daemon._get_halt_timeout', return_value=60) + @patch('gnoi_shutdown_daemon.GnoiClient') + @patch('gnoi_shutdown_daemon.time.sleep') + @patch('gnoi_shutdown_daemon.time.monotonic') + def test_poll_reboot_status_success(self, mock_monotonic, mock_sleep, mock_gnoi_client_class, mock_get_halt_timeout): + """Test _poll_reboot_status returns True on successful completion.""" + mock_monotonic.side_effect = [0, 1] + + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_resp = MagicMock() + mock_resp.active = False + mock_resp.status.status = mock_status_enum.STATUS_SUCCESS + mock_client.reboot_status.return_value = mock_resp + mock_gnoi_client_class.return_value = mock_client + + handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) + result = handler._poll_reboot_status("DPU0", "10.0.0.1", "8080") + self.assertTrue(result) + + @patch('gnoi_shutdown_daemon._get_halt_timeout', return_value=60) + @patch('gnoi_shutdown_daemon.GnoiClient') + @patch('gnoi_shutdown_daemon.time.sleep') + @patch('gnoi_shutdown_daemon.time.monotonic') + def test_poll_reboot_status_timeout(self, mock_monotonic, mock_sleep, mock_gnoi_client_class, mock_get_halt_timeout): + """Test _poll_reboot_status returns False on timeout.""" + mock_monotonic.side_effect = [0, 1, 61] + + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_resp = MagicMock() + mock_resp.active = True # still in progress + mock_client.reboot_status.return_value = mock_resp + mock_gnoi_client_class.return_value = mock_client + + handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) + result = handler._poll_reboot_status("DPU0", "10.0.0.1", "8080") + self.assertFalse(result) + + @patch('gnoi_shutdown_daemon._get_halt_timeout', return_value=60) + @patch('gnoi_shutdown_daemon.GnoiClient') + @patch('gnoi_shutdown_daemon.time.sleep') + @patch('gnoi_shutdown_daemon.time.monotonic') + def test_poll_reboot_status_failure_status(self, mock_monotonic, mock_sleep, mock_gnoi_client_class, mock_get_halt_timeout): + """Test _poll_reboot_status returns False when DPU reports failure.""" + mock_monotonic.side_effect = [0, 1] + + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_resp = MagicMock() + mock_resp.active = False + mock_resp.status.status = 2 # STATUS_FAILURE or similar + mock_resp.status.message = "internal error" + mock_client.reboot_status.return_value = mock_resp + mock_gnoi_client_class.return_value = mock_client + + handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) + result = handler._poll_reboot_status("DPU0", "10.0.0.1", "8080") + self.assertFalse(result) + + @patch('gnoi_shutdown_daemon._get_halt_timeout', return_value=60) + @patch('gnoi_shutdown_daemon.GnoiClient') + @patch('gnoi_shutdown_daemon.time.sleep') + @patch('gnoi_shutdown_daemon.time.monotonic') + def test_poll_reboot_status_rpc_error_recovery(self, mock_monotonic, mock_sleep, mock_gnoi_client_class, mock_get_halt_timeout): + """Test _poll_reboot_status recovers from transient RPC errors.""" + mock_monotonic.side_effect = [0, 1, 2] + + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + + # First call fails with RPC error, second succeeds + rpc_error = mock_grpc.RpcError() + rpc_error.code = MagicMock(return_value="UNAVAILABLE") + rpc_error.details = MagicMock(return_value="transient") + + mock_resp = MagicMock() + mock_resp.active = False + mock_resp.status.status = mock_status_enum.STATUS_SUCCESS + + mock_client.reboot_status.side_effect = [rpc_error, mock_resp] + mock_gnoi_client_class.return_value = mock_client + + handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) + result = handler._poll_reboot_status("DPU0", "10.0.0.1", "8080") + self.assertTrue(result) + + def test_get_dpu_ip_with_string_ips(self): + """Test get_dpu_ip when ips is a string instead of list.""" + mock_config = MagicMock() + mock_config.hget.return_value = "10.0.0.5" + + ip = gnoi_shutdown_daemon.get_dpu_ip(mock_config, "DPU1") + self.assertEqual(ip, "10.0.0.5") + + def test_get_dpu_ip_empty_entry(self): + """Test get_dpu_ip when entry is empty.""" + mock_config = MagicMock() + mock_config.hget.return_value = None + + ip = gnoi_shutdown_daemon.get_dpu_ip(mock_config, "DPU1") + self.assertIsNone(ip) + + def test_get_dpu_ip_exception(self): + """Test get_dpu_ip when exception occurs.""" + mock_config = MagicMock() + mock_config.hget.side_effect = AttributeError("Database error") + + ip = gnoi_shutdown_daemon.get_dpu_ip(mock_config, "DPU1") + self.assertIsNone(ip) + + def test_get_dpu_gnmi_port_exception(self): + """Test get_dpu_gnmi_port when exception occurs.""" + mock_config = MagicMock() + mock_config.hget.side_effect = AttributeError("Database error") + + port = gnoi_shutdown_daemon.get_dpu_gnmi_port(mock_config, "DPU1") + self.assertEqual(port, "8080") + def test_get_dpu_gnmi_port_variants(self): """Test DPU gNMI port retrieval with name variants.""" mock_config = MagicMock() mock_config.hget.side_effect = [ - None, # dpu0 fails - None, # DPU0 fails - "12345" # DPU0 succeeds + None, + None, + "12345" ] port = gnoi_shutdown_daemon.get_dpu_gnmi_port(mock_config, "DPU0") @@ -342,26 +512,21 @@ def test_main_loop_no_dpu_name(self, mock_config_db_connector_class, mock_db_con mock_platform_instance = MagicMock() mock_platform_instance.get_chassis.return_value = mock_chassis - # Create mock for sonic_platform.platform module mock_platform_submodule = MagicMock() mock_platform_submodule.Platform.return_value = mock_platform_instance - # Create mock for sonic_platform parent module mock_sonic_platform = MagicMock() mock_sonic_platform.platform = mock_platform_submodule mock_pubsub = MagicMock() - # Malformed message, then stop malformed_message = mock_message.copy() malformed_message["channel"] = f"__keyspace@{gnoi_shutdown_daemon.CONFIG_DB_INDEX}__:CHASSIS_MODULE|" mock_pubsub.get_message.side_effect = [malformed_message, KeyboardInterrupt] - # Mock DB connections mock_state_db = MagicMock() mock_config_db = MagicMock() mock_db_connect.side_effect = [mock_state_db, mock_config_db] - # Mock ConfigDBConnector for pubsub mock_config_db_connector = MagicMock() mock_config_db_connector.db_name = "CONFIG_DB" mock_redis_client = MagicMock() @@ -384,24 +549,20 @@ def test_main_loop_get_transition_exception(self, mock_config_db_connector_class mock_platform_instance = MagicMock() mock_platform_instance.get_chassis.return_value = mock_chassis - # Create mock for sonic_platform.platform module mock_platform_submodule = MagicMock() mock_platform_submodule.Platform.return_value = mock_platform_instance - # Create mock for sonic_platform parent module mock_sonic_platform = MagicMock() mock_sonic_platform.platform = mock_platform_submodule mock_pubsub = MagicMock() mock_pubsub.get_message.side_effect = [mock_message, KeyboardInterrupt] - # Mock config_db to raise exception on hget mock_config_db = MagicMock() mock_state_db = MagicMock() mock_db_connect.side_effect = [mock_state_db, mock_config_db] mock_config_db.hget.side_effect = AttributeError("DB error") - # Mock ConfigDBConnector for pubsub mock_config_db_connector = MagicMock() mock_config_db_connector.db_name = "CONFIG_DB" mock_redis_client = MagicMock() @@ -416,98 +577,6 @@ def test_main_loop_get_transition_exception(self, mock_config_db_connector_class with self.assertRaises(KeyboardInterrupt): gnoi_shutdown_daemon.main() - @patch('gnoi_shutdown_daemon._get_halt_timeout', return_value=60) - @patch('gnoi_shutdown_daemon.execute_command', return_value=(-1, "", "RPC error")) - def test_poll_reboot_status_failure(self, mock_execute_command, mock_get_halt_timeout): - """Test _poll_reboot_status with a command failure.""" - handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) - with patch('gnoi_shutdown_daemon.time.monotonic', side_effect=[0, 1, 61]): - result = handler._poll_reboot_status("DPU0", "10.0.0.1", "8080") - self.assertFalse(result) - - def test_sonic_platform_import_mock(self): - """Simple test to verify sonic_platform import mocking works.""" - # Create mock chassis - mock_chassis = MagicMock() - mock_chassis.get_name.return_value = "test_chassis" - - # Create mock platform instance that returns our chassis - mock_platform_instance = MagicMock() - mock_platform_instance.get_chassis.return_value = mock_chassis - - # Create mock Platform class - mock_platform_class = MagicMock(return_value=mock_platform_instance) - - # Create mock for sonic_platform.platform module - mock_platform_submodule = MagicMock() - mock_platform_submodule.Platform = mock_platform_class - - # Create mock for sonic_platform parent module - mock_sonic_platform = MagicMock() - mock_sonic_platform.platform = mock_platform_submodule - - # Test that we can mock the import - with patch.dict('sys.modules', { - 'sonic_platform': mock_sonic_platform, - 'sonic_platform.platform': mock_platform_submodule - }): - # Simulate what the actual code does - from sonic_platform import platform - chassis = platform.Platform().get_chassis() - - # Verify it worked - self.assertEqual(chassis, mock_chassis) - self.assertEqual(chassis.get_name(), "test_chassis") - mock_platform_class.assert_called_once() - mock_platform_instance.get_chassis.assert_called_once() - - def test_get_dpu_ip_with_string_ips(self): - """Test get_dpu_ip when ips is a string instead of list.""" - mock_config = MagicMock() - mock_config.hget.return_value = "10.0.0.5" - - ip = gnoi_shutdown_daemon.get_dpu_ip(mock_config, "DPU1") - self.assertEqual(ip, "10.0.0.5") - - def test_get_dpu_ip_empty_entry(self): - """Test get_dpu_ip when entry is empty.""" - mock_config = MagicMock() - mock_config.hget.return_value = None - - ip = gnoi_shutdown_daemon.get_dpu_ip(mock_config, "DPU1") - self.assertIsNone(ip) - - def test_get_dpu_ip_no_ips_field(self): - """Test get_dpu_ip when hget returns None (field doesn't exist).""" - mock_config = MagicMock() - mock_config.hget.return_value = None - - ip = gnoi_shutdown_daemon.get_dpu_ip(mock_config, "DPU1") - self.assertIsNone(ip) - - def test_get_dpu_ip_exception(self): - """Test get_dpu_ip when exception occurs.""" - mock_config = MagicMock() - mock_config.hget.side_effect = AttributeError("Database error") - - ip = gnoi_shutdown_daemon.get_dpu_ip(mock_config, "DPU1") - self.assertIsNone(ip) - - def test_get_dpu_gnmi_port_exception(self): - """Test get_dpu_gnmi_port when exception occurs.""" - mock_config = MagicMock() - mock_config.hget.side_effect = AttributeError("Database error") - - port = gnoi_shutdown_daemon.get_dpu_gnmi_port(mock_config, "DPU1") - self.assertEqual(port, "8080") - - def test_send_reboot_command_success(self): - """Test successful _send_reboot_command.""" - with patch('gnoi_shutdown_daemon.execute_command', return_value=(0, "success", "")): - handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) - result = handler._send_reboot_command("DPU0", "10.0.0.1", "8080") - self.assertTrue(result) - @patch('gnoi_shutdown_daemon._get_halt_timeout', return_value=60) @patch('gnoi_shutdown_daemon.get_dpu_ip', return_value="10.0.0.1") @patch('gnoi_shutdown_daemon.get_dpu_gnmi_port', side_effect=Exception("Port lookup failed")) @@ -517,20 +586,16 @@ def test_handle_transition_config_exception(self, mock_get_port, mock_get_ip, mo mock_config_db = MagicMock() mock_chassis = MagicMock() - # Mock module for clear operation mock_module = MagicMock() mock_chassis.get_module_index.return_value = 0 mock_chassis.get_module.return_value = mock_module handler = gnoi_shutdown_daemon.GnoiRebootHandler(mock_db, mock_config_db, mock_chassis) - - # Mock _wait_for_gnoi_halt_in_progress to return immediately to prevent hanging handler._wait_for_gnoi_halt_in_progress = MagicMock(return_value=True) - + result = handler._handle_transition("DPU0", "shutdown") self.assertFalse(result) - # Verify that clear_module_gnoi_halt_in_progress was called mock_chassis.get_module_index.assert_called_with("DPU0") mock_chassis.get_module.assert_called_with(0) mock_module.clear_module_gnoi_halt_in_progress.assert_called_once() From 5b12fba6453b0fa3571de5186169e1ec2bc8664f Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Mon, 30 Mar 2026 16:28:25 +0000 Subject: [PATCH 11/12] Refactor GnoiClient to be service-agnostic GnoiClient now manages the gRPC channel and exposes service stubs via properties (e.g. client.system). The System convenience methods (reboot, reboot_status, cancel_reboot) remain for backward compat, but future gNOI services (Healthz, Cert, etc.) can add their own stubs without modifying the client class. Also exposes client.channel for direct stub construction when needed. Signed-off-by: sigabrtv1-ui Signed-off-by: Dawei Huang --- host_modules/gnoi/client.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/host_modules/gnoi/client.py b/host_modules/gnoi/client.py index a480da89..f1443ce7 100644 --- a/host_modules/gnoi/client.py +++ b/host_modules/gnoi/client.py @@ -1,8 +1,8 @@ """ -Lightweight Python gRPC client for gNOI System service. +Lightweight Python gRPC client for gNOI services. -Wraps the vendored proto stubs to provide reboot() and reboot_status() -with proper error handling and structured responses. +Manages the gRPC channel and provides access to gNOI service stubs +(System, etc.) with proper lifecycle management. """ import grpc @@ -11,7 +11,7 @@ class GnoiClient: - """gNOI System service client using direct gRPC (no Docker/subprocess).""" + """gNOI client managing a gRPC channel with access to multiple service stubs.""" def __init__(self, target, timeout=60): """ @@ -22,11 +22,9 @@ def __init__(self, target, timeout=60): self._target = target self._timeout = timeout self._channel = None - self._stub = None def __enter__(self): self._channel = grpc.insecure_channel(self._target) - self._stub = system_pb2_grpc.SystemStub(self._channel) return self def __exit__(self, exc_type, exc_val, exc_tb): @@ -37,7 +35,18 @@ def close(self): if self._channel: self._channel.close() self._channel = None - self._stub = None + + @property + def channel(self): + """Access the underlying gRPC channel for custom stubs.""" + return self._channel + + @property + def system(self): + """gNOI System service stub.""" + return system_pb2_grpc.SystemStub(self._channel) + + # ---- System service convenience methods ---- def reboot(self, method, message="", timeout=None): """ @@ -58,7 +67,7 @@ def reboot(self, method, message="", timeout=None): method=method, message=message, ) - return self._stub.Reboot(request, timeout=timeout or self._timeout) + return self.system.Reboot(request, timeout=timeout or self._timeout) def reboot_status(self, timeout=None): """ @@ -79,7 +88,7 @@ def reboot_status(self, timeout=None): grpc.RpcError: On any gRPC failure. """ request = system_pb2.RebootStatusRequest() - return self._stub.RebootStatus(request, timeout=timeout or self._timeout) + return self.system.RebootStatus(request, timeout=timeout or self._timeout) def cancel_reboot(self, message="", timeout=None): """ @@ -96,4 +105,4 @@ def cancel_reboot(self, message="", timeout=None): grpc.RpcError: On any gRPC failure. """ request = system_pb2.CancelRebootRequest(message=message) - return self._stub.CancelReboot(request, timeout=timeout or self._timeout) + return self.system.CancelReboot(request, timeout=timeout or self._timeout) From 9fa800002547f2920f965167f1f442a41a386798 Mon Sep 17 00:00:00 2001 From: Dawei Huang Date: Mon, 30 Mar 2026 16:33:01 +0000 Subject: [PATCH 12/12] Route RPCs through service stubs: client.system.Reboot() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove convenience methods from GnoiClient. All RPCs now go through service stub properties: client.system.Reboot(), client.system.RebootStatus(). This keeps the client clean for future gNOI services — each service gets its own stub property, callers use the standard protobuf API. Signed-off-by: sigabrtv1-ui Signed-off-by: Dawei Huang --- host_modules/gnoi/client.py | 78 ++++-------------------------- scripts/gnoi_shutdown_daemon.py | 10 ++-- tests/gnoi_shutdown_daemon_test.py | 15 +++--- 3 files changed, 24 insertions(+), 79 deletions(-) diff --git a/host_modules/gnoi/client.py b/host_modules/gnoi/client.py index f1443ce7..fb787055 100644 --- a/host_modules/gnoi/client.py +++ b/host_modules/gnoi/client.py @@ -1,26 +1,27 @@ """ Lightweight Python gRPC client for gNOI services. -Manages the gRPC channel and provides access to gNOI service stubs -(System, etc.) with proper lifecycle management. +Manages the gRPC channel and provides access to gNOI service stubs. +All RPCs are accessed through service properties, e.g.: + + with GnoiClient("10.0.0.1:8080") as client: + client.system.Reboot(request, timeout=60) + client.system.RebootStatus(request, timeout=10) """ import grpc -from host_modules.gnoi import system_pb2 from host_modules.gnoi import system_pb2_grpc class GnoiClient: - """gNOI client managing a gRPC channel with access to multiple service stubs.""" + """gNOI client managing a gRPC channel with access to service stubs.""" - def __init__(self, target, timeout=60): + def __init__(self, target): """ Args: target: gRPC target address, e.g. "10.0.0.1:8080" - timeout: Default RPC timeout in seconds. """ self._target = target - self._timeout = timeout self._channel = None def __enter__(self): @@ -43,66 +44,5 @@ def channel(self): @property def system(self): - """gNOI System service stub.""" + """gNOI System service stub (gnoi.system.System).""" return system_pb2_grpc.SystemStub(self._channel) - - # ---- System service convenience methods ---- - - def reboot(self, method, message="", timeout=None): - """ - Send a gNOI System.Reboot RPC. - - Args: - method: RebootMethod enum value (e.g. system_pb2.HALT = 3). - message: Human-readable reason string. - timeout: RPC timeout in seconds (overrides default). - - Returns: - RebootResponse protobuf message. - - Raises: - grpc.RpcError: On any gRPC failure (with code() and details()). - """ - request = system_pb2.RebootRequest( - method=method, - message=message, - ) - return self.system.Reboot(request, timeout=timeout or self._timeout) - - def reboot_status(self, timeout=None): - """ - Poll gNOI System.RebootStatus RPC. - - Args: - timeout: RPC timeout in seconds (overrides default). - - Returns: - RebootStatusResponse protobuf message with fields: - - active (bool): True if reboot is still in progress - - wait (uint64): nanoseconds before next poll - - when (uint64): reboot scheduled time - - reason (str): reason for reboot - - count (uint32): number of reboots since active - - Raises: - grpc.RpcError: On any gRPC failure. - """ - request = system_pb2.RebootStatusRequest() - return self.system.RebootStatus(request, timeout=timeout or self._timeout) - - def cancel_reboot(self, message="", timeout=None): - """ - Cancel a pending reboot via gNOI System.CancelReboot RPC. - - Args: - message: Human-readable reason for cancellation. - timeout: RPC timeout in seconds (overrides default). - - Returns: - CancelRebootResponse protobuf message. - - Raises: - grpc.RpcError: On any gRPC failure. - """ - request = system_pb2.CancelRebootRequest(message=message) - return self.system.CancelReboot(request, timeout=timeout or self._timeout) diff --git a/scripts/gnoi_shutdown_daemon.py b/scripts/gnoi_shutdown_daemon.py index 381b16ef..b9bfdd97 100644 --- a/scripts/gnoi_shutdown_daemon.py +++ b/scripts/gnoi_shutdown_daemon.py @@ -240,11 +240,12 @@ def _wait_for_gnoi_halt_in_progress(self, dpu_name: str) -> bool: def _send_reboot_command(self, dpu_name: str, dpu_ip: str, port: str) -> bool: """Send gNOI Reboot HALT command to the DPU via direct gRPC.""" try: - with GnoiClient(f"{dpu_ip}:{port}", timeout=REBOOT_RPC_TIMEOUT_SEC) as client: - client.reboot( + with GnoiClient(f"{dpu_ip}:{port}") as client: + request = system_pb2.RebootRequest( method=system_pb2.HALT, message="Triggered by SmartSwitch graceful shutdown", ) + client.system.Reboot(request, timeout=REBOOT_RPC_TIMEOUT_SEC) return True except grpc.RpcError as e: logger.log_error(f"{dpu_name}: Reboot RPC failed: {e.code()} {e.details()}") @@ -257,10 +258,11 @@ def _poll_reboot_status(self, dpu_name: str, dpu_ip: str, port: str) -> bool: """Poll RebootStatus via direct gRPC until completion or timeout.""" deadline = time.monotonic() + _get_halt_timeout() try: - with GnoiClient(f"{dpu_ip}:{port}", timeout=STATUS_RPC_TIMEOUT_SEC) as client: + with GnoiClient(f"{dpu_ip}:{port}") as client: while time.monotonic() < deadline: try: - resp = client.reboot_status() + request = system_pb2.RebootStatusRequest() + resp = client.system.RebootStatus(request, timeout=STATUS_RPC_TIMEOUT_SEC) if not resp.active and resp.status.status == system_pb2.RebootStatus.Status.STATUS_SUCCESS: return True if not resp.active and resp.status.status != system_pb2.RebootStatus.Status.STATUS_SUCCESS: diff --git a/tests/gnoi_shutdown_daemon_test.py b/tests/gnoi_shutdown_daemon_test.py index 45fd1185..03a10b34 100644 --- a/tests/gnoi_shutdown_daemon_test.py +++ b/tests/gnoi_shutdown_daemon_test.py @@ -69,10 +69,13 @@ def _make_grpc_client_mock(reboot_status_resp=None, reboot_side_effect=None): client = MagicMock() client.__enter__ = MagicMock(return_value=client) client.__exit__ = MagicMock(return_value=False) + # All service RPCs go through client.system.* + system_stub = MagicMock() + client.system = system_stub if reboot_side_effect: - client.reboot.side_effect = reboot_side_effect + system_stub.Reboot.side_effect = reboot_side_effect if reboot_status_resp is not None: - client.reboot_status.return_value = reboot_status_resp + system_stub.RebootStatus.return_value = reboot_status_resp return client @@ -309,7 +312,7 @@ def test_send_reboot_command_success(self, mock_gnoi_client_class): handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) self.assertTrue(handler._send_reboot_command("DPU0", "10.0.0.1", "8080")) - mock_client.reboot.assert_called_once() + mock_client.system.Reboot.assert_called_once() @patch('gnoi_shutdown_daemon.GnoiClient') def test_send_reboot_command_grpc_error(self, mock_gnoi_client_class): @@ -389,7 +392,7 @@ def test_poll_reboot_status_rpc_error_recovery(self, mock_monotonic, mock_sleep, mock_resp.status.status = mock_status_enum.STATUS_SUCCESS mock_client = _make_grpc_client_mock() - mock_client.reboot_status.side_effect = [rpc_error, mock_resp] + mock_client.system.RebootStatus.side_effect = [rpc_error, mock_resp] mock_gnoi_client_class.return_value = mock_client handler = gnoi_shutdown_daemon.GnoiRebootHandler(MagicMock(), MagicMock(), MagicMock()) @@ -433,8 +436,8 @@ def test_handle_transition_success(self, mock_monotonic, mock_sleep, mock_gnoi_c result = handler._handle_transition("DPU0", "shutdown") self.assertTrue(result) - mock_reboot_client.reboot.assert_called_once() - mock_status_client.reboot_status.assert_called_once() + mock_reboot_client.system.Reboot.assert_called_once() + mock_status_client.system.RebootStatus.assert_called_once() mock_module.clear_module_gnoi_halt_in_progress.assert_called_once() @patch('gnoi_shutdown_daemon._get_halt_timeout', return_value=60)