Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions app/agents/tail.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def _resolve_macos_target(pid: int) -> _ResolvedTarget:
return _ResolvedTarget(pid=pid, path=_check_regular_file(Path(fd_name), what="stdout"))


def _resolve_target(pid: int) -> _ResolvedTarget:
def resolve_target(pid: int) -> _ResolvedTarget:
if sys.platform == "win32":
raise AttachUnsupported("Windows is not supported")
# Guard non-positive ids before probing: ``psutil.pid_exists(0)`` can
Expand Down Expand Up @@ -427,7 +427,7 @@ def attach(
vanished, open failed). Caller is responsible for closing the
session — preferably via ``with attach(pid) as sess: …``.
"""
target = _resolve_target(pid)
target = resolve_target(pid)
return AttachSession(
target,
buffer_bytes=buffer_bytes,
Expand All @@ -446,4 +446,5 @@ def attach(
"AttachUnsupported",
"TailBuffer",
"attach",
"resolve_target",
]
101 changes: 101 additions & 0 deletions app/tools/LocalProcessIntrospectTool/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""Tool for introspecting a local process during incident response.

Returns a psutil snapshot and the last 50 stdout lines for a given PID.
The investigation planner calls this to diagnose stuck or misbehaving
local agents from the OpenSRE interactive shell.
"""

from __future__ import annotations

import os
from datetime import UTC
from typing import Any

from app.agents.error_signals import ErrorSignals
from app.agents.probe import ProcessSnapshot, probe
Comment thread
greptile-apps[bot] marked this conversation as resolved.
from app.agents.tail import DEFAULT_MAX_BYTES, AttachUnsupported, resolve_target
from app.tools.tool_decorator import tool


def _snapshot_to_dict(snapshot: ProcessSnapshot) -> dict[str, Any]:
return {
"pid": snapshot.pid,
"cpu_percent": snapshot.cpu_percent,
"rss_mb": snapshot.rss_mb,
"num_fds": snapshot.num_fds,
"num_connections": snapshot.num_connections,
"status": snapshot.status,
"started_at": snapshot.started_at.astimezone(UTC).isoformat(),
}


def _read_stdout_tail(pid: int, max_lines: int = 50) -> str | None:
"""Read the last ``max_lines`` lines from the process's stdout.

Linux: resolves ``/proc/<pid>/fd/1``.
macOS: resolves fd 1 via ``lsof``.
Returns ``None`` when the pid doesn't exist, stdout is a pipe/socket/tty,
or we lack permission — the planner treats ``None`` as "unavailable".
"""
try:
target = resolve_target(pid)
except (AttachUnsupported, OSError):
return None
try:
with open(target.path, "rb") as f:
offset = max(0, os.fstat(f.fileno()).st_size - DEFAULT_MAX_BYTES)
if offset > 0:
f.seek(offset)
data = f.read()
except (OSError, PermissionError, FileNotFoundError):
return None
lines = data.decode("utf-8", errors="replace").splitlines()
return "\n".join(lines[-max_lines:])

Comment thread
greptile-apps[bot] marked this conversation as resolved.

@tool(
name="local_process_introspect",
source="knowledge",
description=(
"Introspect a local process: return a psutil resource snapshot "
"(CPU%, RSS MB, fd count, connection count, status, start time) "
"and the last 50 lines of stdout. Use this when the planner needs "
"to diagnose a stuck, high-cpu, or misbehaving local agent during "
"incident response."
),
input_schema={
"type": "object",
"properties": {
"pid": {
"type": "integer",
"description": "Process ID to introspect.",
},
},
"required": ["pid"],
},
use_cases=[
"Diagnosing a stuck or high-cpu local agent during incident response",
"Checking whether a process is alive and making forward progress",
"Reading recent stdout output from a local process",
"Verifying resource usage of a monitored agent",
],
outputs={
"snapshot": "psutil ProcessSnapshot dict, or null if the PID is inaccessible",
"stdout_tail": "last 50 stdout lines as a string, or null if stdout cannot be read",
"error_counts": "error/retry counts per category from recent stdout",
},
surfaces=("investigation",),
)
def local_process_introspect(pid: int) -> dict[str, Any]:
snapshot = probe(pid)
stdout_tail = _read_stdout_tail(pid)
error_counts: dict[str, float] = {}
if stdout_tail:
signals = ErrorSignals()
signals.observe(stdout_tail)
error_counts = signals.rate_per_minute()
return {
"snapshot": _snapshot_to_dict(snapshot) if snapshot else None,
"stdout_tail": stdout_tail,
"error_counts": error_counts,
}
Comment thread
X1Vi marked this conversation as resolved.
37 changes: 37 additions & 0 deletions docs/agents.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,43 @@ is required so a stray keypress doesn't abort an in-flight response.
buggy or hostile agents can emit control sequences affecting the viewer.
Only trace processes you trust; there is no sandboxing step.

## `local_process_introspect` — point-in-time process health check

The `local_process_introspect` tool is used by the investigation planner to
diagnose local agent health during incident response. It returns a process
resource snapshot (CPU, memory, file descriptors, connections, status, start
time) and the last 50 lines of stdout for a given PID.

Internally it reuses the same machinery as `/agents trace`:
`resolve_target` for stdout resolution, `probe` for the psutil snapshot, and
`ErrorSignals` to classify error/retry patterns from recent output.

### When the planner calls it

- Diagnosing a stuck or high-CPU local agent during incident investigation
- Checking whether a process is alive and making forward progress
- Reading recent stdout output from a local process
- Verifying resource usage of a monitored agent

### Output fields

| Field | Type | Description |
|-------|------|-------------|
| `snapshot` | `dict` or `null` | psutil process snapshot: `pid`, `cpu_percent`, `rss_mb`, `num_fds`, `num_connections`, `status`, `started_at` |
| `stdout_tail` | `str` or `null` | Last 50 lines of stdout; `null` when unavailable (dead PID, pipe/socket/tty, or permission denied) |
| `error_counts` | `dict[str, float]` | Error/retry counts per category (`traceback`, `rate_limit`, `http_5xx`, `tool_failure`) from recent stdout |

### Example

```python
local_process_introspect(pid=1234)
# {
# "snapshot": {"pid": 1234, "cpu_percent": 4.2, "rss_mb": 256.0, ...},
# "stdout_tail": "line 50\nline 51\n...\nline 99",
# "error_counts": {"traceback": 2, "rate_limit": 1, "http_5xx": 0, "tool_failure": 0}
# }
```

## `/agents bus` — shared context channel

The bus is an opt-in, local-only pub/sub channel that carries findings between
Expand Down
16 changes: 8 additions & 8 deletions tests/agents/test_tail.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
_parse_lsof_fd1,
_resolve_linux_target,
_resolve_macos_target,
_resolve_target,
_ResolvedTarget,
attach,
resolve_target,
)


Expand Down Expand Up @@ -341,20 +341,20 @@ class TestResolveTargetDispatch:
def test_windows_rejected(self, monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(tail_mod.sys, "platform", "win32")
with pytest.raises(AttachUnsupported, match="Windows"):
_resolve_target(1234)
resolve_target(1234)

def test_no_such_pid_rejected(self, monkeypatch: pytest.MonkeyPatch) -> None:
# platform is not "win32" so we hit the pid_exists check
monkeypatch.setattr(tail_mod.sys, "platform", "linux")
monkeypatch.setattr(tail_mod, "pid_exists", lambda _pid: False)
with pytest.raises(AttachUnsupported, match="no such pid"):
_resolve_target(99_999_999)
resolve_target(99_999_999)

def test_unknown_platform_rejected(self, monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(tail_mod.sys, "platform", "freebsd13")
monkeypatch.setattr(tail_mod, "pid_exists", lambda _pid: True)
with pytest.raises(AttachUnsupported, match="freebsd13"):
_resolve_target(1234)
resolve_target(1234)

@pytest.mark.parametrize("invalid_pid", [0, -1, -99])
def test_non_positive_pid_rejected_before_pid_exists(
Expand All @@ -363,15 +363,15 @@ def test_non_positive_pid_rejected_before_pid_exists(
# Regression guard: ``psutil.pid_exists(0)`` can raise
# ``PermissionError`` on macOS. The slash handler only catches
# ``AttachUnsupported``, so an unguarded probe would crash the
# REPL. ``_resolve_target`` must reject non-positive ids before
# REPL. ``resolve_target`` must reject non-positive ids before
# touching ``pid_exists`` at all.
def _boom(_pid: int) -> bool:
raise PermissionError(1, "operation not permitted")

monkeypatch.setattr(tail_mod.sys, "platform", "darwin")
monkeypatch.setattr(tail_mod, "pid_exists", _boom)
with pytest.raises(AttachUnsupported, match="must be positive"):
_resolve_target(invalid_pid)
resolve_target(invalid_pid)


class TestAttachEagerValidation:
Expand All @@ -384,7 +384,7 @@ def test_attach_raises_immediately_on_unsupported(
def _fail(_pid: int) -> _ResolvedTarget:
raise AttachUnsupported("planned failure")

monkeypatch.setattr(tail_mod, "_resolve_target", _fail)
monkeypatch.setattr(tail_mod, "resolve_target", _fail)
with pytest.raises(AttachUnsupported, match="planned failure"):
attach(1234)

Expand Down Expand Up @@ -549,7 +549,7 @@ def test_attach_eagerly_opens_real_file(self, tmp_path: Path) -> None:
log.write_text("")
with patch.object(
tail_mod,
"_resolve_target",
"resolve_target",
return_value=_ResolvedTarget(pid=os.getpid(), path=log),
):
sess = attach(os.getpid(), poll_interval_s=0.01)
Expand Down
Loading