diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..a85752c --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,23 @@ +name: Lint + +on: + push: + branches: [ main, master ] + pull_request: + branches: [ main, master ] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Run pre-commit + uses: pre-commit/action@v3.0.1 + # This action will fail the build if any files would be modified by the hooks. + # It does NOT commit or push any changes back to the repo. diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..63d5541 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,21 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.14.10 + hooks: + - id: ruff + args: [ --fix ] + - id: ruff-format + +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v18.1.8 + hooks: + - id: clang-format + types_or: [c++, c] diff --git a/README.md b/README.md index 857fc19..2e55741 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # rostrace -An eBPF-based performance tool for pinpointing latency bottlenecks in +An eBPF-based performance tool for pinpointing latency bottlenecks in multi-process ROS systems. ## Quick Start @@ -18,8 +18,8 @@ rostrace trace --node /processing_delay_node --topic /slow_topic For detailed instructions on running native nodes on your host, advanced configuration, and troubleshooting, see the [**Usage Guide**](Usage.md). ## The Problem -You publish a topic with a desired frequency - let's say 20Hz. You know it has -to arrive every 50ms, but it keeps missing its schedule. +You publish a topic with a desired frequency - let's say 20Hz. You know it has +to arrive every 50ms, but it keeps missing its schedule. Now you want to know: why? Standard ROS tools (like `rostopic hz` or `rqt_graph`) treat the nodes as a black box. They can tell you a message arrived late, but they cannot tell you @@ -27,12 +27,12 @@ why. - Was your code inefficient? - Was the OS scheduler starving the thread? - Did a "silent" page fault pause the CPU? - - Was the message dropped inside the middleware queue before the callback + - Was the message dropped inside the middleware queue before the callback even started? ## The Solution -`rostrace` uses eBPF (via `bpftrace`) to peek inside the "black box" of both -ROS middleware and the Linux kernel simultaneously. It decomposes the +`rostrace` uses eBPF (via `bpftrace`) to peek inside the "black box" of both +ROS middleware and the Linux kernel simultaneously. It decomposes the lifecycle of a message into four distinct metrics: * **Network Latency** (Planned) @@ -44,83 +44,83 @@ lifecycle of a message into four distinct metrics: To debug a robot, you must see through the layers of the Operating System. -* **Multi-Process (Distributed) Debugging**: ROS splits a robot's brain into - many independent processes. Latency often hides in the communication - between these processes. `rostrace` tracks messages as they cross +* **Multi-Process (Distributed) Debugging**: ROS splits a robot's brain into + many independent processes. Latency often hides in the communication + between these processes. `rostrace` tracks messages as they cross process boundaries. -* **Zero-Instrumentation**: `rostrace` attaches to running processes - dynamically. You don't need to change a single line of your code or +* **Zero-Instrumentation**: `rostrace` attaches to running processes + dynamically. You don't need to change a single line of your code or recompile your nodes. -* **Eliminating Heisenbugs**: Standard tools run in *User-Space* and are - biased by the very latency they try to measure. `rostrace` runs inside - the *Linux Kernel*. It captures 100% of events with nanosecond precision, - seeing system-wide events—like the OS scheduler preempting your node—that +* **Eliminating Heisenbugs**: Standard tools run in *User-Space* and are + biased by the very latency they try to measure. `rostrace` runs inside + the *Linux Kernel*. It captures 100% of events with nanosecond precision, + seeing system-wide events—like the OS scheduler preempting your node—that are invisible to ROS. ### Tracing the Entire ROS Ecosystem -While ROS offers many communication abstractions (Topics, Services, Actions, TF), -they all ultimately rely on common middleware patterns: data serialization, +While ROS offers many communication abstractions (Topics, Services, Actions, TF), +they all ultimately rely on common middleware patterns: data serialization, internal queuing, and callback dispatch. -`rostrace` targets these fundamental building blocks inside `libroscpp.so`, -allowing it to provide latency insights across the entire ROS graph using a +`rostrace` targets these fundamental building blocks inside `libroscpp.so`, +allowing it to provide latency insights across the entire ROS graph using a unified analysis approach. ## Performance Bottleneck Analysis -`rostrace` breaks down the fault domain by measuring how long a message +`rostrace` breaks down the fault domain by measuring how long a message spends in each internal stage: -* **Network Latency (Planned)**: From kernel TCP stack entry until the +* **Network Latency (Planned)**: From kernel TCP stack entry until the middleware begins processing the raw buffer. -* **Serialization Latency**: Measured from the start to the end of the - message deserialization process. This identifies overhead from large or +* **Serialization Latency**: Measured from the start to the end of the + message deserialization process. This identifies overhead from large or complex message types. -* **Queuing Latency**: The time a message sits "ready and waiting" in the +* **Queuing Latency**: The time a message sits "ready and waiting" in the `ros::CallbackQueue` before your code is free to process it. * **Processing Latency**: The time your own callback code took to execute. ### How to Interpret the Results -* **High Serialization Latency**: Indicates the message format is a +* **High Serialization Latency**: Indicates the message format is a bottleneck. Consider optimizing message structures or using compression. -* **High Queuing Latency**: Indicates the node is overloaded or being +* **High Queuing Latency**: Indicates the node is overloaded or being starved of CPU time by the OS scheduler. -* **High Processing Latency**: Indicates an inefficiency inside your +* **High Processing Latency**: Indicates an inefficiency inside your callback function (e.g., heavy algorithms or blocking I/O). ## Technical Architecture -* **Event Tracing**: Unlike statistical profilers (like `parca`) that sample - the CPU at intervals, `rostrace` uses deterministic event tracing. It - captures 100% of occurrences, ensuring that even a single 100ms latency +* **Event Tracing**: Unlike statistical profilers (like `parca`) that sample + the CPU at intervals, `rostrace` uses deterministic event tracing. It + captures 100% of occurrences, ensuring that even a single 100ms latency spike is never missed. -* **Dynamic Symbol Resolution**: The tool automatically discovers mangled - C++ symbols in your specific version of `libroscpp.so`, ensuring robust +* **Dynamic Symbol Resolution**: The tool automatically discovers mangled + C++ symbols in your specific version of `libroscpp.so`, ensuring robust operation across different ROS distributions. -* **Container Compatibility**: Includes specific logic to resolve - namespaced PIDs, allowing you to trace production containers from the +* **Container Compatibility**: Includes specific logic to resolve + namespaced PIDs, allowing you to trace production containers from the host or within a privileged sidecar. ## Future Work ### Extended Metrics -* **Network Latency (Kernel-to-User)**: Tracing from `tcp_recvmsg` to +* **Network Latency (Kernel-to-User)**: Tracing from `tcp_recvmsg` to middleware entry using `kprobes`. -* **Action-Specific Latency**: Direct tracing of the `actionlib` API to +* **Action-Specific Latency**: Direct tracing of the `actionlib` API to measure **Goal-to-Accept** and **Feedback** delay. -* **TF Lookup Latency**: Measuring time spent inside blocking +* **TF Lookup Latency**: Measuring time spent inside blocking `lookupTransform()` calls. ### Root Cause Analysis (System Events) -* **Scheduler Starvation**: Correlating latency with `sched_switch` events +* **Scheduler Starvation**: Correlating latency with `sched_switch` events to detect when a node is blocked by other processes. -* **Context Switch Storms**: Detecting thrashing due to high-frequency +* **Context Switch Storms**: Detecting thrashing due to high-frequency preemption or priority inversion. ### Reliability & Failure Detection -* **Black-Box Drop Detection**: Identifying packet loss by tracking +* **Black-Box Drop Detection**: Identifying packet loss by tracking divergence between arrival rates and execution rates. -* **Real-Time Safety Auditing**: Flagging dynamic memory allocation - (`malloc`) or blocking I/O inside critical control loops. \ No newline at end of file +* **Real-Time Safety Auditing**: Flagging dynamic memory allocation + (`malloc`) or blocking I/O inside critical control loops. diff --git a/Usage.md b/Usage.md index 8bda7ff..60e9334 100644 --- a/Usage.md +++ b/Usage.md @@ -4,7 +4,7 @@ This guide explains how to use `rostrace` to identify and debug latency in your ## 1. Quick Start with Demos (Docker) -We provide pre-packaged demos using Docker Compose. These scenarios simulate common +We provide pre-packaged demos using Docker Compose. These scenarios simulate common latency issues without requiring any installation on your host machine. ### Prerequisites @@ -13,8 +13,8 @@ latency issues without requiring any installation on your host machine. ### 1.1 The Streamlined Way (Recommended) -To make tracing as seamless as possible, we provide a wrapper script -`./rostrace-docker` in the project root. This script automatically handles all +To make tracing as seamless as possible, we provide a wrapper script +`./rostrace-docker` in the project root. This script automatically handles all the complex Docker flags required for eBPF tracing. ```bash @@ -37,7 +37,7 @@ In this scenario, a node has a callback that takes ~200ms to complete. You will see an output like below: ``` 1. Serialization Latency (us): -@serialization_latency_us: +@serialization_latency_us: [2, 4) 1 |@ | [4, 8) 50 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [8, 16) 5 |@@@@@ | @@ -46,22 +46,22 @@ You will see an output like below: 2. Queuing Latency (us): -@queuing_latency_us: +@queuing_latency_us: [128K, 256K) 57 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| 3. Processing Latency (us): -@processing_latency_us: +@processing_latency_us: [128K, 256K) 56 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| ``` -You can see that due to **High Processing Latency** (~200ms), the node cannot -keep up with incoming messages. This causes them to back up, leading to a direct +You can see that due to **High Processing Latency** (~200ms), the node cannot +keep up with incoming messages. This causes them to back up, leading to a direct increase in **Queuing Latency** (also ~200ms). The serialization overhead remains negligible. ### Scenario B: Queuing Latency (Service Blocking) -In this scenario, a node is busy handling a heavy service call, causing incoming +In this scenario, a node is busy handling a heavy service call, causing incoming topic messages to wait in the queue. 1. **Start the environment**: @@ -76,7 +76,7 @@ topic messages to wait in the queue. You will see an output like below: ``` 1. Serialization Latency (us): -@serialization_latency_us: +@serialization_latency_us: [4, 8) 128 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [8, 16) 19 |@@@@@@@ | [16, 32) 33 |@@@@@@@@@@@@@ | @@ -85,7 +85,7 @@ You will see an output like below: 2. Queuing Latency (us): -@queuing_latency_us: +@queuing_latency_us: [8, 16) 9 |@@@@@@ | [16, 32) 31 |@@@@@@@@@@@@@@@@@@@@@@@@ | [32, 64) 67 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| @@ -106,7 +106,7 @@ You will see an output like below: 3. Processing Latency (us): -@processing_latency_us: +@processing_latency_us: [8, 16) 28 |@@@@@@@@@@@@@@ | [16, 32) 29 |@@@@@@@@@@@@@@@ | [32, 64) 99 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| @@ -126,12 +126,12 @@ You will see an output like below: ``` -This scenario reveals a **bimodal distribution** in Processing Latency. The -cluster in the microseconds range represents your fast topic callbacks. The -cluster at ~500ms represents the `heavy_service` handler blocking the main thread. +This scenario reveals a **bimodal distribution** in Processing Latency. The +cluster in the microseconds range represents your fast topic callbacks. The +cluster at ~500ms represents the `heavy_service` handler blocking the main thread. -Note how the **Queuing Latency** shows a wide spread: messages that arrive while -the service is running are forced to wait, while messages that arrive when the +Note how the **Queuing Latency** shows a wide spread: messages that arrive while +the service is running are forced to wait, while messages that arrive when the thread is free are processed instantly. This is a classic "Thread Starvation" pattern. @@ -161,7 +161,7 @@ rosrun rostrace rostrace trace --node /my_node_name --topic /my_topic_name ## 3. Advanced Configuration ### Explicit Library Path (`--lib`) -If `rostrace` cannot automatically find `libroscpp.so` (common in custom builds +If `rostrace` cannot automatically find `libroscpp.so` (common in custom builds or statically linked environments), you can provide the path explicitly: ```bash @@ -175,7 +175,7 @@ If you prefer to run the Python logic in a virtual environment: 3. Note: You still need `bpftrace` installed on your system. ### Troubleshooting -- **Missing DebugFS**: If you see "Kernel debug filesystem not found", ensure +- **Missing DebugFS**: If you see "Kernel debug filesystem not found", ensure `/sys/kernel/debug` is mounted. On host: `sudo mount -t debugfs none /sys/kernel/debug`. -- **Permissions**: `rostrace` requires root privileges to attach eBPF probes. +- **Permissions**: `rostrace` requires root privileges to attach eBPF probes. It will attempt to use `sudo` automatically if not run as root. diff --git a/demo/docker-compose.processing.yml b/demo/docker-compose.processing.yml index 2fd1035..5be7f6a 100644 --- a/demo/docker-compose.processing.yml +++ b/demo/docker-compose.processing.yml @@ -4,7 +4,7 @@ services: container_name: roscore_proc pid: host network_mode: host - command: + command: - "bash" - "-c" - "source /catkin_ws/devel/setup.bash && roscore" @@ -27,7 +27,7 @@ services: - /sys/kernel/debug:/sys/kernel/debug:rw - /lib/modules:/lib/modules:ro - /usr/src:/usr/src:ro - command: + command: - "bash" - "-c" - | @@ -45,7 +45,7 @@ services: - ROS_MASTER_URI=http://localhost:11311 depends_on: - scenario - command: + command: - "bash" - "-c" - | @@ -67,7 +67,7 @@ services: - /usr/src:/usr/src:ro stdin_open: true tty: true - command: + command: - "bash" - "-c" - "source /catkin_ws/devel/setup.bash && echo 'Run: rostrace trace --node /processing_delay_node --topic /slow_topic' && sleep infinity" diff --git a/demo/docker-compose.queuing.yml b/demo/docker-compose.queuing.yml index 54ae7ab..56b5626 100644 --- a/demo/docker-compose.queuing.yml +++ b/demo/docker-compose.queuing.yml @@ -4,7 +4,7 @@ services: container_name: roscore_queue pid: host network_mode: host - command: + command: - "bash" - "-c" - "source /catkin_ws/devel/setup.bash && roscore" @@ -27,7 +27,7 @@ services: - /sys/kernel/debug:/sys/kernel/debug:rw - /lib/modules:/lib/modules:ro - /usr/src:/usr/src:ro - command: + command: - "bash" - "-c" - | @@ -45,7 +45,7 @@ services: - ROS_MASTER_URI=http://localhost:11311 depends_on: - scenario - command: + command: - "bash" - "-c" - | @@ -67,7 +67,7 @@ services: - /usr/src:/usr/src:ro stdin_open: true tty: true - command: + command: - "bash" - "-c" - "source /catkin_ws/devel/setup.bash && echo 'Run: rostrace trace --node /service_blocker_node --topic /blocked_topic' && sleep infinity" diff --git a/example/queuing_latency/src/listener.cpp b/example/queuing_latency/src/listener.cpp index 75ac70a..84d90a5 100644 --- a/example/queuing_latency/src/listener.cpp +++ b/example/queuing_latency/src/listener.cpp @@ -1,14 +1,12 @@ #include "ros/ros.h" #include "std_msgs/String.h" -void topicCallback(const std_msgs::String::ConstPtr& msg) -{ +void topicCallback(const std_msgs::String::ConstPtr &msg) { // Let's imagine this callback does some work that takes a bit of time. ros::Duration(0.01).sleep(); // Simulate 10ms of work } -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { ros::init(argc, argv, "listener"); ros::NodeHandle nh; ros::Subscriber sub = nh.subscribe("my_topic", 20, topicCallback); diff --git a/example/queuing_latency/src/talker.cpp b/example/queuing_latency/src/talker.cpp index 4ea3635..26dd158 100644 --- a/example/queuing_latency/src/talker.cpp +++ b/example/queuing_latency/src/talker.cpp @@ -2,15 +2,13 @@ #include "std_msgs/String.h" #include -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { ros::init(argc, argv, "talker"); ros::NodeHandle nh; ros::Publisher pub = nh.advertise("my_topic", 20); ros::Rate loop_rate(20); // 20 Hz - while (ros::ok()) - { + while (ros::ok()) { std_msgs::String msg; std::stringstream ss; ss << "hello world " << ros::Time::now(); diff --git a/example/scenarios/scripts/load_driver.py b/example/scenarios/scripts/load_driver.py index 5e2c706..d3dbc67 100755 --- a/example/scenarios/scripts/load_driver.py +++ b/example/scenarios/scripts/load_driver.py @@ -1,14 +1,15 @@ #!/usr/bin/env python3 +import threading + import rospy from std_msgs.msg import String from std_srvs.srv import Empty -import threading -import time + def call_service_loop(): - rospy.wait_for_service('heavy_service') - heavy_service = rospy.ServiceProxy('heavy_service', Empty) - rate = rospy.Rate(1) # Call service every 1 second + rospy.wait_for_service("heavy_service") + heavy_service = rospy.ServiceProxy("heavy_service", Empty) + rate = rospy.Rate(1) # Call service every 1 second while not rospy.is_shutdown(): try: rospy.loginfo("Calling /heavy_service...") @@ -17,31 +18,33 @@ def call_service_loop(): rospy.logwarn(f"Service call failed: {e}") rate.sleep() + def main(): - rospy.init_node('load_driver') + rospy.init_node("load_driver") # Publishes to the slow processing node - pub_process = rospy.Publisher('slow_topic', String, queue_size=10) - + pub_process = rospy.Publisher("slow_topic", String, queue_size=10) + # Publishes to the service blocker node - pub_block = rospy.Publisher('blocked_topic', String, queue_size=10) + pub_block = rospy.Publisher("blocked_topic", String, queue_size=10) # Start a background thread to call the service continuously t = threading.Thread(target=call_service_loop) t.daemon = True t.start() - rate = rospy.Rate(10) # 10Hz + rate = rospy.Rate(10) # 10Hz while not rospy.is_shutdown(): msg = "data" - + # 1. Trigger Processing Latency pub_process.publish(msg) - + # 2. Trigger Queuing Latency (will be blocked by service calls) pub_block.publish(msg) - + rate.sleep() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/example/scenarios/src/processing_delay_node.cpp b/example/scenarios/src/processing_delay_node.cpp index e94439a..1ffe738 100644 --- a/example/scenarios/src/processing_delay_node.cpp +++ b/example/scenarios/src/processing_delay_node.cpp @@ -1,24 +1,22 @@ +#include #include #include #include -#include // Scenario: High Processing Latency // Cause: The callback takes too long to execute. -void chatterCallback(const std_msgs::String::ConstPtr& msg) -{ +void chatterCallback(const std_msgs::String::ConstPtr &msg) { ROS_INFO("Processing message: [%s]", msg->data.c_str()); - + // Simulate heavy computation or blocking I/O // This will show up as "Processing Latency" in rostrace - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + ROS_INFO("Finished processing."); } -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { ros::init(argc, argv, "processing_delay_node"); ros::NodeHandle n; diff --git a/example/scenarios/src/service_blocker_node.cpp b/example/scenarios/src/service_blocker_node.cpp index 3ed5d90..70c9027 100644 --- a/example/scenarios/src/service_blocker_node.cpp +++ b/example/scenarios/src/service_blocker_node.cpp @@ -1,41 +1,41 @@ +#include #include #include #include #include -#include // Scenario: High Queuing Latency // Cause: The main thread is busy handling a service call, so the topic callback // cannot start even though the message has arrived and is in the queue. -bool heavyService(std_srvs::Empty::Request &req, - std_srvs::Empty::Response &res) -{ +bool heavyService(std_srvs::Empty::Request &req, + std_srvs::Empty::Response &res) { ROS_INFO("Service called! Blocking thread for 500ms..."); - // Simulate a long-running service task (e.g., complex calculation, hardware sync) + // Simulate a long-running service task (e.g., complex calculation, hardware + // sync) std::this_thread::sleep_for(std::chrono::milliseconds(500)); ROS_INFO("Service finished."); return true; } -void topicCallback(const std_msgs::String::ConstPtr& msg) -{ +void topicCallback(const std_msgs::String::ConstPtr &msg) { // This callback is fast, but it will be delayed if the service is running. ROS_INFO("Topic received: [%s]", msg->data.c_str()); } -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { ros::init(argc, argv, "service_blocker_node"); ros::NodeHandle n; // Single-threaded spinner (default) means we can only do one thing at a time. - - ros::ServiceServer service = n.advertiseService("heavy_service", heavyService); + + ros::ServiceServer service = + n.advertiseService("heavy_service", heavyService); ros::Subscriber sub = n.subscribe("blocked_topic", 1000, topicCallback); - ROS_INFO("Ready to demonstrate Queuing Latency. Call /heavy_service to block the thread."); - + ROS_INFO("Ready to demonstrate Queuing Latency. Call /heavy_service to block " + "the thread."); + ros::spin(); return 0; diff --git a/package.xml b/package.xml index 426be41..896f6d9 100644 --- a/package.xml +++ b/package.xml @@ -3,7 +3,7 @@ rostrace 0.1.0 - A zero-instrumentation, kernel-level observability suite for debugging + A zero-instrumentation, kernel-level observability suite for debugging non-deterministic latency in distributed robotic systems using eBPF. @@ -14,7 +14,7 @@ python3 bpftrace - diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b330b9d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,11 @@ +[tool.ruff] +line-length = 88 +target-version = "py38" + +[tool.ruff.lint] +select = ["E", "F", "I", "B"] +ignore = [] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" diff --git a/scripts/rostrace b/scripts/rostrace index 6c10853..5a670b2 100755 --- a/scripts/rostrace +++ b/scripts/rostrace @@ -2,4 +2,4 @@ from rostrace.main import main if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/setup.py b/setup.py index 8581f95..c442564 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,12 @@ from distutils.core import setup + from catkin_pkg.python_setup import generate_distutils_setup d = generate_distutils_setup( - packages=['rostrace'], - package_dir={'': 'src'}, - package_data={'rostrace': ['bpf/*.bt']}, - entry_points={'console_scripts': ['rostrace=rostrace.main:main']}) + packages=["rostrace"], + package_dir={"": "src"}, + package_data={"rostrace": ["bpf/*.bt"]}, + entry_points={"console_scripts": ["rostrace=rostrace.main:main"]}, +) setup(**d) diff --git a/src/rostrace/__init__.py b/src/rostrace/__init__.py index 7c10232..e4062f6 100644 --- a/src/rostrace/__init__.py +++ b/src/rostrace/__init__.py @@ -1 +1,3 @@ from .runner import Runner + +__all__ = ["Runner"] diff --git a/src/rostrace/bpf/topics.bt b/src/rostrace/bpf/topics.bt index f65c3ac..f037aa7 100644 --- a/src/rostrace/bpf/topics.bt +++ b/src/rostrace/bpf/topics.bt @@ -55,7 +55,7 @@ uprobe:{{LIBROSCPP_PATH}}:{{SYMBOL_CALL_ONE_CB}} { if (pid == (uint32)$1) { $now = nsecs; - + // Calculate Queuing Latency (matching by global FIFO index) $q_start = @q_timestamps[@pop_count]; if ($q_start) { diff --git a/src/rostrace/main.py b/src/rostrace/main.py index 1f9ed97..aa87f8e 100644 --- a/src/rostrace/main.py +++ b/src/rostrace/main.py @@ -1,7 +1,8 @@ -import sys import argparse import logging +import sys from typing import List, Optional + from rostrace.runner import Runner @@ -10,36 +11,34 @@ def main(argv: Optional[List[str]] = None) -> None: argv = sys.argv[1:] parser = argparse.ArgumentParser( - description="rostrace: eBPF-based ROS observability") - parser.add_argument("--verbose", - "-v", - action="store_true", - help="Enable verbose debug logging") + description="rostrace: eBPF-based ROS observability" + ) + parser.add_argument( + "--verbose", "-v", action="store_true", help="Enable verbose debug logging" + ) subparsers = parser.add_subparsers(dest="command", help="Command to run") # Command: trace - trace_parser = subparsers.add_parser("trace", - help="Trace ROS message latency") - trace_parser.add_argument("--pid", - type=int, - help="PID of the ROS node to trace") + trace_parser = subparsers.add_parser("trace", help="Trace ROS message latency") + trace_parser.add_argument("--pid", type=int, help="PID of the ROS node to trace") trace_parser.add_argument( "--node", type=str, - help="Name of the ROS node to trace (resolves PID automatically)") + help="Name of the ROS node to trace (resolves PID automatically)", + ) trace_parser.add_argument("--topic", type=str, help="Filter by topic name") trace_parser.add_argument( "--lib", type=str, - help="Manually specify path to libroscpp.so (overrides auto-detection)" + help="Manually specify path to libroscpp.so (overrides auto-detection)", ) args = parser.parse_args(argv) # Configure logging log_level = logging.DEBUG if args.verbose else logging.INFO - logging.basicConfig(level=log_level, format='%(levelname)s: %(message)s') + logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s") if args.command == "trace": runner = Runner(args) diff --git a/src/rostrace/runner.py b/src/rostrace/runner.py index e289159..9d1b47d 100644 --- a/src/rostrace/runner.py +++ b/src/rostrace/runner.py @@ -1,25 +1,25 @@ +import logging import os import subprocess -import logging import tempfile -from typing import Optional, List, Any +from typing import Any, Optional # Set up logging logger = logging.getLogger("rostrace") class Runner: - def __init__(self, args: Any): self.args = args # Use relative path to find BPF scripts, working for both source and install - self.bpf_dir = os.path.join(os.path.dirname(__file__), 'bpf') + self.bpf_dir = os.path.join(os.path.dirname(__file__), "bpf") @staticmethod def check_root_privileges() -> bool: if os.geteuid() != 0: logger.warning( - "Not running as root. rostrace will attempt to use 'sudo' for kernel access." + "Not running as root. rostrace will attempt to use 'sudo' " + "for kernel access." ) return True @@ -27,10 +27,10 @@ def check_root_privileges() -> bool: def check_debug_fs_mount() -> bool: debugfs_path = "/sys/kernel/debug/tracing" if not os.path.exists(debugfs_path): - logger.error("Kernel debug filesystem not found at %s", - debugfs_path) + logger.error("Kernel debug filesystem not found at %s", debugfs_path) logger.info( - " If running in Docker, ensure you added: -v /sys/kernel/debug:/sys/kernel/debug:rw" + " If running in Docker, ensure you added: " + "-v /sys/kernel/debug:/sys/kernel/debug:rw" ) logger.info(" and use --privileged.") return False @@ -49,29 +49,28 @@ def resolve_pid(self, node_name: str) -> Optional[int]: try: logger.debug("Resolving PID for node: %s", node_name) output = subprocess.check_output( - ["rosnode", "info", node_name], - stderr=subprocess.STDOUT).decode() + ["rosnode", "info", node_name], stderr=subprocess.STDOUT + ).decode() - for line in output.split('\n'): + for line in output.split("\n"): if line.strip().startswith("Pid:"): - return int(line.split(':')[1].strip()) + return int(line.split(":")[1].strip()) except subprocess.CalledProcessError as e: logger.error( - "Failed to resolve node PID: 'rosnode info' failed for %s", - node_name) + "Failed to resolve node PID: 'rosnode info' failed for %s", node_name + ) logger.debug("rosnode output: %s", e.output.decode().strip()) return None except (ValueError, IndexError) as e: logger.error( - "Malformed output from 'rosnode info' while resolving PID: %s", - e) + "Malformed output from 'rosnode info' while resolving PID: %s", e + ) return None except Exception as e: logger.error("Unexpected error resolving node PID: %s", e) return None - logger.error("Node '%s' found but no PID reported by ROS Master.", - node_name) + logger.error("Node '%s' found but no PID reported by ROS Master.", node_name) return None def get_host_pid(self, container_pid: int) -> int: @@ -81,7 +80,7 @@ def get_host_pid(self, container_pid: int) -> int: """ status_path = f"/proc/{container_pid}/status" try: - with open(status_path, 'r') as f: + with open(status_path, "r") as f: for line in f: if line.startswith("NSpid:"): # Format: NSpid: 256123 170 @@ -90,8 +89,8 @@ def get_host_pid(self, container_pid: int) -> int: return int(parts[1]) except FileNotFoundError: logger.error( - "Process status not found: %s. Is the node still running?", - status_path) + "Process status not found: %s. Is the node still running?", status_path + ) except Exception as e: logger.warning("Could not read NSpid from %s: %s", status_path, e) @@ -104,7 +103,7 @@ def get_libroscpp_path(self, pid: int) -> Optional[str]: """ maps_path = f"/proc/{pid}/maps" try: - with open(maps_path, 'r') as f: + with open(maps_path, "r") as f: for line in f: if "libroscpp.so" in line: parts = line.split() @@ -118,8 +117,8 @@ def get_libroscpp_path(self, pid: int) -> Optional[str]: return path except PermissionError: logger.error( - "Permission denied reading %s. Try running with root/sudo.", - maps_path) + "Permission denied reading %s. Try running with root/sudo.", maps_path + ) except Exception as e: logger.error("Error scanning process maps at %s: %s", maps_path, e) @@ -136,10 +135,10 @@ def resolve_symbol(self, lib_path: str, pattern: str) -> Optional[str]: cmd = ["sudo"] + cmd try: - output = subprocess.check_output(cmd, - stderr=subprocess.PIPE).decode() + output = subprocess.check_output(cmd, stderr=subprocess.PIPE).decode() candidates = [ - line.strip().split(':')[-1] for line in output.split('\n') + line.strip().split(":")[-1] + for line in output.split("\n") if line.strip() ] except subprocess.CalledProcessError: @@ -148,7 +147,7 @@ def resolve_symbol(self, lib_path: str, pattern: str) -> Optional[str]: if not candidates: return None - # Heuristic 1: Prefer symbols ending in 'Ev' (void parameter, standard entry point) + # Heuristic 1: Prefer symbols ending in 'Ev' (standard void entry point) # KNOWN LIMITATION: This assumes Itanium C++ ABI. for c in candidates: if c.endswith("Ev"): @@ -174,8 +173,7 @@ def run(self) -> None: # Resolve the Host PID for BPF tracing host_pid = self.get_host_pid(target_pid) - logger.info("Tracing Node PID: %d (Host PID: %d)", target_pid, - host_pid) + logger.info("Tracing Node PID: %d (Host PID: %d)", target_pid, host_pid) # 1. Find libroscpp.so if self.args.lib: @@ -183,28 +181,28 @@ def run(self) -> None: if not os.path.isfile(lib_path): logger.error( "User-specified library path does not exist or is not a file: %s", - lib_path) + lib_path, + ) return logger.info("Using user-specified libroscpp.so at: %s", lib_path) else: lib_path = self.get_libroscpp_path(target_pid) if not lib_path: logger.warning( - "libroscpp.so not found in maps. Falling back to default.") + "libroscpp.so not found in maps. Falling back to default." + ) lib_path = "/opt/ros/noetic/lib/libroscpp.so" else: logger.info("Auto-detected libroscpp.so at: %s", lib_path) # 2. Resolve Symbols Dynamically logger.info("Resolving symbols in %s...", lib_path) - sym_handle = self.resolve_symbol(lib_path, - "*Subscription*handleMessage*") + sym_handle = self.resolve_symbol(lib_path, "*Subscription*handleMessage*") sym_add = self.resolve_symbol(lib_path, "*CallbackQueue*addCallback*") sym_call = self.resolve_symbol(lib_path, "*CallbackQueue*callOneCB*") if not (sym_handle and sym_add and sym_call): - logger.error( - "Could not resolve one or more symbols in libroscpp.so") + logger.error("Could not resolve one or more symbols in libroscpp.so") logger.info(" handleMessage: %s", sym_handle or "NOT FOUND") logger.info(" addCallback: %s", sym_add or "NOT FOUND") logger.info(" callOneCB: %s", sym_call or "NOT FOUND") @@ -222,21 +220,19 @@ def run(self) -> None: return try: - with open(script_path, 'r') as f: + with open(script_path, "r") as f: script_content = f.read() - script_content = script_content.replace("{{LIBROSCPP_PATH}}", - lib_path) + script_content = script_content.replace("{{LIBROSCPP_PATH}}", lib_path) script_content = script_content.replace( - "{{SYMBOL_HANDLE_MESSAGE}}", sym_handle) - script_content = script_content.replace("{{SYMBOL_ADD_CALLBACK}}", - sym_add) - script_content = script_content.replace("{{SYMBOL_CALL_ONE_CB}}", - sym_call) - - with tempfile.NamedTemporaryFile(mode='w', - suffix='.bt', - delete=False) as temp_script: + "{{SYMBOL_HANDLE_MESSAGE}}", sym_handle + ) + script_content = script_content.replace("{{SYMBOL_ADD_CALLBACK}}", sym_add) + script_content = script_content.replace("{{SYMBOL_CALL_ONE_CB}}", sym_call) + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".bt", delete=False + ) as temp_script: temp_script.write(script_content) temp_script_path = temp_script.name except Exception as e: @@ -248,7 +244,7 @@ def run(self) -> None: cmd = prefix + ["bpftrace", temp_script_path, str(host_pid)] logger.info("Starting bpftrace... (Ctrl-C to stop and show results)") - logger.debug("Command: %s", ' '.join(cmd)) + logger.debug("Command: %s", " ".join(cmd)) try: subprocess.call(cmd) diff --git a/tests/test_runner.py b/tests/test_runner.py index bb42b63..b3a332a 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -1,13 +1,11 @@ -import unittest -from unittest.mock import patch, MagicMock, mock_open -import sys -import os -import subprocess import logging +import os +import sys +import unittest +from unittest.mock import MagicMock, mock_open, patch # Add src to sys.path so we can import rostrace -sys.path.insert( - 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) from rostrace.runner import Runner @@ -17,7 +15,6 @@ class TestRunner(unittest.TestCase): - def setUp(self): # Mock args self.args = MagicMock() @@ -26,7 +23,7 @@ def setUp(self): self.args.lib = None self.runner = Runner(self.args) - @patch('os.geteuid') + @patch("os.geteuid") def test_check_root_privileges(self, mock_geteuid): """Test root privilege check.""" mock_geteuid.return_value = 0 @@ -35,7 +32,7 @@ def test_check_root_privileges(self, mock_geteuid): mock_geteuid.return_value = 1000 self.assertTrue(Runner.check_root_privileges()) - @patch('os.path.exists') + @patch("os.path.exists") def test_check_debug_fs_mount(self, mock_exists): """Test DebugFS mount check.""" # Case 1: Exists @@ -49,9 +46,11 @@ def test_check_debug_fs_mount(self, mock_exists): def test_check_preflight(self): """Test that check_preflight calls the individual checks.""" # Mock the static methods on the class - with patch.object(Runner, 'check_root_privileges', return_value=True) as mock_root, \ - patch.object(Runner, 'check_debug_fs_mount', return_value=True) as mock_debug: - + with patch.object( + Runner, "check_root_privileges", return_value=True + ) as mock_root, patch.object( + Runner, "check_debug_fs_mount", return_value=True + ) as mock_debug: self.assertTrue(self.runner.check_preflight()) # Verify calls @@ -59,32 +58,27 @@ def test_check_preflight(self): mock_debug.assert_called_once() # Test failure case - with patch.object(Runner, 'check_root_privileges', return_value=True), \ - patch.object(Runner, 'check_debug_fs_mount', return_value=False): - + with patch.object( + Runner, "check_root_privileges", return_value=True + ), patch.object(Runner, "check_debug_fs_mount", return_value=False): self.assertFalse(self.runner.check_preflight()) - @patch('builtins.open', - new_callable=mock_open, - read_data="NSpid:\t256123\t170\n") + @patch("builtins.open", new_callable=mock_open, read_data="NSpid:\t256123\t170\n") def test_get_host_pid(self, mock_file): """Test parsing NSpid from status file.""" host_pid = self.runner.get_host_pid(170) self.assertEqual(host_pid, 256123) - @patch('subprocess.check_output') + @patch("subprocess.check_output") def test_resolve_pid_success(self, mock_check_output): """Test that resolve_pid correctly extracts PID from rosnode info output.""" - mock_output = b""" -Node [/talker] -Pid: 12345 -""" + mock_output = b"\nNode [/talker]\nPid: 12345\n" mock_check_output.return_value = mock_output - pid = self.runner.resolve_pid('/talker') + pid = self.runner.resolve_pid("/talker") self.assertEqual(pid, 12345) - @patch('subprocess.check_output') - @patch('os.geteuid') + @patch("subprocess.check_output") + @patch("os.geteuid") def test_resolve_symbol(self, mock_geteuid, mock_check_output): """Test dynamic symbol resolution logic.""" mock_geteuid.return_value = 0 @@ -106,46 +100,55 @@ def test_resolve_symbol(self, mock_geteuid, mock_check_output): sym = self.runner.resolve_symbol("/lib/lib.so", "bar") self.assertEqual(sym, "_Z3barShort") - @patch('subprocess.check_output') - @patch('os.geteuid') - @patch('os.remove') - @patch('tempfile.NamedTemporaryFile') - @patch('subprocess.call') - @patch('builtins.open', new_callable=mock_open) - @patch('os.path.exists') - def test_run_logic_flow(self, mock_exists, mock_file, mock_call, mock_temp, - mock_remove, mock_geteuid, mock_check_output): - """Test the full run orchestration including pre-flight, host pid resolution, symbol resolution, and execution.""" + @patch("subprocess.check_output") + @patch("os.geteuid") + @patch("os.remove") + @patch("tempfile.NamedTemporaryFile") + @patch("subprocess.call") + @patch("builtins.open", new_callable=mock_open) + @patch("os.path.exists") + def test_run_logic_flow( + self, + mock_exists, + mock_file, + mock_call, + mock_temp, + mock_remove, + mock_geteuid, + mock_check_output, + ): + """Test full run orchestration.""" self.args.pid = 170 - # Mock not being root + # Mock not being root (so we expect sudo) mock_geteuid.return_value = 1000 # Mock pre-flight and file checks mock_exists.return_value = True # Mock file reading - def open_side_effect(filename, mode='r'): - if filename.endswith('status'): + def open_side_effect(filename, mode="r"): + if filename.endswith("status"): return mock_open( read_data="Name:\tprocess\nNSpid:\t256123\t170\n" ).return_value - elif filename.endswith('maps'): + elif filename.endswith("maps"): return mock_open( - read_data= - "7f8e... r-xp ... /opt/ros/noetic/lib/libroscpp.so\n" + read_data="7f8e... r-xp ... /opt/ros/noetic/lib/libroscpp.so\n" ).return_value - elif filename.endswith('topics.bt'): + elif filename.endswith("topics.bt"): return mock_open( - read_data= - "uprobe:{{LIBROSCPP_PATH}}:{{SYMBOL_CALL_ONE_CB}}:{{SYMBOL_HANDLE_MESSAGE}}:{{SYMBOL_ADD_CALLBACK}}" + read_data="uprobe:{{LIBROSCPP_PATH}}:{{SYMBOL_CALL_ONE_CB}}:" + "{{SYMBOL_HANDLE_MESSAGE}}:{{SYMBOL_ADD_CALLBACK}}" ).return_value return mock_open().return_value mock_file.side_effect = open_side_effect # Mock symbol resolution (check_output) - mock_check_output.return_value = b"uprobe:/lib/lib.so:_ZN3ros13CallbackQueue9callOneCBEv" + mock_check_output.return_value = ( + b"uprobe:/lib/lib.so:_ZN3ros13CallbackQueue9callOneCBEv" + ) # Mock temp file mock_temp_obj = MagicMock() @@ -156,14 +159,14 @@ def open_side_effect(filename, mode='r'): self.runner.run() # Verify host PID was resolved - mock_call.assert_called_with( - ["sudo", "bpftrace", "/tmp/test.bt", "256123"]) + mock_call.assert_called_with(["sudo", "bpftrace", "/tmp/test.bt", "256123"]) # Verify symbols were replaced handle = mock_temp_obj.write - expected_content = "uprobe:/opt/ros/noetic/lib/libroscpp.so:_ZN3ros13CallbackQueue9callOneCBEv:_ZN3ros13CallbackQueue9callOneCBEv:_ZN3ros13CallbackQueue9callOneCBEv" + sym = "_ZN3ros13CallbackQueue9callOneCBEv" + expected_content = f"uprobe:/opt/ros/noetic/lib/libroscpp.so:{sym}:{sym}:{sym}" handle.assert_called_with(expected_content) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main()