diff --git a/docs/user_guide/request_cancellation.md b/docs/user_guide/request_cancellation.md index 753d03968a..10a676ced0 100644 --- a/docs/user_guide/request_cancellation.md +++ b/docs/user_guide/request_cancellation.md @@ -28,14 +28,14 @@ # Request Cancellation -Starting from r23.10, Triton supports handling request cancellation received -from the gRPC client or a C API user. Long running inference requests such -as for auto generative large language models may run for an indeterminate -amount of time or indeterminate number of steps. Additionally clients may -enqueue a large number of requests as part of a sequence or request stream -and later determine the results are no longer needed. Continuing to process -requests whose results are no longer required can significantly impact server -resources. +Triton supports handling request cancellation received from the gRPC Python +client or a C API user (since r23.10), and C++ client (since r26.05). +Long running inference requests such as for auto generative large language +models may run for an indeterminate amount of time or indeterminate number of +steps. Additionally clients may enqueue a large number of requests as part of +a sequence or request stream and later determine the results are no longer +needed. Continuing to process requests whose results are no longer required can +significantly impact server resources. ## Issuing Request Cancellation @@ -51,8 +51,7 @@ about the APIs in [tritonserver.h](https://github.com/triton-inference-server/co In addition, [gRPC endpoint](../customization_guide/inference_protocols.md#httprest-and-grpc-protocols) can now detect cancellation from the client and attempt to terminate request. -At present, only gRPC python client supports issuing request cancellation -to the server endpoint. See [request-cancellation](https://github.com/triton-inference-server/client#request-cancellation) +See [request-cancellation](https://github.com/triton-inference-server/client#request-cancellation) for more details on how to issue requests from the client-side. See gRPC guide on RPC [cancellation](https://grpc.io/docs/guides/cancellation/) for finer details. diff --git a/qa/L0_request_cancellation/test.sh b/qa/L0_request_cancellation/test.sh index d8cffb91e6..2b92c12027 100755 --- a/qa/L0_request_cancellation/test.sh +++ b/qa/L0_request_cancellation/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -42,8 +42,10 @@ export CUDA_VISIBLE_DEVICES=0 SERVER=/opt/tritonserver/bin/tritonserver source ../common/util.sh +CANCEL_LOG_LINE="Cancellation notification received for " RET=0 +rm -f *.log # # Unit tests @@ -66,7 +68,7 @@ if [ $? -ne 0 ]; then fi # -# gRPC cancellation tests +# Python gRPC cancellation tests # rm -rf models && mkdir models mkdir -p models/custom_identity_int32/1 && (cd models/custom_identity_int32 && \ @@ -121,7 +123,7 @@ for TEST_CASE in "test_grpc_async_infer" \ RET=1 fi - count=$(grep -o "Cancellation notification received for" $SERVER_LOG | wc -l) + count=$(grep -o "$CANCEL_LOG_LINE" $SERVER_LOG | wc -l) if [ $count == 0 ]; then echo -e "\n***\n*** Cancellation not received by server on $TEST_CASE\n***" cat $SERVER_LOG @@ -170,6 +172,80 @@ for TEST_CASE in "test_grpc_async_infer" \ fi done +# +# C++ gRPC cancellation tests +# +# allow_timeout_override disables queue prefetching, keeping requests queued +# long enough for the "Queued" cancellation tests to cancel them before +# forwarding to the rate limiter. This saves overall test time. +cat >> models/custom_identity_int32/config.pbtxt <<'EOF' +dynamic_batching { + default_queue_policy { + allow_timeout_override: true + } +} +EOF + +GRPC_CANCELLATION_TEST_CPP=../clients/grpc_cancellation_test + +for ENTRY in "TestGrpcAsyncInferCancelExecutingRequest 1" \ + "TestGrpcAsyncInferCancelQueuedRequest 2" \ + "TestGrpcAsyncInferCancelAfterCompletionIsNoOp 0" \ + "TestGrpcAsyncInferWithoutContextStillCompletes 0" \ + "TestGrpcAsyncInferMultiCancelExecutingRequests 2" \ + "TestGrpcAsyncInferMultiCancelQueuedRequest 2" \ + "TestGrpcStreamInferCancelExecutingRequest 1" \ + "TestGrpcStreamInferCancelQueuedRequest 1" \ + "TestGrpcStreamCancelWithoutInfer 1" \ + "TestGrpcStreamCancelThenRestart 1"; do + read -r TEST_CASE EXPECTED_CANCEL_COUNT <<< "$ENTRY" + + TEST_LOG="./grpc_cancellation_test_cpp.$TEST_CASE.log" + SERVER_LOG="./grpc_cancellation_test_cpp.$TEST_CASE.server.log" + + # AsyncInferMulti fans out N concurrent requests; bump to 3 CPU instances + # so each can execute in parallel. Every other test uses the default + # single-instance config. + if [ "$TEST_CASE" == "TestGrpcAsyncInferMultiCancelExecutingRequests" ]; then + sed -i 's|instance_group .*|instance_group [{ count: 3, kind: KIND_CPU }]|' \ + models/custom_identity_int32/config.pbtxt + fi + + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH \ + $GRPC_CANCELLATION_TEST_CPP \ + --gtest_filter="GrpcCancellationTest.$TEST_CASE" > $TEST_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** C++ gRPC Cancellation Tests Failed on $TEST_CASE\n***" + cat $TEST_LOG + RET=1 + fi + + cancel_count=$(grep -c "$CANCEL_LOG_LINE" $SERVER_LOG || true) + if [ $cancel_count -ne $EXPECTED_CANCEL_COUNT ]; then + echo -e "\n***\n*** Unexpected cancellation count on $TEST_CASE. Expected $EXPECTED_CANCEL_COUNT but received $cancel_count.\n***" + cat $SERVER_LOG + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + + if [ "$TEST_CASE" == "TestGrpcAsyncInferMultiCancelExecutingRequests" ]; then + sed -i 's|instance_group .*|instance_group [{ kind: KIND_CPU }]|' \ + models/custom_identity_int32/config.pbtxt + fi +done + # # End-to-end scheduler tests #