From 158ab0e31f22d5c6707b78cd75935015e7007dec Mon Sep 17 00:00:00 2001 From: Vladimir Sitnikov Date: Wed, 4 Mar 2026 09:12:38 +0300 Subject: [PATCH 1/2] feat: make DIAGNOSTIC_UPLOAD_TIMEOUT configurable Co-Authored-By: Claude Opus 4.6 --- diagtools/constants/helpers.go | 16 ++++++++++++++++ diagtools/constants/helpers_test.go | 25 +++++++++++++++++++++++++ diagtools/constants/names.go | 1 + diagtools/utils/sender.go | 3 ++- 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/diagtools/constants/helpers.go b/diagtools/constants/helpers.go index fca546d9..6763fc48 100644 --- a/diagtools/constants/helpers.go +++ b/diagtools/constants/helpers.go @@ -371,3 +371,19 @@ func parseDurationOrSeconds(interval string) (time.Duration, error) { return 0, err } + +// UploadTimeout returns the HTTP client timeout for sending dump files (e.g. to dumps-collector). +// Configured via DIAGNOSTIC_UPLOAD_TIMEOUT; supports duration strings (e.g. "30m", "1h") or seconds as integer. +// Default is 5 minutes. +func UploadTimeout(ctx context.Context) time.Duration { + env := os.Getenv(NcDiagUploadTimeout) + if env == "" { + return 5 * time.Minute + } + d, err := parseDurationOrSeconds(env) + if err != nil { + log.Error(ctx, err, "Parsing upload timeout failed. Will use default: 5m") + return 5 * time.Minute + } + return d +} diff --git a/diagtools/constants/helpers_test.go b/diagtools/constants/helpers_test.go index d76f728c..a9ce8e45 100644 --- a/diagtools/constants/helpers_test.go +++ b/diagtools/constants/helpers_test.go @@ -434,3 +434,28 @@ func TestGetLogLevelFromEnv(t *testing.T) { os.Setenv("LOG_LEVEL", "TRACE") assert.Equal(t, "info", GetLogLevelFromEnv()) } + +func TestUploadTimeout(t *testing.T) { + os.Setenv("DIAGNOSTIC_UPLOAD_TIMEOUT", "") + assert.Equal(t, 5*time.Minute, UploadTimeout(testCtx)) // default timeout + + os.Setenv("DIAGNOSTIC_UPLOAD_TIMEOUT", "600") + assert.Equal(t, 600*time.Second, UploadTimeout(testCtx)) // integer value means seconds + + os.Setenv("DIAGNOSTIC_UPLOAD_TIMEOUT", "10s") + assert.Equal(t, 10*time.Second, UploadTimeout(testCtx)) + + os.Setenv("DIAGNOSTIC_UPLOAD_TIMEOUT", "30m") + assert.Equal(t, 30*time.Minute, UploadTimeout(testCtx)) + + os.Setenv("DIAGNOSTIC_UPLOAD_TIMEOUT", "1h") + assert.Equal(t, 1*time.Hour, UploadTimeout(testCtx)) + + os.Setenv("DIAGNOSTIC_UPLOAD_TIMEOUT", "bad") + assert.Equal(t, 5*time.Minute, UploadTimeout(testCtx)) // default timeout for invalid values + + os.Setenv(NcDiagUploadTimeout, "15m") + assert.Equal(t, 15*time.Minute, UploadTimeout(testCtx)) + assert.Equal(t, "DIAGNOSTIC_UPLOAD_TIMEOUT", NcDiagUploadTimeout) + os.Setenv("DIAGNOSTIC_UPLOAD_TIMEOUT", "") +} diff --git a/diagtools/constants/names.go b/diagtools/constants/names.go index 803d936a..43d9863c 100644 --- a/diagtools/constants/names.go +++ b/diagtools/constants/names.go @@ -28,6 +28,7 @@ const ( NcDiagCenterDumpEnabled = "DIAGNOSTIC_CENTER_DUMPS_ENABLED" // heap dump collection after OOM // ENABLED by default NcDiagDumpInterval = "DIAGNOSTIC_DUMP_INTERVAL" NcDiagScanInterval = "DIAGNOSTIC_SCAN_INTERVAL" + NcDiagUploadTimeout = "DIAGNOSTIC_UPLOAD_TIMEOUT" // HTTP client timeout for sending dump files (default 5m) NcProfilerFolder = "PROFILER_FOLDER" ZipCompressionLevel = "NC_HEAP_DUMP_COMPRESSION_LEVEL" NcCloudNamespace = "CLOUD_NAMESPACE" diff --git a/diagtools/utils/sender.go b/diagtools/utils/sender.go index 62762bd5..a4b876bb 100644 --- a/diagtools/utils/sender.go +++ b/diagtools/utils/sender.go @@ -11,6 +11,7 @@ import ( "path/filepath" "time" + "github.com/Netcracker/qubership-profiler-agent/diagtools/constants" "github.com/Netcracker/qubership-profiler-agent/diagtools/log" ) @@ -276,6 +277,6 @@ func FileClient(ctx context.Context) (*http.Client, error) { log.Error(ctx, err, "err While creating http client with TLS configuration") return nil, err } - client.Timeout = time.Minute * 5 + client.Timeout = constants.UploadTimeout(ctx) return client, nil } From 9990d91c2d993f3de92747ff0393a4c6e8190a9a Mon Sep 17 00:00:00 2001 From: Nareshamr Date: Wed, 11 Mar 2026 09:50:10 +0530 Subject: [PATCH 2/2] added DIAGNOSTIC_UPLOAD_RETRY_COUNT to limit retry for failed cases --- diagtools/README.md | 29 ++++++++++++++-------- diagtools/actions/action.go | 34 +++++++++++++++---------- diagtools/constants/helpers.go | 15 ++++++++++++ diagtools/constants/names.go | 1 + diagtools/utils/sender.go | 45 ++++++++++++++++++++++++++-------- 5 files changed, 91 insertions(+), 33 deletions(-) diff --git a/diagtools/README.md b/diagtools/README.md index 06eb9723..29baf820 100644 --- a/diagtools/README.md +++ b/diagtools/README.md @@ -17,24 +17,27 @@ Supported features: see [golang:deflate.go](https://github.com/golang/go/blob/master/src/compress/flate/deflate.go#L15-L18) - upload parameter is optional. Uploads zipped heap dump to collector service diagnostic controller via REST `http://localhost:8081/diagnostic/{namespace}/*/*/*/*/*/*/{podName}/{dumpName}`. - Works together with zip flag only. + Works together with zip flag only. Failed uploads are retried up to a configurable limit (see + DIAGNOSTIC_UPLOAD_RETRY_COUNT). - `diagtools dump` - "dump" subcommand is responsible for collecting java thread dump (`jstack -l "{{.Pid}}"`) and - CPU usage for java pid (`top -Hb -p{{.Pid}} -oTIME+ -d60 -n1`) - NC_DIAGNOSTIC_THREADDUMP_ENABLED and NC_DIAGNOSTIC_TOP_ENABLED is responsible for turning on/off collecting - data. If environment variable is absent it is treated as on. + CPU usage for java pid (`top -Hb -p{{.Pid}} -oTIME+ -d60 -n1`). + NC_DIAGNOSTIC_THREADDUMP_ENABLED, NC_DIAGNOSTIC_TOP_ENABLED and NC_DIAGNOSTIC_GC_ENABLED control whether + thread dump, top and GC log collection are enabled. If an environment variable is absent it is treated as on. - `diagtools scan /tmp/diagnostic/*.hprof* ./core* ./hs_err*` - "scan" subcommand is responsible for finding files matching patterns, zipping (if necessary) ".hprof" and uploading ".hprof.zip" and other found files to collector service diagnostic controller via - REST `http://localhost:8081/diagnostic/{namespace}/*/*/*/*/*/*/{podName}/{dumpName}`. + REST `http://localhost:8081/diagnostic/{namespace}/*/*/*/*/*/*/{podName}/{dumpName}`. Failed uploads + are retried up to a configurable limit (see DIAGNOSTIC_UPLOAD_RETRY_COUNT). - `diagtools schedule` - - "schedule" subcommand is responsible for collecting dumps(like dump subcommand), - scanning(like scan subcommand) and cleaning logs located in NC_DIAGNOSTIC_LOGS_FOLDER by schedule. + - "schedule" subcommand is responsible for collecting dumps (like dump subcommand), GC logs, + scanning (like scan subcommand) and cleaning logs located in NC_DIAGNOSTIC_LOGS_FOLDER by schedule. Interval can be changed via DIAGNOSTIC_DUMP_INTERVAL(default 1m), DIAGNOSTIC_SCAN_INTERVAL(default 3m) and - KEEP_LOGS_INTERVAL(default 2 days) environment variables. + KEEP_LOGS_INTERVAL(default 2 days) environment variables. GC log collection (upload of gc.log and rotated + files from the gclogs folder) is controlled by NC_DIAGNOSTIC_GC_ENABLED (enabled by default). - `diagtools zkConfig "${NC_DIAGNOSTIC_FOLDER}/zkproperties" esc.config NC_DIAGNOSTIC_ESC_ENABLED ...` - "zkConfig" subcommand is responsible for changing nc-diagnostic-agent settings in case when ZOOKEEPER_ENABLED=true @@ -50,8 +53,10 @@ Environment variables used by tool: - KEEP_LOGS_INTERVAL - logs located in NC_DIAGNOSTIC_LOGS_FOLDER rotation interval in days. Default is 2. - LOG_TO_CONSOLE - indicates if send logs to the console. Default is false. - DIAGNOSTIC_CENTER_DUMPS_ENABLED - used to check if upload dumps to diagnostic center. -- NC_DIAGNOSTIC_THREADDUMP_ENABLED - used to check if thead dumps enabled. -- NC_DIAGNOSTIC_TOP_ENABLED - used to check if top dumps enabled. +- NC_DIAGNOSTIC_THREADDUMP_ENABLED - used to check if thread dumps enabled. Default is on (absent = on). +- NC_DIAGNOSTIC_TOP_ENABLED - used to check if top (CPU usage) collection enabled. Default is on (absent = on). +- NC_DIAGNOSTIC_GC_ENABLED - used to check if GC log collection by scheduler is enabled (upload of gc.log and + rotated GC log files to diagnostic center). Default is on (absent = on). - DIAGNOSTIC_DUMP_INTERVAL - dump interval used in case of schedule. Default is 1 minute. Support go Duration format. - DIAGNOSTIC_SCAN_INTERVAL - scan interval used in case of schedule. Default is 1 minutes. Support go Duration format. - NC_DIAGNOSTIC_AGENT_SERVICE - diagnostic agent service name. Default is `nc-diagnostic-agent`. @@ -61,5 +66,9 @@ Environment variables used by tool: - MICROSERVICE_NAME - contains actual microservice name. Can't be empty. - ZOOKEEPER_ADDRESS - zookeeper address for fetch settings from ZooKeeper. - NC_HEAP_DUMP_COMPRESSION_LEVEL - defines heap dump compression level. Default is `-1`. +- DIAGNOSTIC_UPLOAD_TIMEOUT - HTTP client timeout for sending files to the diagnostic center (e.g. dumps-collector). + Supports duration strings (e.g. `30m`, `1h`) or seconds as integer. Default is `5m`. +- DIAGNOSTIC_UPLOAD_RETRY_COUNT - maximum number of retries when sending files to the diagnostic center fails. + Total attempts = 1 + this value. Default is `3` (i.e. up to 4 attempts). - ESC_LOG_FORMAT - used to set custom log format for agent loggers using java logging service - LOGBACK_CLOUD_AGENT_LOG_FORMAT - Used to set custom log format for agent loggers using logback service. diff --git a/diagtools/actions/action.go b/diagtools/actions/action.go index 95dbcaa4..ed9b2165 100644 --- a/diagtools/actions/action.go +++ b/diagtools/actions/action.go @@ -162,21 +162,29 @@ func (action *Action) RunJCmdWithOutput(ctx context.Context) (output []byte, err func (action *Action) UploadOutputToDiagnosticCenter(ctx context.Context, output []byte) error { startTime := time.Now() + maxRetries := constants.UploadRetryCount(ctx) + var lastErr error + for attempt := 0; attempt <= maxRetries; attempt++ { + if attempt > 0 { + log.Infof(ctx, "retry sending to diagnostic center (attempt %d/%d)", attempt+1, maxRetries+1) + } + reader := bytes.NewReader(output) + request, err := http.NewRequestWithContext(ctx, http.MethodPut, action.TargetUrl, reader) + if err != nil { + log.Error(ctx, err, "failed to create http request") + return err + } + request.Header.Add("Content-Type", "application/octet-stream") - reader := bytes.NewReader(output) - request, err := http.NewRequestWithContext(ctx, http.MethodPut, action.TargetUrl, reader) - if err != nil { - log.Error(ctx, err, "failed to create http request") - return err - } - request.Header.Add("Content-Type", "application/octet-stream") - - err = utils.SendFileRequest(ctx, action.DumpPath, request) - if err == nil { - log.Info(ctx, fmt.Sprintf("uploaded %s", action.DumpPath), - "bytes", len(output), "duration", time.Since(startTime)) + lastErr = utils.SendFileRequest(ctx, action.DumpPath, request) + if lastErr == nil { + log.Info(ctx, fmt.Sprintf("uploaded %s", action.DumpPath), + "bytes", len(output), "duration", time.Since(startTime)) + return nil + } + log.Error(ctx, lastErr, "failed to send to diagnostic center", "attempt", attempt+1, "maxRetries", maxRetries) } - return err + return lastErr } func (action *Action) ZipDump(ctx context.Context) (err error) { diff --git a/diagtools/constants/helpers.go b/diagtools/constants/helpers.go index 6763fc48..0973b866 100644 --- a/diagtools/constants/helpers.go +++ b/diagtools/constants/helpers.go @@ -387,3 +387,18 @@ func UploadTimeout(ctx context.Context) time.Duration { } return d } + +// UploadRetryCount returns the maximum number of retries when sending files to the diagnostic center fails. +// Configured via DIAGNOSTIC_UPLOAD_RETRY_COUNT. Default is 3. +func UploadRetryCount(ctx context.Context) int { + env := os.Getenv(NcDiagUploadRetryCount) + if env == "" { + return 3 + } + n, err := strconv.Atoi(env) + if err != nil || n < 0 { + log.Error(ctx, err, "Parsing upload retry count failed. Will use default: 3") + return 3 + } + return n +} diff --git a/diagtools/constants/names.go b/diagtools/constants/names.go index 43d9863c..1526dcea 100644 --- a/diagtools/constants/names.go +++ b/diagtools/constants/names.go @@ -29,6 +29,7 @@ const ( NcDiagDumpInterval = "DIAGNOSTIC_DUMP_INTERVAL" NcDiagScanInterval = "DIAGNOSTIC_SCAN_INTERVAL" NcDiagUploadTimeout = "DIAGNOSTIC_UPLOAD_TIMEOUT" // HTTP client timeout for sending dump files (default 5m) + NcDiagUploadRetryCount = "DIAGNOSTIC_UPLOAD_RETRY_COUNT" // max retries on failed send to diagnostic center (default 3) NcProfilerFolder = "PROFILER_FOLDER" ZipCompressionLevel = "NC_HEAP_DUMP_COMPRESSION_LEVEL" NcCloudNamespace = "CLOUD_NAMESPACE" diff --git a/diagtools/utils/sender.go b/diagtools/utils/sender.go index a4b876bb..f8c7a0dd 100644 --- a/diagtools/utils/sender.go +++ b/diagtools/utils/sender.go @@ -16,6 +16,23 @@ import ( ) func SendMultiPart(ctx context.Context, endpoint string, files ...string) (err error) { + fName := fmt.Sprintf("%d files", len(files)) + maxRetries := constants.UploadRetryCount(ctx) + var lastErr error + for attempt := 0; attempt <= maxRetries; attempt++ { + if attempt > 0 { + log.Infof(ctx, "retry sending to diagnostic center (attempt %d/%d)", attempt+1, maxRetries+1) + } + lastErr = sendMultiPartOnce(ctx, endpoint, fName, files) + if lastErr == nil { + return nil + } + log.Error(ctx, lastErr, "failed to send files to diagnostic center", "attempt", attempt+1, "maxRetries", maxRetries) + } + return lastErr +} + +func sendMultiPartOnce(ctx context.Context, endpoint, fName string, files []string) (err error) { pipeReader, pipeWriter := io.Pipe() defer func(pipeReader *io.PipeReader) { errClose := pipeReader.Close() @@ -69,16 +86,28 @@ func SendMultiPart(ctx context.Context, endpoint string, files ...string) (err e } request.Header.Add("Content-Type", bodyWriter.FormDataContentType()) - fName := fmt.Sprintf("%d files", len(files)) - //fName := strings.Join(files, ";") - err = SendFileRequest(ctx, fName, request) - return - + return SendFileRequest(ctx, fName, request) } func SendSingleFile(ctx context.Context, targetUrl, fileName string) (err error) { startTime := time.Now() + maxRetries := constants.UploadRetryCount(ctx) + var lastErr error + for attempt := 0; attempt <= maxRetries; attempt++ { + if attempt > 0 { + log.Infof(ctx, "retry sending to diagnostic center (attempt %d/%d)", attempt+1, maxRetries+1) + } + lastErr = sendSingleFileOnce(ctx, targetUrl, fileName) + if lastErr == nil { + log.Info(ctx, fmt.Sprintf("uploaded %s", fileName), "duration", time.Since(startTime)) + return nil + } + log.Error(ctx, lastErr, "failed to send file to diagnostic center", "file", fileName, "attempt", attempt+1, "maxRetries", maxRetries) + } + return lastErr +} +func sendSingleFileOnce(ctx context.Context, targetUrl, fileName string) (err error) { pipeReader, pipeWriter := io.Pipe() defer func(pipeReader *io.PipeReader) { @@ -112,11 +141,7 @@ func SendSingleFile(ctx context.Context, targetUrl, fileName string) (err error) } request.Header.Add("Content-Type", "application/octet-stream") - err = SendFileRequest(ctx, fileName, request) - if err == nil { - log.Info(ctx, fmt.Sprintf("uploaded %s", fileName), "duration", time.Since(startTime)) - } - return err + return SendFileRequest(ctx, fileName, request) } func SendFileRequest(ctx context.Context, fileName string, request *http.Request) (err error) {