Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 19 additions & 10 deletions diagtools/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,27 @@ Supported features:
see [golang:deflate.go](https://github.com/golang/go/blob/master/src/compress/flate/deflate.go#L15-L18)
- upload parameter is optional. Uploads zipped heap dump to collector service diagnostic controller
via REST `http://localhost:8081/diagnostic/{namespace}/*/*/*/*/*/*/{podName}/{dumpName}`.
Works together with zip flag only.
Works together with zip flag only. Failed uploads are retried up to a configurable limit (see
DIAGNOSTIC_UPLOAD_RETRY_COUNT).

- `diagtools dump`
- "dump" subcommand is responsible for collecting java thread dump (`jstack -l "{{.Pid}}"`) and
CPU usage for java pid (`top -Hb -p{{.Pid}} -oTIME+ -d60 -n1`)
NC_DIAGNOSTIC_THREADDUMP_ENABLED and NC_DIAGNOSTIC_TOP_ENABLED is responsible for turning on/off collecting
data. If environment variable is absent it is treated as on.
CPU usage for java pid (`top -Hb -p{{.Pid}} -oTIME+ -d60 -n1`).
NC_DIAGNOSTIC_THREADDUMP_ENABLED, NC_DIAGNOSTIC_TOP_ENABLED and NC_DIAGNOSTIC_GC_ENABLED control whether
thread dump, top and GC log collection are enabled. If an environment variable is absent it is treated as on.

- `diagtools scan /tmp/diagnostic/*.hprof* ./core* ./hs_err*`
- "scan" subcommand is responsible for finding files matching patterns, zipping (if necessary) ".hprof" and
uploading ".hprof.zip" and other found files to collector service diagnostic controller via
REST `http://localhost:8081/diagnostic/{namespace}/*/*/*/*/*/*/{podName}/{dumpName}`.
REST `http://localhost:8081/diagnostic/{namespace}/*/*/*/*/*/*/{podName}/{dumpName}`. Failed uploads
are retried up to a configurable limit (see DIAGNOSTIC_UPLOAD_RETRY_COUNT).

- `diagtools schedule`
- "schedule" subcommand is responsible for collecting dumps(like dump subcommand),
scanning(like scan subcommand) and cleaning logs located in NC_DIAGNOSTIC_LOGS_FOLDER by schedule.
- "schedule" subcommand is responsible for collecting dumps (like dump subcommand), GC logs,
scanning (like scan subcommand) and cleaning logs located in NC_DIAGNOSTIC_LOGS_FOLDER by schedule.
Interval can be changed via DIAGNOSTIC_DUMP_INTERVAL(default 1m), DIAGNOSTIC_SCAN_INTERVAL(default 3m) and
KEEP_LOGS_INTERVAL(default 2 days) environment variables.
KEEP_LOGS_INTERVAL(default 2 days) environment variables. GC log collection (upload of gc.log and rotated
files from the gclogs folder) is controlled by NC_DIAGNOSTIC_GC_ENABLED (enabled by default).

- `diagtools zkConfig "${NC_DIAGNOSTIC_FOLDER}/zkproperties" esc.config NC_DIAGNOSTIC_ESC_ENABLED ...`
- "zkConfig" subcommand is responsible for changing nc-diagnostic-agent settings in case when ZOOKEEPER_ENABLED=true
Expand All @@ -50,8 +53,10 @@ Environment variables used by tool:
- KEEP_LOGS_INTERVAL - logs located in NC_DIAGNOSTIC_LOGS_FOLDER rotation interval in days. Default is 2.
- LOG_TO_CONSOLE - indicates if send logs to the console. Default is false.
- DIAGNOSTIC_CENTER_DUMPS_ENABLED - used to check if upload dumps to diagnostic center.
- NC_DIAGNOSTIC_THREADDUMP_ENABLED - used to check if thead dumps enabled.
- NC_DIAGNOSTIC_TOP_ENABLED - used to check if top dumps enabled.
- NC_DIAGNOSTIC_THREADDUMP_ENABLED - used to check if thread dumps enabled. Default is on (absent = on).
- NC_DIAGNOSTIC_TOP_ENABLED - used to check if top (CPU usage) collection enabled. Default is on (absent = on).
- NC_DIAGNOSTIC_GC_ENABLED - used to check if GC log collection by scheduler is enabled (upload of gc.log and
rotated GC log files to diagnostic center). Default is on (absent = on).
- DIAGNOSTIC_DUMP_INTERVAL - dump interval used in case of schedule. Default is 1 minute. Support go Duration format.
- DIAGNOSTIC_SCAN_INTERVAL - scan interval used in case of schedule. Default is 1 minutes. Support go Duration format.
- NC_DIAGNOSTIC_AGENT_SERVICE - diagnostic agent service name. Default is `nc-diagnostic-agent`.
Expand All @@ -61,5 +66,9 @@ Environment variables used by tool:
- MICROSERVICE_NAME - contains actual microservice name. Can't be empty.
- ZOOKEEPER_ADDRESS - zookeeper address for fetch settings from ZooKeeper.
- NC_HEAP_DUMP_COMPRESSION_LEVEL - defines heap dump compression level. Default is `-1`.
- DIAGNOSTIC_UPLOAD_TIMEOUT - HTTP client timeout for sending files to the diagnostic center (e.g. dumps-collector).
Supports duration strings (e.g. `30m`, `1h`) or seconds as integer. Default is `5m`.
- DIAGNOSTIC_UPLOAD_RETRY_COUNT - maximum number of retries when sending files to the diagnostic center fails.
Total attempts = 1 + this value. Default is `3` (i.e. up to 4 attempts).
- ESC_LOG_FORMAT - used to set custom log format for agent loggers using java logging service
- LOGBACK_CLOUD_AGENT_LOG_FORMAT - Used to set custom log format for agent loggers using logback service.
34 changes: 21 additions & 13 deletions diagtools/actions/action.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,21 +162,29 @@ func (action *Action) RunJCmdWithOutput(ctx context.Context) (output []byte, err

func (action *Action) UploadOutputToDiagnosticCenter(ctx context.Context, output []byte) error {
startTime := time.Now()
maxRetries := constants.UploadRetryCount(ctx)
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
if attempt > 0 {
log.Infof(ctx, "retry sending to diagnostic center (attempt %d/%d)", attempt+1, maxRetries+1)
}
reader := bytes.NewReader(output)
request, err := http.NewRequestWithContext(ctx, http.MethodPut, action.TargetUrl, reader)
if err != nil {
log.Error(ctx, err, "failed to create http request")
return err
}
request.Header.Add("Content-Type", "application/octet-stream")

reader := bytes.NewReader(output)
request, err := http.NewRequestWithContext(ctx, http.MethodPut, action.TargetUrl, reader)
if err != nil {
log.Error(ctx, err, "failed to create http request")
return err
}
request.Header.Add("Content-Type", "application/octet-stream")

err = utils.SendFileRequest(ctx, action.DumpPath, request)
if err == nil {
log.Info(ctx, fmt.Sprintf("uploaded %s", action.DumpPath),
"bytes", len(output), "duration", time.Since(startTime))
lastErr = utils.SendFileRequest(ctx, action.DumpPath, request)
if lastErr == nil {
log.Info(ctx, fmt.Sprintf("uploaded %s", action.DumpPath),
"bytes", len(output), "duration", time.Since(startTime))
return nil
}
log.Error(ctx, lastErr, "failed to send to diagnostic center", "attempt", attempt+1, "maxRetries", maxRetries)
}
return err
return lastErr
}

func (action *Action) ZipDump(ctx context.Context) (err error) {
Expand Down
15 changes: 15 additions & 0 deletions diagtools/constants/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -387,3 +387,18 @@ func UploadTimeout(ctx context.Context) time.Duration {
}
return d
}

// UploadRetryCount returns the maximum number of retries when sending files to the diagnostic center fails.
// Configured via DIAGNOSTIC_UPLOAD_RETRY_COUNT. Default is 3.
func UploadRetryCount(ctx context.Context) int {
env := os.Getenv(NcDiagUploadRetryCount)
if env == "" {
return 3
}
n, err := strconv.Atoi(env)
if err != nil || n < 0 {
log.Error(ctx, err, "Parsing upload retry count failed. Will use default: 3")
return 3
}
return n
}
1 change: 1 addition & 0 deletions diagtools/constants/names.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const (
NcDiagDumpInterval = "DIAGNOSTIC_DUMP_INTERVAL"
NcDiagScanInterval = "DIAGNOSTIC_SCAN_INTERVAL"
NcDiagUploadTimeout = "DIAGNOSTIC_UPLOAD_TIMEOUT" // HTTP client timeout for sending dump files (default 5m)
NcDiagUploadRetryCount = "DIAGNOSTIC_UPLOAD_RETRY_COUNT" // max retries on failed send to diagnostic center (default 3)
NcProfilerFolder = "PROFILER_FOLDER"
ZipCompressionLevel = "NC_HEAP_DUMP_COMPRESSION_LEVEL"
NcCloudNamespace = "CLOUD_NAMESPACE"
Expand Down
45 changes: 35 additions & 10 deletions diagtools/utils/sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@ import (
)

func SendMultiPart(ctx context.Context, endpoint string, files ...string) (err error) {
fName := fmt.Sprintf("%d files", len(files))
maxRetries := constants.UploadRetryCount(ctx)
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
if attempt > 0 {
log.Infof(ctx, "retry sending to diagnostic center (attempt %d/%d)", attempt+1, maxRetries+1)
}
lastErr = sendMultiPartOnce(ctx, endpoint, fName, files)
if lastErr == nil {
return nil
}
log.Error(ctx, lastErr, "failed to send files to diagnostic center", "attempt", attempt+1, "maxRetries", maxRetries)
}
return lastErr
}

func sendMultiPartOnce(ctx context.Context, endpoint, fName string, files []string) (err error) {
pipeReader, pipeWriter := io.Pipe()
defer func(pipeReader *io.PipeReader) {
errClose := pipeReader.Close()
Expand Down Expand Up @@ -69,16 +86,28 @@ func SendMultiPart(ctx context.Context, endpoint string, files ...string) (err e
}
request.Header.Add("Content-Type", bodyWriter.FormDataContentType())

fName := fmt.Sprintf("%d files", len(files))
//fName := strings.Join(files, ";")
err = SendFileRequest(ctx, fName, request)
return

return SendFileRequest(ctx, fName, request)
}

func SendSingleFile(ctx context.Context, targetUrl, fileName string) (err error) {
startTime := time.Now()
maxRetries := constants.UploadRetryCount(ctx)
var lastErr error
for attempt := 0; attempt <= maxRetries; attempt++ {
if attempt > 0 {
log.Infof(ctx, "retry sending to diagnostic center (attempt %d/%d)", attempt+1, maxRetries+1)
}
lastErr = sendSingleFileOnce(ctx, targetUrl, fileName)
if lastErr == nil {
log.Info(ctx, fmt.Sprintf("uploaded %s", fileName), "duration", time.Since(startTime))
return nil
}
log.Error(ctx, lastErr, "failed to send file to diagnostic center", "file", fileName, "attempt", attempt+1, "maxRetries", maxRetries)
}
return lastErr
}

func sendSingleFileOnce(ctx context.Context, targetUrl, fileName string) (err error) {
pipeReader, pipeWriter := io.Pipe()

defer func(pipeReader *io.PipeReader) {
Expand Down Expand Up @@ -112,11 +141,7 @@ func SendSingleFile(ctx context.Context, targetUrl, fileName string) (err error)
}
request.Header.Add("Content-Type", "application/octet-stream")

err = SendFileRequest(ctx, fileName, request)
if err == nil {
log.Info(ctx, fmt.Sprintf("uploaded %s", fileName), "duration", time.Since(startTime))
}
return err
return SendFileRequest(ctx, fileName, request)
}

func SendFileRequest(ctx context.Context, fileName string, request *http.Request) (err error) {
Expand Down
Loading