@@ -222,24 +222,6 @@ func isTrim(b byte) bool { return b == '.' || b == '_' || b == '-' }
222222
223223// appendSanitizedMetricName converts *any* string into something that StatsD / Graphite
224224// accepts without complaints.
225- //
226- // OPTIMIZATION NOTES:
227- // This implementation was optimized through several iterations:
228- //
229- // 1. Replaced manual UTF-8 bit manipulation with stdlib utf8.DecodeRune for maintainability
230- // while preserving the same transformation logic (Latin-1 supplement handling, invalid
231- // char replacement).
232- //
233- // 2. Added fast-path pre-check for the common case: since 99% of metrics are valid ASCII
234- // (e.g., "http.server.request.duration"), we first scan to detect if transformation is
235- // needed. If not, we bulk-copy the string instead of processing byte-by-byte.
236- //
237- // 3. Explored SIMD (Segment's asm/ascii library) but found simple Go code is faster for
238- // short metric names (~20-40 chars) due to function call overhead. SIMD wins on longer
239- // strings (100+ chars) but those are rare in practice.
240- //
241- // Result: 2.7x faster on typical workloads (25ns → 9.5ns per metric name). The fast-path
242- // check costs ~2-3ns even when transformation is needed, making it always beneficial.
243225func appendSanitizedMetricName (dst []byte , raw string ) []byte {
244226 origLen := len (dst )
245227 if raw == "" {
@@ -249,106 +231,37 @@ func appendSanitizedMetricName(dst []byte, raw string) []byte {
249231 return dst
250232 }
251233
252- // Fast path: check if string is pure valid ASCII (common case)
253- // Most metric names like "http.server.request.duration" hit this path
254- needsTransform := false
255- for i := range len (raw ) {
256- c := raw [i ]
257- if c >= utf8 .RuneSelf || ! valid [c ] {
258- needsTransform = true
234+ // Simple transformation: iterate through runes and convert/replace as needed
235+ lastWasRepl := false
236+ for i , r := range raw {
237+ if i >= maxLen {
259238 break
260239 }
261- }
262-
263- // If no transformation needed, just copy and trim
264- if ! needsTransform {
265- // Respect maxLen
266- copyLen := min (len (raw ), maxLen )
267- dst = append (dst , raw [:copyLen ]... )
268-
269- // Trim leading/trailing '.', '_' or '-'
270- start , end := origLen , len (dst )
271- for start < end && isTrim (dst [start ]) {
272- start ++
273- }
274- for end > start && isTrim (dst [end - 1 ]) {
275- end --
276- }
277-
278- if start > origLen || end < len (dst ) {
279- copy (dst [origLen :], dst [start :end ])
280- dst = dst [:origLen + (end - start )]
281- }
282-
283- if len (dst ) == origLen && len (raw ) > 0 {
284- // We didn't append any characters to dst - basically this means we
285- // truncated every single character from raw because it was a dot or
286- // underscore or hyphen.
287- return append (dst , "_truncated_" ... )
288- }
289- return dst
290- }
291240
292- // Slow path: needs transformation (has unicode, invalid chars, etc)
293- nameLen := 0
294- lastWasRepl := false
295- for _ , r := range raw {
296- if r < utf8 . RuneSelf {
297- // ASCII byte
298- if valid [ byte ( r )] {
299- dst = append ( dst , byte ( r ))
300- nameLen ++
241+ if r < utf8 . RuneSelf && valid [ byte ( r )] {
242+ // Valid ASCII character
243+ dst = append ( dst , byte ( r ))
244+ lastWasRepl = false
245+ } else if r >= 0xC0 && r <= 0xFF {
246+ // Latin-1 Supplement block (common accented characters like À, É, ñ)
247+ mapped := latin1SupplementMap [ r ]
248+ if valid [ mapped ] {
249+ dst = append ( dst , mapped )
301250 lastWasRepl = false
302- } else {
303- // Invalid ASCII character
304- if ! lastWasRepl {
305- dst = append (dst , replacement )
306- nameLen ++
307- lastWasRepl = true
308- }
309- }
310- } else {
311- // Non-ASCII rune
312- // Check if rune is in Latin-1 Supplement block (U+00C0 to U+00FF)
313- // This includes common accented characters like À, É, ñ, etc.
314- if r >= 0xC0 && r <= 0xFF {
315- mapped := latin1SupplementMap [r ]
316- if valid [mapped ] {
317- dst = append (dst , mapped )
318- nameLen ++
319- lastWasRepl = false
320- if nameLen >= maxLen {
321- break
322- }
323- continue
324- }
325- }
326-
327- if ! lastWasRepl {
251+ } else if ! lastWasRepl {
328252 dst = append (dst , replacement )
329- nameLen ++
330253 lastWasRepl = true
331254 }
332- }
333-
334- if nameLen >= maxLen {
335- break
255+ } else if ! lastWasRepl {
256+ // Invalid or unsupported character - only append if we didn't just add a replacement
257+ dst = append ( dst , replacement )
258+ lastWasRepl = true
336259 }
337260 }
338261
339- // Trim
340- start , end := origLen , len (dst )
341- for start < end && isTrim (dst [start ]) {
342- start ++
343- }
344- for end > start && isTrim (dst [end - 1 ]) {
345- end --
346- }
347-
348- if start > origLen || end < len (dst ) {
349- copy (dst [origLen :], dst [start :end ])
350- dst = dst [:origLen + (end - start )]
351- }
262+ // Trim leading/trailing '.', '_' or '-'
263+ trimmed := bytes .Trim (dst [origLen :], "._-" )
264+ dst = append (dst [:origLen ], trimmed ... )
352265
353266 if len (dst ) == origLen {
354267 return append (dst , "_truncated_" ... )
0 commit comments