Skip to content

Commit da2b107

Browse files
simplify implementation
1 parent de940f8 commit da2b107

File tree

1 file changed

+21
-108
lines changed

1 file changed

+21
-108
lines changed

datadog/serializer.go

Lines changed: 21 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -222,24 +222,6 @@ func isTrim(b byte) bool { return b == '.' || b == '_' || b == '-' }
222222

223223
// appendSanitizedMetricName converts *any* string into something that StatsD / Graphite
224224
// accepts without complaints.
225-
//
226-
// OPTIMIZATION NOTES:
227-
// This implementation was optimized through several iterations:
228-
//
229-
// 1. Replaced manual UTF-8 bit manipulation with stdlib utf8.DecodeRune for maintainability
230-
// while preserving the same transformation logic (Latin-1 supplement handling, invalid
231-
// char replacement).
232-
//
233-
// 2. Added fast-path pre-check for the common case: since 99% of metrics are valid ASCII
234-
// (e.g., "http.server.request.duration"), we first scan to detect if transformation is
235-
// needed. If not, we bulk-copy the string instead of processing byte-by-byte.
236-
//
237-
// 3. Explored SIMD (Segment's asm/ascii library) but found simple Go code is faster for
238-
// short metric names (~20-40 chars) due to function call overhead. SIMD wins on longer
239-
// strings (100+ chars) but those are rare in practice.
240-
//
241-
// Result: 2.7x faster on typical workloads (25ns → 9.5ns per metric name). The fast-path
242-
// check costs ~2-3ns even when transformation is needed, making it always beneficial.
243225
func appendSanitizedMetricName(dst []byte, raw string) []byte {
244226
origLen := len(dst)
245227
if raw == "" {
@@ -249,106 +231,37 @@ func appendSanitizedMetricName(dst []byte, raw string) []byte {
249231
return dst
250232
}
251233

252-
// Fast path: check if string is pure valid ASCII (common case)
253-
// Most metric names like "http.server.request.duration" hit this path
254-
needsTransform := false
255-
for i := range len(raw) {
256-
c := raw[i]
257-
if c >= utf8.RuneSelf || !valid[c] {
258-
needsTransform = true
234+
// Simple transformation: iterate through runes and convert/replace as needed
235+
lastWasRepl := false
236+
for i, r := range raw {
237+
if i >= maxLen {
259238
break
260239
}
261-
}
262-
263-
// If no transformation needed, just copy and trim
264-
if !needsTransform {
265-
// Respect maxLen
266-
copyLen := min(len(raw), maxLen)
267-
dst = append(dst, raw[:copyLen]...)
268-
269-
// Trim leading/trailing '.', '_' or '-'
270-
start, end := origLen, len(dst)
271-
for start < end && isTrim(dst[start]) {
272-
start++
273-
}
274-
for end > start && isTrim(dst[end-1]) {
275-
end--
276-
}
277-
278-
if start > origLen || end < len(dst) {
279-
copy(dst[origLen:], dst[start:end])
280-
dst = dst[:origLen+(end-start)]
281-
}
282-
283-
if len(dst) == origLen && len(raw) > 0 {
284-
// We didn't append any characters to dst - basically this means we
285-
// truncated every single character from raw because it was a dot or
286-
// underscore or hyphen.
287-
return append(dst, "_truncated_"...)
288-
}
289-
return dst
290-
}
291240

292-
// Slow path: needs transformation (has unicode, invalid chars, etc)
293-
nameLen := 0
294-
lastWasRepl := false
295-
for _, r := range raw {
296-
if r < utf8.RuneSelf {
297-
// ASCII byte
298-
if valid[byte(r)] {
299-
dst = append(dst, byte(r))
300-
nameLen++
241+
if r < utf8.RuneSelf && valid[byte(r)] {
242+
// Valid ASCII character
243+
dst = append(dst, byte(r))
244+
lastWasRepl = false
245+
} else if r >= 0xC0 && r <= 0xFF {
246+
// Latin-1 Supplement block (common accented characters like À, É, ñ)
247+
mapped := latin1SupplementMap[r]
248+
if valid[mapped] {
249+
dst = append(dst, mapped)
301250
lastWasRepl = false
302-
} else {
303-
// Invalid ASCII character
304-
if !lastWasRepl {
305-
dst = append(dst, replacement)
306-
nameLen++
307-
lastWasRepl = true
308-
}
309-
}
310-
} else {
311-
// Non-ASCII rune
312-
// Check if rune is in Latin-1 Supplement block (U+00C0 to U+00FF)
313-
// This includes common accented characters like À, É, ñ, etc.
314-
if r >= 0xC0 && r <= 0xFF {
315-
mapped := latin1SupplementMap[r]
316-
if valid[mapped] {
317-
dst = append(dst, mapped)
318-
nameLen++
319-
lastWasRepl = false
320-
if nameLen >= maxLen {
321-
break
322-
}
323-
continue
324-
}
325-
}
326-
327-
if !lastWasRepl {
251+
} else if !lastWasRepl {
328252
dst = append(dst, replacement)
329-
nameLen++
330253
lastWasRepl = true
331254
}
332-
}
333-
334-
if nameLen >= maxLen {
335-
break
255+
} else if !lastWasRepl {
256+
// Invalid or unsupported character - only append if we didn't just add a replacement
257+
dst = append(dst, replacement)
258+
lastWasRepl = true
336259
}
337260
}
338261

339-
// Trim
340-
start, end := origLen, len(dst)
341-
for start < end && isTrim(dst[start]) {
342-
start++
343-
}
344-
for end > start && isTrim(dst[end-1]) {
345-
end--
346-
}
347-
348-
if start > origLen || end < len(dst) {
349-
copy(dst[origLen:], dst[start:end])
350-
dst = dst[:origLen+(end-start)]
351-
}
262+
// Trim leading/trailing '.', '_' or '-'
263+
trimmed := bytes.Trim(dst[origLen:], "._-")
264+
dst = append(dst[:origLen], trimmed...)
352265

353266
if len(dst) == origLen {
354267
return append(dst, "_truncated_"...)

0 commit comments

Comments
 (0)