Skip to content

Commit 929d2b2

Browse files
committed
feat: support hex representation for non-unicode strings
1 parent d0773f0 commit 929d2b2

3 files changed

Lines changed: 234 additions & 153 deletions

File tree

thorlog/v3/matchstrings.go

Lines changed: 103 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package thorlog
33
import (
44
"bytes"
55
"encoding/hex"
6-
"encoding/json"
76
"fmt"
87
"regexp"
98
"strings"
@@ -13,57 +12,108 @@ import (
1312
"github.com/NextronSystems/jsonlog"
1413
)
1514

16-
type MatchData struct {
17-
Data []byte
18-
FullHex bool
15+
type StringWithEncoding struct {
16+
Data string `json:"data"`
17+
Encoding StringEncoding `json:"encoding"`
1918
}
2019

21-
func (f MatchData) MarshalJSON() ([]byte, error) {
22-
matchingString := f.String()
23-
return InvalidUnicodeString(matchingString).MarshalJSON()
20+
type StringEncoding string
21+
22+
const (
23+
Plain StringEncoding = "plain"
24+
Hex StringEncoding = "hex"
25+
)
26+
27+
// Encode encodes the given data into a StringWithEncoding,
28+
// choosing the most appropriate encoding based on its content.
29+
func Encode(s []byte) StringWithEncoding {
30+
if utf8.Valid(s) {
31+
return StringWithEncoding{
32+
Data: string(s),
33+
Encoding: Plain,
34+
}
35+
} else {
36+
return StringWithEncoding{
37+
Data: hex.EncodeToString(s),
38+
Encoding: Hex,
39+
}
40+
}
2441
}
2542

26-
func (f *MatchData) UnmarshalJSON(data []byte) error {
27-
var matchingString string
28-
err := json.Unmarshal(data, &matchingString)
29-
if err != nil {
30-
return err
43+
// EncodeString encodes the given data into a StringWithEncoding,
44+
// choosing the most appropriate encoding based on its content.
45+
func EncodeString(s string) StringWithEncoding {
46+
if utf8.ValidString(s) {
47+
return StringWithEncoding{
48+
Data: s,
49+
Encoding: Plain,
50+
}
51+
} else {
52+
return StringWithEncoding{
53+
Data: hex.EncodeToString([]byte(s)),
54+
Encoding: Hex,
55+
}
3156
}
32-
f.Data = []byte(matchingString)
33-
return nil
3457
}
3558

36-
func (f MatchData) JSONSchemaAlias() any {
37-
return ""
59+
// Plaintext returns the raw byte sequence represented by the StringWithEncoding.
60+
func (s StringWithEncoding) Plaintext() []byte {
61+
switch s.Encoding {
62+
case Plain:
63+
return []byte(s.Data)
64+
case Hex:
65+
data, err := hex.DecodeString(s.Data)
66+
if err != nil {
67+
return []byte("<invalid hex data: " + err.Error() + ">")
68+
}
69+
return data
70+
default:
71+
return []byte(fmt.Sprintf("<unknown encoding %s>", s.Encoding))
72+
}
3873
}
3974

4075
var notOnlyASCII = regexp.MustCompile(`[^\x20-\x7E\x0d\x0a\x09]+`) // printable chars + \r,\n,\t
4176

42-
func (f MatchData) String() string {
43-
if f.FullHex {
44-
return hex.EncodeToString(f.Data)
77+
// String returns a human-readable representation of the encoded string.
78+
// The representation is guaranteed to be valid UTF-8.
79+
func (s StringWithEncoding) String() string {
80+
data := s.decode()
81+
if needsQuoting.MatchString(data) {
82+
return quote(data)
4583
}
46-
data := f.Data
47-
matchingString := string(data) // Try to directly convert
84+
return data
85+
}
4886

49-
if !f.FullHex && notOnlyASCII.MatchString(matchingString) { // Check if any non-printable chars occur
50-
var utf16Data = data
51-
// Try UTF16 encoding
52-
if len(utf16Data) > 1 && utf16Data[0] == 0xFF && utf16Data[1] == 0xFE {
53-
// Remove byte order mark
54-
utf16Data = utf16Data[2:]
55-
}
56-
if len(utf16Data) > 0 && utf16Data[0] == 0 {
57-
// Might be UTF16 shifted by one byte
58-
utf16Data = utf16Data[1:]
59-
}
60-
matchingString, _ = decodeUTF16(utf16Data)
61-
if notOnlyASCII.MatchString(matchingString) || len(matchingString) == 0 {
62-
// Can't cleanly be rendered as UTF-16
63-
matchingString = string(data)
64-
}
87+
// decode returns the plain text, after decoding it from UTF-16, if applicable.
88+
func (s StringWithEncoding) decode() string {
89+
plaintext := s.Plaintext()
90+
91+
if decoded, ok := attemptDecodeUTF16(plaintext); ok {
92+
return decoded
6593
}
66-
return matchingString
94+
95+
return string(plaintext)
96+
}
97+
98+
// attemptDecodeUTF16 tries to decode the given byte slice as UTF-16 and checks
99+
// whether the decoded string contains non-ASCII characters.
100+
// It returns the decoded string and a boolean indicating whether the decoding was successful.
101+
func attemptDecodeUTF16(b []byte) (string, bool) {
102+
// Try UTF16 encoding
103+
if len(b) > 1 && b[0] == 0xFF && b[1] == 0xFE {
104+
// Remove byte order mark
105+
b = b[2:]
106+
}
107+
if len(b) > 0 && b[0] == 0 {
108+
// Might be UTF16 shifted by one byte
109+
b = b[1:]
110+
}
111+
decodedUtf16, _ := decodeUTF16(b)
112+
if !notOnlyASCII.MatchString(decodedUtf16) && len(decodedUtf16) >= 0 {
113+
// Can cleanly be rendered as UTF-16
114+
return decodedUtf16, true
115+
}
116+
return "", false
67117
}
68118

69119
// https://gist.github.com/bradleypeabody/185b1d7ed6c0c2ab6cec
@@ -84,31 +134,30 @@ func decodeUTF16(b []byte) (string, error) {
84134
return ret.String(), nil
85135
}
86136

87-
func (f MatchData) QuotedString() string {
88-
matchingString := f.String()
89-
matchingString = escaper.Replace(matchingString)
90-
var replacedString bytes.Buffer
91-
for _, char := range []byte(matchingString) {
137+
func quote(s string) string {
138+
s = escaper.Replace(s)
139+
var quotedString bytes.Buffer
140+
quotedString.WriteString(`"`)
141+
for _, char := range []byte(s) {
92142
if char < 0x20 || char > 0x7E { // non ASCII
93-
replacedString.WriteString("\\x")
94-
replacedString.WriteString(hex.EncodeToString([]byte{char}))
143+
quotedString.WriteString("\\x")
144+
quotedString.WriteString(hex.EncodeToString([]byte{char}))
95145
} else {
96-
replacedString.WriteByte(char)
146+
quotedString.WriteByte(char)
97147
}
98148
}
99-
matchingString = replacedString.String()
100-
matchingString = fmt.Sprintf("\"%s\"", matchingString)
101-
return matchingString
149+
quotedString.WriteString(`"`)
150+
return quotedString.String()
102151
}
103152

104153
// MatchString describes a sequence of bytes in an object
105154
// that was matched on by a signature.
106155
type MatchString struct {
107156
// Match contains the bytes that were matched.
108-
Match MatchData `json:"data"`
157+
Match StringWithEncoding `json:"data"`
109158
// Context contains the bytes surrounding the matched bytes.
110159
// This may be missing if no context is available.
111-
Context *MatchData `json:"context,omitempty"`
160+
Context *StringWithEncoding `json:"context,omitempty"`
112161
// Offset contains the Match's offset within the Field
113162
// where the data was matched.
114163
Offset *uint64 `json:"offset,omitempty"`
@@ -120,26 +169,16 @@ type MatchString struct {
120169
var needsQuoting = regexp.MustCompile(`[^\x21\x23-\x7E]`)
121170

122171
func (f MatchString) String() string {
123-
var matchString string
124-
if needsQuoting.MatchString(f.Match.String()) && !f.Match.FullHex {
125-
matchString += f.Match.QuotedString()
126-
} else {
127-
matchString += f.Match.String()
128-
}
172+
matchString := f.Match.String()
129173
if f.Context != nil {
130-
matchString += " in "
131-
if needsQuoting.MatchString(f.Context.String()) && !f.Context.FullHex {
132-
matchString += f.Context.QuotedString()
133-
} else {
134-
matchString += f.Context.String()
135-
}
174+
matchString += " in " + f.Context.String()
136175
}
137176
if f.Offset != nil {
138177
// Only show the offset if this match does not encompass the full field and it's not explicitly hidden
139178
var showOffset = !f.HideOffset
140179
if f.Field != nil && *f.Offset == 0 {
141180
if targetString, isString := f.Field.Value().(string); isString {
142-
if targetString == string(f.Match.Data) {
181+
if targetString == f.Match.Data {
143182
showOffset = false
144183
}
145184
}

0 commit comments

Comments
 (0)