diff --git a/sync_diff_inspector/utils/utils.go b/sync_diff_inspector/utils/utils.go index ed3e6b918..d951c9815 100644 --- a/sync_diff_inspector/utils/utils.go +++ b/sync_diff_inspector/utils/utils.go @@ -24,6 +24,7 @@ import ( "strconv" "strings" "sync" + "unicode/utf8" "github.com/olekukonko/tablewriter" "github.com/pingcap/errors" @@ -887,7 +888,7 @@ NEXTROW: if col == nil { continue NEXTROW } - randomValue[i] = string(col) + randomValue[i] = TruncateInvalidUTF8(string(col)) } randomValues = append(randomValues, randomValue) } @@ -895,6 +896,19 @@ NEXTROW: return randomValues, errors.Trace(rows.Err()) } +// TruncateInvalidUTF8 truncates the string to the last valid UTF-8 character. +// If the string is valid UTF-8, it returns the original string. +func TruncateInvalidUTF8(s string) string { + for i := 0; i < len(s); { + r, size := utf8.DecodeRuneInString(s[i:]) + if r == utf8.RuneError && size == 1 { + return s[:i] + } + i += size + } + return s +} + // ResetColumns removes index from `tableInfo.Indices`, whose columns appear in `columns`. // And removes column from `tableInfo.Columns`, which appears in `columns`. // And initializes the offset of the column of each index to new `tableInfo.Columns`. diff --git a/sync_diff_inspector/utils/utils_test.go b/sync_diff_inspector/utils/utils_test.go index ea6aba6bd..40ad3aa22 100644 --- a/sync_diff_inspector/utils/utils_test.go +++ b/sync_diff_inspector/utils/utils_test.go @@ -686,3 +686,23 @@ func TestCompareBlob(t *testing.T) { } } } + +func TestTruncateInvalidUTF8(t *testing.T) { + testCases := []struct { + input string + expected string + }{ + {"", ""}, + {"test", "test"}, + {"abc\xffdef", "abc"}, + {"\xffabc", ""}, + {"a\xc3\x28", "a"}, + {"ab\xe2\x82\x28", "ab"}, + {"\xed\xa0\x80", ""}, + {"abc\xe2\x28", "abc"}, + } + for _, tc := range testCases { + got := TruncateInvalidUTF8(tc.input) + require.Equal(t, got, tc.expected, "input: %s", tc.input) + } +}