From 31086b4b0c65e3c4054981801441797e524449d9 Mon Sep 17 00:00:00 2001 From: Lunny Xiao Date: Fri, 12 Dec 2025 23:29:18 -0800 Subject: [PATCH 1/6] Fix bug when viewing the commit diff page with non-ANSI files --- services/gitdiff/gitdiff.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 6e15f7160956c..44dd3efc6feaa 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -1325,10 +1325,10 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit shouldFullFileHighlight := !setting.Git.DisableDiffHighlight && attrDiff.Value() == "" if shouldFullFileHighlight { if limitedContent.LeftContent != nil && limitedContent.LeftContent.buf.Len() < MaxDiffHighlightEntireFileSize { - diffFile.highlightedLeftLines = highlightCodeLines(diffFile, true /* left */, limitedContent.LeftContent.buf.String()) + diffFile.highlightedLeftLines = highlightCodeLines(diffFile, true /* left */, limitedContent.LeftContent.buf.Bytes()) } if limitedContent.RightContent != nil && limitedContent.RightContent.buf.Len() < MaxDiffHighlightEntireFileSize { - diffFile.highlightedRightLines = highlightCodeLines(diffFile, false /* right */, limitedContent.RightContent.buf.String()) + diffFile.highlightedRightLines = highlightCodeLines(diffFile, false /* right */, limitedContent.RightContent.buf.Bytes()) } } } @@ -1336,7 +1336,8 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit return diff, nil } -func highlightCodeLines(diffFile *DiffFile, isLeft bool, content string) map[int]template.HTML { +func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML { + content, _ := charset.ToUTF8(rawContent, charset.ConvertOpts{KeepBOM: false}) highlightedNewContent, _ := highlight.Code(diffFile.Name, diffFile.Language, content) splitLines := strings.Split(string(highlightedNewContent), "\n") lines := make(map[int]template.HTML, len(splitLines)) From 12a737ea00ba7f25747178051227c0d0895c5c5b Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sat, 13 Dec 2025 16:40:54 +0800 Subject: [PATCH 2/6] fix --- modules/charset/charset.go | 144 +++++------- modules/charset/charset_test.go | 218 ++++-------------- modules/httplib/serve.go | 7 +- modules/indexer/code/bleve/bleve.go | 2 +- .../code/elasticsearch/elasticsearch.go | 2 +- routers/web/repo/editor.go | 6 +- services/gitdiff/gitdiff.go | 12 +- services/gitdiff/gitdiff_test.go | 15 ++ .../migration-test/migration_test.go | 6 +- 9 files changed, 135 insertions(+), 277 deletions(-) diff --git a/modules/charset/charset.go b/modules/charset/charset.go index 597ce5120c611..c224281d33765 100644 --- a/modules/charset/charset.go +++ b/modules/charset/charset.go @@ -5,12 +5,10 @@ package charset import ( "bytes" - "fmt" "io" "strings" "unicode/utf8" - "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" @@ -23,135 +21,121 @@ import ( var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} type ConvertOpts struct { - KeepBOM bool + KeepBOM bool + ErrorReplacement []byte + ErrorReturnOrigin bool } +var ToUTF8WithFallbackReaderPrefetchSize = 16 * 1024 + // ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader { - buf := make([]byte, 2048) + buf := make([]byte, ToUTF8WithFallbackReaderPrefetchSize) n, err := util.ReadAtMost(rd, buf) if err != nil { - return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd) + // read error occurs, don't do any processing + return io.MultiReader(bytes.NewReader(buf[:n]), rd) } - charsetLabel, err := DetectEncoding(buf[:n]) - if err != nil || charsetLabel == "UTF-8" { - return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd) + charsetLabel, _ := DetectEncoding(buf[:n]) + if charsetLabel == "UTF-8" { + // is utf-8, try to remove BOM and read it as-is + return io.MultiReader(bytes.NewReader(maybeRemoveBOM(buf[:n], opts)), rd) } encoding, _ := charset.Lookup(charsetLabel) if encoding == nil { + // unknown charset, don't do any processing return io.MultiReader(bytes.NewReader(buf[:n]), rd) } + // convert from charset to utf-8 return transform.NewReader( - io.MultiReader( - bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), - rd, - ), + io.MultiReader(bytes.NewReader(buf[:n]), rd), encoding.NewDecoder(), ) } -// ToUTF8 converts content to UTF8 encoding -func ToUTF8(content []byte, opts ConvertOpts) (string, error) { - charsetLabel, err := DetectEncoding(content) - if err != nil { - return "", err - } else if charsetLabel == "UTF-8" { - return string(MaybeRemoveBOM(content, opts)), nil - } - - encoding, _ := charset.Lookup(charsetLabel) - if encoding == nil { - return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel) - } - - // If there is an error, we concatenate the nicely decoded part and the - // original left over. This way we won't lose much data. - result, n, err := transform.Bytes(encoding.NewDecoder(), content) - if err != nil { - result = append(result, content[n:]...) - } - - result = MaybeRemoveBOM(result, opts) - - return string(result), err -} - // ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte { bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content), opts)) return bs } -// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible -func ToUTF8DropErrors(content []byte, opts ConvertOpts) []byte { - charsetLabel, err := DetectEncoding(content) - if err != nil || charsetLabel == "UTF-8" { - return MaybeRemoveBOM(content, opts) +func ToUTF8DropErrors(content []byte) []byte { + return ToUTF8(content, ConvertOpts{ErrorReplacement: []byte{' '}}) +} + +func ToUTF8(content []byte, opts ConvertOpts) []byte { + charsetLabel, _ := DetectEncoding(content) + if charsetLabel == "UTF-8" { + return maybeRemoveBOM(content, opts) } encoding, _ := charset.Lookup(charsetLabel) if encoding == nil { + setting.PanicInDevOrTesting("unknown detected charset %q, it shouldn't happen", charsetLabel) return content } - // We ignore any non-decodable parts from the file. - // Some parts might be lost var decoded []byte decoder := encoding.NewDecoder() idx := 0 - for { + for idx < len(content) { result, n, err := transform.Bytes(decoder, content[idx:]) decoded = append(decoded, result...) if err == nil { break } - decoded = append(decoded, ' ') - idx = idx + n + 1 - if idx >= len(content) { - break + if opts.ErrorReturnOrigin { + return content + } + if opts.ErrorReplacement == nil { + decoded = append(decoded, content[idx+n]) + } else { + decoded = append(decoded, opts.ErrorReplacement...) } + idx += n + 1 } - - return MaybeRemoveBOM(decoded, opts) + return maybeRemoveBOM(decoded, opts) } -// MaybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false -func MaybeRemoveBOM(content []byte, opts ConvertOpts) []byte { +// maybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false +func maybeRemoveBOM(content []byte, opts ConvertOpts) []byte { if opts.KeepBOM { return content } - if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { - return content[3:] - } - return content + return bytes.TrimPrefix(content, UTF8BOM) } // DetectEncoding detect the encoding of content -func DetectEncoding(content []byte) (string, error) { +// it always returns a detected or guessed "encoding" string, no matter error happens or not +func DetectEncoding(content []byte) (encoding string, _ error) { // First we check if the content represents valid utf8 content excepting a truncated character at the end. // Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do - // instead we walk backwards from the end to trim off a the incomplete character + // instead we walk backwards from the end to trim off the incomplete character toValidate := content end := len(toValidate) - 1 - if end < 0 { - // no-op - } else if toValidate[end]>>5 == 0b110 { - // Incomplete 1 byte extension e.g. © which has been truncated to - toValidate = toValidate[:end] - } else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 { - // Incomplete 2 byte extension e.g. ⛔ <9b><94> which has been truncated to <9b> - toValidate = toValidate[:end-1] - } else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 { - // Incomplete 3 byte extension e.g. 💩 <9f><92> which has been truncated to <9f><92> - toValidate = toValidate[:end-2] + // U+0000 U+007F 0yyyzzzz + // U+0080 U+07FF 110xxxyy 10yyzzzz + // U+0800 U+FFFF 1110wwww 10xxxxyy 10yyzzzz + // U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz + cnt := 0 + for end >= 0 && cnt < 4 { + c := toValidate[end] + if c>>6 == 0b10 { + end-- + } + if c>>5 == 0b110 || c>>4 == 0b1110 || c>>3 == 0b11110 { + toValidate = toValidate[:end] + break + } + cnt++ } + if utf8.Valid(toValidate) { - log.Debug("Detected encoding: utf-8 (fast)") return "UTF-8", nil } @@ -160,7 +144,7 @@ func DetectEncoding(content []byte) (string, error) { if len(content) < 1024 { // Check if original content is valid if _, err := textDetector.DetectBest(content); err != nil { - return "", err + return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err } times := 1024 / len(content) detectContent = make([]byte, 0, times*len(content)) @@ -171,14 +155,10 @@ func DetectEncoding(content []byte) (string, error) { detectContent = content } - // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break + // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie-break results, err := textDetector.DetectAll(detectContent) if err != nil { - if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { - log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) - return setting.Repository.AnsiCharset, nil - } - return "", err + return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err } topConfidence := results[0].Confidence @@ -201,11 +181,9 @@ func DetectEncoding(content []byte) (string, error) { } // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument - if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { - log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) + if topResult.Charset != "UTF-8" && setting.Repository.AnsiCharset != "" { return setting.Repository.AnsiCharset, err } - log.Debug("Detected encoding: %s", topResult.Charset) - return topResult.Charset, err + return topResult.Charset, nil } diff --git a/modules/charset/charset_test.go b/modules/charset/charset_test.go index cd2e3b9aaa46e..fb35655d37e88 100644 --- a/modules/charset/charset_test.go +++ b/modules/charset/charset_test.go @@ -10,6 +10,7 @@ import ( "testing" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" "github.com/stretchr/testify/assert" ) @@ -31,10 +32,10 @@ func resetDefaultCharsetsOrder() { } func TestMaybeRemoveBOM(t *testing.T) { - res := MaybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + res := maybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) - res = MaybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + res = maybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) } @@ -45,63 +46,54 @@ func TestToUTF8(t *testing.T) { // locale, so some conversions might behave differently. For that reason, we don't // depend on particular conversions but in expected behaviors. - res, err := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) - assert.NoError(t, err) - assert.Equal(t, "ABC", res) + res := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) + assert.Equal(t, "ABC", string(res)) // "áéíóú" - res, err = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) - assert.NoError(t, err) - assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res)) + res = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) // "áéíóú" - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba, }, ConvertOpts{}) - assert.NoError(t, err) - assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res)) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e, }, ConvertOpts{}) - assert.NoError(t, err) stringMustStartWith(t, "Hola,", res) stringMustEndWith(t, "AAA.", res) - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e, }, ConvertOpts{}) - assert.NoError(t, err) stringMustStartWith(t, "Hola,", res) stringMustEndWith(t, "AAA.", res) - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e, }, ConvertOpts{}) - assert.NoError(t, err) stringMustStartWith(t, "Hola,", res) stringMustEndWith(t, "AAA.", res) // Japanese (Shift-JIS) // 日属秘ぞしちゅ。 - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42, }, ConvertOpts{}) - assert.NoError(t, err) assert.Equal(t, []byte{ 0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3, 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82, - }, - []byte(res)) + }, res) - res, err = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) - assert.NoError(t, err) - assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res)) + res = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) + assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res) } func TestToUTF8WithFallback(t *testing.T) { @@ -153,43 +145,44 @@ func TestToUTF8WithFallback(t *testing.T) { func TestToUTF8DropErrors(t *testing.T) { resetDefaultCharsetsOrder() + // "ABC" - res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) + res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}) assert.Equal(t, []byte{0x41, 0x42, 0x43}, res) // "áéíóú" - res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) // UTF8 BOM + "áéíóú" - res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) // "Hola, así cómo ños" - res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}) assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8]) assert.Equal(t, []byte{0x73}, res[len(res)-1:]) // "Hola, así cómo " minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20} - res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}) // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those assert.Equal(t, minmatch, res[0:len(minmatch)]) - res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}) // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those assert.Equal(t, minmatch, res[0:len(minmatch)]) // Japanese (Shift-JIS) // "日属秘ぞしちゅ。" - res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}) assert.Equal(t, []byte{ 0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3, 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82, }, res) - res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00}) assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res) } @@ -231,152 +224,33 @@ func TestDetectEncoding(t *testing.T) { assert.Error(t, err) } -func stringMustStartWith(t *testing.T, expected, value string) { - assert.Equal(t, expected, value[:len(expected)]) +func stringMustStartWith(t *testing.T, expected string, value []byte) { + assert.Equal(t, expected, string(value[:len(expected)])) } -func stringMustEndWith(t *testing.T, expected, value string) { - assert.Equal(t, expected, value[len(value)-len(expected):]) +func stringMustEndWith(t *testing.T, expected string, value []byte) { + assert.Equal(t, expected, string(value[len(value)-len(expected):])) } func TestToUTF8WithFallbackReader(t *testing.T) { resetDefaultCharsetsOrder() - - for testLen := range 2048 { - pattern := " test { () }\n" - input := "" - for len(input) < testLen { - input += pattern - } - input = input[:testLen] - input += "// Выключаем" - rd := ToUTF8WithFallbackReader(bytes.NewReader([]byte(input)), ConvertOpts{}) + test.MockVariableValue(&ToUTF8WithFallbackReaderPrefetchSize) + + block := "aá啊🤔" + runes := []rune(block) + assert.Len(t, string(runes[0]), 1) + assert.Len(t, string(runes[1]), 2) + assert.Len(t, string(runes[2]), 3) + assert.Len(t, string(runes[3]), 4) + + content := strings.Repeat(block, 10) + for i := 1; i < len(content); i++ { + encoding, _ := DetectEncoding([]byte(content[:i])) + assert.Equal(t, "UTF-8", encoding) + + ToUTF8WithFallbackReaderPrefetchSize = i + rd := ToUTF8WithFallbackReader(bytes.NewReader([]byte(content)), ConvertOpts{}) r, _ := io.ReadAll(rd) - assert.Equalf(t, input, string(r), "testing string len=%d", testLen) + assert.Equal(t, content, string(r)) } - - truncatedOneByteExtension := failFastBytes - encoding, _ := DetectEncoding(truncatedOneByteExtension) - assert.Equal(t, "UTF-8", encoding) - - truncatedTwoByteExtension := failFastBytes - truncatedTwoByteExtension[len(failFastBytes)-1] = 0x9b - truncatedTwoByteExtension[len(failFastBytes)-2] = 0xe2 - - encoding, _ = DetectEncoding(truncatedTwoByteExtension) - assert.Equal(t, "UTF-8", encoding) - - truncatedThreeByteExtension := failFastBytes - truncatedThreeByteExtension[len(failFastBytes)-1] = 0x92 - truncatedThreeByteExtension[len(failFastBytes)-2] = 0x9f - truncatedThreeByteExtension[len(failFastBytes)-3] = 0xf0 - - encoding, _ = DetectEncoding(truncatedThreeByteExtension) - assert.Equal(t, "UTF-8", encoding) -} - -var failFastBytes = []byte{ - 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x74, 0x6f, - 0x6f, 0x6c, 0x73, 0x2e, 0x61, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x64, 0x65, 0x66, 0x73, 0x2e, 0x63, 0x6f, 0x6e, - 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x4f, 0x73, 0x0a, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67, - 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, - 0x74, 0x2e, 0x67, 0x72, 0x61, 0x64, 0x6c, 0x65, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x75, 0x6e, 0x2e, 0x42, - 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x0a, 0x0a, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x64, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, - 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x64, 0x65, 0x70, 0x65, - 0x6e, 0x64, 0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, - 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, - 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, - 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, - 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x2d, 0x64, 0x6f, 0x63, 0x73, 0x22, 0x29, - 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x64, 0x62, - 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, - 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, - 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, - 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x66, - 0x73, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, - 0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x6d, 0x71, 0x22, 0x29, 0x29, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, - 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, - 0x2d, 0x61, 0x75, 0x74, 0x68, 0x2d, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74, - 0x65, 0x72, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, - 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x68, 0x61, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, - 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x22, 0x29, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, - 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x77, 0x65, 0x62, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, - 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, - 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, - 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x6f, 0x70, - 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, - 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, - 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, - 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x63, 0x74, 0x75, 0x61, 0x74, 0x6f, 0x72, 0x22, 0x29, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, - 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, - 0x6c, 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x73, 0x74, 0x72, 0x61, 0x70, 0x22, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, - 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c, - 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61, - 0x72, 0x74, 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x73, 0x75, 0x6c, 0x2d, 0x61, 0x6c, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, - 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c, - 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61, - 0x72, 0x74, 0x65, 0x72, 0x2d, 0x73, 0x6c, 0x65, 0x75, 0x74, 0x68, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, - 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, - 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x72, 0x65, 0x74, 0x72, 0x79, 0x3a, - 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x72, 0x65, 0x74, 0x72, 0x79, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x63, 0x68, 0x2e, 0x71, - 0x6f, 0x73, 0x2e, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x3a, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x2d, 0x63, - 0x6c, 0x61, 0x73, 0x73, 0x69, 0x63, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, - 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x69, 0x6f, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65, - 0x74, 0x65, 0x72, 0x3a, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x2d, 0x72, 0x65, 0x67, 0x69, 0x73, - 0x74, 0x72, 0x79, 0x2d, 0x70, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x65, 0x75, 0x73, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6b, 0x6f, 0x74, - 0x6c, 0x69, 0x6e, 0x28, 0x22, 0x73, 0x74, 0x64, 0x6c, 0x69, 0x62, 0x22, 0x29, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, - 0x2f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x65, 0x73, 0x74, 0x20, 0x64, 0x65, 0x70, 0x65, 0x6e, 0x64, - 0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, - 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x65, 0x73, 0x74, 0x49, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, - 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, - 0x74, 0x65, 0x73, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a, - 0x61, 0x72, 0x20, 0x62, 0x79, 0x20, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, - 0x69, 0x6e, 0x67, 0x28, 0x4a, 0x61, 0x72, 0x3a, 0x3a, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x61, 0x72, 0x63, 0x68, 0x69, 0x76, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x69, 0x66, 0x69, 0x65, 0x72, 0x2e, - 0x73, 0x65, 0x74, 0x28, 0x22, 0x70, 0x61, 0x74, 0x63, 0x68, 0x65, 0x64, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x76, 0x61, 0x6c, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68, - 0x20, 0x62, 0x79, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2e, 0x67, - 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x6e, 0x69, 0x66, 0x65, 0x73, 0x74, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, - 0x73, 0x28, 0x22, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x2d, 0x50, 0x61, 0x74, 0x68, 0x22, 0x20, 0x74, 0x6f, 0x20, 0x6f, 0x62, - 0x6a, 0x65, 0x63, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, - 0x72, 0x69, 0x76, 0x61, 0x74, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x3d, - 0x20, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x3a, 0x2f, 0x2b, 0x22, 0x2e, 0x74, 0x6f, 0x52, 0x65, 0x67, 0x65, 0x78, 0x28, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, - 0x65, 0x20, 0x66, 0x75, 0x6e, 0x20, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x3a, 0x20, 0x53, 0x74, - 0x72, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70, - 0x61, 0x74, 0x68, 0x2e, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x54, 0x6f, 0x53, 0x74, 0x72, 0x69, - 0x6e, 0x67, 0x28, 0x22, 0x20, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x74, 0x2e, 0x74, 0x6f, 0x55, 0x52, 0x49, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x55, - 0x52, 0x4c, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, - 0x61, 0x63, 0x65, 0x46, 0x69, 0x72, 0x73, 0x74, 0x28, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2c, 0x20, 0x22, 0x2f, - 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x74, 0x61, 0x73, - 0x6b, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x3c, 0x42, 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x3e, 0x28, 0x22, 0x62, - 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x4f, - 0x73, 0x2e, 0x69, 0x73, 0x46, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x28, 0x4f, 0x73, 0x2e, 0x46, 0x41, 0x4d, 0x49, 0x4c, 0x59, - 0x5f, 0x57, 0x49, 0x4e, 0x44, 0x4f, 0x57, 0x53, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68, 0x20, 0x3d, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x28, 0x73, - 0x6f, 0x75, 0x72, 0x63, 0x65, 0x53, 0x65, 0x74, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x28, 0x22, 0x6d, 0x61, 0x69, - 0x6e, 0x22, 0x29, 0x2e, 0x6d, 0x61, 0x70, 0x20, 0x7b, 0x20, 0x69, 0x74, 0x2e, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, - 0x7d, 0x2c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a, 0x61, 0x72, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0xd0, } diff --git a/modules/httplib/serve.go b/modules/httplib/serve.go index b4c5e7fe1ebdf..2d66a86a8b01f 100644 --- a/modules/httplib/serve.go +++ b/modules/httplib/serve.go @@ -19,7 +19,6 @@ import ( charsetModule "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/container" "code.gitea.io/gitea/modules/httpcache" - "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/typesniffer" "code.gitea.io/gitea/modules/util" @@ -109,11 +108,7 @@ func setServeHeadersByFile(r *http.Request, w http.ResponseWriter, mineBuf []byt } if isPlain { - charset, err := charsetModule.DetectEncoding(mineBuf) - if err != nil { - log.Error("Detect raw file %s charset failed: %v, using by default utf-8", opts.Filename, err) - charset = "utf-8" - } + charset, _ := charsetModule.DetectEncoding(mineBuf) opts.ContentTypeCharset = strings.ToLower(charset) } diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index bdb477ce6e7fb..5f6a7f6082796 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -203,7 +203,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro RepoID: repo.ID, CommitID: commitSha, Filename: update.Filename, - Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), + Content: string(charset.ToUTF8DropErrors(fileContents)), Language: analyze.GetCodeLanguage(update.Filename, fileContents), UpdatedAt: time.Now().UTC(), }) diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index b2eb301a5dc8b..a7027051d2652 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -191,7 +191,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro Doc(map[string]any{ "repo_id": repo.ID, "filename": update.Filename, - "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), + "content": string(charset.ToUTF8DropErrors(fileContents)), "commit_id": sha, "language": analyze.GetCodeLanguage(update.Filename, fileContents), "updated_at": timeutil.TimeStampNow(), diff --git a/routers/web/repo/editor.go b/routers/web/repo/editor.go index 983249a6d247e..048c9f3d4a4d8 100644 --- a/routers/web/repo/editor.go +++ b/routers/web/repo/editor.go @@ -317,11 +317,7 @@ func EditFile(ctx *context.Context) { ctx.ServerError("ReadAll", err) return } - if content, err := charset.ToUTF8(buf, charset.ConvertOpts{KeepBOM: true}); err != nil { - ctx.Data["FileContent"] = string(buf) - } else { - ctx.Data["FileContent"] = content - } + ctx.Data["FileContent"] = string(charset.ToUTF8(buf, charset.ConvertOpts{KeepBOM: true, ErrorReturnOrigin: true})) } } diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 44dd3efc6feaa..06e8ceff8baad 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -835,11 +835,11 @@ parsingLoop: if buffer.Len() == 0 { continue } - charsetLabel, err := charset.DetectEncoding(buffer.Bytes()) - if charsetLabel != "UTF-8" && err == nil { - encoding, _ := stdcharset.Lookup(charsetLabel) - if encoding != nil { - diffLineTypeDecoders[lineType] = encoding.NewDecoder() + charsetLabel, _ := charset.DetectEncoding(buffer.Bytes()) + if charsetLabel != "UTF-8" { + charsetEncoding, _ := stdcharset.Lookup(charsetLabel) + if charsetEncoding != nil { + diffLineTypeDecoders[lineType] = charsetEncoding.NewDecoder() } } } @@ -1337,7 +1337,7 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit } func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML { - content, _ := charset.ToUTF8(rawContent, charset.ConvertOpts{KeepBOM: false}) + content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{})) highlightedNewContent, _ := highlight.Code(diffFile.Name, diffFile.Language, content) splitLines := strings.Split(string(highlightedNewContent), "\n") lines := make(map[int]template.HTML, len(splitLines)) diff --git a/services/gitdiff/gitdiff_test.go b/services/gitdiff/gitdiff_test.go index 721ae0dfc7520..b20b3fb6b8aa3 100644 --- a/services/gitdiff/gitdiff_test.go +++ b/services/gitdiff/gitdiff_test.go @@ -1106,3 +1106,18 @@ func TestDiffLine_GetExpandDirection(t *testing.T) { assert.Equal(t, c.direction, c.diffLine.GetExpandDirection(), "case %s expected direction: %s", c.name, c.direction) } } + +func TestHighlightCodeLines(t *testing.T) { + diffFile := &DiffFile{ + Name: "a.c", + Language: "C", + Sections: []*DiffSection{ + { + Lines: []*DiffLine{{LeftIdx: 1}}, + }, + }, + } + ret := highlightCodeLines(diffFile, true, []byte("// abc\xcc def\xcd")) // ISO-8859-1 bytes + // FIXME: tag is not correctly closed + assert.Equal(t, `// abcÌ defÍ`, string(ret[0])) +} diff --git a/tests/integration/migration-test/migration_test.go b/tests/integration/migration-test/migration_test.go index 5fa7cbbfb7f23..2659c5c53dbad 100644 --- a/tests/integration/migration-test/migration_test.go +++ b/tests/integration/migration-test/migration_test.go @@ -4,6 +4,7 @@ package migrations import ( + "bytes" "compress/gzip" "context" "database/sql" @@ -21,7 +22,6 @@ import ( "code.gitea.io/gitea/models/migrations" migrate_base "code.gitea.io/gitea/models/migrations/base" "code.gitea.io/gitea/models/unittest" - "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" @@ -108,11 +108,11 @@ func readSQLFromFile(version string) (string, error) { } defer gr.Close() - bytes, err := io.ReadAll(gr) + buf, err := io.ReadAll(gr) if err != nil { return "", err } - return string(charset.MaybeRemoveBOM(bytes, charset.ConvertOpts{})), nil + return string(bytes.TrimPrefix(buf, []byte{'\xef', '\xbb', '\xbf'})), nil } func restoreOldDB(t *testing.T, version string) { From 5a520d5ff39bf580005c8df1aa4657ff399c9438 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sat, 13 Dec 2025 19:00:17 +0800 Subject: [PATCH 3/6] fix --- modules/charset/charset_test.go | 2 +- services/gitdiff/gitdiff.go | 26 ++++++++++++++- services/gitdiff/gitdiff_test.go | 46 ++++++++++++++++++++------ services/gitdiff/highlightdiff_test.go | 8 ++--- 4 files changed, 65 insertions(+), 17 deletions(-) diff --git a/modules/charset/charset_test.go b/modules/charset/charset_test.go index fb35655d37e88..02a28620b171a 100644 --- a/modules/charset/charset_test.go +++ b/modules/charset/charset_test.go @@ -243,7 +243,7 @@ func TestToUTF8WithFallbackReader(t *testing.T) { assert.Len(t, string(runes[2]), 3) assert.Len(t, string(runes[3]), 4) - content := strings.Repeat(block, 10) + content := strings.Repeat(block, 2) for i := 1; i < len(content); i++ { encoding, _ := DetectEncoding([]byte(content[:i])) assert.Equal(t, "UTF-8", encoding) diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 06e8ceff8baad..f8fde6ab2972c 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -1336,10 +1336,34 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit return diff, nil } +func splitHighlightLines(buf []byte) (ret [][]byte) { + lineCount := bytes.Count(buf, []byte("\n")) + 1 + ret = make([][]byte, 0, lineCount) + nlTagClose := []byte("\n" right after \n, sometimes before. + // * "text\n" + // * "text\n" + if bytes.HasPrefix(buf[pos:], nlTagClose) { + pos1 := bytes.IndexByte(buf[pos:], '>') + if pos1 != -1 { + pos += pos1 + } + } + ret = append(ret, buf[:pos+1]) + buf = buf[pos+1:] + } +} + func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML { content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{})) highlightedNewContent, _ := highlight.Code(diffFile.Name, diffFile.Language, content) - splitLines := strings.Split(string(highlightedNewContent), "\n") + splitLines := splitHighlightLines([]byte(highlightedNewContent)) lines := make(map[int]template.HTML, len(splitLines)) // only save the highlighted lines we need, but not the whole file, to save memory for _, sec := range diffFile.Sections { diff --git a/services/gitdiff/gitdiff_test.go b/services/gitdiff/gitdiff_test.go index b20b3fb6b8aa3..a94dad8b63e03 100644 --- a/services/gitdiff/gitdiff_test.go +++ b/services/gitdiff/gitdiff_test.go @@ -5,6 +5,7 @@ package gitdiff import ( + "html/template" "strconv" "strings" "testing" @@ -1108,16 +1109,39 @@ func TestDiffLine_GetExpandDirection(t *testing.T) { } func TestHighlightCodeLines(t *testing.T) { - diffFile := &DiffFile{ - Name: "a.c", - Language: "C", - Sections: []*DiffSection{ - { - Lines: []*DiffLine{{LeftIdx: 1}}, + t.Run("CharsetDetecting", func(t *testing.T) { + diffFile := &DiffFile{ + Name: "a.c", + Language: "c", + Sections: []*DiffSection{ + { + Lines: []*DiffLine{{LeftIdx: 1}}, + }, }, - }, - } - ret := highlightCodeLines(diffFile, true, []byte("// abc\xcc def\xcd")) // ISO-8859-1 bytes - // FIXME: tag is not correctly closed - assert.Equal(t, `// abcÌ defÍ`, string(ret[0])) + } + ret := highlightCodeLines(diffFile, true, []byte("// abc\xcc def\xcd")) // ISO-8859-1 bytes + assert.Equal(t, "// abcÌ defÍ\n", string(ret[0])) + }) + + t.Run("LeftLines", func(t *testing.T) { + diffFile := &DiffFile{ + Name: "a.c", + Language: "c", + Sections: []*DiffSection{ + { + Lines: []*DiffLine{ + {LeftIdx: 1}, + {LeftIdx: 2}, + {LeftIdx: 3}, + }, + }, + }, + } + const nl = "\n" + ret := highlightCodeLines(diffFile, true, []byte("a\nb\n")) + assert.Equal(t, map[int]template.HTML{ + 0: `a` + nl, + 1: `b`, + }, ret) + }) } diff --git a/services/gitdiff/highlightdiff_test.go b/services/gitdiff/highlightdiff_test.go index aebe38ae7ca85..0df2e29d13e66 100644 --- a/services/gitdiff/highlightdiff_test.go +++ b/services/gitdiff/highlightdiff_test.go @@ -25,12 +25,12 @@ func TestDiffWithHighlight(t *testing.T) { t.Run("CleanUp", func(t *testing.T) { hcd := newHighlightCodeDiff() - codeA := template.HTML(`this is updated comment`) + codeA := template.HTML(`this is a comment`) + codeB := template.HTML(`this is updated comment`) outDel := hcd.diffLineWithHighlight(DiffLineDel, codeA, codeB) - assert.Equal(t, `a comment`, string(outDel)) + assert.Equal(t, `this is a comment`, string(outDel)) outAdd := hcd.diffLineWithHighlight(DiffLineAdd, codeA, codeB) - assert.Equal(t, `updated comment`, string(outAdd)) + assert.Equal(t, `this is updated comment`, string(outAdd)) }) t.Run("OpenCloseTags", func(t *testing.T) { From b294923e5f21896f6240bf90a124480c4f20a7c3 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sat, 13 Dec 2025 19:45:31 +0800 Subject: [PATCH 4/6] add error log for unsupported charset --- modules/charset/charset.go | 2 +- modules/setting/setting.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/charset/charset.go b/modules/charset/charset.go index c224281d33765..94c2f2175caf6 100644 --- a/modules/charset/charset.go +++ b/modules/charset/charset.go @@ -74,7 +74,7 @@ func ToUTF8(content []byte, opts ConvertOpts) []byte { encoding, _ := charset.Lookup(charsetLabel) if encoding == nil { - setting.PanicInDevOrTesting("unknown detected charset %q, it shouldn't happen", charsetLabel) + setting.PanicInDevOrTesting("unsupported detected charset %q, it shouldn't happen", charsetLabel) return content } diff --git a/modules/setting/setting.go b/modules/setting/setting.go index e14997801fed4..dc60d99bd6e8d 100644 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -240,4 +240,5 @@ func PanicInDevOrTesting(msg string, a ...any) { if !IsProd || IsInTesting { panic(fmt.Sprintf(msg, a...)) } + log.Error(msg, a...) } From 87d1fe534599b2bdfaceaa94c625f8f5572c12c7 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sat, 13 Dec 2025 20:10:09 +0800 Subject: [PATCH 5/6] fix test --- modules/charset/charset_test.go | 45 ++++++++++----------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/modules/charset/charset_test.go b/modules/charset/charset_test.go index 02a28620b171a..97d9ba00c4d57 100644 --- a/modules/charset/charset_test.go +++ b/modules/charset/charset_test.go @@ -6,6 +6,7 @@ package charset import ( "bytes" "io" + "os" "strings" "testing" @@ -15,20 +16,12 @@ import ( "github.com/stretchr/testify/assert" ) -func resetDefaultCharsetsOrder() { - defaultDetectedCharsetsOrder := make([]string, 0, len(setting.Repository.DetectedCharsetsOrder)) - for _, charset := range setting.Repository.DetectedCharsetsOrder { - defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset))) - } +func TestMain(m *testing.M) { setting.Repository.DetectedCharsetScore = map[string]int{} - i := 0 - for _, charset := range defaultDetectedCharsetsOrder { - canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) - if _, has := setting.Repository.DetectedCharsetScore[canonicalCharset]; !has { - setting.Repository.DetectedCharsetScore[canonicalCharset] = i - i++ - } + for i, charset := range setting.Repository.DetectedCharsetsOrder { + setting.Repository.DetectedCharsetScore[strings.ToLower(charset)] = i } + os.Exit(m.Run()) } func TestMaybeRemoveBOM(t *testing.T) { @@ -40,8 +33,6 @@ func TestMaybeRemoveBOM(t *testing.T) { } func TestToUTF8(t *testing.T) { - resetDefaultCharsetsOrder() - // Note: golang compiler seems so behave differently depending on the current // locale, so some conversions might behave differently. For that reason, we don't // depend on particular conversions but in expected behaviors. @@ -97,7 +88,6 @@ func TestToUTF8(t *testing.T) { } func TestToUTF8WithFallback(t *testing.T) { - resetDefaultCharsetsOrder() // "ABC" res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) assert.Equal(t, []byte{0x41, 0x42, 0x43}, res) @@ -144,8 +134,6 @@ func TestToUTF8WithFallback(t *testing.T) { } func TestToUTF8DropErrors(t *testing.T) { - resetDefaultCharsetsOrder() - // "ABC" res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}) assert.Equal(t, []byte{0x41, 0x42, 0x43}, res) @@ -187,12 +175,17 @@ func TestToUTF8DropErrors(t *testing.T) { } func TestDetectEncoding(t *testing.T) { - resetDefaultCharsetsOrder() testSuccess := func(b []byte, expected string) { encoding, err := DetectEncoding(b) assert.NoError(t, err) assert.Equal(t, expected, encoding) } + + // invalid bytes + encoding, err := DetectEncoding([]byte{0xfa}) + assert.Error(t, err) + assert.Equal(t, "UTF-8", encoding) + // utf-8 b := []byte("just some ascii") testSuccess(b, "UTF-8") @@ -207,21 +200,12 @@ func TestDetectEncoding(t *testing.T) { // iso-8859-1: dcor b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a} - encoding, err := DetectEncoding(b) + encoding, err = DetectEncoding(b) assert.NoError(t, err) assert.Contains(t, encoding, "ISO-8859-1") - old := setting.Repository.AnsiCharset - setting.Repository.AnsiCharset = "placeholder" - defer func() { - setting.Repository.AnsiCharset = old - }() - testSuccess(b, "placeholder") - - // invalid bytes - b = []byte{0xfa} - _, err = DetectEncoding(b) - assert.Error(t, err) + defer test.MockVariableValue(&setting.Repository.AnsiCharset, "MyEncoding")() + testSuccess(b, "MyEncoding") } func stringMustStartWith(t *testing.T, expected string, value []byte) { @@ -233,7 +217,6 @@ func stringMustEndWith(t *testing.T, expected string, value []byte) { } func TestToUTF8WithFallbackReader(t *testing.T) { - resetDefaultCharsetsOrder() test.MockVariableValue(&ToUTF8WithFallbackReaderPrefetchSize) block := "aá啊🤔" From eb2b6fc16c366e46167784cb3a0ab57c48091708 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sat, 13 Dec 2025 20:57:06 +0800 Subject: [PATCH 6/6] improve test --- modules/charset/charset.go | 10 +++++++--- modules/charset/charset_test.go | 14 +++++++++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/modules/charset/charset.go b/modules/charset/charset.go index 94c2f2175caf6..b15665497339b 100644 --- a/modules/charset/charset.go +++ b/modules/charset/charset.go @@ -125,12 +125,16 @@ func DetectEncoding(content []byte) (encoding string, _ error) { cnt := 0 for end >= 0 && cnt < 4 { c := toValidate[end] - if c>>6 == 0b10 { - end-- - } if c>>5 == 0b110 || c>>4 == 0b1110 || c>>3 == 0b11110 { + // a leading byte toValidate = toValidate[:end] break + } else if c>>6 == 0b10 { + // a continuation byte + end-- + } else { + // not an utf-8 byte + break } cnt++ } diff --git a/modules/charset/charset_test.go b/modules/charset/charset_test.go index 97d9ba00c4d57..0314abc347bc1 100644 --- a/modules/charset/charset_test.go +++ b/modules/charset/charset_test.go @@ -4,7 +4,6 @@ package charset import ( - "bytes" "io" "os" "strings" @@ -228,12 +227,21 @@ func TestToUTF8WithFallbackReader(t *testing.T) { content := strings.Repeat(block, 2) for i := 1; i < len(content); i++ { - encoding, _ := DetectEncoding([]byte(content[:i])) + encoding, err := DetectEncoding([]byte(content[:i])) + assert.NoError(t, err) assert.Equal(t, "UTF-8", encoding) ToUTF8WithFallbackReaderPrefetchSize = i - rd := ToUTF8WithFallbackReader(bytes.NewReader([]byte(content)), ConvertOpts{}) + rd := ToUTF8WithFallbackReader(strings.NewReader(content), ConvertOpts{}) r, _ := io.ReadAll(rd) assert.Equal(t, content, string(r)) } + for _, r := range runes { + content = "abc abc " + string(r) + string(r) + string(r) + for i := 0; i < len(content); i++ { + encoding, err := DetectEncoding([]byte(content[:i])) + assert.NoError(t, err) + assert.Equal(t, "UTF-8", encoding) + } + } }