From a1ad188326f9af633d2be0920a140275a4972bfe Mon Sep 17 00:00:00 2001 From: zeripath Date: Tue, 2 Jun 2020 23:20:19 +0100 Subject: [PATCH] Fix chardet test and add ordering option (#11621) * Fix chardet test and add ordering option Signed-off-by: Andrew Thornton * minor fixes Signed-off-by: Andrew Thornton * remove log Signed-off-by: Andrew Thornton * remove log2 Signed-off-by: Andrew Thornton * only iterate through top results Signed-off-by: Andrew Thornton * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton Co-authored-by: techknowlogick --- custom/conf/app.ini.sample | 7 +- .../doc/advanced/config-cheat-sheet.en-us.md | 3 +- modules/charset/charset.go | 35 ++++++++- modules/charset/charset_test.go | 4 + modules/setting/repository.go | 74 +++++++++++++++++++ 5 files changed, 117 insertions(+), 6 deletions(-) diff --git a/custom/conf/app.ini.sample b/custom/conf/app.ini.sample index 5e150172d..4f5529edf 100644 --- a/custom/conf/app.ini.sample +++ b/custom/conf/app.ini.sample @@ -14,7 +14,12 @@ RUN_MODE = dev [repository] ROOT = SCRIPT_TYPE = bash -; Default ANSI charset +; DETECTED_CHARSETS_ORDER tie-break order for detected charsets. +; If the charsets have equal confidence, tie-breaking will be done by order in this list +; with charsets earlier in the list chosen in preference to those later. +; Adding "defaults" will place the unused charsets at that position. +DETECTED_CHARSETS_ORDER=UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr +; Default ANSI charset to override non-UTF-8 charsets to ANSI_CHARSET = ; Force every new repository to be private FORCE_PRIVATE = false diff --git a/docs/content/doc/advanced/config-cheat-sheet.en-us.md b/docs/content/doc/advanced/config-cheat-sheet.en-us.md index f0908c22a..c29151f64 100644 --- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md +++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md @@ -46,7 +46,8 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`. an absolute path. - `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`, but some users report that only `sh` is available. -- `ANSI_CHARSET`: **\**: The default charset for an unrecognized charset. +- `DETECTED_CHARSETS_ORDER`: **UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr**: Tie-break order of detected charsets - if the detected charsets have equal confidence, charsets earlier in the list will be chosen in preference to those later. Adding `defaults` will place the unnamed charsets at that point. +- `ANSI_CHARSET`: **\**: Default ANSI charset to override non-UTF-8 charsets to. - `FORCE_PRIVATE`: **false**: Force every new repository to be private. - `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository. \[last, private, public\] diff --git a/modules/charset/charset.go b/modules/charset/charset.go index 3d3d1664f..a7e427db9 100644 --- a/modules/charset/charset.go +++ b/modules/charset/charset.go @@ -7,6 +7,7 @@ package charset import ( "bytes" "fmt" + "strings" "unicode/utf8" "code.gitea.io/gitea/modules/log" @@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) { } else { detectContent = content } - result, err := textDetector.DetectBest(detectContent) + + // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break + results, err := textDetector.DetectAll(detectContent) if err != nil { + if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { + log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) + return setting.Repository.AnsiCharset, nil + } return "", err } + + topConfidence := results[0].Confidence + topResult := results[0] + priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))] + for _, result := range results { + // As results are sorted in confidence order - if we have a different confidence + // we know it's less than the current confidence and can break out of the loop early + if result.Confidence != topConfidence { + break + } + + // Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss + resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))] + if resultHas && (!has || resultPriority < priority) { + topResult = result + priority = resultPriority + has = true + } + } + // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument - if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { + if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) return setting.Repository.AnsiCharset, err } - log.Debug("Detected encoding: %s", result.Charset) - return result.Charset, err + log.Debug("Detected encoding: %s", topResult.Charset) + return topResult.Charset, err } diff --git a/modules/charset/charset_test.go b/modules/charset/charset_test.go index a81a6e03e..394a42c71 100644 --- a/modules/charset/charset_test.go +++ b/modules/charset/charset_test.go @@ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) { // we accept either. assert.Contains(t, encoding, "ISO-8859") + old := setting.Repository.AnsiCharset setting.Repository.AnsiCharset = "placeholder" + defer func() { + setting.Repository.AnsiCharset = old + }() testSuccess(b, "placeholder") // invalid bytes diff --git a/modules/setting/repository.go b/modules/setting/repository.go index 8af3eaaf4..1796a8d6b 100644 --- a/modules/setting/repository.go +++ b/modules/setting/repository.go @@ -24,6 +24,8 @@ const ( // Repository settings var ( Repository = struct { + DetectedCharsetsOrder []string + DetectedCharsetScore map[string]int `ini:"-"` AnsiCharset string ForcePrivate bool DefaultPrivate string @@ -88,6 +90,42 @@ var ( Wiki []string } `ini:"repository.signing"` }{ + DetectedCharsetsOrder: []string{ + "UTF-8", + "UTF-16BE", + "UTF-16LE", + "UTF-32BE", + "UTF-32LE", + "ISO-8859-1", + "windows-1252", + "ISO-8859-2", + "windows-1250", + "ISO-8859-5", + "ISO-8859-6", + "ISO-8859-7", + "windows-1253", + "ISO-8859-8-I", + "windows-1255", + "ISO-8859-8", + "windows-1251", + "windows-1256", + "KOI8-R", + "ISO-8859-9", + "windows-1254", + "Shift_JIS", + "GB18030", + "EUC-JP", + "EUC-KR", + "Big5", + "ISO-2022-JP", + "ISO-2022-KR", + "ISO-2022-CN", + "IBM424_rtl", + "IBM424_ltr", + "IBM420_rtl", + "IBM420_ltr", + }, + DetectedCharsetScore: map[string]int{}, AnsiCharset: "", ForcePrivate: false, DefaultPrivate: RepoCreatingLastUserVisibility, @@ -208,6 +246,10 @@ func newRepository() { } else { RepoRootPath = filepath.Clean(RepoRootPath) } + defaultDetectedCharsetsOrder := make([]string, 0, len(Repository.DetectedCharsetsOrder)) + for _, charset := range Repository.DetectedCharsetsOrder { + defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset))) + } ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash") if err = Cfg.Section("repository").MapTo(&Repository); err != nil { @@ -222,6 +264,38 @@ func newRepository() { log.Fatal("Failed to map Repository.PullRequest settings: %v", err) } + preferred := make([]string, 0, len(Repository.DetectedCharsetsOrder)) + for _, charset := range Repository.DetectedCharsetsOrder { + canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) + preferred = append(preferred, canonicalCharset) + // remove it from the defaults + for i, charset := range defaultDetectedCharsetsOrder { + if charset == canonicalCharset { + defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder[:i], defaultDetectedCharsetsOrder[i+1:]...) + break + } + } + } + + i := 0 + for _, charset := range preferred { + // Add the defaults + if charset == "defaults" { + for _, charset := range defaultDetectedCharsetsOrder { + canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) + if _, has := Repository.DetectedCharsetScore[canonicalCharset]; !has { + Repository.DetectedCharsetScore[canonicalCharset] = i + i++ + } + } + continue + } + if _, has := Repository.DetectedCharsetScore[charset]; !has { + Repository.DetectedCharsetScore[charset] = i + i++ + } + } + if !filepath.IsAbs(Repository.Upload.TempPath) { Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath) }