Fix chardet test and add ordering option (#11621)
* Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io>
This commit is contained in:
		
							parent
							
								
									fe2cacf5ea
								
							
						
					
					
						commit
						a1ad188326
					
				
					 5 changed files with 117 additions and 6 deletions
				
			
		|  | @ -14,7 +14,12 @@ RUN_MODE = dev | |||
| [repository] | ||||
| ROOT = | ||||
| SCRIPT_TYPE = bash | ||||
| ; Default ANSI charset | ||||
| ; DETECTED_CHARSETS_ORDER tie-break order for detected charsets. | ||||
| ; If the charsets have equal confidence, tie-breaking will be done by order in this list | ||||
| ; with charsets earlier in the list chosen in preference to those later. | ||||
| ; Adding "defaults" will place the unused charsets at that position.  | ||||
| DETECTED_CHARSETS_ORDER=UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr | ||||
| ; Default ANSI charset to override non-UTF-8 charsets to | ||||
| ANSI_CHARSET = | ||||
| ; Force every new repository to be private | ||||
| FORCE_PRIVATE = false | ||||
|  |  | |||
|  | @ -46,7 +46,8 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`. | |||
|    an absolute path. | ||||
| - `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`, | ||||
|    but some users report that only `sh` is available. | ||||
| - `ANSI_CHARSET`: **\<empty\>**: The default charset for an unrecognized charset. | ||||
| - `DETECTED_CHARSETS_ORDER`: **UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr**: Tie-break order of detected charsets - if the detected charsets have equal confidence, charsets earlier in the list will be chosen in preference to those later. Adding `defaults` will place the unnamed charsets at that point. | ||||
| - `ANSI_CHARSET`: **\<empty\>**: Default ANSI charset to override non-UTF-8 charsets to. | ||||
| - `FORCE_PRIVATE`: **false**: Force every new repository to be private. | ||||
| - `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository. | ||||
|    \[last, private, public\] | ||||
|  |  | |||
|  | @ -7,6 +7,7 @@ package charset | |||
| import ( | ||||
| 	"bytes" | ||||
| 	"fmt" | ||||
| 	"strings" | ||||
| 	"unicode/utf8" | ||||
| 
 | ||||
| 	"code.gitea.io/gitea/modules/log" | ||||
|  | @ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) { | |||
| 	} else { | ||||
| 		detectContent = content | ||||
| 	} | ||||
| 	result, err := textDetector.DetectBest(detectContent) | ||||
| 
 | ||||
| 	// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
 | ||||
| 	results, err := textDetector.DetectAll(detectContent) | ||||
| 	if err != nil { | ||||
| 		if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { | ||||
| 			log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) | ||||
| 			return setting.Repository.AnsiCharset, nil | ||||
| 		} | ||||
| 		return "", err | ||||
| 	} | ||||
| 
 | ||||
| 	topConfidence := results[0].Confidence | ||||
| 	topResult := results[0] | ||||
| 	priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))] | ||||
| 	for _, result := range results { | ||||
| 		// As results are sorted in confidence order - if we have a different confidence
 | ||||
| 		// we know it's less than the current confidence and can break out of the loop early
 | ||||
| 		if result.Confidence != topConfidence { | ||||
| 			break | ||||
| 		} | ||||
| 
 | ||||
| 		// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
 | ||||
| 		resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))] | ||||
| 		if resultHas && (!has || resultPriority < priority) { | ||||
| 			topResult = result | ||||
| 			priority = resultPriority | ||||
| 			has = true | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
 | ||||
| 	if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { | ||||
| 	if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { | ||||
| 		log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) | ||||
| 		return setting.Repository.AnsiCharset, err | ||||
| 	} | ||||
| 
 | ||||
| 	log.Debug("Detected encoding: %s", result.Charset) | ||||
| 	return result.Charset, err | ||||
| 	log.Debug("Detected encoding: %s", topResult.Charset) | ||||
| 	return topResult.Charset, err | ||||
| } | ||||
|  |  | |||
|  | @ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) { | |||
| 	// we accept either.
 | ||||
| 	assert.Contains(t, encoding, "ISO-8859") | ||||
| 
 | ||||
| 	old := setting.Repository.AnsiCharset | ||||
| 	setting.Repository.AnsiCharset = "placeholder" | ||||
| 	defer func() { | ||||
| 		setting.Repository.AnsiCharset = old | ||||
| 	}() | ||||
| 	testSuccess(b, "placeholder") | ||||
| 
 | ||||
| 	// invalid bytes
 | ||||
|  |  | |||
|  | @ -24,6 +24,8 @@ const ( | |||
| // Repository settings
 | ||||
| var ( | ||||
| 	Repository = struct { | ||||
| 		DetectedCharsetsOrder                   []string | ||||
| 		DetectedCharsetScore                    map[string]int `ini:"-"` | ||||
| 		AnsiCharset                             string | ||||
| 		ForcePrivate                            bool | ||||
| 		DefaultPrivate                          string | ||||
|  | @ -88,6 +90,42 @@ var ( | |||
| 			Wiki          []string | ||||
| 		} `ini:"repository.signing"` | ||||
| 	}{ | ||||
| 		DetectedCharsetsOrder: []string{ | ||||
| 			"UTF-8", | ||||
| 			"UTF-16BE", | ||||
| 			"UTF-16LE", | ||||
| 			"UTF-32BE", | ||||
| 			"UTF-32LE", | ||||
| 			"ISO-8859-1", | ||||
| 			"windows-1252", | ||||
| 			"ISO-8859-2", | ||||
| 			"windows-1250", | ||||
| 			"ISO-8859-5", | ||||
| 			"ISO-8859-6", | ||||
| 			"ISO-8859-7", | ||||
| 			"windows-1253", | ||||
| 			"ISO-8859-8-I", | ||||
| 			"windows-1255", | ||||
| 			"ISO-8859-8", | ||||
| 			"windows-1251", | ||||
| 			"windows-1256", | ||||
| 			"KOI8-R", | ||||
| 			"ISO-8859-9", | ||||
| 			"windows-1254", | ||||
| 			"Shift_JIS", | ||||
| 			"GB18030", | ||||
| 			"EUC-JP", | ||||
| 			"EUC-KR", | ||||
| 			"Big5", | ||||
| 			"ISO-2022-JP", | ||||
| 			"ISO-2022-KR", | ||||
| 			"ISO-2022-CN", | ||||
| 			"IBM424_rtl", | ||||
| 			"IBM424_ltr", | ||||
| 			"IBM420_rtl", | ||||
| 			"IBM420_ltr", | ||||
| 		}, | ||||
| 		DetectedCharsetScore:                    map[string]int{}, | ||||
| 		AnsiCharset:                             "", | ||||
| 		ForcePrivate:                            false, | ||||
| 		DefaultPrivate:                          RepoCreatingLastUserVisibility, | ||||
|  | @ -208,6 +246,10 @@ func newRepository() { | |||
| 	} else { | ||||
| 		RepoRootPath = filepath.Clean(RepoRootPath) | ||||
| 	} | ||||
| 	defaultDetectedCharsetsOrder := make([]string, 0, len(Repository.DetectedCharsetsOrder)) | ||||
| 	for _, charset := range Repository.DetectedCharsetsOrder { | ||||
| 		defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset))) | ||||
| 	} | ||||
| 	ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash") | ||||
| 
 | ||||
| 	if err = Cfg.Section("repository").MapTo(&Repository); err != nil { | ||||
|  | @ -222,6 +264,38 @@ func newRepository() { | |||
| 		log.Fatal("Failed to map Repository.PullRequest settings: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	preferred := make([]string, 0, len(Repository.DetectedCharsetsOrder)) | ||||
| 	for _, charset := range Repository.DetectedCharsetsOrder { | ||||
| 		canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) | ||||
| 		preferred = append(preferred, canonicalCharset) | ||||
| 		// remove it from the defaults
 | ||||
| 		for i, charset := range defaultDetectedCharsetsOrder { | ||||
| 			if charset == canonicalCharset { | ||||
| 				defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder[:i], defaultDetectedCharsetsOrder[i+1:]...) | ||||
| 				break | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	i := 0 | ||||
| 	for _, charset := range preferred { | ||||
| 		// Add the defaults
 | ||||
| 		if charset == "defaults" { | ||||
| 			for _, charset := range defaultDetectedCharsetsOrder { | ||||
| 				canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) | ||||
| 				if _, has := Repository.DetectedCharsetScore[canonicalCharset]; !has { | ||||
| 					Repository.DetectedCharsetScore[canonicalCharset] = i | ||||
| 					i++ | ||||
| 				} | ||||
| 			} | ||||
| 			continue | ||||
| 		} | ||||
| 		if _, has := Repository.DetectedCharsetScore[charset]; !has { | ||||
| 			Repository.DetectedCharsetScore[charset] = i | ||||
| 			i++ | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if !filepath.IsAbs(Repository.Upload.TempPath) { | ||||
| 		Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath) | ||||
| 	} | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue