mirror of
				https://gitee.com/gitea/gitea
				synced 2025-11-04 08:30:25 +08:00 
			
		
		
		
	Fix chardet test and add ordering option (#11621)
* Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io>
This commit is contained in:
		@@ -14,7 +14,12 @@ RUN_MODE = dev
 | 
			
		||||
[repository]
 | 
			
		||||
ROOT =
 | 
			
		||||
SCRIPT_TYPE = bash
 | 
			
		||||
; Default ANSI charset
 | 
			
		||||
; DETECTED_CHARSETS_ORDER tie-break order for detected charsets.
 | 
			
		||||
; If the charsets have equal confidence, tie-breaking will be done by order in this list
 | 
			
		||||
; with charsets earlier in the list chosen in preference to those later.
 | 
			
		||||
; Adding "defaults" will place the unused charsets at that position. 
 | 
			
		||||
DETECTED_CHARSETS_ORDER=UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr
 | 
			
		||||
; Default ANSI charset to override non-UTF-8 charsets to
 | 
			
		||||
ANSI_CHARSET =
 | 
			
		||||
; Force every new repository to be private
 | 
			
		||||
FORCE_PRIVATE = false
 | 
			
		||||
 
 | 
			
		||||
@@ -46,7 +46,8 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`.
 | 
			
		||||
   an absolute path.
 | 
			
		||||
- `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`,
 | 
			
		||||
   but some users report that only `sh` is available.
 | 
			
		||||
- `ANSI_CHARSET`: **\<empty\>**: The default charset for an unrecognized charset.
 | 
			
		||||
- `DETECTED_CHARSETS_ORDER`: **UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr**: Tie-break order of detected charsets - if the detected charsets have equal confidence, charsets earlier in the list will be chosen in preference to those later. Adding `defaults` will place the unnamed charsets at that point.
 | 
			
		||||
- `ANSI_CHARSET`: **\<empty\>**: Default ANSI charset to override non-UTF-8 charsets to.
 | 
			
		||||
- `FORCE_PRIVATE`: **false**: Force every new repository to be private.
 | 
			
		||||
- `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository.
 | 
			
		||||
   \[last, private, public\]
 | 
			
		||||
 
 | 
			
		||||
@@ -7,6 +7,7 @@ package charset
 | 
			
		||||
import (
 | 
			
		||||
	"bytes"
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"strings"
 | 
			
		||||
	"unicode/utf8"
 | 
			
		||||
 | 
			
		||||
	"code.gitea.io/gitea/modules/log"
 | 
			
		||||
@@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) {
 | 
			
		||||
	} else {
 | 
			
		||||
		detectContent = content
 | 
			
		||||
	}
 | 
			
		||||
	result, err := textDetector.DetectBest(detectContent)
 | 
			
		||||
 | 
			
		||||
	// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
 | 
			
		||||
	results, err := textDetector.DetectAll(detectContent)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
 | 
			
		||||
			log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
 | 
			
		||||
			return setting.Repository.AnsiCharset, nil
 | 
			
		||||
		}
 | 
			
		||||
		return "", err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	topConfidence := results[0].Confidence
 | 
			
		||||
	topResult := results[0]
 | 
			
		||||
	priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
 | 
			
		||||
	for _, result := range results {
 | 
			
		||||
		// As results are sorted in confidence order - if we have a different confidence
 | 
			
		||||
		// we know it's less than the current confidence and can break out of the loop early
 | 
			
		||||
		if result.Confidence != topConfidence {
 | 
			
		||||
			break
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
 | 
			
		||||
		resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
 | 
			
		||||
		if resultHas && (!has || resultPriority < priority) {
 | 
			
		||||
			topResult = result
 | 
			
		||||
			priority = resultPriority
 | 
			
		||||
			has = true
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
 | 
			
		||||
	if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
 | 
			
		||||
	if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
 | 
			
		||||
		log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
 | 
			
		||||
		return setting.Repository.AnsiCharset, err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	log.Debug("Detected encoding: %s", result.Charset)
 | 
			
		||||
	return result.Charset, err
 | 
			
		||||
	log.Debug("Detected encoding: %s", topResult.Charset)
 | 
			
		||||
	return topResult.Charset, err
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) {
 | 
			
		||||
	// we accept either.
 | 
			
		||||
	assert.Contains(t, encoding, "ISO-8859")
 | 
			
		||||
 | 
			
		||||
	old := setting.Repository.AnsiCharset
 | 
			
		||||
	setting.Repository.AnsiCharset = "placeholder"
 | 
			
		||||
	defer func() {
 | 
			
		||||
		setting.Repository.AnsiCharset = old
 | 
			
		||||
	}()
 | 
			
		||||
	testSuccess(b, "placeholder")
 | 
			
		||||
 | 
			
		||||
	// invalid bytes
 | 
			
		||||
 
 | 
			
		||||
@@ -24,6 +24,8 @@ const (
 | 
			
		||||
// Repository settings
 | 
			
		||||
var (
 | 
			
		||||
	Repository = struct {
 | 
			
		||||
		DetectedCharsetsOrder                   []string
 | 
			
		||||
		DetectedCharsetScore                    map[string]int `ini:"-"`
 | 
			
		||||
		AnsiCharset                             string
 | 
			
		||||
		ForcePrivate                            bool
 | 
			
		||||
		DefaultPrivate                          string
 | 
			
		||||
@@ -88,6 +90,42 @@ var (
 | 
			
		||||
			Wiki          []string
 | 
			
		||||
		} `ini:"repository.signing"`
 | 
			
		||||
	}{
 | 
			
		||||
		DetectedCharsetsOrder: []string{
 | 
			
		||||
			"UTF-8",
 | 
			
		||||
			"UTF-16BE",
 | 
			
		||||
			"UTF-16LE",
 | 
			
		||||
			"UTF-32BE",
 | 
			
		||||
			"UTF-32LE",
 | 
			
		||||
			"ISO-8859-1",
 | 
			
		||||
			"windows-1252",
 | 
			
		||||
			"ISO-8859-2",
 | 
			
		||||
			"windows-1250",
 | 
			
		||||
			"ISO-8859-5",
 | 
			
		||||
			"ISO-8859-6",
 | 
			
		||||
			"ISO-8859-7",
 | 
			
		||||
			"windows-1253",
 | 
			
		||||
			"ISO-8859-8-I",
 | 
			
		||||
			"windows-1255",
 | 
			
		||||
			"ISO-8859-8",
 | 
			
		||||
			"windows-1251",
 | 
			
		||||
			"windows-1256",
 | 
			
		||||
			"KOI8-R",
 | 
			
		||||
			"ISO-8859-9",
 | 
			
		||||
			"windows-1254",
 | 
			
		||||
			"Shift_JIS",
 | 
			
		||||
			"GB18030",
 | 
			
		||||
			"EUC-JP",
 | 
			
		||||
			"EUC-KR",
 | 
			
		||||
			"Big5",
 | 
			
		||||
			"ISO-2022-JP",
 | 
			
		||||
			"ISO-2022-KR",
 | 
			
		||||
			"ISO-2022-CN",
 | 
			
		||||
			"IBM424_rtl",
 | 
			
		||||
			"IBM424_ltr",
 | 
			
		||||
			"IBM420_rtl",
 | 
			
		||||
			"IBM420_ltr",
 | 
			
		||||
		},
 | 
			
		||||
		DetectedCharsetScore:                    map[string]int{},
 | 
			
		||||
		AnsiCharset:                             "",
 | 
			
		||||
		ForcePrivate:                            false,
 | 
			
		||||
		DefaultPrivate:                          RepoCreatingLastUserVisibility,
 | 
			
		||||
@@ -208,6 +246,10 @@ func newRepository() {
 | 
			
		||||
	} else {
 | 
			
		||||
		RepoRootPath = filepath.Clean(RepoRootPath)
 | 
			
		||||
	}
 | 
			
		||||
	defaultDetectedCharsetsOrder := make([]string, 0, len(Repository.DetectedCharsetsOrder))
 | 
			
		||||
	for _, charset := range Repository.DetectedCharsetsOrder {
 | 
			
		||||
		defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset)))
 | 
			
		||||
	}
 | 
			
		||||
	ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash")
 | 
			
		||||
 | 
			
		||||
	if err = Cfg.Section("repository").MapTo(&Repository); err != nil {
 | 
			
		||||
@@ -222,6 +264,38 @@ func newRepository() {
 | 
			
		||||
		log.Fatal("Failed to map Repository.PullRequest settings: %v", err)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	preferred := make([]string, 0, len(Repository.DetectedCharsetsOrder))
 | 
			
		||||
	for _, charset := range Repository.DetectedCharsetsOrder {
 | 
			
		||||
		canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
 | 
			
		||||
		preferred = append(preferred, canonicalCharset)
 | 
			
		||||
		// remove it from the defaults
 | 
			
		||||
		for i, charset := range defaultDetectedCharsetsOrder {
 | 
			
		||||
			if charset == canonicalCharset {
 | 
			
		||||
				defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder[:i], defaultDetectedCharsetsOrder[i+1:]...)
 | 
			
		||||
				break
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	i := 0
 | 
			
		||||
	for _, charset := range preferred {
 | 
			
		||||
		// Add the defaults
 | 
			
		||||
		if charset == "defaults" {
 | 
			
		||||
			for _, charset := range defaultDetectedCharsetsOrder {
 | 
			
		||||
				canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
 | 
			
		||||
				if _, has := Repository.DetectedCharsetScore[canonicalCharset]; !has {
 | 
			
		||||
					Repository.DetectedCharsetScore[canonicalCharset] = i
 | 
			
		||||
					i++
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		if _, has := Repository.DetectedCharsetScore[charset]; !has {
 | 
			
		||||
			Repository.DetectedCharsetScore[charset] = i
 | 
			
		||||
			i++
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if !filepath.IsAbs(Repository.Upload.TempPath) {
 | 
			
		||||
		Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath)
 | 
			
		||||
	}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user