mirror of
				https://gitee.com/gitea/gitea
				synced 2025-11-04 08:30:25 +08:00 
			
		
		
		
	Detect encoding changes while parsing diff (#16330)
* Detect encoding changes while parsing diff
This commit is contained in:
		@@ -32,6 +32,7 @@ import (
 | 
			
		||||
 | 
			
		||||
	"github.com/sergi/go-diff/diffmatchpatch"
 | 
			
		||||
	stdcharset "golang.org/x/net/html/charset"
 | 
			
		||||
	"golang.org/x/text/encoding"
 | 
			
		||||
	"golang.org/x/text/transform"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
@@ -883,35 +884,46 @@ parsingLoop:
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// FIXME: There are numerous issues with this:
 | 
			
		||||
	// TODO: There are numerous issues with this:
 | 
			
		||||
	// - we might want to consider detecting encoding while parsing but...
 | 
			
		||||
	// - we're likely to fail to get the correct encoding here anyway as we won't have enough information
 | 
			
		||||
	// - and this doesn't really account for changes in encoding
 | 
			
		||||
	var buf bytes.Buffer
 | 
			
		||||
	var diffLineTypeBuffers = make(map[DiffLineType]*bytes.Buffer, 3)
 | 
			
		||||
	var diffLineTypeDecoders = make(map[DiffLineType]*encoding.Decoder, 3)
 | 
			
		||||
	diffLineTypeBuffers[DiffLinePlain] = new(bytes.Buffer)
 | 
			
		||||
	diffLineTypeBuffers[DiffLineAdd] = new(bytes.Buffer)
 | 
			
		||||
	diffLineTypeBuffers[DiffLineDel] = new(bytes.Buffer)
 | 
			
		||||
	for _, f := range diff.Files {
 | 
			
		||||
		buf.Reset()
 | 
			
		||||
		for _, buffer := range diffLineTypeBuffers {
 | 
			
		||||
			buffer.Reset()
 | 
			
		||||
		}
 | 
			
		||||
		for _, sec := range f.Sections {
 | 
			
		||||
			for _, l := range sec.Lines {
 | 
			
		||||
				if l.Type == DiffLineSection {
 | 
			
		||||
					continue
 | 
			
		||||
				}
 | 
			
		||||
				buf.WriteString(l.Content[1:])
 | 
			
		||||
				buf.WriteString("\n")
 | 
			
		||||
				diffLineTypeBuffers[l.Type].WriteString(l.Content[1:])
 | 
			
		||||
				diffLineTypeBuffers[l.Type].WriteString("\n")
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		charsetLabel, err := charset.DetectEncoding(buf.Bytes())
 | 
			
		||||
		if charsetLabel != "UTF-8" && err == nil {
 | 
			
		||||
			encoding, _ := stdcharset.Lookup(charsetLabel)
 | 
			
		||||
			if encoding != nil {
 | 
			
		||||
				d := encoding.NewDecoder()
 | 
			
		||||
				for _, sec := range f.Sections {
 | 
			
		||||
					for _, l := range sec.Lines {
 | 
			
		||||
						if l.Type == DiffLineSection {
 | 
			
		||||
							continue
 | 
			
		||||
						}
 | 
			
		||||
						if c, _, err := transform.String(d, l.Content[1:]); err == nil {
 | 
			
		||||
							l.Content = l.Content[0:1] + c
 | 
			
		||||
						}
 | 
			
		||||
		for lineType, buffer := range diffLineTypeBuffers {
 | 
			
		||||
			diffLineTypeDecoders[lineType] = nil
 | 
			
		||||
			if buffer.Len() == 0 {
 | 
			
		||||
				continue
 | 
			
		||||
			}
 | 
			
		||||
			charsetLabel, err := charset.DetectEncoding(buffer.Bytes())
 | 
			
		||||
			if charsetLabel != "UTF-8" && err == nil {
 | 
			
		||||
				encoding, _ := stdcharset.Lookup(charsetLabel)
 | 
			
		||||
				if encoding != nil {
 | 
			
		||||
					diffLineTypeDecoders[lineType] = encoding.NewDecoder()
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		for _, sec := range f.Sections {
 | 
			
		||||
			for _, l := range sec.Lines {
 | 
			
		||||
				decoder := diffLineTypeDecoders[l.Type]
 | 
			
		||||
				if decoder != nil {
 | 
			
		||||
					if c, _, err := transform.String(decoder, l.Content[1:]); err == nil {
 | 
			
		||||
						l.Content = l.Content[0:1] + c
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user