mirror of
				https://gitee.com/gitea/gitea
				synced 2025-11-04 08:30:25 +08:00 
			
		
		
		
	UI: Detect and restore encoding and BOM in content (#6727)
* detect and remove a decoded BOM Signed-off-by: Andrew Thornton <art27@cantab.net> * Restore the previous encoding and BOM * On error keep as UTF-8 Signed-off-by: Andrew Thornton <art27@cantab.net> * create remove BOM function * Deal with LFSed content * Update modules/repofiles/update.go * Fix final LFS bug * Keep LFS sections referring to opts.Content
This commit is contained in:
		@@ -5,6 +5,7 @@
 | 
				
			|||||||
package base
 | 
					package base
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import (
 | 
					import (
 | 
				
			||||||
 | 
						"bytes"
 | 
				
			||||||
	"crypto/md5"
 | 
						"crypto/md5"
 | 
				
			||||||
	"crypto/rand"
 | 
						"crypto/rand"
 | 
				
			||||||
	"crypto/sha1"
 | 
						"crypto/sha1"
 | 
				
			||||||
@@ -36,6 +37,9 @@ import (
 | 
				
			|||||||
	"github.com/gogits/chardet"
 | 
						"github.com/gogits/chardet"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// UTF8BOM is the utf-8 byte-order marker
 | 
				
			||||||
 | 
					var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// EncodeMD5 encodes string to md5 hex value.
 | 
					// EncodeMD5 encodes string to md5 hex value.
 | 
				
			||||||
func EncodeMD5(str string) string {
 | 
					func EncodeMD5(str string) string {
 | 
				
			||||||
	m := md5.New()
 | 
						m := md5.New()
 | 
				
			||||||
@@ -91,6 +95,14 @@ func DetectEncoding(content []byte) (string, error) {
 | 
				
			|||||||
	return result.Charset, err
 | 
						return result.Charset, err
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
 | 
				
			||||||
 | 
					func RemoveBOMIfPresent(content []byte) []byte {
 | 
				
			||||||
 | 
						if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
 | 
				
			||||||
 | 
							return content[3:]
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						return content
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// BasicAuthDecode decode basic auth string
 | 
					// BasicAuthDecode decode basic auth string
 | 
				
			||||||
func BasicAuthDecode(encoded string) (string, string, error) {
 | 
					func BasicAuthDecode(encoded string) (string, string, error) {
 | 
				
			||||||
	s, err := base64.StdEncoding.DecodeString(encoded)
 | 
						s, err := base64.StdEncoding.DecodeString(encoded)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -5,13 +5,19 @@
 | 
				
			|||||||
package repofiles
 | 
					package repofiles
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import (
 | 
					import (
 | 
				
			||||||
 | 
						"bytes"
 | 
				
			||||||
	"fmt"
 | 
						"fmt"
 | 
				
			||||||
	"path"
 | 
						"path"
 | 
				
			||||||
	"strings"
 | 
						"strings"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						"golang.org/x/net/html/charset"
 | 
				
			||||||
 | 
						"golang.org/x/text/transform"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	"code.gitea.io/gitea/models"
 | 
						"code.gitea.io/gitea/models"
 | 
				
			||||||
 | 
						"code.gitea.io/gitea/modules/base"
 | 
				
			||||||
	"code.gitea.io/gitea/modules/git"
 | 
						"code.gitea.io/gitea/modules/git"
 | 
				
			||||||
	"code.gitea.io/gitea/modules/lfs"
 | 
						"code.gitea.io/gitea/modules/lfs"
 | 
				
			||||||
 | 
						"code.gitea.io/gitea/modules/log"
 | 
				
			||||||
	"code.gitea.io/gitea/modules/setting"
 | 
						"code.gitea.io/gitea/modules/setting"
 | 
				
			||||||
	"code.gitea.io/sdk/gitea"
 | 
						"code.gitea.io/sdk/gitea"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
@@ -37,6 +43,70 @@ type UpdateRepoFileOptions struct {
 | 
				
			|||||||
	Committer    *IdentityOptions
 | 
						Committer    *IdentityOptions
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func detectEncodingAndBOM(entry *git.TreeEntry, repo *models.Repository) (string, bool) {
 | 
				
			||||||
 | 
						reader, err := entry.Blob().DataAsync()
 | 
				
			||||||
 | 
						if err != nil {
 | 
				
			||||||
 | 
							// return default
 | 
				
			||||||
 | 
							return "UTF-8", false
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						defer reader.Close()
 | 
				
			||||||
 | 
						buf := make([]byte, 1024)
 | 
				
			||||||
 | 
						n, err := reader.Read(buf)
 | 
				
			||||||
 | 
						if err != nil {
 | 
				
			||||||
 | 
							// return default
 | 
				
			||||||
 | 
							return "UTF-8", false
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						buf = buf[:n]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if setting.LFS.StartServer {
 | 
				
			||||||
 | 
							meta := lfs.IsPointerFile(&buf)
 | 
				
			||||||
 | 
							if meta != nil {
 | 
				
			||||||
 | 
								meta, err = repo.GetLFSMetaObjectByOid(meta.Oid)
 | 
				
			||||||
 | 
								if err != nil && err != models.ErrLFSObjectNotExist {
 | 
				
			||||||
 | 
									// return default
 | 
				
			||||||
 | 
									return "UTF-8", false
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							if meta != nil {
 | 
				
			||||||
 | 
								dataRc, err := lfs.ReadMetaObject(meta)
 | 
				
			||||||
 | 
								if err != nil {
 | 
				
			||||||
 | 
									// return default
 | 
				
			||||||
 | 
									return "UTF-8", false
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								defer dataRc.Close()
 | 
				
			||||||
 | 
								buf = make([]byte, 1024)
 | 
				
			||||||
 | 
								n, err = dataRc.Read(buf)
 | 
				
			||||||
 | 
								if err != nil {
 | 
				
			||||||
 | 
									// return default
 | 
				
			||||||
 | 
									return "UTF-8", false
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								buf = buf[:n]
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						encoding, err := base.DetectEncoding(buf)
 | 
				
			||||||
 | 
						if err != nil {
 | 
				
			||||||
 | 
							// just default to utf-8 and no bom
 | 
				
			||||||
 | 
							return "UTF-8", false
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						if encoding == "UTF-8" {
 | 
				
			||||||
 | 
							return encoding, bytes.Equal(buf[0:3], base.UTF8BOM)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						charsetEncoding, _ := charset.Lookup(encoding)
 | 
				
			||||||
 | 
						if charsetEncoding == nil {
 | 
				
			||||||
 | 
							return "UTF-8", false
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if n > 2 {
 | 
				
			||||||
 | 
							return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM)
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return encoding, false
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// CreateOrUpdateRepoFile adds or updates a file in the given repository
 | 
					// CreateOrUpdateRepoFile adds or updates a file in the given repository
 | 
				
			||||||
func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepoFileOptions) (*gitea.FileResponse, error) {
 | 
					func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepoFileOptions) (*gitea.FileResponse, error) {
 | 
				
			||||||
	// If no branch name is set, assume master
 | 
						// If no branch name is set, assume master
 | 
				
			||||||
@@ -118,6 +188,9 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
 | 
				
			|||||||
		opts.LastCommitID = commit.ID.String()
 | 
							opts.LastCommitID = commit.ID.String()
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						encoding := "UTF-8"
 | 
				
			||||||
 | 
						bom := false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if !opts.IsNewFile {
 | 
						if !opts.IsNewFile {
 | 
				
			||||||
		fromEntry, err := commit.GetTreeEntryByPath(fromTreePath)
 | 
							fromEntry, err := commit.GetTreeEntryByPath(fromTreePath)
 | 
				
			||||||
		if err != nil {
 | 
							if err != nil {
 | 
				
			||||||
@@ -151,6 +224,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
 | 
				
			|||||||
			// haven't been made. We throw an error if one wasn't provided.
 | 
								// haven't been made. We throw an error if one wasn't provided.
 | 
				
			||||||
			return nil, models.ErrSHAOrCommitIDNotProvided{}
 | 
								return nil, models.ErrSHAOrCommitIDNotProvided{}
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							encoding, bom = detectEncodingAndBOM(fromEntry, repo)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// For the path where this file will be created/updated, we need to make
 | 
						// For the path where this file will be created/updated, we need to make
 | 
				
			||||||
@@ -235,9 +309,28 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
 | 
				
			|||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	content := opts.Content
 | 
						content := opts.Content
 | 
				
			||||||
 | 
						if bom {
 | 
				
			||||||
 | 
							content = string(base.UTF8BOM) + content
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						if encoding != "UTF-8" {
 | 
				
			||||||
 | 
							charsetEncoding, _ := charset.Lookup(encoding)
 | 
				
			||||||
 | 
							if charsetEncoding != nil {
 | 
				
			||||||
 | 
								result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content))
 | 
				
			||||||
 | 
								if err != nil {
 | 
				
			||||||
 | 
									// Look if we can't encode back in to the original we should just stick with utf-8
 | 
				
			||||||
 | 
									log.Error("Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.TreePath, opts.FromTreePath, encoding, err)
 | 
				
			||||||
 | 
									result = content
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								content = result
 | 
				
			||||||
 | 
							} else {
 | 
				
			||||||
 | 
								log.Error("Unknown encoding: %s", encoding)
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						// Reset the opts.Content to our adjusted content to ensure that LFS gets the correct content
 | 
				
			||||||
 | 
						opts.Content = content
 | 
				
			||||||
	var lfsMetaObject *models.LFSMetaObject
 | 
						var lfsMetaObject *models.LFSMetaObject
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" {
 | 
						if setting.LFS.StartServer && filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" {
 | 
				
			||||||
		// OK so we are supposed to LFS this data!
 | 
							// OK so we are supposed to LFS this data!
 | 
				
			||||||
		oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content))
 | 
							oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content))
 | 
				
			||||||
		if err != nil {
 | 
							if err != nil {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -267,7 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
 | 
				
			|||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
		return "", err
 | 
							return "", err
 | 
				
			||||||
	} else if charsetLabel == "UTF-8" {
 | 
						} else if charsetLabel == "UTF-8" {
 | 
				
			||||||
		return string(content), nil
 | 
							return string(base.RemoveBOMIfPresent(content)), nil
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	encoding, _ := charset.Lookup(charsetLabel)
 | 
						encoding, _ := charset.Lookup(charsetLabel)
 | 
				
			||||||
@@ -277,19 +277,21 @@ func ToUTF8WithErr(content []byte) (string, error) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	// If there is an error, we concatenate the nicely decoded part and the
 | 
						// If there is an error, we concatenate the nicely decoded part and the
 | 
				
			||||||
	// original left over. This way we won't lose data.
 | 
						// original left over. This way we won't lose data.
 | 
				
			||||||
	result, n, err := transform.String(encoding.NewDecoder(), string(content))
 | 
						result, n, err := transform.Bytes(encoding.NewDecoder(), content)
 | 
				
			||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
		result = result + string(content[n:])
 | 
							result = append(result, content[n:]...)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return result, err
 | 
						result = base.RemoveBOMIfPresent(result)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return string(result), err
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
 | 
					// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
 | 
				
			||||||
func ToUTF8WithFallback(content []byte) []byte {
 | 
					func ToUTF8WithFallback(content []byte) []byte {
 | 
				
			||||||
	charsetLabel, err := base.DetectEncoding(content)
 | 
						charsetLabel, err := base.DetectEncoding(content)
 | 
				
			||||||
	if err != nil || charsetLabel == "UTF-8" {
 | 
						if err != nil || charsetLabel == "UTF-8" {
 | 
				
			||||||
		return content
 | 
							return base.RemoveBOMIfPresent(content)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	encoding, _ := charset.Lookup(charsetLabel)
 | 
						encoding, _ := charset.Lookup(charsetLabel)
 | 
				
			||||||
@@ -304,7 +306,7 @@ func ToUTF8WithFallback(content []byte) []byte {
 | 
				
			|||||||
		return append(result, content[n:]...)
 | 
							return append(result, content[n:]...)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return result
 | 
						return base.RemoveBOMIfPresent(result)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// ToUTF8 converts content to UTF8 encoding and ignore error
 | 
					// ToUTF8 converts content to UTF8 encoding and ignore error
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user