UI: Detect and restore encoding and BOM in content (#6727)
* detect and remove a decoded BOM Signed-off-by: Andrew Thornton <art27@cantab.net> * Restore the previous encoding and BOM * On error keep as UTF-8 Signed-off-by: Andrew Thornton <art27@cantab.net> * create remove BOM function * Deal with LFSed content * Update modules/repofiles/update.go * Fix final LFS bug * Keep LFS sections referring to opts.Content
This commit is contained in:
		
							parent
							
								
									4c34bc111c
								
							
						
					
					
						commit
						f6eedd4dc8
					
				
					 3 changed files with 114 additions and 7 deletions
				
			
		|  | @ -5,6 +5,7 @@ | |||
| package base | ||||
| 
 | ||||
| import ( | ||||
| 	"bytes" | ||||
| 	"crypto/md5" | ||||
| 	"crypto/rand" | ||||
| 	"crypto/sha1" | ||||
|  | @ -36,6 +37,9 @@ import ( | |||
| 	"github.com/gogits/chardet" | ||||
| ) | ||||
| 
 | ||||
| // UTF8BOM is the utf-8 byte-order marker
 | ||||
| var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} | ||||
| 
 | ||||
| // EncodeMD5 encodes string to md5 hex value.
 | ||||
| func EncodeMD5(str string) string { | ||||
| 	m := md5.New() | ||||
|  | @ -91,6 +95,14 @@ func DetectEncoding(content []byte) (string, error) { | |||
| 	return result.Charset, err | ||||
| } | ||||
| 
 | ||||
| // RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
 | ||||
| func RemoveBOMIfPresent(content []byte) []byte { | ||||
| 	if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { | ||||
| 		return content[3:] | ||||
| 	} | ||||
| 	return content | ||||
| } | ||||
| 
 | ||||
| // BasicAuthDecode decode basic auth string
 | ||||
| func BasicAuthDecode(encoded string) (string, string, error) { | ||||
| 	s, err := base64.StdEncoding.DecodeString(encoded) | ||||
|  |  | |||
|  | @ -5,13 +5,19 @@ | |||
| package repofiles | ||||
| 
 | ||||
| import ( | ||||
| 	"bytes" | ||||
| 	"fmt" | ||||
| 	"path" | ||||
| 	"strings" | ||||
| 
 | ||||
| 	"golang.org/x/net/html/charset" | ||||
| 	"golang.org/x/text/transform" | ||||
| 
 | ||||
| 	"code.gitea.io/gitea/models" | ||||
| 	"code.gitea.io/gitea/modules/base" | ||||
| 	"code.gitea.io/gitea/modules/git" | ||||
| 	"code.gitea.io/gitea/modules/lfs" | ||||
| 	"code.gitea.io/gitea/modules/log" | ||||
| 	"code.gitea.io/gitea/modules/setting" | ||||
| 	"code.gitea.io/sdk/gitea" | ||||
| ) | ||||
|  | @ -37,6 +43,70 @@ type UpdateRepoFileOptions struct { | |||
| 	Committer    *IdentityOptions | ||||
| } | ||||
| 
 | ||||
| func detectEncodingAndBOM(entry *git.TreeEntry, repo *models.Repository) (string, bool) { | ||||
| 	reader, err := entry.Blob().DataAsync() | ||||
| 	if err != nil { | ||||
| 		// return default
 | ||||
| 		return "UTF-8", false | ||||
| 	} | ||||
| 	defer reader.Close() | ||||
| 	buf := make([]byte, 1024) | ||||
| 	n, err := reader.Read(buf) | ||||
| 	if err != nil { | ||||
| 		// return default
 | ||||
| 		return "UTF-8", false | ||||
| 	} | ||||
| 	buf = buf[:n] | ||||
| 
 | ||||
| 	if setting.LFS.StartServer { | ||||
| 		meta := lfs.IsPointerFile(&buf) | ||||
| 		if meta != nil { | ||||
| 			meta, err = repo.GetLFSMetaObjectByOid(meta.Oid) | ||||
| 			if err != nil && err != models.ErrLFSObjectNotExist { | ||||
| 				// return default
 | ||||
| 				return "UTF-8", false | ||||
| 			} | ||||
| 		} | ||||
| 		if meta != nil { | ||||
| 			dataRc, err := lfs.ReadMetaObject(meta) | ||||
| 			if err != nil { | ||||
| 				// return default
 | ||||
| 				return "UTF-8", false | ||||
| 			} | ||||
| 			defer dataRc.Close() | ||||
| 			buf = make([]byte, 1024) | ||||
| 			n, err = dataRc.Read(buf) | ||||
| 			if err != nil { | ||||
| 				// return default
 | ||||
| 				return "UTF-8", false | ||||
| 			} | ||||
| 			buf = buf[:n] | ||||
| 		} | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 	encoding, err := base.DetectEncoding(buf) | ||||
| 	if err != nil { | ||||
| 		// just default to utf-8 and no bom
 | ||||
| 		return "UTF-8", false | ||||
| 	} | ||||
| 	if encoding == "UTF-8" { | ||||
| 		return encoding, bytes.Equal(buf[0:3], base.UTF8BOM) | ||||
| 	} | ||||
| 	charsetEncoding, _ := charset.Lookup(encoding) | ||||
| 	if charsetEncoding == nil { | ||||
| 		return "UTF-8", false | ||||
| 	} | ||||
| 
 | ||||
| 	result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf)) | ||||
| 
 | ||||
| 	if n > 2 { | ||||
| 		return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM) | ||||
| 	} | ||||
| 
 | ||||
| 	return encoding, false | ||||
| } | ||||
| 
 | ||||
| // CreateOrUpdateRepoFile adds or updates a file in the given repository
 | ||||
| func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepoFileOptions) (*gitea.FileResponse, error) { | ||||
| 	// If no branch name is set, assume master
 | ||||
|  | @ -118,6 +188,9 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up | |||
| 		opts.LastCommitID = commit.ID.String() | ||||
| 	} | ||||
| 
 | ||||
| 	encoding := "UTF-8" | ||||
| 	bom := false | ||||
| 
 | ||||
| 	if !opts.IsNewFile { | ||||
| 		fromEntry, err := commit.GetTreeEntryByPath(fromTreePath) | ||||
| 		if err != nil { | ||||
|  | @ -151,6 +224,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up | |||
| 			// haven't been made. We throw an error if one wasn't provided.
 | ||||
| 			return nil, models.ErrSHAOrCommitIDNotProvided{} | ||||
| 		} | ||||
| 		encoding, bom = detectEncodingAndBOM(fromEntry, repo) | ||||
| 	} | ||||
| 
 | ||||
| 	// For the path where this file will be created/updated, we need to make
 | ||||
|  | @ -235,9 +309,28 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up | |||
| 	} | ||||
| 
 | ||||
| 	content := opts.Content | ||||
| 	if bom { | ||||
| 		content = string(base.UTF8BOM) + content | ||||
| 	} | ||||
| 	if encoding != "UTF-8" { | ||||
| 		charsetEncoding, _ := charset.Lookup(encoding) | ||||
| 		if charsetEncoding != nil { | ||||
| 			result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content)) | ||||
| 			if err != nil { | ||||
| 				// Look if we can't encode back in to the original we should just stick with utf-8
 | ||||
| 				log.Error("Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.TreePath, opts.FromTreePath, encoding, err) | ||||
| 				result = content | ||||
| 			} | ||||
| 			content = result | ||||
| 		} else { | ||||
| 			log.Error("Unknown encoding: %s", encoding) | ||||
| 		} | ||||
| 	} | ||||
| 	// Reset the opts.Content to our adjusted content to ensure that LFS gets the correct content
 | ||||
| 	opts.Content = content | ||||
| 	var lfsMetaObject *models.LFSMetaObject | ||||
| 
 | ||||
| 	if filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" { | ||||
| 	if setting.LFS.StartServer && filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" { | ||||
| 		// OK so we are supposed to LFS this data!
 | ||||
| 		oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content)) | ||||
| 		if err != nil { | ||||
|  |  | |||
|  | @ -267,7 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) { | |||
| 	if err != nil { | ||||
| 		return "", err | ||||
| 	} else if charsetLabel == "UTF-8" { | ||||
| 		return string(content), nil | ||||
| 		return string(base.RemoveBOMIfPresent(content)), nil | ||||
| 	} | ||||
| 
 | ||||
| 	encoding, _ := charset.Lookup(charsetLabel) | ||||
|  | @ -277,19 +277,21 @@ func ToUTF8WithErr(content []byte) (string, error) { | |||
| 
 | ||||
| 	// If there is an error, we concatenate the nicely decoded part and the
 | ||||
| 	// original left over. This way we won't lose data.
 | ||||
| 	result, n, err := transform.String(encoding.NewDecoder(), string(content)) | ||||
| 	result, n, err := transform.Bytes(encoding.NewDecoder(), content) | ||||
| 	if err != nil { | ||||
| 		result = result + string(content[n:]) | ||||
| 		result = append(result, content[n:]...) | ||||
| 	} | ||||
| 
 | ||||
| 	return result, err | ||||
| 	result = base.RemoveBOMIfPresent(result) | ||||
| 
 | ||||
| 	return string(result), err | ||||
| } | ||||
| 
 | ||||
| // ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
 | ||||
| func ToUTF8WithFallback(content []byte) []byte { | ||||
| 	charsetLabel, err := base.DetectEncoding(content) | ||||
| 	if err != nil || charsetLabel == "UTF-8" { | ||||
| 		return content | ||||
| 		return base.RemoveBOMIfPresent(content) | ||||
| 	} | ||||
| 
 | ||||
| 	encoding, _ := charset.Lookup(charsetLabel) | ||||
|  | @ -304,7 +306,7 @@ func ToUTF8WithFallback(content []byte) []byte { | |||
| 		return append(result, content[n:]...) | ||||
| 	} | ||||
| 
 | ||||
| 	return result | ||||
| 	return base.RemoveBOMIfPresent(result) | ||||
| } | ||||
| 
 | ||||
| // ToUTF8 converts content to UTF8 encoding and ignore error
 | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue