Do not "guess" the file encoding/BOM when using API to upload files (#25828)
Related issue: #18368 It doesn't seem right to "guess" the file encoding/BOM when using API to upload files. The API should save the uploaded content as-is.
This commit is contained in:
parent
d1e066f5d6
commit
22eeede885
@ -4,7 +4,6 @@
|
|||||||
package files
|
package files
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"path"
|
"path"
|
||||||
@ -12,21 +11,15 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"code.gitea.io/gitea/models"
|
"code.gitea.io/gitea/models"
|
||||||
"code.gitea.io/gitea/models/db"
|
|
||||||
git_model "code.gitea.io/gitea/models/git"
|
git_model "code.gitea.io/gitea/models/git"
|
||||||
repo_model "code.gitea.io/gitea/models/repo"
|
repo_model "code.gitea.io/gitea/models/repo"
|
||||||
user_model "code.gitea.io/gitea/models/user"
|
user_model "code.gitea.io/gitea/models/user"
|
||||||
"code.gitea.io/gitea/modules/charset"
|
|
||||||
"code.gitea.io/gitea/modules/git"
|
"code.gitea.io/gitea/modules/git"
|
||||||
"code.gitea.io/gitea/modules/lfs"
|
"code.gitea.io/gitea/modules/lfs"
|
||||||
"code.gitea.io/gitea/modules/log"
|
"code.gitea.io/gitea/modules/log"
|
||||||
"code.gitea.io/gitea/modules/setting"
|
"code.gitea.io/gitea/modules/setting"
|
||||||
"code.gitea.io/gitea/modules/structs"
|
"code.gitea.io/gitea/modules/structs"
|
||||||
"code.gitea.io/gitea/modules/util"
|
|
||||||
asymkey_service "code.gitea.io/gitea/services/asymkey"
|
asymkey_service "code.gitea.io/gitea/services/asymkey"
|
||||||
|
|
||||||
stdcharset "golang.org/x/net/html/charset"
|
|
||||||
"golang.org/x/text/transform"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// IdentityOptions for a person's identity like an author or committer
|
// IdentityOptions for a person's identity like an author or committer
|
||||||
@ -66,78 +59,9 @@ type ChangeRepoFilesOptions struct {
|
|||||||
type RepoFileOptions struct {
|
type RepoFileOptions struct {
|
||||||
treePath string
|
treePath string
|
||||||
fromTreePath string
|
fromTreePath string
|
||||||
encoding string
|
|
||||||
bom bool
|
|
||||||
executable bool
|
executable bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func detectEncodingAndBOM(entry *git.TreeEntry, repo *repo_model.Repository) (string, bool) {
|
|
||||||
reader, err := entry.Blob().DataAsync()
|
|
||||||
if err != nil {
|
|
||||||
// return default
|
|
||||||
return "UTF-8", false
|
|
||||||
}
|
|
||||||
defer reader.Close()
|
|
||||||
buf := make([]byte, 1024)
|
|
||||||
n, err := util.ReadAtMost(reader, buf)
|
|
||||||
if err != nil {
|
|
||||||
// return default
|
|
||||||
return "UTF-8", false
|
|
||||||
}
|
|
||||||
buf = buf[:n]
|
|
||||||
|
|
||||||
if setting.LFS.StartServer {
|
|
||||||
pointer, _ := lfs.ReadPointerFromBuffer(buf)
|
|
||||||
if pointer.IsValid() {
|
|
||||||
meta, err := git_model.GetLFSMetaObjectByOid(db.DefaultContext, repo.ID, pointer.Oid)
|
|
||||||
if err != nil && err != git_model.ErrLFSObjectNotExist {
|
|
||||||
// return default
|
|
||||||
return "UTF-8", false
|
|
||||||
}
|
|
||||||
if meta != nil {
|
|
||||||
dataRc, err := lfs.ReadMetaObject(pointer)
|
|
||||||
if err != nil {
|
|
||||||
// return default
|
|
||||||
return "UTF-8", false
|
|
||||||
}
|
|
||||||
defer dataRc.Close()
|
|
||||||
buf = make([]byte, 1024)
|
|
||||||
n, err = util.ReadAtMost(dataRc, buf)
|
|
||||||
if err != nil {
|
|
||||||
// return default
|
|
||||||
return "UTF-8", false
|
|
||||||
}
|
|
||||||
buf = buf[:n]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
encoding, err := charset.DetectEncoding(buf)
|
|
||||||
if err != nil {
|
|
||||||
// just default to utf-8 and no bom
|
|
||||||
return "UTF-8", false
|
|
||||||
}
|
|
||||||
if encoding == "UTF-8" {
|
|
||||||
return encoding, bytes.Equal(buf[0:3], charset.UTF8BOM)
|
|
||||||
}
|
|
||||||
charsetEncoding, _ := stdcharset.Lookup(encoding)
|
|
||||||
if charsetEncoding == nil {
|
|
||||||
return "UTF-8", false
|
|
||||||
}
|
|
||||||
|
|
||||||
result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf))
|
|
||||||
if err != nil {
|
|
||||||
// return default
|
|
||||||
return "UTF-8", false
|
|
||||||
}
|
|
||||||
|
|
||||||
if n > 2 {
|
|
||||||
return encoding, bytes.Equal([]byte(result)[0:3], charset.UTF8BOM)
|
|
||||||
}
|
|
||||||
|
|
||||||
return encoding, false
|
|
||||||
}
|
|
||||||
|
|
||||||
// ChangeRepoFiles adds, updates or removes multiple files in the given repository
|
// ChangeRepoFiles adds, updates or removes multiple files in the given repository
|
||||||
func ChangeRepoFiles(ctx context.Context, repo *repo_model.Repository, doer *user_model.User, opts *ChangeRepoFilesOptions) (*structs.FilesResponse, error) {
|
func ChangeRepoFiles(ctx context.Context, repo *repo_model.Repository, doer *user_model.User, opts *ChangeRepoFilesOptions) (*structs.FilesResponse, error) {
|
||||||
// If no branch name is set, assume default branch
|
// If no branch name is set, assume default branch
|
||||||
@ -184,8 +108,6 @@ func ChangeRepoFiles(ctx context.Context, repo *repo_model.Repository, doer *use
|
|||||||
file.Options = &RepoFileOptions{
|
file.Options = &RepoFileOptions{
|
||||||
treePath: treePath,
|
treePath: treePath,
|
||||||
fromTreePath: fromTreePath,
|
fromTreePath: fromTreePath,
|
||||||
encoding: "UTF-8",
|
|
||||||
bom: false,
|
|
||||||
executable: false,
|
executable: false,
|
||||||
}
|
}
|
||||||
treePaths = append(treePaths, treePath)
|
treePaths = append(treePaths, treePath)
|
||||||
@ -381,7 +303,6 @@ func handleCheckErrors(file *ChangeRepoFile, commit *git.Commit, opts *ChangeRep
|
|||||||
// haven't been made. We throw an error if one wasn't provided.
|
// haven't been made. We throw an error if one wasn't provided.
|
||||||
return models.ErrSHAOrCommitIDNotProvided{}
|
return models.ErrSHAOrCommitIDNotProvided{}
|
||||||
}
|
}
|
||||||
file.Options.encoding, file.Options.bom = detectEncodingAndBOM(fromEntry, repo)
|
|
||||||
file.Options.executable = fromEntry.IsExecutable()
|
file.Options.executable = fromEntry.IsExecutable()
|
||||||
}
|
}
|
||||||
if file.Operation == "create" || file.Operation == "update" {
|
if file.Operation == "create" || file.Operation == "update" {
|
||||||
@ -466,28 +387,8 @@ func CreateOrUpdateFile(ctx context.Context, t *TemporaryUploadRepository, file
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
content := file.Content
|
treeObjectContent := file.Content
|
||||||
if file.Options.bom {
|
|
||||||
content = string(charset.UTF8BOM) + content
|
|
||||||
}
|
|
||||||
if file.Options.encoding != "UTF-8" {
|
|
||||||
charsetEncoding, _ := stdcharset.Lookup(file.Options.encoding)
|
|
||||||
if charsetEncoding != nil {
|
|
||||||
result, _, err := transform.String(charsetEncoding.NewEncoder(), content)
|
|
||||||
if err != nil {
|
|
||||||
// Look if we can't encode back in to the original we should just stick with utf-8
|
|
||||||
log.Error("Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", file.TreePath, file.FromTreePath, file.Options.encoding, err)
|
|
||||||
result = content
|
|
||||||
}
|
|
||||||
content = result
|
|
||||||
} else {
|
|
||||||
log.Error("Unknown encoding: %s", file.Options.encoding)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Reset the opts.Content to our adjusted content to ensure that LFS gets the correct content
|
|
||||||
file.Content = content
|
|
||||||
var lfsMetaObject *git_model.LFSMetaObject
|
var lfsMetaObject *git_model.LFSMetaObject
|
||||||
|
|
||||||
if setting.LFS.StartServer && hasOldBranch {
|
if setting.LFS.StartServer && hasOldBranch {
|
||||||
// Check there is no way this can return multiple infos
|
// Check there is no way this can return multiple infos
|
||||||
filename2attribute2info, err := t.gitRepo.CheckAttribute(git.CheckAttributeOpts{
|
filename2attribute2info, err := t.gitRepo.CheckAttribute(git.CheckAttributeOpts{
|
||||||
@ -506,12 +407,12 @@ func CreateOrUpdateFile(ctx context.Context, t *TemporaryUploadRepository, file
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
lfsMetaObject = &git_model.LFSMetaObject{Pointer: pointer, RepositoryID: repoID}
|
lfsMetaObject = &git_model.LFSMetaObject{Pointer: pointer, RepositoryID: repoID}
|
||||||
content = pointer.StringContent()
|
treeObjectContent = pointer.StringContent()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add the object to the database
|
// Add the object to the database
|
||||||
objectHash, err := t.HashObject(strings.NewReader(content))
|
objectHash, err := t.HashObject(strings.NewReader(treeObjectContent))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user