From fd8faeb0e6289f6b56699c3fceb102f449d3de53 Mon Sep 17 00:00:00 2001 From: nielash Date: Mon, 5 Feb 2024 02:58:11 -0500 Subject: [PATCH] vfs: fix unicode normalization on macOS - fixes #7072 Before this change, the VFS layer did not properly handle unicode normalization, which caused problems particularly for users of macOS. While attempts were made to handle it with various `-o modules=iconv` combinations, this was an imperfect solution, as no one combination allowed both NFC and NFD content to simultaneously be both visible and editable via Finder. After this change, the VFS supports `--no-unicode-normalization` (default `false`) via the existing `--vfs-case-insensitive` logic, which is extended to apply to both case insensitivity and unicode normalization form. This change also adds an additional flag, `--vfs-block-norm-dupes`, to address a probably rare but potentially possible scenario where a directory contains multiple duplicate filenames after applying case and unicode normalization settings. In such a scenario, this flag (disabled by default) hides the duplicates. This comes with a performance tradeoff, as rclone will have to scan the entire directory for duplicates when listing a directory. For this reason, it is recommended to leave this disabled if not needed. However, macOS users may wish to consider using it, as otherwise, if a remote directory contains both NFC and NFD versions of the same filename, an odd situation will occur: both versions of the file will be visible in the mount, and both will appear to be editable, however, editing either version will actually result in only the NFD version getting edited under the hood. `--vfs-block-norm-dupes` prevents this confusion by detecting this scenario, hiding the duplicates, and logging an error, similar to how this is handled in `rclone sync`. --- cmd/cmount/mount.go | 7 ------ cmd/mountlib/mount.md | 16 ++++++-------- fs/operations/check.go | 13 ++++++++++-- vfs/dir.go | 46 ++++++++++++++++++++++++++++++++++------ vfs/vfs.md | 22 +++++++++++++++++++ vfs/vfs_case_test.go | 35 ++++++++++++++++++++++++++++++ vfs/vfscommon/options.go | 1 + vfs/vfsflags/vfsflags.go | 1 + 8 files changed, 115 insertions(+), 26 deletions(-) diff --git a/cmd/cmount/mount.go b/cmd/cmount/mount.go index f6d86bdb5c5d9..d6d684d0496c8 100644 --- a/cmd/cmount/mount.go +++ b/cmd/cmount/mount.go @@ -116,13 +116,6 @@ func mountOptions(VFS *vfs.VFS, device string, mountpoint string, opt *mountlib. for _, option := range opt.ExtraFlags { options = append(options, option) } - if runtime.GOOS == "darwin" { - if !findOption("modules=iconv", options) { - iconv := "modules=iconv,from_code=UTF-8,to_code=UTF-8-MAC" - options = append(options, "-o", iconv) - fs.Debugf(nil, "Adding \"-o %s\" for macOS", iconv) - } - } return options } diff --git a/cmd/mountlib/mount.md b/cmd/mountlib/mount.md index 6b3ccc6209554..0e38323b379bf 100644 --- a/cmd/mountlib/mount.md +++ b/cmd/mountlib/mount.md @@ -257,7 +257,12 @@ Mounting on macOS can be done either via [built-in NFS server](/commands/rclone_ FUSE driver utilizing a macOS kernel extension (kext). FUSE-T is an alternative FUSE system which "mounts" via an NFSv4 local server. -## NFS mount +##### Unicode Normalization + +It is highly recommended to keep the default of `--no-unicode-normalization=false` +for all `mount` and `serve` commands on macOS. For details, see [vfs-case-sensitivity](https://rclone.org/commands/rclone_mount/#vfs-case-sensitivity). + +#### NFS mount This method spins up an NFS server using [serve nfs](/commands/rclone_serve_nfs/) command and mounts it to the specified mountpoint. If you run this in background mode using |--daemon|, you will need to @@ -294,15 +299,6 @@ As per the [FUSE-T wiki](https://github.com/macos-fuse-t/fuse-t/wiki#caveats): This means that viewing files with various tools, notably macOS Finder, will cause rlcone to update the modification time of the file. This may make rclone upload a full new copy of the file. - -##### Unicode Normalization - -Rclone includes flags for unicode normalization with macFUSE that should be updated -for FUSE-T. See [this forum post](https://forum.rclone.org/t/some-unicode-forms-break-mount-on-macos-with-fuse-t/36403) -and [FUSE-T issue #16](https://github.com/macos-fuse-t/fuse-t/issues/16). The following -flag should be added to the `rclone mount` command. - - -o modules=iconv,from_code=UTF-8,to_code=UTF-8 ##### Read Only mounts diff --git a/fs/operations/check.go b/fs/operations/check.go index 2279b3fe6d3b0..b9fe36aabe2d2 100644 --- a/fs/operations/check.go +++ b/fs/operations/check.go @@ -380,10 +380,19 @@ func CheckDownload(ctx context.Context, opt *CheckOpt) error { // so that it matches behavior of Check (where it's handled by March) func ApplyTransforms(ctx context.Context, s string) string { ci := fs.GetConfig(ctx) - if !ci.NoUnicodeNormalization { + return ToNormal(s, !ci.NoUnicodeNormalization, ci.IgnoreCaseSync) +} + +// ToNormal normalizes case and unicode form and returns the transformed string. +// It is similar to ApplyTransforms but does not use a context. +// If normUnicode == true, s will be transformed to NFC. +// If normCase == true, s will be transformed to lowercase. +// If both are true, both transformations will be performed. +func ToNormal(s string, normUnicode, normCase bool) string { + if normUnicode { s = norm.NFC.String(s) } - if ci.IgnoreCaseSync { + if normCase { s = strings.ToLower(s) } return s diff --git a/vfs/dir.go b/vfs/dir.go index 5044ee855d2fb..47ffd8aaa5ad3 100644 --- a/vfs/dir.go +++ b/vfs/dir.go @@ -18,6 +18,7 @@ import ( "github.com/rclone/rclone/fs/operations" "github.com/rclone/rclone/fs/walk" "github.com/rclone/rclone/vfs/vfscommon" + "golang.org/x/text/unicode/norm" ) // Dir represents a directory entry @@ -466,7 +467,6 @@ func (d *Dir) AddVirtual(leaf string, size int64, isDir bool) { node = f } d.addObject(node) - } // delObject removes an object from the directory @@ -514,6 +514,35 @@ func (d *Dir) _readDir() error { return err } + if d.vfs.Opt.BlockNormDupes { // do this only if requested, as it will have a performance hit + ci := fs.GetConfig(context.TODO()) + + // sort entries such that NFD comes before NFC of same name + sort.Slice(entries, func(i, j int) bool { + if entries[i] != entries[j] && fs.DirEntryType(entries[i]) == fs.DirEntryType(entries[j]) && norm.NFC.String(entries[i].Remote()) == norm.NFC.String(entries[j].Remote()) { + if norm.NFD.IsNormalString(entries[i].Remote()) && !norm.NFD.IsNormalString(entries[j].Remote()) { + return true + } + } + return entries.Less(i, j) + }) + + // detect dupes, remove them from the list and log an error + normalizedNames := make(map[string]struct{}, entries.Len()) + filteredEntries := make(fs.DirEntries, 0) + for _, e := range entries { + normName := fmt.Sprintf("%s-%T", operations.ToNormal(e.Remote(), !ci.NoUnicodeNormalization, (ci.IgnoreCaseSync || d.vfs.Opt.CaseInsensitive)), e) // include type to track objects and dirs separately + _, found := normalizedNames[normName] + if found { + fs.Errorf(e.Remote(), "duplicate normalized names detected - skipping") + continue + } + normalizedNames[normName] = struct{}{} + filteredEntries = append(filteredEntries, e) + } + entries = filteredEntries + } + err = d._readDirFromEntries(entries, nil, time.Time{}) if err != nil { return err @@ -767,15 +796,18 @@ func (d *Dir) stat(leaf string) (Node, error) { } item, ok := d.items[leaf] - if !ok && d.vfs.Opt.CaseInsensitive { - leafLower := strings.ToLower(leaf) + ci := fs.GetConfig(context.TODO()) + normUnicode := !ci.NoUnicodeNormalization + normCase := ci.IgnoreCaseSync || d.vfs.Opt.CaseInsensitive + if !ok && (normUnicode || normCase) { + leafNormalized := operations.ToNormal(leaf, normUnicode, normCase) // this handles both case and unicode normalization for name, node := range d.items { - if strings.ToLower(name) == leafLower { + if operations.ToNormal(name, normUnicode, normCase) == leafNormalized { if ok { - // duplicate case insensitive match is an error - return nil, fmt.Errorf("duplicate filename %q detected with --vfs-case-insensitive set", leaf) + // duplicate normalized match is an error + return nil, fmt.Errorf("duplicate filename %q detected with case/unicode normalization settings", leaf) } - // found a case insensitive match + // found a normalized match ok = true item = node } diff --git a/vfs/vfs.md b/vfs/vfs.md index dba7afa0a0d40..8d53507646d70 100644 --- a/vfs/vfs.md +++ b/vfs/vfs.md @@ -309,6 +309,28 @@ If the flag is not provided on the command line, then its default value depends on the operating system where rclone runs: "true" on Windows and macOS, "false" otherwise. If the flag is provided without a value, then it is "true". +The `--no-unicode-normalization` flag controls whether a similar "fixup" is +performed for filenames that differ but are [canonically +equivalent](https://en.wikipedia.org/wiki/Unicode_equivalence) with respect to +unicode. Unicode normalization can be particularly helpful for users of macOS, +which prefers form NFD instead of the NFC used by most other platforms. It is +therefore highly recommended to keep the default of `false` on macOS, to avoid +encoding compatibility issues. + +In the (probably unlikely) event that a directory has multiple duplicate +filenames after applying case and unicode normalization, the `--vfs-block-norm-dupes` +flag allows hiding these duplicates. This comes with a performance tradeoff, as +rclone will have to scan the entire directory for duplicates when listing a +directory. For this reason, it is recommended to leave this disabled if not +needed. However, macOS users may wish to consider using it, as otherwise, if a +remote directory contains both NFC and NFD versions of the same filename, an odd +situation will occur: both versions of the file will be visible in the mount, +and both will appear to be editable, however, editing either version will +actually result in only the NFD version getting edited under the hood. `--vfs-block- +norm-dupes` prevents this confusion by detecting this scenario, hiding the +duplicates, and logging an error, similar to how this is handled in `rclone +sync`. + ### VFS Disk Options This flag allows you to manually set the statistics about the filing system. diff --git a/vfs/vfs_case_test.go b/vfs/vfs_case_test.go index 5ba81a169b7c7..7641cf89fac6a 100644 --- a/vfs/vfs_case_test.go +++ b/vfs/vfs_case_test.go @@ -5,10 +5,12 @@ import ( "os" "testing" + "github.com/rclone/rclone/fs" "github.com/rclone/rclone/fstest" "github.com/rclone/rclone/vfs/vfscommon" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "golang.org/x/text/unicode/norm" ) func TestCaseSensitivity(t *testing.T) { @@ -160,3 +162,36 @@ func assertFileAbsentVFS(t *testing.T, vfs *VFS, name string) { assert.Error(t, err) assert.Equal(t, err, ENOENT) } + +func TestUnicodeNormalization(t *testing.T) { + r := fstest.NewRun(t) + + var ( + nfc = norm.NFC.String(norm.NFD.String("測試_Русский___ě_áñ")) + nfd = norm.NFD.String(nfc) + both = "normal name with no special characters.txt" + ) + + // Create test files + ctx := context.Background() + file1 := r.WriteObject(ctx, both, "data1", t1) + file2 := r.WriteObject(ctx, nfc, "data2", t2) + r.CheckRemoteItems(t, file1, file2) + + // Create VFS + opt := vfscommon.DefaultOpt + vfs := New(r.Fremote, &opt) + defer cleanupVFS(t, vfs) + + // assert that both files are found under NFD-normalized names + assertFileDataVFS(t, vfs, norm.NFD.String(both), "data1") + assertFileDataVFS(t, vfs, nfd, "data2") + + // change ci.NoUnicodeNormalization to true and verify that only file1 is found + ci := fs.GetConfig(ctx) // need to set the global config here as the *Dir methods don't take a ctx param + oldVal := ci.NoUnicodeNormalization + defer func() { fs.GetConfig(ctx).NoUnicodeNormalization = oldVal }() // restore the prior value after the test + ci.NoUnicodeNormalization = true + assertFileDataVFS(t, vfs, norm.NFD.String(both), "data1") + assertFileAbsentVFS(t, vfs, nfd) +} diff --git a/vfs/vfscommon/options.go b/vfs/vfscommon/options.go index bb5f3313a4aec..f1e4368474ff8 100644 --- a/vfs/vfscommon/options.go +++ b/vfs/vfscommon/options.go @@ -30,6 +30,7 @@ type Options struct { CacheMinFreeSpace fs.SizeSuffix CachePollInterval time.Duration CaseInsensitive bool + BlockNormDupes bool WriteWait time.Duration // time to wait for in-sequence write ReadWait time.Duration // time to wait for in-sequence read WriteBack time.Duration // time to wait before writing back dirty files diff --git a/vfs/vfsflags/vfsflags.go b/vfs/vfsflags/vfsflags.go index 83b593d295f5a..1dedd891818b6 100644 --- a/vfs/vfsflags/vfsflags.go +++ b/vfs/vfsflags/vfsflags.go @@ -35,6 +35,7 @@ func AddFlags(flagSet *pflag.FlagSet) { flags.FVarP(flagSet, DirPerms, "dir-perms", "", "Directory permissions", "VFS") flags.FVarP(flagSet, FilePerms, "file-perms", "", "File permissions", "VFS") flags.BoolVarP(flagSet, &Opt.CaseInsensitive, "vfs-case-insensitive", "", Opt.CaseInsensitive, "If a file name not found, find a case insensitive match", "VFS") + flags.BoolVarP(flagSet, &Opt.BlockNormDupes, "vfs-block-norm-dupes", "", Opt.BlockNormDupes, "If duplicate filenames exist in the same directory (after normalization), log an error and hide the duplicates (may have a performance cost)", "VFS") flags.DurationVarP(flagSet, &Opt.WriteWait, "vfs-write-wait", "", Opt.WriteWait, "Time to wait for in-sequence write before giving error", "VFS") flags.DurationVarP(flagSet, &Opt.ReadWait, "vfs-read-wait", "", Opt.ReadWait, "Time to wait for in-sequence read before seeking", "VFS") flags.DurationVarP(flagSet, &Opt.WriteBack, "vfs-write-back", "", Opt.WriteBack, "Time to writeback files after last use when using cache", "VFS")