Pkg: Add fs/README.md to document performance & security improvements

Signed-off-by: Michael Mayer <michael@photoprism.app>
2025-12-12 08:44:04 +01:00 · 2025-11-25 13:09:48 +01:00
parent 897dfe7264
commit 1631aecea6
7 changed files with 294 additions and 28 deletions
--- a/pkg/fs/README.md
+++ b/pkg/fs/README.md
@@ -0,0 +1,60 @@
+## PhotoPrism — pkg/fs
+
+**Last Updated:** November 25, 2025
+
+### Overview
+
+`pkg/fs` provides safe, cross-platform filesystem helpers used across PhotoPrism. It supplies permission constants, copy/move utilities with force-aware semantics, safe path joins, archive extraction with size limits, MIME and extension lookups, hashing, canonical path casing, and fast directory walking with ignore lists.
+
+#### Goals
+
+- Offer reusable, side-effect-safe filesystem helpers that other packages can call without importing `internal/*`.
+- Enforce shared permission defaults (`ModeDir`, `ModeFile`, `ModeConfigFile`, `ModeSecretFile`, `ModeBackupFile`).
+- Protect against common filesystem attacks (path traversal, overwrite of non-empty files without `force`, unsafe zip extraction).
+- Provide consistent file-type detection (extensions/MIME), hashing, and fast walkers with skip logic for caches and `.ppstorage` markers.
+
+#### Non-Goals
+
+- Database migrations or metadata parsing (handled elsewhere).
+- Edition-specific behavior; all helpers are edition-agnostic.
+
+### Package Layout (Code Map)
+
+- Permissions & paths: `mode.go`, `filepath.go`, `canonical.go`, `case.go`.
+- Copy/Move & write helpers: `copy_move.go`, `write.go`, `cache.go`, `purge.go`.
+- Archive extraction: `zip.go` (size limits, safe join), tests in `zip_test.go`.
+- File info & types: `file_type*.go`, `mime.go`, `file_ext*.go`, `name.go`.
+- Hashing & IDs: `hash.go`, `id.go`.
+- Walkers & ignore rules: `walk.go`, `ignore.go`, `done.go`.
+- Utilities: `bytes.go`, `resolve.go`, `symlink.go`, `modtime.go`, `readlines.go`.
+
+### Usage & Test Guidelines
+
+- Overwrite semantics: pass `force=true` only when the caller explicitly confirmed replacement; empty files may be replaced without `force`.
+- Permissions: use provided mode constants; do not mix with stdlib `io/fs` bits.
+- Zip extraction: always set `fileSizeLimit` / `totalSizeLimit` in `Unzip` for untrusted inputs; ensure tests cover path traversal and size caps (see `zip_test.go`).
+- Focused tests: `go test ./pkg/fs -run 'Copy|Move|Unzip|Write' -count=1` keeps feedback quick; full package: `go test ./pkg/fs -count=1`.
+
+### Recent Changes & Improvements
+
+- Hardened `safeJoin`: normalize `\\`/`/`, use `filepath.Rel` to reject paths escaping `baseDir`, and keep volume/absolute checks.
+- Added optional max-entries guard in `Unzip` and treat `totalSizeLimit=0` as “no limit” while documenting `-1` as unlimited.
+- Added pool copy buffers (128–256 KiB) that use `io.CopyBuffer` in `Copy`, `Hash`, `Checksum`, `WriteFileFromReader` to cut allocations/GC.
+
+#### Pool Copy Buffers
+
+- Read/write iterations per 4 GiB file:
+  - Before: ~131,072 iterations (4 GiB / 32 KiB).
+  - After: 16,384 iterations (4 GiB / 256 KiB).
+  - ~8× fewer syscalls and loop bookkeeping.
+- Latency saved (order-of-magnitude):
+  - If each read+write pair costs ~2 µs of syscall/loop overhead, skipping ~115k iterations saves ≈0.23 s on a 4 GiB stream.
+  - On SSD/NVMe where disk I/O dominates, expect ~5–10% throughput gain; on spinning disks or network mounts with higher syscall cost, closer to 10–20% is realistic.
+  - CPU-bound hashing (SHA-1) sees mostly overhead reduction; the hash itself stays the dominant cost, but you still avoid ~8× buffer boundary checks and syscalls, so a few percent improvement is typical.
+- Allocation/GC savings:
+  - Before: each call allocated a fresh 32 KiB buffer; hashing and copy both did this per invocation.
+  - After: pooled 256 KiB buffer reused; effectively zero steady-state allocations for these paths, which is most noticeable when hashing or copying many files in a batch (less GC pressure, fewer pauses).
+- Net effect on large video files (several GB):
+  - Wall-clock improvement: modest but measurable (sub‑second on SSDs; up to a couple of seconds on slower media per 4 GiB).
+  - CPU usage: a few percentage points lower due to fewer syscalls and eliminated buffer allocations.
+  - GC: reduced minor-GC churn during bulk imports/hashes.
--- a/pkg/fs/buffer_pool.go
+++ b/pkg/fs/buffer_pool.go
@@ -0,0 +1,24 @@
+package fs
+
+import "sync"
+
+// copyBufferSize defines the shared buffer length used for large file copies/hashes.
+const copyBufferSize = 256 * 1024
+
+// copyBufferPool reuses byte slices to reduce allocations during file I/O.
+var copyBufferPool = sync.Pool{ //nolint:gochecknoglobals // shared pool for I/O buffers
+	New: func() any {
+		buf := make([]byte, copyBufferSize)
+		return &buf
+	},
+}
+
+// getCopyBuffer returns a pooled buffer for copy operations.
+func getCopyBuffer() []byte {
+	return *(copyBufferPool.Get().(*[]byte))
+}
+
+// putCopyBuffer returns a buffer to the pool.
+func putCopyBuffer(buf []byte) {
+	copyBufferPool.Put(&buf)
+}
--- a/pkg/fs/copy_move.go
+++ b/pkg/fs/copy_move.go
@@ -72,7 +72,10 @@ func Copy(src, dest string, force bool) (err error) {

 	defer destFile.Close()

-	_, err = io.Copy(destFile, thisFile)
+	buf := getCopyBuffer()
+	defer putCopyBuffer(buf)
+
+	_, err = io.CopyBuffer(destFile, thisFile, buf)

 	if err != nil {
 		return err
--- a/pkg/fs/hash.go
+++ b/pkg/fs/hash.go
@@ -24,7 +24,10 @@ func Hash(fileName string) string {

 	hash := sha1.New() //nolint:gosec // legacy SHA1 hashes retained for compatibility

-	if _, err := io.Copy(hash, file); err != nil {
+	buf := getCopyBuffer()
+	defer putCopyBuffer(buf)
+
+	if _, err = io.CopyBuffer(hash, file, buf); err != nil {
 		return ""
 	}

@@ -45,7 +48,10 @@ func Checksum(fileName string) string {

 	hash := crc32.New(checksum.Crc32Castagnoli)

-	if _, err := io.Copy(hash, file); err != nil {
+	buf := getCopyBuffer()
+	defer putCopyBuffer(buf)
+
+	if _, err = io.CopyBuffer(hash, file, buf); err != nil {
 		return ""
 	}

--- a/pkg/fs/write.go
+++ b/pkg/fs/write.go
@@ -72,7 +72,10 @@ func WriteFileFromReader(fileName string, reader io.Reader) (err error) {
 		return err
 	}

-	_, err = io.Copy(file, reader)
+	buf := getCopyBuffer()
+	defer putCopyBuffer(buf)
+
+	_, err = io.CopyBuffer(file, reader, buf)

 	if closeErr := file.Close(); closeErr != nil && err == nil {
 		err = closeErr
--- a/pkg/fs/zip.go
+++ b/pkg/fs/zip.go
@@ -11,6 +11,9 @@ import (
 	"strings"
 )

+// MaxUnzipEntries caps the number of entries extracted by Unzip. It may be tuned via config/env later.
+var MaxUnzipEntries = 100000
+
 // Zip compresses one or many files into a single zip archive file.
 func Zip(zipName string, files []string, compress bool) (err error) {
 	// Create zip file directory if it does not yet exist.
@@ -94,6 +97,7 @@ func ZipFile(zipWriter *zip.Writer, fileName, fileAlias string, compress bool) (
 }

 // Unzip extracts the contents of a zip file to the target directory.
+// totalSizeLimit: 0 means unlimited; -1 also means unlimited (reserved for backward compatibility).
 func Unzip(zipName, dir string, fileSizeLimit, totalSizeLimit int64) (files []string, skipped []string, err error) {
 	zipReader, err := zip.OpenReader(zipName)

@@ -103,10 +107,21 @@ func Unzip(zipName, dir string, fileSizeLimit, totalSizeLimit int64) (files []st

 	defer zipReader.Close()

-	for _, zipFile := range zipReader.File {
+	// Treat 0 as no limit; negative also unlimited.
+	if totalSizeLimit == 0 {
+		totalSizeLimit = -1
+	}
+
+	entryLimit := MaxUnzipEntries
+
+	for i, zipFile := range zipReader.File {
+		if entryLimit > 0 && i >= entryLimit {
+			return files, skipped, fmt.Errorf("zip entry limit exceeded (%d)", entryLimit)
+		}
+
 		// Skip directories like __OSX and potentially malicious file names containing "..".
 		if strings.HasPrefix(zipFile.Name, "__") || strings.Contains(zipFile.Name, "..") ||
-			totalSizeLimit == 0 || fileSizeLimit > 0 && zipFile.UncompressedSize64 > uint64(fileSizeLimit) {
+			fileSizeLimit > 0 && zipFile.UncompressedSize64 > uint64(fileSizeLimit) {
 			skipped = append(skipped, zipFile.Name)
 			continue
 		}
@@ -218,15 +233,31 @@ func safeJoin(baseDir, name string) (string, error) {
 	if name == "" {
 		return "", fmt.Errorf("invalid zip path")
 	}
+
+	// Normalize separators so mixed '/' and '\\' are handled consistently.
+	name = strings.ReplaceAll(name, "\\", "/")
+
+	// Reject Windows-style volume names even on non-Windows platforms.
+	if len(name) >= 2 && name[1] == ':' && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z')) {
+		return "", fmt.Errorf("invalid zip path: absolute or volume path not allowed")
+	}
+
 	if filepath.IsAbs(name) || filepath.VolumeName(name) != "" {
 		return "", fmt.Errorf("invalid zip path: absolute or volume path not allowed")
 	}
+
 	cleaned := filepath.Clean(name)
-	// Prevent paths that resolve outside the base dir.
-	dest := filepath.Join(baseDir, cleaned)
 	base := filepath.Clean(baseDir)
-	if dest != base && !strings.HasPrefix(dest, base+string(os.PathSeparator)) {
+
+	dest := filepath.Join(base, cleaned)
+	rel, err := filepath.Rel(base, dest)
+	if err != nil {
+		return "", fmt.Errorf("invalid zip path: %w", err)
+	}
+
+	if rel == ".." || strings.HasPrefix(rel, ".."+string(os.PathSeparator)) {
 		return "", fmt.Errorf("invalid zip path: outside target directory")
 	}
+
 	return dest, nil
 }
--- a/pkg/fs/zip_test.go
+++ b/pkg/fs/zip_test.go
@@ -3,6 +3,7 @@ package fs
 import (
 	"archive/zip"
 	"encoding/binary"
+	"fmt"
 	"math"
 	"os"
 	"path/filepath"
@@ -47,15 +48,18 @@ func TestUnzip_SkipRulesAndLimits(t *testing.T) {
 	}
 	writeZip(t, zipPath, entries)

-	// totalSizeLimit == 0 → skip everything
+	t.Run("UnlimitedTotalSize", func(t *testing.T) {
 		files, skipped, err := Unzip(zipPath, filepath.Join(dir, "a"), 0, 0)
 		assert.NoError(t, err)
-	assert.Empty(t, files)
-	assert.GreaterOrEqual(t, len(skipped), 1)
-
-	// Apply per-file and total limits
+		assert.ElementsMatch(t, []string{
+			filepath.Join(dir, "a", "ok1.txt"),
+			filepath.Join(dir, "a", "ok2.txt"),
+		}, files)
+		assert.GreaterOrEqual(t, len(skipped), 2) // __MACOSX and evil path skipped
+	})
+	t.Run("WithEntryAndTotalLimits", func(t *testing.T) {
 		outDir := filepath.Join(dir, "b")
-	files, skipped, err = Unzip(zipPath, outDir, 2, 3) // file limit=2 bytes; total limit=3 bytes
+		files, skipped, err := Unzip(zipPath, outDir, 2, 3) // file limit=2 bytes; total limit=3 bytes
 		assert.NoError(t, err)

 		// ok1 (3 bytes) skipped by file limit; evil skipped by '..'; __MACOSX skipped by prefix
@@ -67,6 +71,7 @@ func TestUnzip_SkipRulesAndLimits(t *testing.T) {
 		assert.Equal(t, []byte("x"), b)
 		// Skipped contains at least the three excluded entries
 		assert.GreaterOrEqual(t, len(skipped), 3)
+	})
 }

 func TestUnzip_AbsolutePathRejected(t *testing.T) {
@@ -162,6 +167,25 @@ func TestUnzip_SkipsVeryLargeEntry(t *testing.T) {
 	assert.Contains(t, skipped, "huge.bin")
 }

+func TestUnzip_EntryLimit(t *testing.T) {
+	dir := t.TempDir()
+	zipPath := filepath.Join(dir, "limit.zip")
+
+	entries := map[string][]byte{}
+	for i := 0; i < 5; i++ {
+		entries[fmt.Sprintf("f%d.txt", i)] = []byte("x")
+	}
+	writeZip(t, zipPath, entries)
+
+	orig := MaxUnzipEntries
+	MaxUnzipEntries = 3
+	defer func() { MaxUnzipEntries = orig }()
+
+	_, _, err := Unzip(zipPath, filepath.Join(dir, "out"), 0, 0)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "entry limit")
+}
+
 // writeZip64Stub writes a minimal ZIP64 archive with one stored entry and custom size values.
 func writeZip64Stub(t *testing.T, path, name string, size uint64) {
 	t.Helper()
@@ -362,3 +386,118 @@ func TestZip(t *testing.T) {
 		}
 	})
 }
+
+func TestSafeJoin(t *testing.T) {
+	base := filepath.Clean("/tmp/base")
+
+	if runtime.GOOS == "windows" {
+		base = filepath.Clean(`C:\tmp\base`)
+	}
+
+	type testCase struct {
+		name     string
+		baseDir  string
+		input    string
+		wantErr  bool
+		wantPath string
+	}
+
+	tests := []testCase{
+		{
+			name:     "SimpleFile",
+			baseDir:  base,
+			input:    "a.txt",
+			wantPath: filepath.Join(base, "a.txt"),
+		},
+		{
+			name:     "NestedRelative",
+			baseDir:  base,
+			input:    "nested/dir/file.jpg",
+			wantPath: filepath.Join(base, "nested", "dir", "file.jpg"),
+		},
+		{
+			name:    "EmptyName",
+			baseDir: base,
+			input:   "",
+			wantErr: true,
+		},
+		{
+			name:    "DotDotTraversal",
+			baseDir: base,
+			input:   "../secret.txt",
+			wantErr: true,
+		},
+		{
+			name:     "MixedSeparatorsTraversal",
+			baseDir:  base,
+			input:    `dir\..\evil.txt`,
+			wantPath: filepath.Join(base, "evil.txt"),
+		},
+		{
+			name:     "ContainsParentInMiddle",
+			baseDir:  base,
+			input:    "dir/../evil.txt",
+			wantPath: filepath.Join(base, "evil.txt"),
+		},
+		{
+			name:    "AbsoluteUnix",
+			baseDir: base,
+			input:   "/etc/passwd",
+			wantErr: true,
+		},
+		{
+			name:    "WindowsVolumeForwardSlash",
+			baseDir: base,
+			input:   "C:/Windows/System32/evil.txt",
+			wantErr: true,
+		},
+		{
+			name:    "WindowsVolumeBackslash",
+			baseDir: base,
+			input:   `D:\Data\evil.txt`,
+			wantErr: true,
+		},
+		{
+			name:     "CleansInsideBase",
+			baseDir:  base,
+			input:    "sub/../ok.txt",
+			wantPath: filepath.Join(base, "ok.txt"),
+		},
+		{
+			name:     "RepeatedSeparators",
+			baseDir:  base,
+			input:    "dir//file.txt",
+			wantPath: filepath.Join(base, "dir", "file.txt"),
+		},
+		{
+			name:    "VolumeNameOnly",
+			baseDir: base,
+			input:   "C:",
+			wantErr: true,
+		},
+		{
+			name:    "RootedBackslashUnix",
+			baseDir: base,
+			input:   `\evil.txt`,
+			wantErr: true,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := safeJoin(tc.baseDir, tc.input)
+			if tc.wantErr {
+				if err == nil {
+					t.Fatalf("expected error, got none (path=%q)", got)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if got != tc.wantPath {
+				t.Fatalf("unexpected path: got %q want %q", got, tc.wantPath)
+			}
+		})
+	}
+}