Pkg: Add fs/README.md to document performance & security improvements

Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
Michael Mayer
2025-11-25 13:09:48 +01:00
parent 897dfe7264
commit 1631aecea6
7 changed files with 294 additions and 28 deletions

60
pkg/fs/README.md Normal file
View File

@@ -0,0 +1,60 @@
## PhotoPrism — pkg/fs
**Last Updated:** November 25, 2025
### Overview
`pkg/fs` provides safe, cross-platform filesystem helpers used across PhotoPrism. It supplies permission constants, copy/move utilities with force-aware semantics, safe path joins, archive extraction with size limits, MIME and extension lookups, hashing, canonical path casing, and fast directory walking with ignore lists.
#### Goals
- Offer reusable, side-effect-safe filesystem helpers that other packages can call without importing `internal/*`.
- Enforce shared permission defaults (`ModeDir`, `ModeFile`, `ModeConfigFile`, `ModeSecretFile`, `ModeBackupFile`).
- Protect against common filesystem attacks (path traversal, overwrite of non-empty files without `force`, unsafe zip extraction).
- Provide consistent file-type detection (extensions/MIME), hashing, and fast walkers with skip logic for caches and `.ppstorage` markers.
#### Non-Goals
- Database migrations or metadata parsing (handled elsewhere).
- Edition-specific behavior; all helpers are edition-agnostic.
### Package Layout (Code Map)
- Permissions & paths: `mode.go`, `filepath.go`, `canonical.go`, `case.go`.
- Copy/Move & write helpers: `copy_move.go`, `write.go`, `cache.go`, `purge.go`.
- Archive extraction: `zip.go` (size limits, safe join), tests in `zip_test.go`.
- File info & types: `file_type*.go`, `mime.go`, `file_ext*.go`, `name.go`.
- Hashing & IDs: `hash.go`, `id.go`.
- Walkers & ignore rules: `walk.go`, `ignore.go`, `done.go`.
- Utilities: `bytes.go`, `resolve.go`, `symlink.go`, `modtime.go`, `readlines.go`.
### Usage & Test Guidelines
- Overwrite semantics: pass `force=true` only when the caller explicitly confirmed replacement; empty files may be replaced without `force`.
- Permissions: use provided mode constants; do not mix with stdlib `io/fs` bits.
- Zip extraction: always set `fileSizeLimit` / `totalSizeLimit` in `Unzip` for untrusted inputs; ensure tests cover path traversal and size caps (see `zip_test.go`).
- Focused tests: `go test ./pkg/fs -run 'Copy|Move|Unzip|Write' -count=1` keeps feedback quick; full package: `go test ./pkg/fs -count=1`.
### Recent Changes & Improvements
- Hardened `safeJoin`: normalize `\\`/`/`, use `filepath.Rel` to reject paths escaping `baseDir`, and keep volume/absolute checks.
- Added optional max-entries guard in `Unzip` and treat `totalSizeLimit=0` as “no limit” while documenting `-1` as unlimited.
- Added pool copy buffers (128256 KiB) that use `io.CopyBuffer` in `Copy`, `Hash`, `Checksum`, `WriteFileFromReader` to cut allocations/GC.
#### Pool Copy Buffers
- Read/write iterations per 4GiB file:
- Before: ~131,072 iterations (4GiB / 32KiB).
- After: 16,384 iterations (4GiB / 256KiB).
- ~8× fewer syscalls and loop bookkeeping.
- Latency saved (order-of-magnitude):
- If each read+write pair costs ~2µs of syscall/loop overhead, skipping ~115k iterations saves ≈0.23s on a 4GiB stream.
- On SSD/NVMe where disk I/O dominates, expect ~510% throughput gain; on spinning disks or network mounts with higher syscall cost, closer to 1020% is realistic.
- CPU-bound hashing (SHA-1) sees mostly overhead reduction; the hash itself stays the dominant cost, but you still avoid ~8× buffer boundary checks and syscalls, so a few percent improvement is typical.
- Allocation/GC savings:
- Before: each call allocated a fresh 32KiB buffer; hashing and copy both did this per invocation.
- After: pooled 256KiB buffer reused; effectively zero steady-state allocations for these paths, which is most noticeable when hashing or copying many files in a batch (less GC pressure, fewer pauses).
- Net effect on large video files (several GB):
- Wall-clock improvement: modest but measurable (subsecond on SSDs; up to a couple of seconds on slower media per 4GiB).
- CPU usage: a few percentage points lower due to fewer syscalls and eliminated buffer allocations.
- GC: reduced minor-GC churn during bulk imports/hashes.

24
pkg/fs/buffer_pool.go Normal file
View File

@@ -0,0 +1,24 @@
package fs
import "sync"
// copyBufferSize defines the shared buffer length used for large file copies/hashes.
const copyBufferSize = 256 * 1024
// copyBufferPool reuses byte slices to reduce allocations during file I/O.
var copyBufferPool = sync.Pool{ //nolint:gochecknoglobals // shared pool for I/O buffers
New: func() any {
buf := make([]byte, copyBufferSize)
return &buf
},
}
// getCopyBuffer returns a pooled buffer for copy operations.
func getCopyBuffer() []byte {
return *(copyBufferPool.Get().(*[]byte))
}
// putCopyBuffer returns a buffer to the pool.
func putCopyBuffer(buf []byte) {
copyBufferPool.Put(&buf)
}

View File

@@ -72,7 +72,10 @@ func Copy(src, dest string, force bool) (err error) {
defer destFile.Close()
_, err = io.Copy(destFile, thisFile)
buf := getCopyBuffer()
defer putCopyBuffer(buf)
_, err = io.CopyBuffer(destFile, thisFile, buf)
if err != nil {
return err

View File

@@ -24,7 +24,10 @@ func Hash(fileName string) string {
hash := sha1.New() //nolint:gosec // legacy SHA1 hashes retained for compatibility
if _, err := io.Copy(hash, file); err != nil {
buf := getCopyBuffer()
defer putCopyBuffer(buf)
if _, err = io.CopyBuffer(hash, file, buf); err != nil {
return ""
}
@@ -45,7 +48,10 @@ func Checksum(fileName string) string {
hash := crc32.New(checksum.Crc32Castagnoli)
if _, err := io.Copy(hash, file); err != nil {
buf := getCopyBuffer()
defer putCopyBuffer(buf)
if _, err = io.CopyBuffer(hash, file, buf); err != nil {
return ""
}

View File

@@ -72,7 +72,10 @@ func WriteFileFromReader(fileName string, reader io.Reader) (err error) {
return err
}
_, err = io.Copy(file, reader)
buf := getCopyBuffer()
defer putCopyBuffer(buf)
_, err = io.CopyBuffer(file, reader, buf)
if closeErr := file.Close(); closeErr != nil && err == nil {
err = closeErr

View File

@@ -11,6 +11,9 @@ import (
"strings"
)
// MaxUnzipEntries caps the number of entries extracted by Unzip. It may be tuned via config/env later.
var MaxUnzipEntries = 100000
// Zip compresses one or many files into a single zip archive file.
func Zip(zipName string, files []string, compress bool) (err error) {
// Create zip file directory if it does not yet exist.
@@ -94,6 +97,7 @@ func ZipFile(zipWriter *zip.Writer, fileName, fileAlias string, compress bool) (
}
// Unzip extracts the contents of a zip file to the target directory.
// totalSizeLimit: 0 means unlimited; -1 also means unlimited (reserved for backward compatibility).
func Unzip(zipName, dir string, fileSizeLimit, totalSizeLimit int64) (files []string, skipped []string, err error) {
zipReader, err := zip.OpenReader(zipName)
@@ -103,10 +107,21 @@ func Unzip(zipName, dir string, fileSizeLimit, totalSizeLimit int64) (files []st
defer zipReader.Close()
for _, zipFile := range zipReader.File {
// Treat 0 as no limit; negative also unlimited.
if totalSizeLimit == 0 {
totalSizeLimit = -1
}
entryLimit := MaxUnzipEntries
for i, zipFile := range zipReader.File {
if entryLimit > 0 && i >= entryLimit {
return files, skipped, fmt.Errorf("zip entry limit exceeded (%d)", entryLimit)
}
// Skip directories like __OSX and potentially malicious file names containing "..".
if strings.HasPrefix(zipFile.Name, "__") || strings.Contains(zipFile.Name, "..") ||
totalSizeLimit == 0 || fileSizeLimit > 0 && zipFile.UncompressedSize64 > uint64(fileSizeLimit) {
fileSizeLimit > 0 && zipFile.UncompressedSize64 > uint64(fileSizeLimit) {
skipped = append(skipped, zipFile.Name)
continue
}
@@ -218,15 +233,31 @@ func safeJoin(baseDir, name string) (string, error) {
if name == "" {
return "", fmt.Errorf("invalid zip path")
}
// Normalize separators so mixed '/' and '\\' are handled consistently.
name = strings.ReplaceAll(name, "\\", "/")
// Reject Windows-style volume names even on non-Windows platforms.
if len(name) >= 2 && name[1] == ':' && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z')) {
return "", fmt.Errorf("invalid zip path: absolute or volume path not allowed")
}
if filepath.IsAbs(name) || filepath.VolumeName(name) != "" {
return "", fmt.Errorf("invalid zip path: absolute or volume path not allowed")
}
cleaned := filepath.Clean(name)
// Prevent paths that resolve outside the base dir.
dest := filepath.Join(baseDir, cleaned)
base := filepath.Clean(baseDir)
if dest != base && !strings.HasPrefix(dest, base+string(os.PathSeparator)) {
dest := filepath.Join(base, cleaned)
rel, err := filepath.Rel(base, dest)
if err != nil {
return "", fmt.Errorf("invalid zip path: %w", err)
}
if rel == ".." || strings.HasPrefix(rel, ".."+string(os.PathSeparator)) {
return "", fmt.Errorf("invalid zip path: outside target directory")
}
return dest, nil
}

View File

@@ -3,6 +3,7 @@ package fs
import (
"archive/zip"
"encoding/binary"
"fmt"
"math"
"os"
"path/filepath"
@@ -47,15 +48,18 @@ func TestUnzip_SkipRulesAndLimits(t *testing.T) {
}
writeZip(t, zipPath, entries)
// totalSizeLimit == 0 → skip everything
t.Run("UnlimitedTotalSize", func(t *testing.T) {
files, skipped, err := Unzip(zipPath, filepath.Join(dir, "a"), 0, 0)
assert.NoError(t, err)
assert.Empty(t, files)
assert.GreaterOrEqual(t, len(skipped), 1)
// Apply per-file and total limits
assert.ElementsMatch(t, []string{
filepath.Join(dir, "a", "ok1.txt"),
filepath.Join(dir, "a", "ok2.txt"),
}, files)
assert.GreaterOrEqual(t, len(skipped), 2) // __MACOSX and evil path skipped
})
t.Run("WithEntryAndTotalLimits", func(t *testing.T) {
outDir := filepath.Join(dir, "b")
files, skipped, err = Unzip(zipPath, outDir, 2, 3) // file limit=2 bytes; total limit=3 bytes
files, skipped, err := Unzip(zipPath, outDir, 2, 3) // file limit=2 bytes; total limit=3 bytes
assert.NoError(t, err)
// ok1 (3 bytes) skipped by file limit; evil skipped by '..'; __MACOSX skipped by prefix
@@ -67,6 +71,7 @@ func TestUnzip_SkipRulesAndLimits(t *testing.T) {
assert.Equal(t, []byte("x"), b)
// Skipped contains at least the three excluded entries
assert.GreaterOrEqual(t, len(skipped), 3)
})
}
func TestUnzip_AbsolutePathRejected(t *testing.T) {
@@ -162,6 +167,25 @@ func TestUnzip_SkipsVeryLargeEntry(t *testing.T) {
assert.Contains(t, skipped, "huge.bin")
}
func TestUnzip_EntryLimit(t *testing.T) {
dir := t.TempDir()
zipPath := filepath.Join(dir, "limit.zip")
entries := map[string][]byte{}
for i := 0; i < 5; i++ {
entries[fmt.Sprintf("f%d.txt", i)] = []byte("x")
}
writeZip(t, zipPath, entries)
orig := MaxUnzipEntries
MaxUnzipEntries = 3
defer func() { MaxUnzipEntries = orig }()
_, _, err := Unzip(zipPath, filepath.Join(dir, "out"), 0, 0)
assert.Error(t, err)
assert.Contains(t, err.Error(), "entry limit")
}
// writeZip64Stub writes a minimal ZIP64 archive with one stored entry and custom size values.
func writeZip64Stub(t *testing.T, path, name string, size uint64) {
t.Helper()
@@ -362,3 +386,118 @@ func TestZip(t *testing.T) {
}
})
}
func TestSafeJoin(t *testing.T) {
base := filepath.Clean("/tmp/base")
if runtime.GOOS == "windows" {
base = filepath.Clean(`C:\tmp\base`)
}
type testCase struct {
name string
baseDir string
input string
wantErr bool
wantPath string
}
tests := []testCase{
{
name: "SimpleFile",
baseDir: base,
input: "a.txt",
wantPath: filepath.Join(base, "a.txt"),
},
{
name: "NestedRelative",
baseDir: base,
input: "nested/dir/file.jpg",
wantPath: filepath.Join(base, "nested", "dir", "file.jpg"),
},
{
name: "EmptyName",
baseDir: base,
input: "",
wantErr: true,
},
{
name: "DotDotTraversal",
baseDir: base,
input: "../secret.txt",
wantErr: true,
},
{
name: "MixedSeparatorsTraversal",
baseDir: base,
input: `dir\..\evil.txt`,
wantPath: filepath.Join(base, "evil.txt"),
},
{
name: "ContainsParentInMiddle",
baseDir: base,
input: "dir/../evil.txt",
wantPath: filepath.Join(base, "evil.txt"),
},
{
name: "AbsoluteUnix",
baseDir: base,
input: "/etc/passwd",
wantErr: true,
},
{
name: "WindowsVolumeForwardSlash",
baseDir: base,
input: "C:/Windows/System32/evil.txt",
wantErr: true,
},
{
name: "WindowsVolumeBackslash",
baseDir: base,
input: `D:\Data\evil.txt`,
wantErr: true,
},
{
name: "CleansInsideBase",
baseDir: base,
input: "sub/../ok.txt",
wantPath: filepath.Join(base, "ok.txt"),
},
{
name: "RepeatedSeparators",
baseDir: base,
input: "dir//file.txt",
wantPath: filepath.Join(base, "dir", "file.txt"),
},
{
name: "VolumeNameOnly",
baseDir: base,
input: "C:",
wantErr: true,
},
{
name: "RootedBackslashUnix",
baseDir: base,
input: `\evil.txt`,
wantErr: true,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, err := safeJoin(tc.baseDir, tc.input)
if tc.wantErr {
if err == nil {
t.Fatalf("expected error, got none (path=%q)", got)
}
return
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if got != tc.wantPath {
t.Fatalf("unexpected path: got %q want %q", got, tc.wantPath)
}
})
}
}