Setup: Update Ollama service examples in compose.yaml files #5123

Signed-off-by: Michael Mayer <michael@photoprism.app>
2025-12-12 00:34:13 +01:00 · 2025-09-01 16:03:27 +02:00
parent c3537b10e5
commit a0b44b2ca2
5 changed files with 5 additions and 5 deletions
--- a/compose.nvidia.yaml
+++ b/compose.nvidia.yaml
@@ -179,7 +179,7 @@ services:
      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
-      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
+      OLLAMA_KV_CACHE_TYPE: "f16"      # cache quantization (f16, q8_0, or q4_0)
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information
--- a/compose.yaml
+++ b/compose.yaml
@@ -259,7 +259,7 @@ services:
      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
-      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
+      OLLAMA_KV_CACHE_TYPE: "f16"      # cache quantization (f16, q8_0, or q4_0)
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information
--- a/setup/docker/arm64/compose.yaml
+++ b/setup/docker/arm64/compose.yaml
@@ -181,7 +181,7 @@ services:
      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
-      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
+      OLLAMA_KV_CACHE_TYPE: "f16"      # cache quantization (f16, q8_0, or q4_0)
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information
--- a/setup/docker/compose.yaml
+++ b/setup/docker/compose.yaml
@@ -186,7 +186,7 @@ services:
      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
-      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
+      OLLAMA_KV_CACHE_TYPE: "f16"      # cache quantization (f16, q8_0, or q4_0)
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information
--- a/setup/docker/nvidia/compose.yaml
+++ b/setup/docker/nvidia/compose.yaml
@@ -186,7 +186,7 @@ services:
      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
      OLLAMA_NOHISTORY: "true"         # disables readline history
      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
-      OLLAMA_KV_CACHE_TYPE: "f16"      # see https://mitjamartini.com/blog/kv-cache-quantization-in-ollama/
+      OLLAMA_KV_CACHE_TYPE: "f16"      # cache quantization (f16, q8_0, or q4_0)
      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
      # OLLAMA_DEBUG: "true"             # shows additional debug information