AI: Set default Model & URI depending on OLLAMA_API_KEY env var #5361

Signed-off-by: Michael Mayer <michael@photoprism.app>
2025-12-11 16:24:11 +01:00 · 2025-12-04 16:10:29 +01:00
parent 52ac4a91e0
commit f295a4bac3
11 changed files with 193 additions and 73 deletions
--- a/internal/ai/vision/engine.go
+++ b/internal/ai/vision/engine.go
@@ -75,6 +75,7 @@ type EngineInfo struct {
 	RequestFormat     ApiFormat
 	ResponseFormat    ApiFormat
 	FileScheme        string
+	DefaultModel      string
 	DefaultResolution int
 	DefaultKey        string // Optional placeholder key (e.g., ${OPENAI_API_KEY}); applied only when Service.Key is empty.
 }
--- a/internal/ai/vision/engine_ollama.go
+++ b/internal/ai/vision/engine_ollama.go
@@ -2,6 +2,7 @@ package vision

 import (
 	"context"
+	"os"
 	"strings"

 	"github.com/photoprism/photoprism/internal/ai/vision/ollama"
@@ -23,17 +24,38 @@ func init() {
 		Defaults: ollamaDefaults{},
 	})

+	registerOllamaEngineDefaults()
+}
+
+// registerOllamaEngineDefaults selects the default Ollama endpoint based on the
+// available credentials and registers the engine alias accordingly. When an
+// API key is configured, we default to the hosted Cloud endpoint; otherwise we
+// assume a self-hosted instance reachable via the docker-compose default.
+// This keeps the zero-config path fast for local dev while automatically using
+// the cloud service when credentials are present.
+func registerOllamaEngineDefaults() {
+	defaultModel := ollama.DefaultModel
+	defaultUri := ollama.DefaultUri
+
+	// Detect Ollama cloud API key.
+	if key := os.Getenv(ollama.APIKeyEnv); len(key) > 50 && strings.Contains(key, ".") {
+		defaultModel = ollama.CloudModel
+		defaultUri = ollama.CloudUri
+	}
+
 	// Register the human-friendly engine name so configuration can simply use
 	// `Engine: "ollama"` and inherit adapter defaults.
 	RegisterEngineAlias(ollama.EngineName, EngineInfo{
+		Uri:               defaultUri,
 		RequestFormat:     ApiFormatOllama,
 		ResponseFormat:    ApiFormatOllama,
 		FileScheme:        scheme.Base64,
+		DefaultModel:      defaultModel,
 		DefaultResolution: ollama.DefaultResolution,
 		DefaultKey:        ollama.APIKeyPlaceholder,
 	})

-	CaptionModel.Engine = ollama.EngineName
+	// Keep the default caption model config aligned with the defaults.
 	CaptionModel.ApplyEngineDefaults()
 }

--- a/internal/ai/vision/engine_ollama_test.go
+++ b/internal/ai/vision/engine_ollama_test.go
@@ -3,11 +3,116 @@ package vision
 import (
 	"context"
 	"encoding/json"
+	"os"
 	"testing"

 	"github.com/photoprism/photoprism/internal/ai/vision/ollama"
+	"github.com/photoprism/photoprism/pkg/http/scheme"
 )

+func TestRegisterOllamaEngineDefaults(t *testing.T) {
+	original := os.Getenv(ollama.APIKeyEnv)
+	originalCaptionModel := CaptionModel.Clone()
+	testCaptionModel := CaptionModel.Clone()
+	testCaptionModel.Model = ""
+	testCaptionModel.Service.Uri = ""
+	cloudToken := "moo9yaiS4ShoKiojiathie2vuejiec2X.Mahl7ewaej4ebi7afq8f_vwe" //nolint:gosec
+
+	t.Cleanup(func() {
+		if original == "" {
+			_ = os.Unsetenv(ollama.APIKeyEnv)
+		} else {
+			_ = os.Setenv(ollama.APIKeyEnv, original)
+		}
+		CaptionModel = originalCaptionModel
+		registerOllamaEngineDefaults()
+	})
+
+	t.Run("SelfHosted", func(t *testing.T) {
+		CaptionModel = testCaptionModel.Clone()
+		_ = os.Unsetenv(ollama.APIKeyEnv)
+
+		registerOllamaEngineDefaults()
+
+		info, ok := EngineInfoFor(ollama.EngineName)
+		if !ok {
+			t.Fatalf("expected engine info for %s", ollama.EngineName)
+		}
+
+		if info.Uri != ollama.DefaultUri {
+			t.Fatalf("expected default uri %s, got %s", ollama.DefaultUri, info.Uri)
+		}
+
+		if info.DefaultModel != ollama.DefaultModel {
+			t.Fatalf("expected default model %s, got %s", ollama.DefaultModel, info.DefaultModel)
+		}
+
+		if CaptionModel.Model != ollama.DefaultModel {
+			t.Fatalf("expected caption model %s, got %s", ollama.DefaultModel, CaptionModel.Model)
+		}
+
+		if CaptionModel.Service.Uri != ollama.DefaultUri {
+			t.Fatalf("expected caption model uri %s, got %s", ollama.DefaultUri, CaptionModel.Service.Uri)
+		}
+	})
+	t.Run("Cloud", func(t *testing.T) {
+		CaptionModel = testCaptionModel.Clone()
+		t.Setenv(ollama.APIKeyEnv, cloudToken)
+
+		registerOllamaEngineDefaults()
+
+		info, ok := EngineInfoFor(ollama.EngineName)
+		if !ok {
+			t.Fatalf("expected engine info for %s", ollama.EngineName)
+		}
+
+		if info.Uri != ollama.CloudUri {
+			t.Fatalf("expected cloud uri %s, got %s", ollama.CloudUri, info.Uri)
+		}
+
+		if info.DefaultModel != ollama.CloudModel {
+			t.Fatalf("expected cloud model %s, got %s", ollama.CloudModel, info.DefaultModel)
+		}
+
+		if CaptionModel.Model != ollama.CloudModel {
+			t.Fatalf("expected caption model %s, got %s", ollama.CloudModel, CaptionModel.Model)
+		}
+
+		if CaptionModel.Service.Uri != ollama.CloudUri {
+			t.Fatalf("expected caption model uri %s, got %s", ollama.CloudUri, CaptionModel.Service.Uri)
+		}
+	})
+	t.Run("NewModels", func(t *testing.T) {
+		CaptionModel = testCaptionModel.Clone()
+
+		t.Setenv(ollama.APIKeyEnv, cloudToken)
+		registerOllamaEngineDefaults()
+
+		model := &Model{Type: ModelTypeCaption, Engine: ollama.EngineName}
+		model.ApplyEngineDefaults()
+
+		if model.Model != ollama.CloudModel {
+			t.Fatalf("expected model %s, got %s", ollama.CloudModel, model.Model)
+		}
+
+		if model.Service.Uri != ollama.CloudUri {
+			t.Fatalf("expected service uri %s, got %s", ollama.CloudUri, model.Service.Uri)
+		}
+
+		if model.Service.RequestFormat != ApiFormatOllama || model.Service.ResponseFormat != ApiFormatOllama {
+			t.Fatalf("expected request/response format %s, got %s/%s", ApiFormatOllama, model.Service.RequestFormat, model.Service.ResponseFormat)
+		}
+
+		if model.Service.FileScheme != scheme.Base64 {
+			t.Fatalf("expected file scheme %s, got %s", scheme.Base64, model.Service.FileScheme)
+		}
+
+		if model.Resolution != ollama.DefaultResolution {
+			t.Fatalf("expected resolution %d, got %d", ollama.DefaultResolution, model.Resolution)
+		}
+	})
+}
+
 func TestOllamaDefaultConfidenceApplied(t *testing.T) {
 	req := &ApiRequest{Format: FormatJSON}
 	payload := ollama.Response{
--- a/internal/ai/vision/engine_openai.go
+++ b/internal/ai/vision/engine_openai.go
@@ -34,6 +34,7 @@ func init() {
 		RequestFormat:     ApiFormatOpenAI,
 		ResponseFormat:    ApiFormatOpenAI,
 		FileScheme:        scheme.Data,
+		DefaultModel:      openai.DefaultModel,
 		DefaultResolution: openai.DefaultResolution,
 		DefaultKey:        openai.APIKeyPlaceholder,
 	})
--- a/internal/ai/vision/model.go
+++ b/internal/ai/vision/model.go
@@ -133,8 +133,6 @@ func (m *Model) IsDefault() bool {
 		return m.Name == NsfwModel.Name
 	case ModelTypeFace:
 		return m.Name == FacenetModel.Name
-	case ModelTypeCaption:
-		return m.Name == CaptionModel.Name
 	}

 	return false
@@ -467,32 +465,37 @@ func (m *Model) ApplyEngineDefaults() {
 	}

 	engine := strings.TrimSpace(strings.ToLower(m.Engine))
+
 	if engine == "" {
 		return
 	}

 	if info, ok := EngineInfoFor(engine); ok {
-		if m.Service.Uri == "" {
+		if strings.TrimSpace(m.Model) == "" && strings.TrimSpace(m.Name) == "" {
+			m.Model = info.DefaultModel
+		}
+
+		if strings.TrimSpace(m.Service.Uri) == "" {
 			m.Service.Uri = info.Uri
 		}

-		if m.Service.RequestFormat == "" {
+		if strings.TrimSpace(m.Service.RequestFormat) == "" {
 			m.Service.RequestFormat = info.RequestFormat
 		}

-		if m.Service.ResponseFormat == "" {
+		if strings.TrimSpace(m.Service.ResponseFormat) == "" {
 			m.Service.ResponseFormat = info.ResponseFormat
 		}

-		if info.FileScheme != "" && m.Service.FileScheme == "" {
+		if strings.TrimSpace(m.Service.FileScheme) == "" && info.FileScheme != "" {
 			m.Service.FileScheme = info.FileScheme
 		}

-		if info.DefaultResolution > 0 && m.Resolution <= 0 {
+		if m.Resolution <= 0 && info.DefaultResolution > 0 {
 			m.Resolution = info.DefaultResolution
 		}

-		if strings.TrimSpace(m.Service.Key) == "" && strings.TrimSpace(info.DefaultKey) != "" {
+		if strings.TrimSpace(m.Service.Key) == "" && info.DefaultKey != "" {
 			m.Service.Key = info.DefaultKey
 		}
 	}
--- a/internal/ai/vision/models.go
+++ b/internal/ai/vision/models.go
@@ -88,14 +88,8 @@ var (
 		},
 	}
 	CaptionModel = &Model{
-		Type:       ModelTypeCaption,
-		Model:      ollama.CaptionModel,
-		Version:    VersionLatest,
-		Engine:     ollama.EngineName,
-		Resolution: 720, // Original aspect ratio, with a max size of 720 x 720 pixels.
-		Service: Service{
-			Uri: "http://ollama:11434/api/generate",
-		},
+		Type:   ModelTypeCaption,
+		Engine: ollama.EngineName,
 	}
 	DefaultModels = Models{
 		NasnetModel,
--- a/internal/ai/vision/ollama/const.go
+++ b/internal/ai/vision/ollama/const.go
@@ -11,4 +11,28 @@ const (
 	APIKeyFileEnv = "OLLAMA_API_KEY_FILE" //nolint:gosec // environment variable name, not a secret
 	// APIKeyPlaceholder is the `${VAR}` form injected when no explicit key is provided.
 	APIKeyPlaceholder = "${" + APIKeyEnv + "}"
+	// DefaultUri is the default service URI for self-hosted Ollama instances.
+	DefaultUri = "http://ollama:11434/api/generate"
+	// CloudUri is the Ollama cloud service URI
+	CloudUri = "https://ollama.com/api/generate"
+	// DefaultModel names the default caption model bundled with our adapter defaults.
+	DefaultModel = "gemma3:latest"
+	// CloudModel names the default caption for the Ollama cloud service, see https://ollama.com/cloud.
+	CloudModel = "qwen3-vl:235b-instruct"
+	// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
+	CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
+	// LabelConfidenceDefault is used when the model omits the confidence field.
+	LabelConfidenceDefault = 0.5
+	// LabelSystem defines the system prompt shared by Ollama label models. It aims to ensure that single-word nouns are returned.
+	LabelSystem = "You are a PhotoPrism vision model. Output concise JSON that matches the schema. Each label name MUST be a single-word noun in its canonical singular form. Avoid spaces, punctuation, emoji, or descriptive phrases."
+	// LabelSystemSimple defines a simple system prompt for Ollama label models that does not strictly require names to be single-word nouns.
+	LabelSystemSimple = "You are a PhotoPrism vision model. Output concise JSON that matches the schema."
+	// LabelPromptDefault defines a simple user prompt for Ollama label models.
+	LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
+	// LabelPromptStrict asks the model to return scored labels for the provided image. It aims to ensure that single-word nouns are returned.
+	LabelPromptStrict = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), and topicality (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64}]} and adjust the values for this image."
+	// LabelPromptNSFW asks the model to return scored labels for the provided image that includes a NSFW flag and score. It aims to ensure that single-word nouns are returned.
+	LabelPromptNSFW = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), topicality (0-1), nsfw (true when the label describes sensitive or adult content), and nsfw_confidence (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64,\"nsfw\":false,\"nsfw_confidence\":0.02}]} and adjust the values for this image."
+	// DefaultResolution is the default thumbnail size submitted to Ollama models.
+	DefaultResolution = 720
 )
--- a/internal/ai/vision/ollama/defaults.go
+++ b/internal/ai/vision/ollama/defaults.go
@@ -1,22 +0,0 @@
-package ollama
-
-const (
-	// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
-	CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
-	// CaptionModel names the default caption model bundled with our adapter defaults.
-	CaptionModel = "gemma3:latest"
-	// LabelConfidenceDefault is used when the model omits the confidence field.
-	LabelConfidenceDefault = 0.5
-	// LabelSystem defines the system prompt shared by Ollama label models. It aims to ensure that single-word nouns are returned.
-	LabelSystem = "You are a PhotoPrism vision model. Output concise JSON that matches the schema. Each label name MUST be a single-word noun in its canonical singular form. Avoid spaces, punctuation, emoji, or descriptive phrases."
-	// LabelSystemSimple defines a simple system prompt for Ollama label models that does not strictly require names to be single-word nouns.
-	LabelSystemSimple = "You are a PhotoPrism vision model. Output concise JSON that matches the schema."
-	// LabelPromptDefault defines a simple user prompt for Ollama label models.
-	LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
-	// LabelPromptStrict asks the model to return scored labels for the provided image. It aims to ensure that single-word nouns are returned.
-	LabelPromptStrict = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), and topicality (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64}]} and adjust the values for this image."
-	// LabelPromptNSFW asks the model to return scored labels for the provided image that includes a NSFW flag and score. It aims to ensure that single-word nouns are returned.
-	LabelPromptNSFW = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), topicality (0-1), nsfw (true when the label describes sensitive or adult content), and nsfw_confidence (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64,\"nsfw\":false,\"nsfw_confidence\":0.02}]} and adjust the values for this image."
-	// DefaultResolution is the default thumbnail size submitted to Ollama models.
-	DefaultResolution = 720
-)
--- a/internal/ai/vision/openai/const.go
+++ b/internal/ai/vision/openai/const.go
@@ -11,4 +11,30 @@ const (
 	APIKeyFileEnv = "OPENAI_API_KEY_FILE" //nolint:gosec // environment variable name, not a secret
 	// APIKeyPlaceholder is the `${VAR}` form injected when no explicit key is provided.
 	APIKeyPlaceholder = "${" + APIKeyEnv + "}"
+	// DefaultModel is the model used by default when accessing the OpenAI API.
+	DefaultModel = "gpt-5-mini"
+	// DefaultResolution is the default thumbnail size submitted to the OpenAI.
+	DefaultResolution = 720
+	// CaptionSystem defines the default system prompt for caption models.
+	CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
+	// CaptionPrompt instructs caption models to respond with a single sentence.
+	CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
+	// LabelSystem defines the system prompt for label generation.
+	LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
+	// LabelPromptDefault requests general-purpose labels.
+	LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
+	// LabelPromptNSFW requests labels including NSFW metadata when required.
+	LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
+	// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
+	DefaultDetail = "low"
+	// CaptionMaxTokens suggests the output budget for caption responses.
+	CaptionMaxTokens = 512
+	// LabelsMaxTokens suggests the output budget for label responses.
+	LabelsMaxTokens = 1024
+	// DefaultTemperature configures deterministic replies.
+	DefaultTemperature = 0.1
+	// DefaultTopP limits nucleus sampling.
+	DefaultTopP = 0.9
+	// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
+	DefaultSchemaVersion = "v1"
 )
--- a/internal/ai/vision/openai/defaults.go
+++ b/internal/ai/vision/openai/defaults.go
@@ -1,33 +0,0 @@
-package openai
-
-const (
-	// CaptionSystem defines the default system prompt for caption models.
-	CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
-	// CaptionPrompt instructs caption models to respond with a single sentence.
-	CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
-	// LabelSystem defines the system prompt for label generation.
-	LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
-	// LabelPromptDefault requests general-purpose labels.
-	LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
-	// LabelPromptNSFW requests labels including NSFW metadata when required.
-	LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
-	// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
-	DefaultDetail = "low"
-	// CaptionMaxTokens suggests the output budget for caption responses.
-	CaptionMaxTokens = 512
-	// LabelsMaxTokens suggests the output budget for label responses.
-	LabelsMaxTokens = 1024
-	// DefaultTemperature configures deterministic replies.
-	DefaultTemperature = 0.1
-	// DefaultTopP limits nucleus sampling.
-	DefaultTopP = 0.9
-	// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
-	DefaultSchemaVersion = "v1"
-)
-
-var (
-	// DefaultModel is the model used by default when accessing the OpenAI API.
-	DefaultModel = "gpt-5-mini"
-	// DefaultResolution is the default thumbnail size submitted to the OpenAI.
-	DefaultResolution = 720
-)
--- a/internal/ai/vision/testdata/vision.yml
+++ b/internal/ai/vision/testdata/vision.yml
@@ -66,7 +66,6 @@ Models:
      Outputs: 512
 - Type: caption
  Model: gemma3:latest
-  Version: latest
  Engine: ollama
  Resolution: 720
  Service: