diff --git a/internal/ai/vision/engine.go b/internal/ai/vision/engine.go index 6a3839043..b2f852bca 100644 --- a/internal/ai/vision/engine.go +++ b/internal/ai/vision/engine.go @@ -75,6 +75,7 @@ type EngineInfo struct { RequestFormat ApiFormat ResponseFormat ApiFormat FileScheme string + DefaultModel string DefaultResolution int DefaultKey string // Optional placeholder key (e.g., ${OPENAI_API_KEY}); applied only when Service.Key is empty. } diff --git a/internal/ai/vision/engine_ollama.go b/internal/ai/vision/engine_ollama.go index f2e8d0d95..49772e04f 100644 --- a/internal/ai/vision/engine_ollama.go +++ b/internal/ai/vision/engine_ollama.go @@ -2,6 +2,7 @@ package vision import ( "context" + "os" "strings" "github.com/photoprism/photoprism/internal/ai/vision/ollama" @@ -23,17 +24,38 @@ func init() { Defaults: ollamaDefaults{}, }) + registerOllamaEngineDefaults() +} + +// registerOllamaEngineDefaults selects the default Ollama endpoint based on the +// available credentials and registers the engine alias accordingly. When an +// API key is configured, we default to the hosted Cloud endpoint; otherwise we +// assume a self-hosted instance reachable via the docker-compose default. +// This keeps the zero-config path fast for local dev while automatically using +// the cloud service when credentials are present. +func registerOllamaEngineDefaults() { + defaultModel := ollama.DefaultModel + defaultUri := ollama.DefaultUri + + // Detect Ollama cloud API key. + if key := os.Getenv(ollama.APIKeyEnv); len(key) > 50 && strings.Contains(key, ".") { + defaultModel = ollama.CloudModel + defaultUri = ollama.CloudUri + } + // Register the human-friendly engine name so configuration can simply use // `Engine: "ollama"` and inherit adapter defaults. RegisterEngineAlias(ollama.EngineName, EngineInfo{ + Uri: defaultUri, RequestFormat: ApiFormatOllama, ResponseFormat: ApiFormatOllama, FileScheme: scheme.Base64, + DefaultModel: defaultModel, DefaultResolution: ollama.DefaultResolution, DefaultKey: ollama.APIKeyPlaceholder, }) - CaptionModel.Engine = ollama.EngineName + // Keep the default caption model config aligned with the defaults. CaptionModel.ApplyEngineDefaults() } diff --git a/internal/ai/vision/engine_ollama_test.go b/internal/ai/vision/engine_ollama_test.go index 00ef6efa2..abe82a63e 100644 --- a/internal/ai/vision/engine_ollama_test.go +++ b/internal/ai/vision/engine_ollama_test.go @@ -3,11 +3,116 @@ package vision import ( "context" "encoding/json" + "os" "testing" "github.com/photoprism/photoprism/internal/ai/vision/ollama" + "github.com/photoprism/photoprism/pkg/http/scheme" ) +func TestRegisterOllamaEngineDefaults(t *testing.T) { + original := os.Getenv(ollama.APIKeyEnv) + originalCaptionModel := CaptionModel.Clone() + testCaptionModel := CaptionModel.Clone() + testCaptionModel.Model = "" + testCaptionModel.Service.Uri = "" + cloudToken := "moo9yaiS4ShoKiojiathie2vuejiec2X.Mahl7ewaej4ebi7afq8f_vwe" //nolint:gosec + + t.Cleanup(func() { + if original == "" { + _ = os.Unsetenv(ollama.APIKeyEnv) + } else { + _ = os.Setenv(ollama.APIKeyEnv, original) + } + CaptionModel = originalCaptionModel + registerOllamaEngineDefaults() + }) + + t.Run("SelfHosted", func(t *testing.T) { + CaptionModel = testCaptionModel.Clone() + _ = os.Unsetenv(ollama.APIKeyEnv) + + registerOllamaEngineDefaults() + + info, ok := EngineInfoFor(ollama.EngineName) + if !ok { + t.Fatalf("expected engine info for %s", ollama.EngineName) + } + + if info.Uri != ollama.DefaultUri { + t.Fatalf("expected default uri %s, got %s", ollama.DefaultUri, info.Uri) + } + + if info.DefaultModel != ollama.DefaultModel { + t.Fatalf("expected default model %s, got %s", ollama.DefaultModel, info.DefaultModel) + } + + if CaptionModel.Model != ollama.DefaultModel { + t.Fatalf("expected caption model %s, got %s", ollama.DefaultModel, CaptionModel.Model) + } + + if CaptionModel.Service.Uri != ollama.DefaultUri { + t.Fatalf("expected caption model uri %s, got %s", ollama.DefaultUri, CaptionModel.Service.Uri) + } + }) + t.Run("Cloud", func(t *testing.T) { + CaptionModel = testCaptionModel.Clone() + t.Setenv(ollama.APIKeyEnv, cloudToken) + + registerOllamaEngineDefaults() + + info, ok := EngineInfoFor(ollama.EngineName) + if !ok { + t.Fatalf("expected engine info for %s", ollama.EngineName) + } + + if info.Uri != ollama.CloudUri { + t.Fatalf("expected cloud uri %s, got %s", ollama.CloudUri, info.Uri) + } + + if info.DefaultModel != ollama.CloudModel { + t.Fatalf("expected cloud model %s, got %s", ollama.CloudModel, info.DefaultModel) + } + + if CaptionModel.Model != ollama.CloudModel { + t.Fatalf("expected caption model %s, got %s", ollama.CloudModel, CaptionModel.Model) + } + + if CaptionModel.Service.Uri != ollama.CloudUri { + t.Fatalf("expected caption model uri %s, got %s", ollama.CloudUri, CaptionModel.Service.Uri) + } + }) + t.Run("NewModels", func(t *testing.T) { + CaptionModel = testCaptionModel.Clone() + + t.Setenv(ollama.APIKeyEnv, cloudToken) + registerOllamaEngineDefaults() + + model := &Model{Type: ModelTypeCaption, Engine: ollama.EngineName} + model.ApplyEngineDefaults() + + if model.Model != ollama.CloudModel { + t.Fatalf("expected model %s, got %s", ollama.CloudModel, model.Model) + } + + if model.Service.Uri != ollama.CloudUri { + t.Fatalf("expected service uri %s, got %s", ollama.CloudUri, model.Service.Uri) + } + + if model.Service.RequestFormat != ApiFormatOllama || model.Service.ResponseFormat != ApiFormatOllama { + t.Fatalf("expected request/response format %s, got %s/%s", ApiFormatOllama, model.Service.RequestFormat, model.Service.ResponseFormat) + } + + if model.Service.FileScheme != scheme.Base64 { + t.Fatalf("expected file scheme %s, got %s", scheme.Base64, model.Service.FileScheme) + } + + if model.Resolution != ollama.DefaultResolution { + t.Fatalf("expected resolution %d, got %d", ollama.DefaultResolution, model.Resolution) + } + }) +} + func TestOllamaDefaultConfidenceApplied(t *testing.T) { req := &ApiRequest{Format: FormatJSON} payload := ollama.Response{ diff --git a/internal/ai/vision/engine_openai.go b/internal/ai/vision/engine_openai.go index 8fab62417..f807f7dad 100644 --- a/internal/ai/vision/engine_openai.go +++ b/internal/ai/vision/engine_openai.go @@ -34,6 +34,7 @@ func init() { RequestFormat: ApiFormatOpenAI, ResponseFormat: ApiFormatOpenAI, FileScheme: scheme.Data, + DefaultModel: openai.DefaultModel, DefaultResolution: openai.DefaultResolution, DefaultKey: openai.APIKeyPlaceholder, }) diff --git a/internal/ai/vision/model.go b/internal/ai/vision/model.go index f815f4434..a4b74fc69 100644 --- a/internal/ai/vision/model.go +++ b/internal/ai/vision/model.go @@ -133,8 +133,6 @@ func (m *Model) IsDefault() bool { return m.Name == NsfwModel.Name case ModelTypeFace: return m.Name == FacenetModel.Name - case ModelTypeCaption: - return m.Name == CaptionModel.Name } return false @@ -467,32 +465,37 @@ func (m *Model) ApplyEngineDefaults() { } engine := strings.TrimSpace(strings.ToLower(m.Engine)) + if engine == "" { return } if info, ok := EngineInfoFor(engine); ok { - if m.Service.Uri == "" { + if strings.TrimSpace(m.Model) == "" && strings.TrimSpace(m.Name) == "" { + m.Model = info.DefaultModel + } + + if strings.TrimSpace(m.Service.Uri) == "" { m.Service.Uri = info.Uri } - if m.Service.RequestFormat == "" { + if strings.TrimSpace(m.Service.RequestFormat) == "" { m.Service.RequestFormat = info.RequestFormat } - if m.Service.ResponseFormat == "" { + if strings.TrimSpace(m.Service.ResponseFormat) == "" { m.Service.ResponseFormat = info.ResponseFormat } - if info.FileScheme != "" && m.Service.FileScheme == "" { + if strings.TrimSpace(m.Service.FileScheme) == "" && info.FileScheme != "" { m.Service.FileScheme = info.FileScheme } - if info.DefaultResolution > 0 && m.Resolution <= 0 { + if m.Resolution <= 0 && info.DefaultResolution > 0 { m.Resolution = info.DefaultResolution } - if strings.TrimSpace(m.Service.Key) == "" && strings.TrimSpace(info.DefaultKey) != "" { + if strings.TrimSpace(m.Service.Key) == "" && info.DefaultKey != "" { m.Service.Key = info.DefaultKey } } diff --git a/internal/ai/vision/models.go b/internal/ai/vision/models.go index c88bef528..c60d5ab52 100644 --- a/internal/ai/vision/models.go +++ b/internal/ai/vision/models.go @@ -88,14 +88,8 @@ var ( }, } CaptionModel = &Model{ - Type: ModelTypeCaption, - Model: ollama.CaptionModel, - Version: VersionLatest, - Engine: ollama.EngineName, - Resolution: 720, // Original aspect ratio, with a max size of 720 x 720 pixels. - Service: Service{ - Uri: "http://ollama:11434/api/generate", - }, + Type: ModelTypeCaption, + Engine: ollama.EngineName, } DefaultModels = Models{ NasnetModel, diff --git a/internal/ai/vision/ollama/const.go b/internal/ai/vision/ollama/const.go index 2bb12bf82..c6093414e 100644 --- a/internal/ai/vision/ollama/const.go +++ b/internal/ai/vision/ollama/const.go @@ -11,4 +11,28 @@ const ( APIKeyFileEnv = "OLLAMA_API_KEY_FILE" //nolint:gosec // environment variable name, not a secret // APIKeyPlaceholder is the `${VAR}` form injected when no explicit key is provided. APIKeyPlaceholder = "${" + APIKeyEnv + "}" + // DefaultUri is the default service URI for self-hosted Ollama instances. + DefaultUri = "http://ollama:11434/api/generate" + // CloudUri is the Ollama cloud service URI + CloudUri = "https://ollama.com/api/generate" + // DefaultModel names the default caption model bundled with our adapter defaults. + DefaultModel = "gemma3:latest" + // CloudModel names the default caption for the Ollama cloud service, see https://ollama.com/cloud. + CloudModel = "qwen3-vl:235b-instruct" + // CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence. + CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words." + // LabelConfidenceDefault is used when the model omits the confidence field. + LabelConfidenceDefault = 0.5 + // LabelSystem defines the system prompt shared by Ollama label models. It aims to ensure that single-word nouns are returned. + LabelSystem = "You are a PhotoPrism vision model. Output concise JSON that matches the schema. Each label name MUST be a single-word noun in its canonical singular form. Avoid spaces, punctuation, emoji, or descriptive phrases." + // LabelSystemSimple defines a simple system prompt for Ollama label models that does not strictly require names to be single-word nouns. + LabelSystemSimple = "You are a PhotoPrism vision model. Output concise JSON that matches the schema." + // LabelPromptDefault defines a simple user prompt for Ollama label models. + LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)." + // LabelPromptStrict asks the model to return scored labels for the provided image. It aims to ensure that single-word nouns are returned. + LabelPromptStrict = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), and topicality (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64}]} and adjust the values for this image." + // LabelPromptNSFW asks the model to return scored labels for the provided image that includes a NSFW flag and score. It aims to ensure that single-word nouns are returned. + LabelPromptNSFW = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), topicality (0-1), nsfw (true when the label describes sensitive or adult content), and nsfw_confidence (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64,\"nsfw\":false,\"nsfw_confidence\":0.02}]} and adjust the values for this image." + // DefaultResolution is the default thumbnail size submitted to Ollama models. + DefaultResolution = 720 ) diff --git a/internal/ai/vision/ollama/defaults.go b/internal/ai/vision/ollama/defaults.go deleted file mode 100644 index ad570ff7c..000000000 --- a/internal/ai/vision/ollama/defaults.go +++ /dev/null @@ -1,22 +0,0 @@ -package ollama - -const ( - // CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence. - CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words." - // CaptionModel names the default caption model bundled with our adapter defaults. - CaptionModel = "gemma3:latest" - // LabelConfidenceDefault is used when the model omits the confidence field. - LabelConfidenceDefault = 0.5 - // LabelSystem defines the system prompt shared by Ollama label models. It aims to ensure that single-word nouns are returned. - LabelSystem = "You are a PhotoPrism vision model. Output concise JSON that matches the schema. Each label name MUST be a single-word noun in its canonical singular form. Avoid spaces, punctuation, emoji, or descriptive phrases." - // LabelSystemSimple defines a simple system prompt for Ollama label models that does not strictly require names to be single-word nouns. - LabelSystemSimple = "You are a PhotoPrism vision model. Output concise JSON that matches the schema." - // LabelPromptDefault defines a simple user prompt for Ollama label models. - LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)." - // LabelPromptStrict asks the model to return scored labels for the provided image. It aims to ensure that single-word nouns are returned. - LabelPromptStrict = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), and topicality (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64}]} and adjust the values for this image." - // LabelPromptNSFW asks the model to return scored labels for the provided image that includes a NSFW flag and score. It aims to ensure that single-word nouns are returned. - LabelPromptNSFW = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), topicality (0-1), nsfw (true when the label describes sensitive or adult content), and nsfw_confidence (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64,\"nsfw\":false,\"nsfw_confidence\":0.02}]} and adjust the values for this image." - // DefaultResolution is the default thumbnail size submitted to Ollama models. - DefaultResolution = 720 -) diff --git a/internal/ai/vision/openai/const.go b/internal/ai/vision/openai/const.go index 1a6af1254..99e76f15c 100644 --- a/internal/ai/vision/openai/const.go +++ b/internal/ai/vision/openai/const.go @@ -11,4 +11,30 @@ const ( APIKeyFileEnv = "OPENAI_API_KEY_FILE" //nolint:gosec // environment variable name, not a secret // APIKeyPlaceholder is the `${VAR}` form injected when no explicit key is provided. APIKeyPlaceholder = "${" + APIKeyEnv + "}" + // DefaultModel is the model used by default when accessing the OpenAI API. + DefaultModel = "gpt-5-mini" + // DefaultResolution is the default thumbnail size submitted to the OpenAI. + DefaultResolution = 720 + // CaptionSystem defines the default system prompt for caption models. + CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately." + // CaptionPrompt instructs caption models to respond with a single sentence. + CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon." + // LabelSystem defines the system prompt for label generation. + LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns." + // LabelPromptDefault requests general-purpose labels. + LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)." + // LabelPromptNSFW requests labels including NSFW metadata when required. + LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)." + // DefaultDetail specifies the preferred thumbnail detail level for Requests API calls. + DefaultDetail = "low" + // CaptionMaxTokens suggests the output budget for caption responses. + CaptionMaxTokens = 512 + // LabelsMaxTokens suggests the output budget for label responses. + LabelsMaxTokens = 1024 + // DefaultTemperature configures deterministic replies. + DefaultTemperature = 0.1 + // DefaultTopP limits nucleus sampling. + DefaultTopP = 0.9 + // DefaultSchemaVersion is used when callers do not specify an explicit schema version. + DefaultSchemaVersion = "v1" ) diff --git a/internal/ai/vision/openai/defaults.go b/internal/ai/vision/openai/defaults.go deleted file mode 100644 index 36f9977dd..000000000 --- a/internal/ai/vision/openai/defaults.go +++ /dev/null @@ -1,33 +0,0 @@ -package openai - -const ( - // CaptionSystem defines the default system prompt for caption models. - CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately." - // CaptionPrompt instructs caption models to respond with a single sentence. - CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon." - // LabelSystem defines the system prompt for label generation. - LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns." - // LabelPromptDefault requests general-purpose labels. - LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)." - // LabelPromptNSFW requests labels including NSFW metadata when required. - LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)." - // DefaultDetail specifies the preferred thumbnail detail level for Requests API calls. - DefaultDetail = "low" - // CaptionMaxTokens suggests the output budget for caption responses. - CaptionMaxTokens = 512 - // LabelsMaxTokens suggests the output budget for label responses. - LabelsMaxTokens = 1024 - // DefaultTemperature configures deterministic replies. - DefaultTemperature = 0.1 - // DefaultTopP limits nucleus sampling. - DefaultTopP = 0.9 - // DefaultSchemaVersion is used when callers do not specify an explicit schema version. - DefaultSchemaVersion = "v1" -) - -var ( - // DefaultModel is the model used by default when accessing the OpenAI API. - DefaultModel = "gpt-5-mini" - // DefaultResolution is the default thumbnail size submitted to the OpenAI. - DefaultResolution = 720 -) diff --git a/internal/ai/vision/testdata/vision.yml b/internal/ai/vision/testdata/vision.yml index 930d7eb5b..45bb24d43 100644 --- a/internal/ai/vision/testdata/vision.yml +++ b/internal/ai/vision/testdata/vision.yml @@ -66,7 +66,6 @@ Models: Outputs: 512 - Type: caption Model: gemma3:latest - Version: latest Engine: ollama Resolution: 720 Service: