AI: Set default Model & URI depending on OLLAMA_API_KEY env var #5361

Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
Michael Mayer
2025-12-04 16:10:29 +01:00
parent 52ac4a91e0
commit f295a4bac3
11 changed files with 193 additions and 73 deletions

View File

@@ -75,6 +75,7 @@ type EngineInfo struct {
RequestFormat ApiFormat
ResponseFormat ApiFormat
FileScheme string
DefaultModel string
DefaultResolution int
DefaultKey string // Optional placeholder key (e.g., ${OPENAI_API_KEY}); applied only when Service.Key is empty.
}

View File

@@ -2,6 +2,7 @@ package vision
import (
"context"
"os"
"strings"
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
@@ -23,17 +24,38 @@ func init() {
Defaults: ollamaDefaults{},
})
registerOllamaEngineDefaults()
}
// registerOllamaEngineDefaults selects the default Ollama endpoint based on the
// available credentials and registers the engine alias accordingly. When an
// API key is configured, we default to the hosted Cloud endpoint; otherwise we
// assume a self-hosted instance reachable via the docker-compose default.
// This keeps the zero-config path fast for local dev while automatically using
// the cloud service when credentials are present.
func registerOllamaEngineDefaults() {
defaultModel := ollama.DefaultModel
defaultUri := ollama.DefaultUri
// Detect Ollama cloud API key.
if key := os.Getenv(ollama.APIKeyEnv); len(key) > 50 && strings.Contains(key, ".") {
defaultModel = ollama.CloudModel
defaultUri = ollama.CloudUri
}
// Register the human-friendly engine name so configuration can simply use
// `Engine: "ollama"` and inherit adapter defaults.
RegisterEngineAlias(ollama.EngineName, EngineInfo{
Uri: defaultUri,
RequestFormat: ApiFormatOllama,
ResponseFormat: ApiFormatOllama,
FileScheme: scheme.Base64,
DefaultModel: defaultModel,
DefaultResolution: ollama.DefaultResolution,
DefaultKey: ollama.APIKeyPlaceholder,
})
CaptionModel.Engine = ollama.EngineName
// Keep the default caption model config aligned with the defaults.
CaptionModel.ApplyEngineDefaults()
}

View File

@@ -3,11 +3,116 @@ package vision
import (
"context"
"encoding/json"
"os"
"testing"
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
func TestRegisterOllamaEngineDefaults(t *testing.T) {
original := os.Getenv(ollama.APIKeyEnv)
originalCaptionModel := CaptionModel.Clone()
testCaptionModel := CaptionModel.Clone()
testCaptionModel.Model = ""
testCaptionModel.Service.Uri = ""
cloudToken := "moo9yaiS4ShoKiojiathie2vuejiec2X.Mahl7ewaej4ebi7afq8f_vwe" //nolint:gosec
t.Cleanup(func() {
if original == "" {
_ = os.Unsetenv(ollama.APIKeyEnv)
} else {
_ = os.Setenv(ollama.APIKeyEnv, original)
}
CaptionModel = originalCaptionModel
registerOllamaEngineDefaults()
})
t.Run("SelfHosted", func(t *testing.T) {
CaptionModel = testCaptionModel.Clone()
_ = os.Unsetenv(ollama.APIKeyEnv)
registerOllamaEngineDefaults()
info, ok := EngineInfoFor(ollama.EngineName)
if !ok {
t.Fatalf("expected engine info for %s", ollama.EngineName)
}
if info.Uri != ollama.DefaultUri {
t.Fatalf("expected default uri %s, got %s", ollama.DefaultUri, info.Uri)
}
if info.DefaultModel != ollama.DefaultModel {
t.Fatalf("expected default model %s, got %s", ollama.DefaultModel, info.DefaultModel)
}
if CaptionModel.Model != ollama.DefaultModel {
t.Fatalf("expected caption model %s, got %s", ollama.DefaultModel, CaptionModel.Model)
}
if CaptionModel.Service.Uri != ollama.DefaultUri {
t.Fatalf("expected caption model uri %s, got %s", ollama.DefaultUri, CaptionModel.Service.Uri)
}
})
t.Run("Cloud", func(t *testing.T) {
CaptionModel = testCaptionModel.Clone()
t.Setenv(ollama.APIKeyEnv, cloudToken)
registerOllamaEngineDefaults()
info, ok := EngineInfoFor(ollama.EngineName)
if !ok {
t.Fatalf("expected engine info for %s", ollama.EngineName)
}
if info.Uri != ollama.CloudUri {
t.Fatalf("expected cloud uri %s, got %s", ollama.CloudUri, info.Uri)
}
if info.DefaultModel != ollama.CloudModel {
t.Fatalf("expected cloud model %s, got %s", ollama.CloudModel, info.DefaultModel)
}
if CaptionModel.Model != ollama.CloudModel {
t.Fatalf("expected caption model %s, got %s", ollama.CloudModel, CaptionModel.Model)
}
if CaptionModel.Service.Uri != ollama.CloudUri {
t.Fatalf("expected caption model uri %s, got %s", ollama.CloudUri, CaptionModel.Service.Uri)
}
})
t.Run("NewModels", func(t *testing.T) {
CaptionModel = testCaptionModel.Clone()
t.Setenv(ollama.APIKeyEnv, cloudToken)
registerOllamaEngineDefaults()
model := &Model{Type: ModelTypeCaption, Engine: ollama.EngineName}
model.ApplyEngineDefaults()
if model.Model != ollama.CloudModel {
t.Fatalf("expected model %s, got %s", ollama.CloudModel, model.Model)
}
if model.Service.Uri != ollama.CloudUri {
t.Fatalf("expected service uri %s, got %s", ollama.CloudUri, model.Service.Uri)
}
if model.Service.RequestFormat != ApiFormatOllama || model.Service.ResponseFormat != ApiFormatOllama {
t.Fatalf("expected request/response format %s, got %s/%s", ApiFormatOllama, model.Service.RequestFormat, model.Service.ResponseFormat)
}
if model.Service.FileScheme != scheme.Base64 {
t.Fatalf("expected file scheme %s, got %s", scheme.Base64, model.Service.FileScheme)
}
if model.Resolution != ollama.DefaultResolution {
t.Fatalf("expected resolution %d, got %d", ollama.DefaultResolution, model.Resolution)
}
})
}
func TestOllamaDefaultConfidenceApplied(t *testing.T) {
req := &ApiRequest{Format: FormatJSON}
payload := ollama.Response{

View File

@@ -34,6 +34,7 @@ func init() {
RequestFormat: ApiFormatOpenAI,
ResponseFormat: ApiFormatOpenAI,
FileScheme: scheme.Data,
DefaultModel: openai.DefaultModel,
DefaultResolution: openai.DefaultResolution,
DefaultKey: openai.APIKeyPlaceholder,
})

View File

@@ -133,8 +133,6 @@ func (m *Model) IsDefault() bool {
return m.Name == NsfwModel.Name
case ModelTypeFace:
return m.Name == FacenetModel.Name
case ModelTypeCaption:
return m.Name == CaptionModel.Name
}
return false
@@ -467,32 +465,37 @@ func (m *Model) ApplyEngineDefaults() {
}
engine := strings.TrimSpace(strings.ToLower(m.Engine))
if engine == "" {
return
}
if info, ok := EngineInfoFor(engine); ok {
if m.Service.Uri == "" {
if strings.TrimSpace(m.Model) == "" && strings.TrimSpace(m.Name) == "" {
m.Model = info.DefaultModel
}
if strings.TrimSpace(m.Service.Uri) == "" {
m.Service.Uri = info.Uri
}
if m.Service.RequestFormat == "" {
if strings.TrimSpace(m.Service.RequestFormat) == "" {
m.Service.RequestFormat = info.RequestFormat
}
if m.Service.ResponseFormat == "" {
if strings.TrimSpace(m.Service.ResponseFormat) == "" {
m.Service.ResponseFormat = info.ResponseFormat
}
if info.FileScheme != "" && m.Service.FileScheme == "" {
if strings.TrimSpace(m.Service.FileScheme) == "" && info.FileScheme != "" {
m.Service.FileScheme = info.FileScheme
}
if info.DefaultResolution > 0 && m.Resolution <= 0 {
if m.Resolution <= 0 && info.DefaultResolution > 0 {
m.Resolution = info.DefaultResolution
}
if strings.TrimSpace(m.Service.Key) == "" && strings.TrimSpace(info.DefaultKey) != "" {
if strings.TrimSpace(m.Service.Key) == "" && info.DefaultKey != "" {
m.Service.Key = info.DefaultKey
}
}

View File

@@ -88,14 +88,8 @@ var (
},
}
CaptionModel = &Model{
Type: ModelTypeCaption,
Model: ollama.CaptionModel,
Version: VersionLatest,
Engine: ollama.EngineName,
Resolution: 720, // Original aspect ratio, with a max size of 720 x 720 pixels.
Service: Service{
Uri: "http://ollama:11434/api/generate",
},
Type: ModelTypeCaption,
Engine: ollama.EngineName,
}
DefaultModels = Models{
NasnetModel,

View File

@@ -11,4 +11,28 @@ const (
APIKeyFileEnv = "OLLAMA_API_KEY_FILE" //nolint:gosec // environment variable name, not a secret
// APIKeyPlaceholder is the `${VAR}` form injected when no explicit key is provided.
APIKeyPlaceholder = "${" + APIKeyEnv + "}"
// DefaultUri is the default service URI for self-hosted Ollama instances.
DefaultUri = "http://ollama:11434/api/generate"
// CloudUri is the Ollama cloud service URI
CloudUri = "https://ollama.com/api/generate"
// DefaultModel names the default caption model bundled with our adapter defaults.
DefaultModel = "gemma3:latest"
// CloudModel names the default caption for the Ollama cloud service, see https://ollama.com/cloud.
CloudModel = "qwen3-vl:235b-instruct"
// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
// LabelConfidenceDefault is used when the model omits the confidence field.
LabelConfidenceDefault = 0.5
// LabelSystem defines the system prompt shared by Ollama label models. It aims to ensure that single-word nouns are returned.
LabelSystem = "You are a PhotoPrism vision model. Output concise JSON that matches the schema. Each label name MUST be a single-word noun in its canonical singular form. Avoid spaces, punctuation, emoji, or descriptive phrases."
// LabelSystemSimple defines a simple system prompt for Ollama label models that does not strictly require names to be single-word nouns.
LabelSystemSimple = "You are a PhotoPrism vision model. Output concise JSON that matches the schema."
// LabelPromptDefault defines a simple user prompt for Ollama label models.
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
// LabelPromptStrict asks the model to return scored labels for the provided image. It aims to ensure that single-word nouns are returned.
LabelPromptStrict = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), and topicality (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64}]} and adjust the values for this image."
// LabelPromptNSFW asks the model to return scored labels for the provided image that includes a NSFW flag and score. It aims to ensure that single-word nouns are returned.
LabelPromptNSFW = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), topicality (0-1), nsfw (true when the label describes sensitive or adult content), and nsfw_confidence (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64,\"nsfw\":false,\"nsfw_confidence\":0.02}]} and adjust the values for this image."
// DefaultResolution is the default thumbnail size submitted to Ollama models.
DefaultResolution = 720
)

View File

@@ -1,22 +0,0 @@
package ollama
const (
// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
// CaptionModel names the default caption model bundled with our adapter defaults.
CaptionModel = "gemma3:latest"
// LabelConfidenceDefault is used when the model omits the confidence field.
LabelConfidenceDefault = 0.5
// LabelSystem defines the system prompt shared by Ollama label models. It aims to ensure that single-word nouns are returned.
LabelSystem = "You are a PhotoPrism vision model. Output concise JSON that matches the schema. Each label name MUST be a single-word noun in its canonical singular form. Avoid spaces, punctuation, emoji, or descriptive phrases."
// LabelSystemSimple defines a simple system prompt for Ollama label models that does not strictly require names to be single-word nouns.
LabelSystemSimple = "You are a PhotoPrism vision model. Output concise JSON that matches the schema."
// LabelPromptDefault defines a simple user prompt for Ollama label models.
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
// LabelPromptStrict asks the model to return scored labels for the provided image. It aims to ensure that single-word nouns are returned.
LabelPromptStrict = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), and topicality (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64}]} and adjust the values for this image."
// LabelPromptNSFW asks the model to return scored labels for the provided image that includes a NSFW flag and score. It aims to ensure that single-word nouns are returned.
LabelPromptNSFW = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), topicality (0-1), nsfw (true when the label describes sensitive or adult content), and nsfw_confidence (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64,\"nsfw\":false,\"nsfw_confidence\":0.02}]} and adjust the values for this image."
// DefaultResolution is the default thumbnail size submitted to Ollama models.
DefaultResolution = 720
)

View File

@@ -11,4 +11,30 @@ const (
APIKeyFileEnv = "OPENAI_API_KEY_FILE" //nolint:gosec // environment variable name, not a secret
// APIKeyPlaceholder is the `${VAR}` form injected when no explicit key is provided.
APIKeyPlaceholder = "${" + APIKeyEnv + "}"
// DefaultModel is the model used by default when accessing the OpenAI API.
DefaultModel = "gpt-5-mini"
// DefaultResolution is the default thumbnail size submitted to the OpenAI.
DefaultResolution = 720
// CaptionSystem defines the default system prompt for caption models.
CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
// CaptionPrompt instructs caption models to respond with a single sentence.
CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
// LabelSystem defines the system prompt for label generation.
LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
// LabelPromptDefault requests general-purpose labels.
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
// LabelPromptNSFW requests labels including NSFW metadata when required.
LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
DefaultDetail = "low"
// CaptionMaxTokens suggests the output budget for caption responses.
CaptionMaxTokens = 512
// LabelsMaxTokens suggests the output budget for label responses.
LabelsMaxTokens = 1024
// DefaultTemperature configures deterministic replies.
DefaultTemperature = 0.1
// DefaultTopP limits nucleus sampling.
DefaultTopP = 0.9
// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
DefaultSchemaVersion = "v1"
)

View File

@@ -1,33 +0,0 @@
package openai
const (
// CaptionSystem defines the default system prompt for caption models.
CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
// CaptionPrompt instructs caption models to respond with a single sentence.
CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
// LabelSystem defines the system prompt for label generation.
LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
// LabelPromptDefault requests general-purpose labels.
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
// LabelPromptNSFW requests labels including NSFW metadata when required.
LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
DefaultDetail = "low"
// CaptionMaxTokens suggests the output budget for caption responses.
CaptionMaxTokens = 512
// LabelsMaxTokens suggests the output budget for label responses.
LabelsMaxTokens = 1024
// DefaultTemperature configures deterministic replies.
DefaultTemperature = 0.1
// DefaultTopP limits nucleus sampling.
DefaultTopP = 0.9
// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
DefaultSchemaVersion = "v1"
)
var (
// DefaultModel is the model used by default when accessing the OpenAI API.
DefaultModel = "gpt-5-mini"
// DefaultResolution is the default thumbnail size submitted to the OpenAI.
DefaultResolution = 720
)

View File

@@ -66,7 +66,6 @@ Models:
Outputs: 512
- Type: caption
Model: gemma3:latest
Version: latest
Engine: ollama
Resolution: 720
Service: