mirror of
https://github.com/photoprism/photoprism.git
synced 2025-12-11 16:24:11 +01:00
AI: Set default Model & URI depending on OLLAMA_API_KEY env var #5361
Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
@@ -75,6 +75,7 @@ type EngineInfo struct {
|
||||
RequestFormat ApiFormat
|
||||
ResponseFormat ApiFormat
|
||||
FileScheme string
|
||||
DefaultModel string
|
||||
DefaultResolution int
|
||||
DefaultKey string // Optional placeholder key (e.g., ${OPENAI_API_KEY}); applied only when Service.Key is empty.
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package vision
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
|
||||
@@ -23,17 +24,38 @@ func init() {
|
||||
Defaults: ollamaDefaults{},
|
||||
})
|
||||
|
||||
registerOllamaEngineDefaults()
|
||||
}
|
||||
|
||||
// registerOllamaEngineDefaults selects the default Ollama endpoint based on the
|
||||
// available credentials and registers the engine alias accordingly. When an
|
||||
// API key is configured, we default to the hosted Cloud endpoint; otherwise we
|
||||
// assume a self-hosted instance reachable via the docker-compose default.
|
||||
// This keeps the zero-config path fast for local dev while automatically using
|
||||
// the cloud service when credentials are present.
|
||||
func registerOllamaEngineDefaults() {
|
||||
defaultModel := ollama.DefaultModel
|
||||
defaultUri := ollama.DefaultUri
|
||||
|
||||
// Detect Ollama cloud API key.
|
||||
if key := os.Getenv(ollama.APIKeyEnv); len(key) > 50 && strings.Contains(key, ".") {
|
||||
defaultModel = ollama.CloudModel
|
||||
defaultUri = ollama.CloudUri
|
||||
}
|
||||
|
||||
// Register the human-friendly engine name so configuration can simply use
|
||||
// `Engine: "ollama"` and inherit adapter defaults.
|
||||
RegisterEngineAlias(ollama.EngineName, EngineInfo{
|
||||
Uri: defaultUri,
|
||||
RequestFormat: ApiFormatOllama,
|
||||
ResponseFormat: ApiFormatOllama,
|
||||
FileScheme: scheme.Base64,
|
||||
DefaultModel: defaultModel,
|
||||
DefaultResolution: ollama.DefaultResolution,
|
||||
DefaultKey: ollama.APIKeyPlaceholder,
|
||||
})
|
||||
|
||||
CaptionModel.Engine = ollama.EngineName
|
||||
// Keep the default caption model config aligned with the defaults.
|
||||
CaptionModel.ApplyEngineDefaults()
|
||||
}
|
||||
|
||||
|
||||
@@ -3,11 +3,116 @@ package vision
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
|
||||
"github.com/photoprism/photoprism/pkg/http/scheme"
|
||||
)
|
||||
|
||||
func TestRegisterOllamaEngineDefaults(t *testing.T) {
|
||||
original := os.Getenv(ollama.APIKeyEnv)
|
||||
originalCaptionModel := CaptionModel.Clone()
|
||||
testCaptionModel := CaptionModel.Clone()
|
||||
testCaptionModel.Model = ""
|
||||
testCaptionModel.Service.Uri = ""
|
||||
cloudToken := "moo9yaiS4ShoKiojiathie2vuejiec2X.Mahl7ewaej4ebi7afq8f_vwe" //nolint:gosec
|
||||
|
||||
t.Cleanup(func() {
|
||||
if original == "" {
|
||||
_ = os.Unsetenv(ollama.APIKeyEnv)
|
||||
} else {
|
||||
_ = os.Setenv(ollama.APIKeyEnv, original)
|
||||
}
|
||||
CaptionModel = originalCaptionModel
|
||||
registerOllamaEngineDefaults()
|
||||
})
|
||||
|
||||
t.Run("SelfHosted", func(t *testing.T) {
|
||||
CaptionModel = testCaptionModel.Clone()
|
||||
_ = os.Unsetenv(ollama.APIKeyEnv)
|
||||
|
||||
registerOllamaEngineDefaults()
|
||||
|
||||
info, ok := EngineInfoFor(ollama.EngineName)
|
||||
if !ok {
|
||||
t.Fatalf("expected engine info for %s", ollama.EngineName)
|
||||
}
|
||||
|
||||
if info.Uri != ollama.DefaultUri {
|
||||
t.Fatalf("expected default uri %s, got %s", ollama.DefaultUri, info.Uri)
|
||||
}
|
||||
|
||||
if info.DefaultModel != ollama.DefaultModel {
|
||||
t.Fatalf("expected default model %s, got %s", ollama.DefaultModel, info.DefaultModel)
|
||||
}
|
||||
|
||||
if CaptionModel.Model != ollama.DefaultModel {
|
||||
t.Fatalf("expected caption model %s, got %s", ollama.DefaultModel, CaptionModel.Model)
|
||||
}
|
||||
|
||||
if CaptionModel.Service.Uri != ollama.DefaultUri {
|
||||
t.Fatalf("expected caption model uri %s, got %s", ollama.DefaultUri, CaptionModel.Service.Uri)
|
||||
}
|
||||
})
|
||||
t.Run("Cloud", func(t *testing.T) {
|
||||
CaptionModel = testCaptionModel.Clone()
|
||||
t.Setenv(ollama.APIKeyEnv, cloudToken)
|
||||
|
||||
registerOllamaEngineDefaults()
|
||||
|
||||
info, ok := EngineInfoFor(ollama.EngineName)
|
||||
if !ok {
|
||||
t.Fatalf("expected engine info for %s", ollama.EngineName)
|
||||
}
|
||||
|
||||
if info.Uri != ollama.CloudUri {
|
||||
t.Fatalf("expected cloud uri %s, got %s", ollama.CloudUri, info.Uri)
|
||||
}
|
||||
|
||||
if info.DefaultModel != ollama.CloudModel {
|
||||
t.Fatalf("expected cloud model %s, got %s", ollama.CloudModel, info.DefaultModel)
|
||||
}
|
||||
|
||||
if CaptionModel.Model != ollama.CloudModel {
|
||||
t.Fatalf("expected caption model %s, got %s", ollama.CloudModel, CaptionModel.Model)
|
||||
}
|
||||
|
||||
if CaptionModel.Service.Uri != ollama.CloudUri {
|
||||
t.Fatalf("expected caption model uri %s, got %s", ollama.CloudUri, CaptionModel.Service.Uri)
|
||||
}
|
||||
})
|
||||
t.Run("NewModels", func(t *testing.T) {
|
||||
CaptionModel = testCaptionModel.Clone()
|
||||
|
||||
t.Setenv(ollama.APIKeyEnv, cloudToken)
|
||||
registerOllamaEngineDefaults()
|
||||
|
||||
model := &Model{Type: ModelTypeCaption, Engine: ollama.EngineName}
|
||||
model.ApplyEngineDefaults()
|
||||
|
||||
if model.Model != ollama.CloudModel {
|
||||
t.Fatalf("expected model %s, got %s", ollama.CloudModel, model.Model)
|
||||
}
|
||||
|
||||
if model.Service.Uri != ollama.CloudUri {
|
||||
t.Fatalf("expected service uri %s, got %s", ollama.CloudUri, model.Service.Uri)
|
||||
}
|
||||
|
||||
if model.Service.RequestFormat != ApiFormatOllama || model.Service.ResponseFormat != ApiFormatOllama {
|
||||
t.Fatalf("expected request/response format %s, got %s/%s", ApiFormatOllama, model.Service.RequestFormat, model.Service.ResponseFormat)
|
||||
}
|
||||
|
||||
if model.Service.FileScheme != scheme.Base64 {
|
||||
t.Fatalf("expected file scheme %s, got %s", scheme.Base64, model.Service.FileScheme)
|
||||
}
|
||||
|
||||
if model.Resolution != ollama.DefaultResolution {
|
||||
t.Fatalf("expected resolution %d, got %d", ollama.DefaultResolution, model.Resolution)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestOllamaDefaultConfidenceApplied(t *testing.T) {
|
||||
req := &ApiRequest{Format: FormatJSON}
|
||||
payload := ollama.Response{
|
||||
|
||||
@@ -34,6 +34,7 @@ func init() {
|
||||
RequestFormat: ApiFormatOpenAI,
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
FileScheme: scheme.Data,
|
||||
DefaultModel: openai.DefaultModel,
|
||||
DefaultResolution: openai.DefaultResolution,
|
||||
DefaultKey: openai.APIKeyPlaceholder,
|
||||
})
|
||||
|
||||
@@ -133,8 +133,6 @@ func (m *Model) IsDefault() bool {
|
||||
return m.Name == NsfwModel.Name
|
||||
case ModelTypeFace:
|
||||
return m.Name == FacenetModel.Name
|
||||
case ModelTypeCaption:
|
||||
return m.Name == CaptionModel.Name
|
||||
}
|
||||
|
||||
return false
|
||||
@@ -467,32 +465,37 @@ func (m *Model) ApplyEngineDefaults() {
|
||||
}
|
||||
|
||||
engine := strings.TrimSpace(strings.ToLower(m.Engine))
|
||||
|
||||
if engine == "" {
|
||||
return
|
||||
}
|
||||
|
||||
if info, ok := EngineInfoFor(engine); ok {
|
||||
if m.Service.Uri == "" {
|
||||
if strings.TrimSpace(m.Model) == "" && strings.TrimSpace(m.Name) == "" {
|
||||
m.Model = info.DefaultModel
|
||||
}
|
||||
|
||||
if strings.TrimSpace(m.Service.Uri) == "" {
|
||||
m.Service.Uri = info.Uri
|
||||
}
|
||||
|
||||
if m.Service.RequestFormat == "" {
|
||||
if strings.TrimSpace(m.Service.RequestFormat) == "" {
|
||||
m.Service.RequestFormat = info.RequestFormat
|
||||
}
|
||||
|
||||
if m.Service.ResponseFormat == "" {
|
||||
if strings.TrimSpace(m.Service.ResponseFormat) == "" {
|
||||
m.Service.ResponseFormat = info.ResponseFormat
|
||||
}
|
||||
|
||||
if info.FileScheme != "" && m.Service.FileScheme == "" {
|
||||
if strings.TrimSpace(m.Service.FileScheme) == "" && info.FileScheme != "" {
|
||||
m.Service.FileScheme = info.FileScheme
|
||||
}
|
||||
|
||||
if info.DefaultResolution > 0 && m.Resolution <= 0 {
|
||||
if m.Resolution <= 0 && info.DefaultResolution > 0 {
|
||||
m.Resolution = info.DefaultResolution
|
||||
}
|
||||
|
||||
if strings.TrimSpace(m.Service.Key) == "" && strings.TrimSpace(info.DefaultKey) != "" {
|
||||
if strings.TrimSpace(m.Service.Key) == "" && info.DefaultKey != "" {
|
||||
m.Service.Key = info.DefaultKey
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,13 +89,7 @@ var (
|
||||
}
|
||||
CaptionModel = &Model{
|
||||
Type: ModelTypeCaption,
|
||||
Model: ollama.CaptionModel,
|
||||
Version: VersionLatest,
|
||||
Engine: ollama.EngineName,
|
||||
Resolution: 720, // Original aspect ratio, with a max size of 720 x 720 pixels.
|
||||
Service: Service{
|
||||
Uri: "http://ollama:11434/api/generate",
|
||||
},
|
||||
}
|
||||
DefaultModels = Models{
|
||||
NasnetModel,
|
||||
|
||||
@@ -11,4 +11,28 @@ const (
|
||||
APIKeyFileEnv = "OLLAMA_API_KEY_FILE" //nolint:gosec // environment variable name, not a secret
|
||||
// APIKeyPlaceholder is the `${VAR}` form injected when no explicit key is provided.
|
||||
APIKeyPlaceholder = "${" + APIKeyEnv + "}"
|
||||
// DefaultUri is the default service URI for self-hosted Ollama instances.
|
||||
DefaultUri = "http://ollama:11434/api/generate"
|
||||
// CloudUri is the Ollama cloud service URI
|
||||
CloudUri = "https://ollama.com/api/generate"
|
||||
// DefaultModel names the default caption model bundled with our adapter defaults.
|
||||
DefaultModel = "gemma3:latest"
|
||||
// CloudModel names the default caption for the Ollama cloud service, see https://ollama.com/cloud.
|
||||
CloudModel = "qwen3-vl:235b-instruct"
|
||||
// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
|
||||
CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
|
||||
// LabelConfidenceDefault is used when the model omits the confidence field.
|
||||
LabelConfidenceDefault = 0.5
|
||||
// LabelSystem defines the system prompt shared by Ollama label models. It aims to ensure that single-word nouns are returned.
|
||||
LabelSystem = "You are a PhotoPrism vision model. Output concise JSON that matches the schema. Each label name MUST be a single-word noun in its canonical singular form. Avoid spaces, punctuation, emoji, or descriptive phrases."
|
||||
// LabelSystemSimple defines a simple system prompt for Ollama label models that does not strictly require names to be single-word nouns.
|
||||
LabelSystemSimple = "You are a PhotoPrism vision model. Output concise JSON that matches the schema."
|
||||
// LabelPromptDefault defines a simple user prompt for Ollama label models.
|
||||
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
|
||||
// LabelPromptStrict asks the model to return scored labels for the provided image. It aims to ensure that single-word nouns are returned.
|
||||
LabelPromptStrict = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), and topicality (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64}]} and adjust the values for this image."
|
||||
// LabelPromptNSFW asks the model to return scored labels for the provided image that includes a NSFW flag and score. It aims to ensure that single-word nouns are returned.
|
||||
LabelPromptNSFW = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), topicality (0-1), nsfw (true when the label describes sensitive or adult content), and nsfw_confidence (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64,\"nsfw\":false,\"nsfw_confidence\":0.02}]} and adjust the values for this image."
|
||||
// DefaultResolution is the default thumbnail size submitted to Ollama models.
|
||||
DefaultResolution = 720
|
||||
)
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
package ollama
|
||||
|
||||
const (
|
||||
// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
|
||||
CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
|
||||
// CaptionModel names the default caption model bundled with our adapter defaults.
|
||||
CaptionModel = "gemma3:latest"
|
||||
// LabelConfidenceDefault is used when the model omits the confidence field.
|
||||
LabelConfidenceDefault = 0.5
|
||||
// LabelSystem defines the system prompt shared by Ollama label models. It aims to ensure that single-word nouns are returned.
|
||||
LabelSystem = "You are a PhotoPrism vision model. Output concise JSON that matches the schema. Each label name MUST be a single-word noun in its canonical singular form. Avoid spaces, punctuation, emoji, or descriptive phrases."
|
||||
// LabelSystemSimple defines a simple system prompt for Ollama label models that does not strictly require names to be single-word nouns.
|
||||
LabelSystemSimple = "You are a PhotoPrism vision model. Output concise JSON that matches the schema."
|
||||
// LabelPromptDefault defines a simple user prompt for Ollama label models.
|
||||
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
|
||||
// LabelPromptStrict asks the model to return scored labels for the provided image. It aims to ensure that single-word nouns are returned.
|
||||
LabelPromptStrict = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), and topicality (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64}]} and adjust the values for this image."
|
||||
// LabelPromptNSFW asks the model to return scored labels for the provided image that includes a NSFW flag and score. It aims to ensure that single-word nouns are returned.
|
||||
LabelPromptNSFW = "Analyze the image and return label objects with name (single-word noun), confidence (0-1), topicality (0-1), nsfw (true when the label describes sensitive or adult content), and nsfw_confidence (0-1). Respond with JSON exactly like {\"labels\":[{\"name\":\"sunset\",\"confidence\":0.72,\"topicality\":0.64,\"nsfw\":false,\"nsfw_confidence\":0.02}]} and adjust the values for this image."
|
||||
// DefaultResolution is the default thumbnail size submitted to Ollama models.
|
||||
DefaultResolution = 720
|
||||
)
|
||||
@@ -11,4 +11,30 @@ const (
|
||||
APIKeyFileEnv = "OPENAI_API_KEY_FILE" //nolint:gosec // environment variable name, not a secret
|
||||
// APIKeyPlaceholder is the `${VAR}` form injected when no explicit key is provided.
|
||||
APIKeyPlaceholder = "${" + APIKeyEnv + "}"
|
||||
// DefaultModel is the model used by default when accessing the OpenAI API.
|
||||
DefaultModel = "gpt-5-mini"
|
||||
// DefaultResolution is the default thumbnail size submitted to the OpenAI.
|
||||
DefaultResolution = 720
|
||||
// CaptionSystem defines the default system prompt for caption models.
|
||||
CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
|
||||
// CaptionPrompt instructs caption models to respond with a single sentence.
|
||||
CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
|
||||
// LabelSystem defines the system prompt for label generation.
|
||||
LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
|
||||
// LabelPromptDefault requests general-purpose labels.
|
||||
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
|
||||
// LabelPromptNSFW requests labels including NSFW metadata when required.
|
||||
LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
|
||||
// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
|
||||
DefaultDetail = "low"
|
||||
// CaptionMaxTokens suggests the output budget for caption responses.
|
||||
CaptionMaxTokens = 512
|
||||
// LabelsMaxTokens suggests the output budget for label responses.
|
||||
LabelsMaxTokens = 1024
|
||||
// DefaultTemperature configures deterministic replies.
|
||||
DefaultTemperature = 0.1
|
||||
// DefaultTopP limits nucleus sampling.
|
||||
DefaultTopP = 0.9
|
||||
// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
|
||||
DefaultSchemaVersion = "v1"
|
||||
)
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
package openai
|
||||
|
||||
const (
|
||||
// CaptionSystem defines the default system prompt for caption models.
|
||||
CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
|
||||
// CaptionPrompt instructs caption models to respond with a single sentence.
|
||||
CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
|
||||
// LabelSystem defines the system prompt for label generation.
|
||||
LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
|
||||
// LabelPromptDefault requests general-purpose labels.
|
||||
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
|
||||
// LabelPromptNSFW requests labels including NSFW metadata when required.
|
||||
LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
|
||||
// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
|
||||
DefaultDetail = "low"
|
||||
// CaptionMaxTokens suggests the output budget for caption responses.
|
||||
CaptionMaxTokens = 512
|
||||
// LabelsMaxTokens suggests the output budget for label responses.
|
||||
LabelsMaxTokens = 1024
|
||||
// DefaultTemperature configures deterministic replies.
|
||||
DefaultTemperature = 0.1
|
||||
// DefaultTopP limits nucleus sampling.
|
||||
DefaultTopP = 0.9
|
||||
// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
|
||||
DefaultSchemaVersion = "v1"
|
||||
)
|
||||
|
||||
var (
|
||||
// DefaultModel is the model used by default when accessing the OpenAI API.
|
||||
DefaultModel = "gpt-5-mini"
|
||||
// DefaultResolution is the default thumbnail size submitted to the OpenAI.
|
||||
DefaultResolution = 720
|
||||
)
|
||||
1
internal/ai/vision/testdata/vision.yml
vendored
1
internal/ai/vision/testdata/vision.yml
vendored
@@ -66,7 +66,6 @@ Models:
|
||||
Outputs: 512
|
||||
- Type: caption
|
||||
Model: gemma3:latest
|
||||
Version: latest
|
||||
Engine: ollama
|
||||
Resolution: 720
|
||||
Service:
|
||||
|
||||
Reference in New Issue
Block a user