mirror of
https://github.com/photoprism/photoprism.git
synced 2025-12-12 00:34:13 +01:00
AI: Generate Captions & Labels using the OpenAI Responses API #5322
Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
@@ -9,6 +9,9 @@ import (
|
||||
"io"
|
||||
"net/http"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
|
||||
"github.com/photoprism/photoprism/pkg/clean"
|
||||
"github.com/photoprism/photoprism/pkg/http/header"
|
||||
)
|
||||
@@ -69,6 +72,10 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
|
||||
return nil, parseErr
|
||||
}
|
||||
|
||||
if log.IsLevelEnabled(logrus.TraceLevel) {
|
||||
log.Tracef("vision: response %s", string(body))
|
||||
}
|
||||
|
||||
return parsed, nil
|
||||
}
|
||||
|
||||
@@ -89,12 +96,12 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
|
||||
return apiResponse, nil
|
||||
}
|
||||
|
||||
func decodeOllamaResponse(data []byte) (*ApiResponseOllama, error) {
|
||||
resp := &ApiResponseOllama{}
|
||||
func decodeOllamaResponse(data []byte) (*ollama.Response, error) {
|
||||
resp := &ollama.Response{}
|
||||
dec := json.NewDecoder(bytes.NewReader(data))
|
||||
|
||||
for {
|
||||
var chunk ApiResponseOllama
|
||||
var chunk ollama.Response
|
||||
if err := dec.Decode(&chunk); err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
|
||||
"github.com/photoprism/photoprism/pkg/http/scheme"
|
||||
)
|
||||
|
||||
@@ -49,7 +50,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
|
||||
var req ApiRequest
|
||||
assert.NoError(t, json.NewDecoder(r.Body).Decode(&req))
|
||||
assert.Equal(t, FormatJSON, req.Format)
|
||||
assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
|
||||
assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
|
||||
Model: "qwen2.5vl:latest",
|
||||
Response: `{"labels":[{"name":"test","confidence":0.9,"topicality":0.8}]}`,
|
||||
}))
|
||||
@@ -72,7 +73,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
|
||||
})
|
||||
t.Run("LabelsWithCodeFence", func(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
|
||||
assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
|
||||
Model: "gemma3:latest",
|
||||
Response: "```json\n{\"labels\":[{\"name\":\"lingerie\",\"confidence\":0.81,\"topicality\":0.73}]}\n```\nThe model provided additional commentary.",
|
||||
}))
|
||||
@@ -95,7 +96,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
|
||||
})
|
||||
t.Run("CaptionFallback", func(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
|
||||
assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
|
||||
Model: "qwen2.5vl:latest",
|
||||
Response: "plain text",
|
||||
}))
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
package vision
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/photoprism/photoprism/pkg/clean"
|
||||
"github.com/photoprism/photoprism/pkg/http/scheme"
|
||||
@@ -12,53 +10,6 @@ import (
|
||||
"github.com/photoprism/photoprism/pkg/rnd"
|
||||
)
|
||||
|
||||
// ApiResponseOllama represents a Ollama API service response.
|
||||
type ApiResponseOllama struct {
|
||||
Id string `yaml:"Id,omitempty" json:"id,omitempty"`
|
||||
Code int `yaml:"Code,omitempty" json:"code,omitempty"`
|
||||
Error string `yaml:"Error,omitempty" json:"error,omitempty"`
|
||||
Model string `yaml:"Model,omitempty" json:"model,omitempty"`
|
||||
CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
|
||||
Response string `yaml:"Response,omitempty" json:"response,omitempty"`
|
||||
Done bool `yaml:"Done,omitempty" json:"done,omitempty"`
|
||||
Context []int `yaml:"Context,omitempty" json:"context,omitempty"`
|
||||
TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
|
||||
LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
|
||||
PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
|
||||
PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
|
||||
EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
|
||||
EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
|
||||
Result ApiResult `yaml:"Result,omitempty" json:"result,omitempty"`
|
||||
}
|
||||
|
||||
// Err returns an error if the request has failed.
|
||||
func (r *ApiResponseOllama) Err() error {
|
||||
if r == nil {
|
||||
return errors.New("response is nil")
|
||||
}
|
||||
|
||||
if r.Code >= 400 {
|
||||
if r.Error != "" {
|
||||
return errors.New(r.Error)
|
||||
}
|
||||
|
||||
return fmt.Errorf("error %d", r.Code)
|
||||
} else if r.Result.IsEmpty() {
|
||||
return errors.New("no result")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// HasResult checks if there is at least one result in the response data.
|
||||
func (r *ApiResponseOllama) HasResult() bool {
|
||||
if r == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return !r.Result.IsEmpty()
|
||||
}
|
||||
|
||||
// NewApiRequestOllama returns a new Ollama API request with the specified images as payload.
|
||||
func NewApiRequestOllama(images Files, fileScheme scheme.Type) (*ApiRequest, error) {
|
||||
imagesData := make(Files, len(images))
|
||||
|
||||
@@ -11,6 +11,8 @@ import (
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/openai"
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/schema"
|
||||
"github.com/photoprism/photoprism/internal/api/download"
|
||||
"github.com/photoprism/photoprism/pkg/clean"
|
||||
"github.com/photoprism/photoprism/pkg/fs"
|
||||
@@ -58,6 +60,11 @@ type ApiRequestOptions struct {
|
||||
UseMmap bool `yaml:"UseMmap,omitempty" json:"use_mmap,omitempty"`
|
||||
UseMlock bool `yaml:"UseMlock,omitempty" json:"use_mlock,omitempty"`
|
||||
NumThread int `yaml:"NumThread,omitempty" json:"num_thread,omitempty"`
|
||||
MaxOutputTokens int `yaml:"MaxOutputTokens,omitempty" json:"max_output_tokens,omitempty"`
|
||||
Detail string `yaml:"Detail,omitempty" json:"detail,omitempty"`
|
||||
ForceJson bool `yaml:"ForceJson,omitempty" json:"force_json,omitempty"`
|
||||
SchemaVersion string `yaml:"SchemaVersion,omitempty" json:"schema_version,omitempty"`
|
||||
CombineOutputs string `yaml:"CombineOutputs,omitempty" json:"combine_outputs,omitempty"`
|
||||
}
|
||||
|
||||
// ApiRequestContext represents a context parameter returned from a previous request.
|
||||
@@ -77,6 +84,7 @@ type ApiRequest struct {
|
||||
Context *ApiRequestContext `form:"context" yaml:"Context,omitempty" json:"context,omitempty"`
|
||||
Stream bool `form:"stream" yaml:"Stream,omitempty" json:"stream"`
|
||||
Images Files `form:"images" yaml:"Images,omitempty" json:"images,omitempty"`
|
||||
Schema json.RawMessage `form:"schema" yaml:"Schema,omitempty" json:"schema,omitempty"`
|
||||
ResponseFormat ApiFormat `form:"-" yaml:"-" json:"-"`
|
||||
}
|
||||
|
||||
@@ -195,6 +203,14 @@ func (r *ApiRequest) GetResponseFormat() ApiFormat {
|
||||
|
||||
// JSON returns the request data as JSON-encoded bytes.
|
||||
func (r *ApiRequest) JSON() ([]byte, error) {
|
||||
if r == nil {
|
||||
return nil, errors.New("api request is nil")
|
||||
}
|
||||
|
||||
if r.ResponseFormat == ApiFormatOpenAI {
|
||||
return r.openAIJSON()
|
||||
}
|
||||
|
||||
return json.Marshal(*r)
|
||||
}
|
||||
|
||||
@@ -229,6 +245,8 @@ func (r *ApiRequest) sanitizedForLog() ApiRequest {
|
||||
|
||||
sanitized.Url = sanitizeLogPayload(r.Url)
|
||||
|
||||
sanitized.Schema = r.Schema
|
||||
|
||||
return sanitized
|
||||
}
|
||||
|
||||
@@ -287,3 +305,134 @@ func isLikelyBase64(value string) bool {
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// openAIJSON converts the request data into an OpenAI Responses API payload.
|
||||
func (r *ApiRequest) openAIJSON() ([]byte, error) {
|
||||
detail := openai.DefaultDetail
|
||||
|
||||
if opts := r.Options; opts != nil && strings.TrimSpace(opts.Detail) != "" {
|
||||
detail = strings.TrimSpace(opts.Detail)
|
||||
}
|
||||
|
||||
messages := make([]openai.InputMessage, 0, 2)
|
||||
|
||||
if system := strings.TrimSpace(r.System); system != "" {
|
||||
messages = append(messages, openai.InputMessage{
|
||||
Role: "system",
|
||||
Type: "message",
|
||||
Content: []openai.ContentItem{
|
||||
{
|
||||
Type: openai.ContentTypeText,
|
||||
Text: system,
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
userContent := make([]openai.ContentItem, 0, len(r.Images)+1)
|
||||
|
||||
if prompt := strings.TrimSpace(r.Prompt); prompt != "" {
|
||||
userContent = append(userContent, openai.ContentItem{
|
||||
Type: openai.ContentTypeText,
|
||||
Text: prompt,
|
||||
})
|
||||
}
|
||||
|
||||
for _, img := range r.Images {
|
||||
if img == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
userContent = append(userContent, openai.ContentItem{
|
||||
Type: openai.ContentTypeImage,
|
||||
ImageURL: img,
|
||||
Detail: detail,
|
||||
})
|
||||
}
|
||||
|
||||
if len(userContent) > 0 {
|
||||
messages = append(messages, openai.InputMessage{
|
||||
Role: "user",
|
||||
Type: "message",
|
||||
Content: userContent,
|
||||
})
|
||||
}
|
||||
|
||||
if len(messages) == 0 {
|
||||
return nil, errors.New("openai request requires at least one message")
|
||||
}
|
||||
|
||||
payload := openai.HTTPRequest{
|
||||
Model: strings.TrimSpace(r.Model),
|
||||
Input: messages,
|
||||
}
|
||||
|
||||
if payload.Model == "" {
|
||||
payload.Model = openai.DefaultModel
|
||||
}
|
||||
|
||||
if strings.HasPrefix(strings.ToLower(payload.Model), "gpt-5") {
|
||||
payload.Reasoning = &openai.Reasoning{Effort: "low"}
|
||||
}
|
||||
|
||||
if opts := r.Options; opts != nil {
|
||||
if opts.MaxOutputTokens > 0 {
|
||||
payload.MaxOutputTokens = opts.MaxOutputTokens
|
||||
}
|
||||
|
||||
if opts.Temperature > 0 {
|
||||
payload.Temperature = opts.Temperature
|
||||
}
|
||||
|
||||
if opts.TopP > 0 {
|
||||
payload.TopP = opts.TopP
|
||||
}
|
||||
|
||||
if opts.PresencePenalty != 0 {
|
||||
payload.PresencePenalty = opts.PresencePenalty
|
||||
}
|
||||
|
||||
if opts.FrequencyPenalty != 0 {
|
||||
payload.FrequencyPenalty = opts.FrequencyPenalty
|
||||
}
|
||||
}
|
||||
|
||||
if format := buildOpenAIResponseFormat(r); format != nil {
|
||||
payload.Text = &openai.TextOptions{
|
||||
Format: format,
|
||||
}
|
||||
}
|
||||
|
||||
return json.Marshal(payload)
|
||||
}
|
||||
|
||||
// buildOpenAIResponseFormat determines which response_format to send to OpenAI.
|
||||
func buildOpenAIResponseFormat(r *ApiRequest) *openai.ResponseFormat {
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
opts := r.Options
|
||||
hasSchema := len(r.Schema) > 0
|
||||
|
||||
if !hasSchema && (opts == nil || !opts.ForceJson) {
|
||||
return nil
|
||||
}
|
||||
|
||||
result := &openai.ResponseFormat{}
|
||||
|
||||
if hasSchema {
|
||||
result.Type = openai.ResponseFormatJSONSchema
|
||||
result.Schema = r.Schema
|
||||
|
||||
if opts != nil && strings.TrimSpace(opts.SchemaVersion) != "" {
|
||||
result.Name = strings.TrimSpace(opts.SchemaVersion)
|
||||
} else {
|
||||
result.Name = schema.JsonSchemaName(r.Schema, openai.DefaultSchemaVersion)
|
||||
}
|
||||
} else {
|
||||
result.Type = openai.ResponseFormatJSONObject
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -53,7 +53,11 @@ func captionInternal(images Files, mediaSrc media.Src) (result *CaptionResult, m
|
||||
|
||||
apiRequest.System = model.GetSystemPrompt()
|
||||
apiRequest.Prompt = model.GetPrompt()
|
||||
apiRequest.Options = model.GetOptions()
|
||||
|
||||
if apiRequest.Options == nil {
|
||||
apiRequest.Options = model.GetOptions()
|
||||
}
|
||||
|
||||
apiRequest.WriteLog()
|
||||
|
||||
if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
|
||||
|
||||
@@ -58,14 +58,15 @@ func init() {
|
||||
RegisterEngineAlias(EngineVision, EngineInfo{
|
||||
RequestFormat: ApiFormatVision,
|
||||
ResponseFormat: ApiFormatVision,
|
||||
FileScheme: string(scheme.Data),
|
||||
FileScheme: scheme.Data,
|
||||
DefaultResolution: DefaultResolution,
|
||||
})
|
||||
|
||||
RegisterEngineAlias(openai.EngineName, EngineInfo{
|
||||
Uri: "https://api.openai.com/v1/responses",
|
||||
RequestFormat: ApiFormatOpenAI,
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
FileScheme: string(scheme.Data),
|
||||
FileScheme: scheme.Data,
|
||||
DefaultResolution: openai.DefaultResolution,
|
||||
})
|
||||
}
|
||||
@@ -79,6 +80,7 @@ func RegisterEngine(format ApiFormat, engine Engine) {
|
||||
|
||||
// EngineInfo describes metadata that can be associated with an engine alias.
|
||||
type EngineInfo struct {
|
||||
Uri string
|
||||
RequestFormat ApiFormat
|
||||
ResponseFormat ApiFormat
|
||||
FileScheme string
|
||||
|
||||
@@ -28,7 +28,7 @@ func init() {
|
||||
RegisterEngineAlias(ollama.EngineName, EngineInfo{
|
||||
RequestFormat: ApiFormatOllama,
|
||||
ResponseFormat: ApiFormatOllama,
|
||||
FileScheme: string(scheme.Base64),
|
||||
FileScheme: scheme.Base64,
|
||||
DefaultResolution: ollama.DefaultResolution,
|
||||
})
|
||||
|
||||
@@ -72,7 +72,7 @@ func (ollamaDefaults) SchemaTemplate(model *Model) string {
|
||||
|
||||
switch model.Type {
|
||||
case ModelTypeLabels:
|
||||
return ollama.LabelsSchema(model.PromptContains("nsfw"))
|
||||
return ollama.SchemaLabels(model.PromptContains("nsfw"))
|
||||
}
|
||||
|
||||
return ""
|
||||
@@ -134,64 +134,93 @@ func (ollamaParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, stat
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result := &ApiResponse{
|
||||
response := &ApiResponse{
|
||||
Id: req.GetId(),
|
||||
Code: status,
|
||||
Model: &Model{Name: ollamaResp.Model},
|
||||
Result: ApiResult{
|
||||
Labels: append([]LabelResult{}, ollamaResp.Result.Labels...),
|
||||
Caption: func() *CaptionResult {
|
||||
if ollamaResp.Result.Caption != nil {
|
||||
copyCaption := *ollamaResp.Result.Caption
|
||||
return ©Caption
|
||||
}
|
||||
return nil
|
||||
}(),
|
||||
Labels: convertOllamaLabels(ollamaResp.Result.Labels),
|
||||
Caption: convertOllamaCaption(ollamaResp.Result.Caption),
|
||||
},
|
||||
}
|
||||
|
||||
parsedLabels := len(result.Result.Labels) > 0
|
||||
parsedLabels := len(response.Result.Labels) > 0
|
||||
|
||||
if !parsedLabels && strings.TrimSpace(ollamaResp.Response) != "" && req.Format == FormatJSON {
|
||||
if labels, parseErr := parseOllamaLabels(ollamaResp.Response); parseErr != nil {
|
||||
log.Debugf("vision: %s (parse ollama labels)", clean.Error(parseErr))
|
||||
} else if len(labels) > 0 {
|
||||
result.Result.Labels = append(result.Result.Labels, labels...)
|
||||
response.Result.Labels = append(response.Result.Labels, labels...)
|
||||
parsedLabels = true
|
||||
}
|
||||
}
|
||||
|
||||
if parsedLabels {
|
||||
filtered := result.Result.Labels[:0]
|
||||
for i := range result.Result.Labels {
|
||||
if result.Result.Labels[i].Confidence <= 0 {
|
||||
result.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
|
||||
filtered := response.Result.Labels[:0]
|
||||
for i := range response.Result.Labels {
|
||||
if response.Result.Labels[i].Confidence <= 0 {
|
||||
response.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
|
||||
}
|
||||
|
||||
if result.Result.Labels[i].Topicality <= 0 {
|
||||
result.Result.Labels[i].Topicality = result.Result.Labels[i].Confidence
|
||||
if response.Result.Labels[i].Topicality <= 0 {
|
||||
response.Result.Labels[i].Topicality = response.Result.Labels[i].Confidence
|
||||
}
|
||||
|
||||
// Apply thresholds and canonicalize the name.
|
||||
normalizeLabelResult(&result.Result.Labels[i])
|
||||
normalizeLabelResult(&response.Result.Labels[i])
|
||||
|
||||
if result.Result.Labels[i].Name == "" {
|
||||
if response.Result.Labels[i].Name == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if result.Result.Labels[i].Source == "" {
|
||||
result.Result.Labels[i].Source = entity.SrcOllama
|
||||
if response.Result.Labels[i].Source == "" {
|
||||
response.Result.Labels[i].Source = entity.SrcOllama
|
||||
}
|
||||
|
||||
filtered = append(filtered, result.Result.Labels[i])
|
||||
filtered = append(filtered, response.Result.Labels[i])
|
||||
}
|
||||
result.Result.Labels = filtered
|
||||
response.Result.Labels = filtered
|
||||
} else if caption := strings.TrimSpace(ollamaResp.Response); caption != "" {
|
||||
result.Result.Caption = &CaptionResult{
|
||||
response.Result.Caption = &CaptionResult{
|
||||
Text: caption,
|
||||
Source: entity.SrcOllama,
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
return response, nil
|
||||
}
|
||||
|
||||
func convertOllamaLabels(payload []ollama.LabelPayload) []LabelResult {
|
||||
if len(payload) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
labels := make([]LabelResult, len(payload))
|
||||
|
||||
for i := range payload {
|
||||
labels[i] = LabelResult{
|
||||
Name: payload[i].Name,
|
||||
Source: payload[i].Source,
|
||||
Priority: payload[i].Priority,
|
||||
Confidence: payload[i].Confidence,
|
||||
Topicality: payload[i].Topicality,
|
||||
Categories: payload[i].Categories,
|
||||
NSFW: payload[i].NSFW,
|
||||
NSFWConfidence: payload[i].NSFWConfidence,
|
||||
}
|
||||
}
|
||||
|
||||
return labels
|
||||
}
|
||||
|
||||
func convertOllamaCaption(payload *ollama.CaptionPayload) *CaptionResult {
|
||||
if payload == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &CaptionResult{
|
||||
Text: payload.Text,
|
||||
Source: payload.Source,
|
||||
Confidence: payload.Confidence,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,9 +10,9 @@ import (
|
||||
|
||||
func TestOllamaDefaultConfidenceApplied(t *testing.T) {
|
||||
req := &ApiRequest{Format: FormatJSON}
|
||||
payload := ApiResponseOllama{
|
||||
Result: ApiResult{
|
||||
Labels: []LabelResult{{Name: "forest path", Confidence: 0, Topicality: 0}},
|
||||
payload := ollama.Response{
|
||||
Result: ollama.ResultPayload{
|
||||
Labels: []ollama.LabelPayload{{Name: "forest path", Confidence: 0, Topicality: 0}},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(payload)
|
||||
|
||||
@@ -1,18 +1,342 @@
|
||||
package vision
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/openai"
|
||||
"github.com/photoprism/photoprism/internal/entity"
|
||||
"github.com/photoprism/photoprism/pkg/clean"
|
||||
"github.com/photoprism/photoprism/pkg/http/scheme"
|
||||
)
|
||||
|
||||
// init registers the OpenAI engine alias so models can set Engine: "openai"
|
||||
// and inherit sensible defaults (request/response formats, file scheme, and
|
||||
// preferred thumbnail resolution).
|
||||
// openaiDefaults provides canned prompts, schema templates, and options for OpenAI engines.
|
||||
type openaiDefaults struct{}
|
||||
|
||||
// openaiBuilder prepares ApiRequest objects for OpenAI's Responses API.
|
||||
type openaiBuilder struct{}
|
||||
|
||||
// openaiParser converts Responses API payloads into ApiResponse instances.
|
||||
type openaiParser struct{}
|
||||
|
||||
func init() {
|
||||
RegisterEngineAlias(openai.EngineName, EngineInfo{
|
||||
RequestFormat: ApiFormatOpenAI,
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
FileScheme: string(scheme.Base64),
|
||||
DefaultResolution: openai.DefaultResolution,
|
||||
RegisterEngine(ApiFormatOpenAI, Engine{
|
||||
Builder: openaiBuilder{},
|
||||
Parser: openaiParser{},
|
||||
Defaults: openaiDefaults{},
|
||||
})
|
||||
}
|
||||
|
||||
// SystemPrompt returns the default OpenAI system prompt for the specified model type.
|
||||
func (openaiDefaults) SystemPrompt(model *Model) string {
|
||||
if model == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
switch model.Type {
|
||||
case ModelTypeCaption:
|
||||
return openai.CaptionSystem
|
||||
case ModelTypeLabels:
|
||||
return openai.LabelSystem
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// UserPrompt returns the default OpenAI user prompt for the specified model type.
|
||||
func (openaiDefaults) UserPrompt(model *Model) string {
|
||||
if model == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
switch model.Type {
|
||||
case ModelTypeCaption:
|
||||
return openai.CaptionPrompt
|
||||
case ModelTypeLabels:
|
||||
if DetectNSFWLabels {
|
||||
return openai.LabelPromptNSFW
|
||||
}
|
||||
return openai.LabelPromptDefault
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// SchemaTemplate returns the JSON schema template for the model, if applicable.
|
||||
func (openaiDefaults) SchemaTemplate(model *Model) string {
|
||||
if model == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
switch model.Type {
|
||||
case ModelTypeLabels:
|
||||
return string(openai.SchemaLabels(model.PromptContains("nsfw")))
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// Options returns default OpenAI request options for the model.
|
||||
func (openaiDefaults) Options(model *Model) *ApiRequestOptions {
|
||||
if model == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
switch model.Type {
|
||||
case ModelTypeCaption:
|
||||
/*
|
||||
Options:
|
||||
Detail: low
|
||||
MaxOutputTokens: 512
|
||||
Temperature: 0.1
|
||||
TopP: 0.9
|
||||
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
|
||||
*/
|
||||
return &ApiRequestOptions{
|
||||
Detail: openai.DefaultDetail,
|
||||
MaxOutputTokens: openai.CaptionMaxTokens,
|
||||
Temperature: openai.DefaultTemperature,
|
||||
TopP: openai.DefaultTopP,
|
||||
}
|
||||
case ModelTypeLabels:
|
||||
/*
|
||||
Options:
|
||||
Detail: low
|
||||
MaxOutputTokens: 1024
|
||||
Temperature: 0.1
|
||||
ForceJson: true
|
||||
SchemaVersion: "photoprism_vision_labels_v1"
|
||||
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
|
||||
*/
|
||||
return &ApiRequestOptions{
|
||||
Detail: openai.DefaultDetail,
|
||||
MaxOutputTokens: openai.LabelsMaxTokens,
|
||||
Temperature: openai.DefaultTemperature,
|
||||
TopP: openai.DefaultTopP,
|
||||
ForceJson: true,
|
||||
}
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Build constructs an OpenAI request payload using base64-encoded thumbnails.
|
||||
func (openaiBuilder) Build(ctx context.Context, model *Model, files Files) (*ApiRequest, error) {
|
||||
if model == nil {
|
||||
return nil, ErrInvalidModel
|
||||
}
|
||||
|
||||
dataReq, err := NewApiRequestImages(files, scheme.Data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req := &ApiRequest{
|
||||
Id: dataReq.Id,
|
||||
Images: append(Files(nil), dataReq.Images...),
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
}
|
||||
|
||||
if opts := model.GetOptions(); opts != nil {
|
||||
req.Options = cloneOptions(opts)
|
||||
if model.Type == ModelTypeCaption {
|
||||
// Captions default to plain text responses; structured JSON is optional.
|
||||
req.Options.ForceJson = false
|
||||
if req.Options.MaxOutputTokens < openai.CaptionMaxTokens {
|
||||
req.Options.MaxOutputTokens = openai.CaptionMaxTokens
|
||||
}
|
||||
} else if model.Type == ModelTypeLabels {
|
||||
if req.Options.MaxOutputTokens < openai.LabelsMaxTokens {
|
||||
req.Options.MaxOutputTokens = openai.LabelsMaxTokens
|
||||
}
|
||||
}
|
||||
|
||||
if strings.HasPrefix(strings.ToLower(strings.TrimSpace(model.Name)), "gpt-5") {
|
||||
req.Options.Temperature = 0
|
||||
req.Options.TopP = 0
|
||||
}
|
||||
}
|
||||
|
||||
if schema := strings.TrimSpace(model.SchemaTemplate()); schema != "" {
|
||||
if raw, parseErr := parseOpenAISchema(schema); parseErr != nil {
|
||||
log.Warnf("vision: failed to parse OpenAI schema template (%s)", clean.Error(parseErr))
|
||||
} else {
|
||||
req.Schema = raw
|
||||
}
|
||||
}
|
||||
|
||||
return req, nil
|
||||
}
|
||||
|
||||
// Parse converts an OpenAI Responses API payload into the internal ApiResponse representation.
|
||||
func (openaiParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, status int) (*ApiResponse, error) {
|
||||
if status >= 300 {
|
||||
if msg := openai.ParseErrorMessage(raw); msg != "" {
|
||||
return nil, fmt.Errorf("openai: %s", msg)
|
||||
}
|
||||
return nil, fmt.Errorf("openai: status %d", status)
|
||||
}
|
||||
|
||||
var resp openai.Response
|
||||
if err := json.Unmarshal(raw, &resp); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if resp.Error != nil && resp.Error.Message != "" {
|
||||
return nil, errors.New(resp.Error.Message)
|
||||
}
|
||||
|
||||
result := ApiResult{}
|
||||
if jsonPayload := resp.FirstJSON(); len(jsonPayload) > 0 {
|
||||
if err := populateOpenAIJSONResult(&result, jsonPayload); err != nil {
|
||||
log.Debugf("vision: %s (parse openai json payload)", clean.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
if result.Caption == nil {
|
||||
if text := resp.FirstText(); text != "" {
|
||||
trimmed := strings.TrimSpace(text)
|
||||
var parsedJSON bool
|
||||
|
||||
if len(trimmed) > 0 && (trimmed[0] == '{' || trimmed[0] == '[') {
|
||||
if err := populateOpenAIJSONResult(&result, json.RawMessage(trimmed)); err != nil {
|
||||
log.Debugf("vision: %s (parse openai json text payload)", clean.Error(err))
|
||||
} else {
|
||||
parsedJSON = true
|
||||
}
|
||||
}
|
||||
|
||||
if !parsedJSON && trimmed != "" {
|
||||
result.Caption = &CaptionResult{
|
||||
Text: trimmed,
|
||||
Source: entity.SrcOpenAI,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var responseID string
|
||||
if req != nil {
|
||||
responseID = req.GetId()
|
||||
}
|
||||
|
||||
modelName := strings.TrimSpace(resp.Model)
|
||||
if modelName == "" && req != nil {
|
||||
modelName = strings.TrimSpace(req.Model)
|
||||
}
|
||||
|
||||
return &ApiResponse{
|
||||
Id: responseID,
|
||||
Code: status,
|
||||
Model: &Model{Name: modelName},
|
||||
Result: result,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// parseOpenAISchema validates the provided JSON schema and returns it as a raw message.
|
||||
func parseOpenAISchema(schema string) (json.RawMessage, error) {
|
||||
var raw json.RawMessage
|
||||
if err := json.Unmarshal([]byte(schema), &raw); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return normalizeOpenAISchema(raw)
|
||||
}
|
||||
|
||||
// normalizeOpenAISchema upgrades legacy label schema definitions so they comply with
|
||||
// OpenAI's json_schema format requirements.
|
||||
func normalizeOpenAISchema(raw json.RawMessage) (json.RawMessage, error) {
|
||||
if len(raw) == 0 {
|
||||
return raw, nil
|
||||
}
|
||||
|
||||
var doc map[string]any
|
||||
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||
// Fallback to the original payload if it isn't a JSON object.
|
||||
return raw, nil
|
||||
}
|
||||
|
||||
if t, ok := doc["type"]; ok {
|
||||
if typeStr, ok := t.(string); ok && strings.TrimSpace(typeStr) != "" {
|
||||
return raw, nil
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := doc["properties"]; ok {
|
||||
return raw, nil
|
||||
}
|
||||
|
||||
labels, ok := doc["labels"]
|
||||
if !ok {
|
||||
return raw, nil
|
||||
}
|
||||
|
||||
nsfw := false
|
||||
|
||||
if items, ok := labels.([]any); ok && len(items) > 0 {
|
||||
if first, ok := items[0].(map[string]any); ok {
|
||||
if _, hasNSFW := first["nsfw"]; hasNSFW {
|
||||
nsfw = true
|
||||
}
|
||||
if _, hasNSFWConfidence := first["nsfw_confidence"]; hasNSFWConfidence {
|
||||
nsfw = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return openai.SchemaLabels(nsfw), nil
|
||||
}
|
||||
|
||||
// populateOpenAIJSONResult unmarshals a structured OpenAI response into ApiResult fields.
|
||||
func populateOpenAIJSONResult(result *ApiResult, payload json.RawMessage) error {
|
||||
if result == nil || len(payload) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var envelope struct {
|
||||
Caption *struct {
|
||||
Text string `json:"text"`
|
||||
Confidence float32 `json:"confidence"`
|
||||
} `json:"caption"`
|
||||
Labels []LabelResult `json:"labels"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(payload, &envelope); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if envelope.Caption != nil {
|
||||
text := strings.TrimSpace(envelope.Caption.Text)
|
||||
if text != "" {
|
||||
result.Caption = &CaptionResult{
|
||||
Text: text,
|
||||
Confidence: envelope.Caption.Confidence,
|
||||
Source: entity.SrcOpenAI,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(envelope.Labels) > 0 {
|
||||
filtered := envelope.Labels[:0]
|
||||
|
||||
for i := range envelope.Labels {
|
||||
if envelope.Labels[i].Source == "" {
|
||||
envelope.Labels[i].Source = entity.SrcOpenAI
|
||||
}
|
||||
|
||||
normalizeLabelResult(&envelope.Labels[i])
|
||||
|
||||
if envelope.Labels[i].Name == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
filtered = append(filtered, envelope.Labels[i])
|
||||
}
|
||||
|
||||
result.Labels = append(result.Labels, filtered...)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
337
internal/ai/vision/engine_openai_test.go
Normal file
337
internal/ai/vision/engine_openai_test.go
Normal file
@@ -0,0 +1,337 @@
|
||||
package vision
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/openai"
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/schema"
|
||||
"github.com/photoprism/photoprism/internal/entity"
|
||||
)
|
||||
|
||||
func TestOpenAIBuilderBuild(t *testing.T) {
|
||||
model := &Model{
|
||||
Type: ModelTypeLabels,
|
||||
Name: openai.DefaultModel,
|
||||
Engine: openai.EngineName,
|
||||
}
|
||||
model.ApplyEngineDefaults()
|
||||
|
||||
request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, request)
|
||||
|
||||
assert.Equal(t, ApiFormatOpenAI, request.ResponseFormat)
|
||||
assert.NotEmpty(t, request.Images)
|
||||
assert.NotNil(t, request.Options)
|
||||
assert.Equal(t, openai.DefaultDetail, request.Options.Detail)
|
||||
assert.True(t, request.Options.ForceJson)
|
||||
assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.LabelsMaxTokens)
|
||||
}
|
||||
|
||||
func TestOpenAIBuilderBuildCaptionDisablesForceJSON(t *testing.T) {
|
||||
model := &Model{
|
||||
Type: ModelTypeCaption,
|
||||
Name: openai.DefaultModel,
|
||||
Engine: openai.EngineName,
|
||||
Options: &ApiRequestOptions{ForceJson: true},
|
||||
}
|
||||
model.ApplyEngineDefaults()
|
||||
|
||||
request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, request)
|
||||
require.NotNil(t, request.Options)
|
||||
assert.False(t, request.Options.ForceJson)
|
||||
assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.CaptionMaxTokens)
|
||||
}
|
||||
|
||||
func TestApiRequestJSONForOpenAI(t *testing.T) {
|
||||
req := &ApiRequest{
|
||||
Model: "gpt-5-mini",
|
||||
System: "system",
|
||||
Prompt: "describe the scene",
|
||||
Images: []string{"data:image/jpeg;base64,AA=="},
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
Options: &ApiRequestOptions{
|
||||
Detail: openai.DefaultDetail,
|
||||
MaxOutputTokens: 128,
|
||||
Temperature: 0.2,
|
||||
TopP: 0.8,
|
||||
ForceJson: true,
|
||||
},
|
||||
Schema: json.RawMessage(`{"type":"object","properties":{"caption":{"type":"object"}}}`),
|
||||
}
|
||||
|
||||
payload, err := req.JSON()
|
||||
require.NoError(t, err)
|
||||
|
||||
var decoded struct {
|
||||
Model string `json:"model"`
|
||||
Input []struct {
|
||||
Role string `json:"role"`
|
||||
Content []struct {
|
||||
Type string `json:"type"`
|
||||
} `json:"content"`
|
||||
} `json:"input"`
|
||||
Text struct {
|
||||
Format struct {
|
||||
Type string `json:"type"`
|
||||
Name string `json:"name"`
|
||||
Schema json.RawMessage `json:"schema"`
|
||||
Strict bool `json:"strict"`
|
||||
} `json:"format"`
|
||||
} `json:"text"`
|
||||
Reasoning struct {
|
||||
Effort string `json:"effort"`
|
||||
} `json:"reasoning"`
|
||||
MaxOutputTokens int `json:"max_output_tokens"`
|
||||
}
|
||||
|
||||
require.NoError(t, json.Unmarshal(payload, &decoded))
|
||||
assert.Equal(t, "gpt-5-mini", decoded.Model)
|
||||
require.Len(t, decoded.Input, 2)
|
||||
assert.Equal(t, "system", decoded.Input[0].Role)
|
||||
assert.Equal(t, openai.ResponseFormatJSONSchema, decoded.Text.Format.Type)
|
||||
assert.Equal(t, schema.JsonSchemaName(decoded.Text.Format.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
|
||||
assert.False(t, decoded.Text.Format.Strict)
|
||||
assert.NotNil(t, decoded.Text.Format.Schema)
|
||||
assert.Equal(t, "low", decoded.Reasoning.Effort)
|
||||
assert.Equal(t, 128, decoded.MaxOutputTokens)
|
||||
}
|
||||
|
||||
func TestApiRequestJSONForOpenAIDefaultSchemaName(t *testing.T) {
|
||||
req := &ApiRequest{
|
||||
Model: "gpt-5-mini",
|
||||
Images: []string{"data:image/jpeg;base64,AA=="},
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
Options: &ApiRequestOptions{
|
||||
Detail: openai.DefaultDetail,
|
||||
MaxOutputTokens: 64,
|
||||
ForceJson: true,
|
||||
},
|
||||
Schema: json.RawMessage(`{"type":"object"}`),
|
||||
}
|
||||
|
||||
payload, err := req.JSON()
|
||||
require.NoError(t, err)
|
||||
|
||||
var decoded struct {
|
||||
Text struct {
|
||||
Format struct {
|
||||
Name string `json:"name"`
|
||||
} `json:"format"`
|
||||
} `json:"text"`
|
||||
}
|
||||
|
||||
require.NoError(t, json.Unmarshal(payload, &decoded))
|
||||
assert.Equal(t, schema.JsonSchemaName(req.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
|
||||
}
|
||||
|
||||
func TestOpenAIParserParsesJSONFromTextPayload(t *testing.T) {
|
||||
respPayload := `{
|
||||
"id": "resp_123",
|
||||
"model": "gpt-5-mini",
|
||||
"output": [{
|
||||
"role": "assistant",
|
||||
"content": [{
|
||||
"type": "output_text",
|
||||
"text": "{\"labels\":[{\"name\":\"deer\",\"confidence\":0.98,\"topicality\":0.99}]}"
|
||||
}]
|
||||
}]
|
||||
}`
|
||||
|
||||
req := &ApiRequest{
|
||||
Id: "test",
|
||||
Model: "gpt-5-mini",
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
}
|
||||
|
||||
resp, err := openaiParser{}.Parse(context.Background(), req, []byte(respPayload), http.StatusOK)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, resp)
|
||||
require.Len(t, resp.Result.Labels, 1)
|
||||
assert.Equal(t, "Deer", resp.Result.Labels[0].Name)
|
||||
assert.Nil(t, resp.Result.Caption)
|
||||
}
|
||||
|
||||
func TestParseOpenAISchemaLegacyUpgrade(t *testing.T) {
|
||||
legacy := `{
|
||||
"labels": [{
|
||||
"name": "",
|
||||
"confidence": 0,
|
||||
"topicality": 0
|
||||
}]
|
||||
}`
|
||||
|
||||
raw, err := parseOpenAISchema(legacy)
|
||||
require.NoError(t, err)
|
||||
|
||||
var decoded map[string]any
|
||||
require.NoError(t, json.Unmarshal(raw, &decoded))
|
||||
|
||||
assert.Equal(t, "object", decoded["type"])
|
||||
|
||||
props, ok := decoded["properties"].(map[string]any)
|
||||
require.True(t, ok)
|
||||
labels, ok := props["labels"].(map[string]any)
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "array", labels["type"])
|
||||
}
|
||||
|
||||
func TestParseOpenAISchemaLegacyUpgradeNSFW(t *testing.T) {
|
||||
legacy := `{
|
||||
"labels": [{
|
||||
"name": "",
|
||||
"confidence": 0,
|
||||
"topicality": 0,
|
||||
"nsfw": false,
|
||||
"nsfw_confidence": 0
|
||||
}]
|
||||
}`
|
||||
|
||||
raw, err := parseOpenAISchema(legacy)
|
||||
require.NoError(t, err)
|
||||
|
||||
var decoded map[string]any
|
||||
require.NoError(t, json.Unmarshal(raw, &decoded))
|
||||
|
||||
props := decoded["properties"].(map[string]any)
|
||||
labels := props["labels"].(map[string]any)
|
||||
items := labels["items"].(map[string]any)
|
||||
_, hasNSFW := items["properties"].(map[string]any)["nsfw"]
|
||||
assert.True(t, hasNSFW)
|
||||
}
|
||||
|
||||
func TestPerformApiRequestOpenAISuccess(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
var reqPayload struct {
|
||||
Model string `json:"model"`
|
||||
}
|
||||
assert.NoError(t, json.NewDecoder(r.Body).Decode(&reqPayload))
|
||||
assert.Equal(t, "gpt-5-mini", reqPayload.Model)
|
||||
|
||||
response := map[string]any{
|
||||
"id": "resp_123",
|
||||
"model": "gpt-5-mini",
|
||||
"output": []any{
|
||||
map[string]any{
|
||||
"role": "assistant",
|
||||
"content": []any{
|
||||
map[string]any{
|
||||
"type": "output_json",
|
||||
"json": map[string]any{
|
||||
"caption": map[string]any{
|
||||
"text": "A cat rests on a windowsill.",
|
||||
"confidence": 0.91,
|
||||
},
|
||||
"labels": []map[string]any{
|
||||
{
|
||||
"name": "cat",
|
||||
"confidence": 0.92,
|
||||
"topicality": 0.88,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
assert.NoError(t, json.NewEncoder(w).Encode(response))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
req := &ApiRequest{
|
||||
Id: "test",
|
||||
Model: "gpt-5-mini",
|
||||
Images: []string{"data:image/jpeg;base64,AA=="},
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
Options: &ApiRequestOptions{
|
||||
Detail: openai.DefaultDetail,
|
||||
},
|
||||
Schema: json.RawMessage(`{"type":"object"}`),
|
||||
}
|
||||
|
||||
resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "secret")
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, resp)
|
||||
|
||||
require.NotNil(t, resp.Result.Caption)
|
||||
assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
|
||||
assert.Equal(t, "A cat rests on a windowsill.", resp.Result.Caption.Text)
|
||||
|
||||
require.Len(t, resp.Result.Labels, 1)
|
||||
assert.Equal(t, entity.SrcOpenAI, resp.Result.Labels[0].Source)
|
||||
assert.Equal(t, "Cat", resp.Result.Labels[0].Name)
|
||||
}
|
||||
|
||||
func TestPerformApiRequestOpenAITextFallback(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
response := map[string]any{
|
||||
"id": "resp_456",
|
||||
"model": "gpt-5-mini",
|
||||
"output": []any{
|
||||
map[string]any{
|
||||
"role": "assistant",
|
||||
"content": []any{
|
||||
map[string]any{
|
||||
"type": "output_text",
|
||||
"text": "Two hikers reach the summit at sunset.",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
assert.NoError(t, json.NewEncoder(w).Encode(response))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
req := &ApiRequest{
|
||||
Id: "fallback",
|
||||
Model: "gpt-5-mini",
|
||||
Images: []string{"data:image/jpeg;base64,AA=="},
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
Options: &ApiRequestOptions{
|
||||
Detail: openai.DefaultDetail,
|
||||
},
|
||||
Schema: nil,
|
||||
}
|
||||
|
||||
resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, resp.Result.Caption)
|
||||
assert.Equal(t, "Two hikers reach the summit at sunset.", resp.Result.Caption.Text)
|
||||
assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
|
||||
}
|
||||
|
||||
func TestPerformApiRequestOpenAIError(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusBadRequest)
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"error": map[string]any{
|
||||
"message": "Invalid image payload",
|
||||
},
|
||||
})
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
req := &ApiRequest{
|
||||
Id: "error",
|
||||
Model: "gpt-5-mini",
|
||||
ResponseFormat: ApiFormatOpenAI,
|
||||
Schema: nil,
|
||||
Images: []string{"data:image/jpeg;base64,AA=="},
|
||||
}
|
||||
|
||||
_, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Invalid image payload")
|
||||
}
|
||||
@@ -96,8 +96,10 @@ func labelsInternal(images Files, mediaSrc media.Src, labelSrc entity.Src) (resu
|
||||
apiRequest.Prompt = prompt
|
||||
}
|
||||
|
||||
if options := model.GetOptions(); options != nil {
|
||||
apiRequest.Options = options
|
||||
if apiRequest.Options == nil {
|
||||
if options := model.GetOptions(); options != nil {
|
||||
apiRequest.Options = options
|
||||
}
|
||||
}
|
||||
|
||||
apiRequest.WriteLog()
|
||||
|
||||
@@ -348,6 +348,26 @@ func mergeOptionDefaults(target, defaults *ApiRequestOptions) {
|
||||
if len(target.Stop) == 0 && len(defaults.Stop) > 0 {
|
||||
target.Stop = append([]string(nil), defaults.Stop...)
|
||||
}
|
||||
|
||||
if target.MaxOutputTokens <= 0 && defaults.MaxOutputTokens > 0 {
|
||||
target.MaxOutputTokens = defaults.MaxOutputTokens
|
||||
}
|
||||
|
||||
if strings.TrimSpace(target.Detail) == "" && strings.TrimSpace(defaults.Detail) != "" {
|
||||
target.Detail = strings.TrimSpace(defaults.Detail)
|
||||
}
|
||||
|
||||
if !target.ForceJson && defaults.ForceJson {
|
||||
target.ForceJson = true
|
||||
}
|
||||
|
||||
if target.SchemaVersion == "" && defaults.SchemaVersion != "" {
|
||||
target.SchemaVersion = defaults.SchemaVersion
|
||||
}
|
||||
|
||||
if target.CombineOutputs == "" && defaults.CombineOutputs != "" {
|
||||
target.CombineOutputs = defaults.CombineOutputs
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeOptions(opts *ApiRequestOptions) {
|
||||
@@ -422,6 +442,10 @@ func (m *Model) ApplyEngineDefaults() {
|
||||
}
|
||||
|
||||
if info, ok := EngineInfoFor(engine); ok {
|
||||
if m.Service.Uri == "" {
|
||||
m.Service.Uri = info.Uri
|
||||
}
|
||||
|
||||
if m.Service.RequestFormat == "" {
|
||||
m.Service.RequestFormat = info.RequestFormat
|
||||
}
|
||||
@@ -490,7 +514,7 @@ func (m *Model) SchemaTemplate() string {
|
||||
}
|
||||
|
||||
if m.schema == "" {
|
||||
m.schema = visionschema.Labels(m.PromptContains("nsfw"))
|
||||
m.schema = visionschema.LabelsJson(m.PromptContains("nsfw"))
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
package ollama
|
||||
|
||||
import "github.com/photoprism/photoprism/internal/ai/vision/schema"
|
||||
|
||||
const (
|
||||
// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
|
||||
CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
|
||||
@@ -22,12 +20,3 @@ const (
|
||||
// DefaultResolution is the default thumbnail size submitted to Ollama models.
|
||||
DefaultResolution = 720
|
||||
)
|
||||
|
||||
// LabelsSchema returns the canonical label schema string consumed by Ollama models.
|
||||
func LabelsSchema(nsfw bool) string {
|
||||
if nsfw {
|
||||
return schema.LabelsNSFW
|
||||
} else {
|
||||
return schema.LabelsDefault
|
||||
}
|
||||
}
|
||||
|
||||
14
internal/ai/vision/ollama/schema.go
Normal file
14
internal/ai/vision/ollama/schema.go
Normal file
@@ -0,0 +1,14 @@
|
||||
package ollama
|
||||
|
||||
import (
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/schema"
|
||||
)
|
||||
|
||||
// SchemaLabels returns the canonical label schema string consumed by Ollama models.
|
||||
//
|
||||
// Related documentation and references:
|
||||
// - https://www.alibabacloud.com/help/en/model-studio/json-mode
|
||||
// - https://www.json.org/json-en.html
|
||||
func SchemaLabels(nsfw bool) string {
|
||||
return schema.LabelsJson(nsfw)
|
||||
}
|
||||
79
internal/ai/vision/ollama/transport.go
Normal file
79
internal/ai/vision/ollama/transport.go
Normal file
@@ -0,0 +1,79 @@
|
||||
package ollama
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Response encapsulates the subset of the Ollama generate API response we care about.
|
||||
type Response struct {
|
||||
ID string `yaml:"Id,omitempty" json:"id,omitempty"`
|
||||
Code int `yaml:"Code,omitempty" json:"code,omitempty"`
|
||||
Error string `yaml:"Error,omitempty" json:"error,omitempty"`
|
||||
Model string `yaml:"Model,omitempty" json:"model,omitempty"`
|
||||
CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
|
||||
Response string `yaml:"Response,omitempty" json:"response,omitempty"`
|
||||
Done bool `yaml:"Done,omitempty" json:"done,omitempty"`
|
||||
Context []int `yaml:"Context,omitempty" json:"context,omitempty"`
|
||||
TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
|
||||
LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
|
||||
PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
|
||||
PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
|
||||
EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
|
||||
EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
|
||||
Result ResultPayload `yaml:"Result,omitempty" json:"result,omitempty"`
|
||||
}
|
||||
|
||||
// Err returns an error if the request has failed.
|
||||
func (r *Response) Err() error {
|
||||
if r == nil {
|
||||
return errors.New("response is nil")
|
||||
}
|
||||
|
||||
if r.Code >= 400 {
|
||||
if r.Error != "" {
|
||||
return errors.New(r.Error)
|
||||
}
|
||||
|
||||
return fmt.Errorf("error %d", r.Code)
|
||||
} else if len(r.Result.Labels) == 0 && r.Result.Caption == nil {
|
||||
return errors.New("no result")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// HasResult checks if there is at least one result in the response data.
|
||||
func (r *Response) HasResult() bool {
|
||||
if r == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return len(r.Result.Labels) > 0 || r.Result.Caption != nil
|
||||
}
|
||||
|
||||
// ResultPayload mirrors the structure returned by Ollama for result data.
|
||||
type ResultPayload struct {
|
||||
Labels []LabelPayload `json:"labels"`
|
||||
Caption *CaptionPayload `json:"caption,omitempty"`
|
||||
}
|
||||
|
||||
// LabelPayload represents a single label object emitted by the Ollama adapter.
|
||||
type LabelPayload struct {
|
||||
Name string `json:"name"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Priority int `json:"priority,omitempty"`
|
||||
Confidence float32 `json:"confidence,omitempty"`
|
||||
Topicality float32 `json:"topicality,omitempty"`
|
||||
Categories []string `json:"categories,omitempty"`
|
||||
NSFW bool `json:"nsfw,omitempty"`
|
||||
NSFWConfidence float32 `json:"nsfw_confidence,omitempty"`
|
||||
}
|
||||
|
||||
// CaptionPayload represents the caption object emitted by the Ollama adapter.
|
||||
type CaptionPayload struct {
|
||||
Text string `json:"text"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Confidence float32 `json:"confidence,omitempty"`
|
||||
}
|
||||
128
internal/ai/vision/openai/README.md
Normal file
128
internal/ai/vision/openai/README.md
Normal file
@@ -0,0 +1,128 @@
|
||||
## PhotoPrism — OpenAI API Integration
|
||||
|
||||
**Last Updated:** November 14, 2025
|
||||
|
||||
### Overview
|
||||
|
||||
This package contains PhotoPrism’s adapter for the OpenAI Responses API. It enables existing caption and label workflows (`GenerateCaption`, `GenerateLabels`, and the `photoprism vision run` CLI) to call OpenAI models alongside TensorFlow and Ollama without changing worker or API code. The implementation focuses on predictable results, structured outputs, and clear observability so operators can opt in gradually.
|
||||
|
||||
#### Context & Constraints
|
||||
|
||||
- OpenAI requests flow through the existing vision client (`internal/ai/vision/api_client.go`) and must honour PhotoPrism’s timeout, logging, and ACL rules.
|
||||
- Structured outputs are preferred but the adapter must gracefully handle free-form text; `output_text` responses are parsed both as JSON and as plain captions.
|
||||
- Costs should remain predictable: requests are limited to a single 720 px thumbnail (`detail=low`) and capped token budgets (512 caption, 1024 labels).
|
||||
- Secrets are supplied per model (`Service.Key`) with fallbacks to `OPENAI_API_KEY` / `_FILE`. Logs must redact sensitive data.
|
||||
|
||||
#### Goals
|
||||
|
||||
- Provide drop-in OpenAI support for captions and labels using `vision.yml`.
|
||||
- Keep configuration ergonomic by auto-populating prompts, schema names, token limits, and sampling defaults.
|
||||
- Expose enough logging and tests so operators can compare OpenAI output with existing engines before enabling it broadly.
|
||||
|
||||
#### Non-Goals
|
||||
|
||||
- Introducing a new `generate` model type or combined caption/label endpoint (reserved for a later phase).
|
||||
- Replacing the default TensorFlow models; they remain active as fallbacks.
|
||||
- Managing OpenAI billing or quota dashboards beyond surfacing token counts in logs and metrics.
|
||||
|
||||
### Prompt, Model, & Schema Guidance
|
||||
|
||||
- **Models:** The adapter targets GPT‑5 vision tiers (e.g. `gpt-5-nano`, `gpt-5-mini`). These models support image inputs, structured outputs, and deterministic settings. Set `Name` to the exact provider identifier so defaults are applied correctly. Caption models share the same configuration surface and run through the same adapter.
|
||||
- **Prompts:** Defaults live in `defaults.go`. Captions use a single-sentence instruction; labels use `LabelPromptDefault` (or `LabelPromptNSFW` when PhotoPrism requests NSFW metadata). Custom prompts should retain schema reminders so structured outputs stay valid.
|
||||
- **Schemas:** Labels use the JSON schema returned by `schema.LabelsJsonSchema(nsfw)`; the response format name is derived via `schema.JsonSchemaName` (e.g. `photoprism_vision_labels_v1`). Captions omit schemas unless operators explicitly request a structured format.
|
||||
- **When to keep defaults:** For most deployments, leaving `System`, `Prompt`, `Schema`, and `Options` unset yields stable output with minimal configuration. Override them only when domain-specific language or custom scoring is necessary, and add regression tests alongside.
|
||||
|
||||
Budget-conscious operators can experiment with lighter prompts or lower-resolution thumbnails, but should keep token limits and determinism settings intact to avoid unexpected bills and UI churn.
|
||||
|
||||
#### Performance & Cost Estimates
|
||||
|
||||
- **Token budgets:** Captions request up to 512 output tokens; labels request up to 1024. Input tokens are typically ≤700 for a single 720 px thumbnail plus prompts.
|
||||
- **Latency:** GPT‑5 nano/mini vision calls typically complete in 3–8 s, depending on OpenAI region. Including reasoning metadata (`reasoning.effort=low`) has negligible impact but improves traceability.
|
||||
- **Costs:** Consult OpenAI’s pricing for the selected model. Multiply input/output tokens by the published rate. PhotoPrism currently sends one image per request to keep costs linear with photo count.
|
||||
|
||||
#### Defaults
|
||||
|
||||
- File scheme: `data:` URLs (base64) for all OpenAI models.
|
||||
- Resolution: 720 px thumbnails (`vision.Thumb(ModelTypeCaption|Labels)`).
|
||||
- Options: `MaxOutputTokens` raised to 512 (caption) / 1024 (labels); `ForceJson=false` for captions, `true` for labels; `reasoning.effort="low"`.
|
||||
- Sampling: `Temperature` and `TopP` set to `0` for `gpt-5*` models; inherited values (0.1/0.9) remain for other engines. `openaiBuilder.Build` performs this override while preserving the struct defaults for non-OpenAI adapters.
|
||||
- Schema naming: Automatically derived via `schema.JsonSchemaName`, so operators may omit `SchemaVersion`.
|
||||
|
||||
### Configuration
|
||||
|
||||
#### Environment Variables
|
||||
|
||||
- `OPENAI_API_KEY` / `OPENAI_API_KEY_FILE` — fallback credentials when a model’s `Service.Key` is unset.
|
||||
- Existing `PHOTOPRISM_VISION_*` variables remain authoritative (see the [Developer Guide](https://docs.photoprism.app/developer-guide/vision/service/) for full lists).
|
||||
|
||||
#### `vision.yml` Examples
|
||||
|
||||
```yaml
|
||||
Models:
|
||||
- Type: caption
|
||||
Name: gpt-5-nano
|
||||
Engine: openai
|
||||
Disabled: false # opt in manually
|
||||
Resolution: 720 # optional; default is 720
|
||||
Options:
|
||||
Detail: low # optional; defaults to low
|
||||
MaxOutputTokens: 512
|
||||
Service:
|
||||
Uri: https://api.openai.com/v1/responses
|
||||
FileScheme: data
|
||||
Key: ${OPENAI_API_KEY}
|
||||
|
||||
- Type: labels
|
||||
Name: gpt-5-mini
|
||||
Engine: openai
|
||||
Disabled: false
|
||||
Resolution: 720
|
||||
Options:
|
||||
Detail: low
|
||||
MaxOutputTokens: 1024
|
||||
ForceJson: true # redundant but explicit
|
||||
Service:
|
||||
Uri: https://api.openai.com/v1/responses
|
||||
FileScheme: data
|
||||
Key: ${OPENAI_API_KEY}
|
||||
```
|
||||
|
||||
Keep TensorFlow entries in place so PhotoPrism falls back when the external service is unavailable.
|
||||
|
||||
### Documentation
|
||||
|
||||
- Label Generation: <https://docs.photoprism.app/developer-guide/vision/label-generation/>
|
||||
- Caption Generation: <https://docs.photoprism.app/developer-guide/vision/caption-generation/>
|
||||
- Vision CLI Commands: <https://docs.photoprism.app/developer-guide/vision/cli/>
|
||||
|
||||
### Implementation Details
|
||||
|
||||
#### Core Concepts
|
||||
|
||||
- **Structured outputs:** PhotoPrism leverages OpenAI’s structured output capability as documented at <https://platform.openai.com/docs/guides/structured-outputs>. When a JSON schema is supplied, the adapter emits `text.format` with `type: "json_schema"` and a schema name derived from the content. The parser then prefers `output_json`, but also attempts to decode `output_text` payloads that contain JSON objects.
|
||||
- **Deterministic sampling:** GPT‑5 models are run with `temperature=0` and `top_p=0` to minimise variance, while still allowing developers to override values in `vision.yml` if needed.
|
||||
- **Reasoning metadata:** Requests include `reasoning.effort="low"` so OpenAI returns structured reasoning usage counters, helping operators track token consumption.
|
||||
- **Worker summaries:** The vision worker now logs either “updated …” or “processed … (no metadata changes detected)”, making reruns easy to audit.
|
||||
|
||||
#### Rate Limiting
|
||||
|
||||
OpenAI calls respect the existing `limiter.Auth` configuration used by the vision service. Failed requests surface standard HTTP errors and are not automatically retried; operators should ensure they have adequate account limits and consider external rate limiting when sharing credentials.
|
||||
|
||||
#### Testing & Validation
|
||||
|
||||
1. Unit tests: `go test ./internal/ai/vision/openai ./internal/ai/vision -run OpenAI -count=1`. Fixtures under `internal/ai/vision/openai/testdata/` replay real Responses payloads (captions and labels).
|
||||
2. CLI smoke test: `photoprism vision run -m labels --count 1 --force --model=gpt-5-mini` with trace logging enabled to inspect sanitised Responses.
|
||||
3. Compare worker summaries and label sources (`openai`) in the UI or via `photoprism vision ls`.
|
||||
|
||||
#### Code Map
|
||||
|
||||
- **Adapter & defaults:** `internal/ai/vision/openai` (defaults, schema helpers, transport, tests).
|
||||
- **Request/response plumbing:** `internal/ai/vision/api_request.go`, `api_client.go`, `engine_openai.go`, `engine_openai_test.go`.
|
||||
- **Workers & CLI:** `internal/workers/vision.go`, `internal/commands/vision_run.go`.
|
||||
- **Shared utilities:** `internal/ai/vision/schema`, `pkg/clean`, `pkg/media`.
|
||||
|
||||
#### Next Steps
|
||||
|
||||
- [ ] Introduce the future `generate` model type that combines captions, labels, and optional markers.
|
||||
- [ ] Evaluate additional OpenAI models as pricing and capabilities evolve.
|
||||
- [ ] Expose token usage metrics (input/output/reasoning) via Prometheus once the schema stabilises.
|
||||
@@ -1,6 +1,29 @@
|
||||
package openai
|
||||
|
||||
import "github.com/photoprism/photoprism/internal/ai/vision/schema"
|
||||
const (
|
||||
// CaptionSystem defines the default system prompt for caption models.
|
||||
CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
|
||||
// CaptionPrompt instructs caption models to respond with a single sentence.
|
||||
CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
|
||||
// LabelSystem defines the system prompt for label generation.
|
||||
LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
|
||||
// LabelPromptDefault requests general-purpose labels.
|
||||
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
|
||||
// LabelPromptNSFW requests labels including NSFW metadata when required.
|
||||
LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
|
||||
// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
|
||||
DefaultDetail = "low"
|
||||
// CaptionMaxTokens suggests the output budget for caption responses.
|
||||
CaptionMaxTokens = 512
|
||||
// LabelsMaxTokens suggests the output budget for label responses.
|
||||
LabelsMaxTokens = 1024
|
||||
// DefaultTemperature configures deterministic replies.
|
||||
DefaultTemperature = 0.1
|
||||
// DefaultTopP limits nucleus sampling.
|
||||
DefaultTopP = 0.9
|
||||
// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
|
||||
DefaultSchemaVersion = "v1"
|
||||
)
|
||||
|
||||
var (
|
||||
// DefaultModel is the model used by default when accessing the OpenAI API.
|
||||
@@ -8,8 +31,3 @@ var (
|
||||
// DefaultResolution is the default thumbnail size submitted to the OpenAI.
|
||||
DefaultResolution = 720
|
||||
)
|
||||
|
||||
// LabelsSchema returns the canonical label schema string consumed by OpenAI models.
|
||||
func LabelsSchema() string {
|
||||
return schema.LabelsDefault
|
||||
}
|
||||
|
||||
16
internal/ai/vision/openai/schema.go
Normal file
16
internal/ai/vision/openai/schema.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
|
||||
"github.com/photoprism/photoprism/internal/ai/vision/schema"
|
||||
)
|
||||
|
||||
// SchemaLabels returns the canonical labels JSON Schema string consumed by Ollama models.
|
||||
//
|
||||
// Related documentation and references:
|
||||
// - https://platform.openai.com/docs/guides/structured-outputs
|
||||
// - https://json-schema.org/learn/miscellaneous-examples
|
||||
func SchemaLabels(nsfw bool) json.RawMessage {
|
||||
return schema.LabelsJsonSchema(nsfw)
|
||||
}
|
||||
73
internal/ai/vision/openai/testdata/caption-response.json
vendored
Normal file
73
internal/ai/vision/openai/testdata/caption-response.json
vendored
Normal file
@@ -0,0 +1,73 @@
|
||||
{
|
||||
"id": "resp_0d356718505119f3006916e5d8730881a0b91de2aa700f6196",
|
||||
"object": "response",
|
||||
"created_at": 1763108312,
|
||||
"status": "completed",
|
||||
"background": false,
|
||||
"billing": {
|
||||
"payer": "developer"
|
||||
},
|
||||
"error": null,
|
||||
"incomplete_details": null,
|
||||
"instructions": null,
|
||||
"max_output_tokens": 512,
|
||||
"max_tool_calls": null,
|
||||
"model": "gpt-5-nano-2025-08-07",
|
||||
"output": [
|
||||
{
|
||||
"id": "rs_0d356718505119f3006916e5d8efd481a0a4f9cc1823cc6c83",
|
||||
"type": "reasoning",
|
||||
"summary": []
|
||||
},
|
||||
{
|
||||
"id": "msg_0d356718505119f3006916e5d9433881a0bc79197d2cfc2027",
|
||||
"type": "message",
|
||||
"status": "completed",
|
||||
"content": [
|
||||
{
|
||||
"type": "output_text",
|
||||
"annotations": [],
|
||||
"logprobs": [],
|
||||
"text": "A bee gathers nectar from the vibrant red poppy\u2019s center."
|
||||
}
|
||||
],
|
||||
"role": "assistant"
|
||||
}
|
||||
],
|
||||
"parallel_tool_calls": true,
|
||||
"previous_response_id": null,
|
||||
"prompt_cache_key": null,
|
||||
"prompt_cache_retention": null,
|
||||
"reasoning": {
|
||||
"effort": "low",
|
||||
"summary": null
|
||||
},
|
||||
"safety_identifier": null,
|
||||
"service_tier": "default",
|
||||
"store": true,
|
||||
"temperature": 1.0,
|
||||
"text": {
|
||||
"format": {
|
||||
"type": "text"
|
||||
},
|
||||
"verbosity": "medium"
|
||||
},
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"top_logprobs": 0,
|
||||
"top_p": 1.0,
|
||||
"truncation": "disabled",
|
||||
"usage": {
|
||||
"input_tokens": 576,
|
||||
"input_tokens_details": {
|
||||
"cached_tokens": 0
|
||||
},
|
||||
"output_tokens": 19,
|
||||
"output_tokens_details": {
|
||||
"reasoning_tokens": 0
|
||||
},
|
||||
"total_tokens": 595
|
||||
},
|
||||
"user": null,
|
||||
"metadata": {}
|
||||
}
|
||||
114
internal/ai/vision/openai/testdata/labels-response.json
vendored
Normal file
114
internal/ai/vision/openai/testdata/labels-response.json
vendored
Normal file
@@ -0,0 +1,114 @@
|
||||
{
|
||||
"id": "resp_0fa91dfb69b7d644006916ea0b72ac819f84ff3152a38dfcdb",
|
||||
"object": "response",
|
||||
"created_at": 1763109387,
|
||||
"status": "completed",
|
||||
"background": false,
|
||||
"billing": {
|
||||
"payer": "developer"
|
||||
},
|
||||
"error": null,
|
||||
"incomplete_details": null,
|
||||
"instructions": null,
|
||||
"max_output_tokens": 1024,
|
||||
"max_tool_calls": null,
|
||||
"model": "gpt-5-mini-2025-08-07",
|
||||
"output": [
|
||||
{
|
||||
"id": "rs_0fa91dfb69b7d644006916ea0c3450819f8a13396bf377f474",
|
||||
"type": "reasoning",
|
||||
"summary": []
|
||||
},
|
||||
{
|
||||
"id": "msg_0fa91dfb69b7d644006916ea0d2dfc819faf52b11334fc10a4",
|
||||
"type": "message",
|
||||
"status": "completed",
|
||||
"content": [
|
||||
{
|
||||
"type": "output_text",
|
||||
"annotations": [],
|
||||
"logprobs": [],
|
||||
"text": "{\"labels\":[{\"name\":\"flower\",\"confidence\":0.99,\"topicality\":0.99},{\"name\":\"bee\",\"confidence\":0.95,\"topicality\":0.95},{\"name\":\"petal\",\"confidence\":0.92,\"topicality\":0.88},{\"name\":\"pollen\",\"confidence\":0.85,\"topicality\":0.8},{\"name\":\"insect\",\"confidence\":0.9,\"topicality\":0.85},{\"name\":\"red\",\"confidence\":0.88,\"topicality\":0.6},{\"name\":\"close-up\",\"confidence\":0.86,\"topicality\":0.7},{\"name\":\"nature\",\"confidence\":0.8,\"topicality\":0.5}]}"
|
||||
}
|
||||
],
|
||||
"role": "assistant"
|
||||
}
|
||||
],
|
||||
"parallel_tool_calls": true,
|
||||
"previous_response_id": null,
|
||||
"prompt_cache_key": null,
|
||||
"prompt_cache_retention": null,
|
||||
"reasoning": {
|
||||
"effort": "low",
|
||||
"summary": null
|
||||
},
|
||||
"safety_identifier": null,
|
||||
"service_tier": "default",
|
||||
"store": true,
|
||||
"temperature": 1.0,
|
||||
"text": {
|
||||
"format": {
|
||||
"type": "json_schema",
|
||||
"description": null,
|
||||
"name": "photoprism_vision_labels_v1",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"labels": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"confidence": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"topicality": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"confidence",
|
||||
"topicality"
|
||||
],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"default": []
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"labels"
|
||||
],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"strict": true
|
||||
},
|
||||
"verbosity": "medium"
|
||||
},
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"top_logprobs": 0,
|
||||
"top_p": 1.0,
|
||||
"truncation": "disabled",
|
||||
"usage": {
|
||||
"input_tokens": 724,
|
||||
"input_tokens_details": {
|
||||
"cached_tokens": 0
|
||||
},
|
||||
"output_tokens": 169,
|
||||
"output_tokens_details": {
|
||||
"reasoning_tokens": 0
|
||||
},
|
||||
"total_tokens": 893
|
||||
},
|
||||
"user": null,
|
||||
"metadata": {}
|
||||
}
|
||||
142
internal/ai/vision/openai/transport.go
Normal file
142
internal/ai/vision/openai/transport.go
Normal file
@@ -0,0 +1,142 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
// ContentTypeText identifies text input segments for the Responses API.
|
||||
ContentTypeText = "input_text"
|
||||
// ContentTypeImage identifies image input segments for the Responses API.
|
||||
ContentTypeImage = "input_image"
|
||||
|
||||
// ResponseFormatJSONSchema requests JSON constrained by a schema.
|
||||
ResponseFormatJSONSchema = "json_schema"
|
||||
// ResponseFormatJSONObject requests a free-form JSON object.
|
||||
ResponseFormatJSONObject = "json_object"
|
||||
)
|
||||
|
||||
// HTTPRequest represents the payload expected by OpenAI's Responses API.
|
||||
type HTTPRequest struct {
|
||||
Model string `json:"model"`
|
||||
Input []InputMessage `json:"input"`
|
||||
Text *TextOptions `json:"text,omitempty"`
|
||||
Reasoning *Reasoning `json:"reasoning,omitempty"`
|
||||
MaxOutputTokens int `json:"max_output_tokens,omitempty"`
|
||||
Temperature float64 `json:"temperature,omitempty"`
|
||||
TopP float64 `json:"top_p,omitempty"`
|
||||
PresencePenalty float64 `json:"presence_penalty,omitempty"`
|
||||
FrequencyPenalty float64 `json:"frequency_penalty,omitempty"`
|
||||
}
|
||||
|
||||
// TextOptions carries formatting preferences for textual responses.
|
||||
type TextOptions struct {
|
||||
Format *ResponseFormat `json:"format,omitempty"`
|
||||
}
|
||||
|
||||
// Reasoning configures the effort level for reasoning models.
|
||||
type Reasoning struct {
|
||||
Effort string `json:"effort,omitempty"`
|
||||
}
|
||||
|
||||
// InputMessage captures a single system or user message in the request.
|
||||
type InputMessage struct {
|
||||
Role string `json:"role"`
|
||||
Type string `json:"type,omitempty"`
|
||||
Content []ContentItem `json:"content"`
|
||||
}
|
||||
|
||||
// ContentItem represents a text or image entry within a message.
|
||||
type ContentItem struct {
|
||||
Type string `json:"type"`
|
||||
Text string `json:"text,omitempty"`
|
||||
ImageURL string `json:"image_url,omitempty"`
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// ResponseFormat describes how OpenAI should format its response.
|
||||
type ResponseFormat struct {
|
||||
Type string `json:"type"`
|
||||
Name string `json:"name,omitempty"`
|
||||
Schema json.RawMessage `json:"schema,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Strict bool `json:"strict,omitempty"`
|
||||
}
|
||||
|
||||
// Response mirrors the subset of the Responses API response we need.
|
||||
type Response struct {
|
||||
ID string `json:"id"`
|
||||
Model string `json:"model"`
|
||||
Output []ResponseOutput `json:"output"`
|
||||
Error *struct {
|
||||
Message string `json:"message"`
|
||||
Type string `json:"type"`
|
||||
} `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// ResponseOutput captures assistant messages within the response.
|
||||
type ResponseOutput struct {
|
||||
Role string `json:"role"`
|
||||
Content []ResponseContent `json:"content"`
|
||||
}
|
||||
|
||||
// ResponseContent contains individual message parts (JSON or text).
|
||||
type ResponseContent struct {
|
||||
Type string `json:"type"`
|
||||
Text string `json:"text,omitempty"`
|
||||
JSON json.RawMessage `json:"json,omitempty"`
|
||||
}
|
||||
|
||||
// FirstJSON returns the first JSON payload contained in the response.
|
||||
func (r *Response) FirstJSON() json.RawMessage {
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
for i := range r.Output {
|
||||
for j := range r.Output[i].Content {
|
||||
if len(r.Output[i].Content[j].JSON) > 0 {
|
||||
return r.Output[i].Content[j].JSON
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// FirstText returns the first textual payload contained in the response.
|
||||
func (r *Response) FirstText() string {
|
||||
if r == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
for i := range r.Output {
|
||||
for j := range r.Output[i].Content {
|
||||
if text := strings.TrimSpace(r.Output[i].Content[j].Text); text != "" {
|
||||
return text
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// ParseErrorMessage extracts a human readable error message from a Responses API payload.
|
||||
func ParseErrorMessage(raw []byte) string {
|
||||
var errResp struct {
|
||||
Error *struct {
|
||||
Message string `json:"message"`
|
||||
} `json:"error"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(raw, &errResp); err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
if errResp.Error != nil {
|
||||
return strings.TrimSpace(errResp.Error.Message)
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
120
internal/ai/vision/openai/transport_test.go
Normal file
120
internal/ai/vision/openai/transport_test.go
Normal file
@@ -0,0 +1,120 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func loadTestResponse(t *testing.T, name string) *Response {
|
||||
t.Helper()
|
||||
|
||||
filePath := filepath.Join("testdata", name)
|
||||
|
||||
data, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read %s: %v", filePath, err)
|
||||
}
|
||||
|
||||
var resp Response
|
||||
if err := json.Unmarshal(data, &resp); err != nil {
|
||||
t.Fatalf("failed to unmarshal %s: %v", filePath, err)
|
||||
}
|
||||
|
||||
return &resp
|
||||
}
|
||||
|
||||
func TestParseErrorMessage(t *testing.T) {
|
||||
t.Run("returns message when present", func(t *testing.T) {
|
||||
raw := []byte(`{"error":{"message":"Invalid schema"}}`)
|
||||
msg := ParseErrorMessage(raw)
|
||||
if msg != "Invalid schema" {
|
||||
t.Fatalf("expected message, got %q", msg)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("returns empty string when error is missing", func(t *testing.T) {
|
||||
raw := []byte(`{"output":[]}`)
|
||||
if msg := ParseErrorMessage(raw); msg != "" {
|
||||
t.Fatalf("expected empty message, got %q", msg)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestResponseFirstTextCaption(t *testing.T) {
|
||||
resp := loadTestResponse(t, "caption-response.json")
|
||||
|
||||
if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
|
||||
t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
|
||||
}
|
||||
|
||||
text := resp.FirstText()
|
||||
expected := "A bee gathers nectar from the vibrant red poppy’s center."
|
||||
if text != expected {
|
||||
t.Fatalf("unexpected caption text: %q", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResponseFirstTextLabels(t *testing.T) {
|
||||
resp := loadTestResponse(t, "labels-response.json")
|
||||
|
||||
if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
|
||||
t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
|
||||
}
|
||||
|
||||
text := resp.FirstText()
|
||||
if len(text) == 0 {
|
||||
t.Fatal("expected structured JSON string in text payload")
|
||||
}
|
||||
if text[0] != '{' {
|
||||
t.Fatalf("expected JSON object in text payload, got %q", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResponseFirstJSONFromStructuredPayload(t *testing.T) {
|
||||
resp := &Response{
|
||||
ID: "resp_structured",
|
||||
Model: "gpt-5-mini",
|
||||
Output: []ResponseOutput{
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: []ResponseContent{
|
||||
{
|
||||
Type: "output_json",
|
||||
JSON: json.RawMessage(`{"labels":[{"name":"sunset"}]}`),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
jsonPayload := resp.FirstJSON()
|
||||
if len(jsonPayload) == 0 {
|
||||
t.Fatal("expected JSON payload, got empty result")
|
||||
}
|
||||
|
||||
var decoded struct {
|
||||
Labels []map[string]string `json:"labels"`
|
||||
}
|
||||
if err := json.Unmarshal(jsonPayload, &decoded); err != nil {
|
||||
t.Fatalf("failed to decode JSON payload: %v", err)
|
||||
}
|
||||
|
||||
if len(decoded.Labels) != 1 || decoded.Labels[0]["name"] != "sunset" {
|
||||
t.Fatalf("unexpected JSON payload: %+v", decoded.Labels)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSchemaLabelsReturnsValidJSON(t *testing.T) {
|
||||
raw := SchemaLabels(false)
|
||||
|
||||
var decoded map[string]any
|
||||
if err := json.Unmarshal(raw, &decoded); err != nil {
|
||||
t.Fatalf("schema should be valid JSON: %v", err)
|
||||
}
|
||||
|
||||
if decoded["type"] != "object" {
|
||||
t.Fatalf("expected type object, got %v", decoded["type"])
|
||||
}
|
||||
}
|
||||
@@ -1,16 +1,115 @@
|
||||
package schema
|
||||
|
||||
// LabelsDefault provides the minimal JSON schema for label responses used across engines.
|
||||
const (
|
||||
LabelsDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}"
|
||||
LabelsNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}"
|
||||
import (
|
||||
"encoding/json"
|
||||
)
|
||||
|
||||
// Labels returns the canonical label schema string.
|
||||
func Labels(nsfw bool) string {
|
||||
// LabelsJsonSchemaDefault provides the minimal JSON schema for label responses used across engines.
|
||||
const (
|
||||
LabelsJsonSchemaDefault = `{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"labels": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"confidence": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"topicality": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
}
|
||||
},
|
||||
"required": ["name", "confidence", "topicality"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"default": []
|
||||
}
|
||||
},
|
||||
"required": ["labels"],
|
||||
"additionalProperties": false
|
||||
}`
|
||||
LabelsJsonDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}"
|
||||
LabelsJsonSchemaNSFW = `{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"labels": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"confidence": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"topicality": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
},
|
||||
"nsfw": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"nsfw_confidence": {
|
||||
"type": "number",
|
||||
"minimum": 0,
|
||||
"maximum": 1
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"name",
|
||||
"confidence",
|
||||
"topicality",
|
||||
"nsfw",
|
||||
"nsfw_confidence"
|
||||
],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"default": []
|
||||
}
|
||||
},
|
||||
"required": ["labels"],
|
||||
"additionalProperties": false
|
||||
}`
|
||||
LabelsJsonNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}"
|
||||
)
|
||||
|
||||
// LabelsJsonSchema returns the canonical label JSON Schema string for OpenAI API endpoints.
|
||||
//
|
||||
// Related documentation and references:
|
||||
// - https://platform.openai.com/docs/guides/structured-outputs
|
||||
// - https://json-schema.org/learn/miscellaneous-examples
|
||||
func LabelsJsonSchema(nsfw bool) json.RawMessage {
|
||||
if nsfw {
|
||||
return LabelsNSFW
|
||||
return json.RawMessage(LabelsJsonSchemaNSFW)
|
||||
} else {
|
||||
return LabelsDefault
|
||||
return json.RawMessage(LabelsJsonSchemaDefault)
|
||||
}
|
||||
}
|
||||
|
||||
// LabelsJson returns the canonical label JSON string for Ollama vision models.
|
||||
//
|
||||
// Related documentation and references:
|
||||
// - https://www.alibabacloud.com/help/en/model-studio/json-mode
|
||||
// - https://www.json.org/json-en.html
|
||||
func LabelsJson(nsfw bool) string {
|
||||
if nsfw {
|
||||
return LabelsJsonNSFW
|
||||
} else {
|
||||
return LabelsJsonDefault
|
||||
}
|
||||
}
|
||||
|
||||
36
internal/ai/vision/schema/name.go
Normal file
36
internal/ai/vision/schema/name.go
Normal file
@@ -0,0 +1,36 @@
|
||||
package schema
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"github.com/photoprism/photoprism/pkg/clean"
|
||||
)
|
||||
|
||||
const (
|
||||
NamePrefix = "photoprism_vision"
|
||||
)
|
||||
|
||||
// JsonSchemaName returns the schema version string to be used for API requests.
|
||||
func JsonSchemaName(schema json.RawMessage, version string) string {
|
||||
var schemaName string
|
||||
|
||||
switch {
|
||||
case bytes.Contains(schema, []byte("labels")):
|
||||
schemaName = "labels"
|
||||
case bytes.Contains(schema, []byte("labels")):
|
||||
schemaName = "caption"
|
||||
default:
|
||||
schemaName = "schema"
|
||||
}
|
||||
|
||||
version = clean.TypeLowerUnderscore(version)
|
||||
|
||||
if version == "" {
|
||||
version = "v1"
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s_%s_%s", NamePrefix, schemaName, version)
|
||||
|
||||
}
|
||||
23
internal/ai/vision/schema/name_test.go
Normal file
23
internal/ai/vision/schema/name_test.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package schema
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestJsonSchemaName(t *testing.T) {
|
||||
t.Run("Default", func(t *testing.T) {
|
||||
assert.Equal(t, "photoprism_vision_schema_v1", JsonSchemaName(nil, ""))
|
||||
})
|
||||
t.Run("Labels", func(t *testing.T) {
|
||||
assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(json.RawMessage(LabelsJsonSchemaDefault), ""))
|
||||
})
|
||||
t.Run("LabelsV1", func(t *testing.T) {
|
||||
assert.Equal(t, "photoprism_vision_labels_v2", JsonSchemaName([]byte("labels"), "v2"))
|
||||
})
|
||||
t.Run("LabelsJsonSchema", func(t *testing.T) {
|
||||
assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(LabelsJsonSchema(false), "v1"))
|
||||
})
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Package schema defines canonical JSON schema templates shared by PhotoPrism's AI vision engines.
|
||||
Package schema defines canonical JSON and JSON Schema templates shared by PhotoPrism's AI vision engines.
|
||||
|
||||
Copyright (c) 2018 - 2025 PhotoPrism UG. All rights reserved.
|
||||
|
||||
|
||||
@@ -4,5 +4,5 @@ package feat
|
||||
var (
|
||||
VisionModelGenerate = false // controls exposure of the generate endpoint and CLI commands
|
||||
VisionModelMarkers = false // gates marker generation/return until downstream UI and reconciliation paths are ready
|
||||
VisionServiceOpenAI = false // controls whether users are able to configure OpenAI as a vision service engine
|
||||
VisionServiceOpenAI = true // controls whether users are able to configure OpenAI as a vision service engine
|
||||
)
|
||||
|
||||
@@ -135,6 +135,7 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
|
||||
done := make(map[string]bool)
|
||||
offset := 0
|
||||
updated := 0
|
||||
processed := 0
|
||||
|
||||
// Make sure count is within
|
||||
if count < 1 || count > search.MaxResults {
|
||||
@@ -197,6 +198,8 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
|
||||
continue
|
||||
}
|
||||
|
||||
processed++
|
||||
|
||||
fileName := photoprism.FileName(photo.FileRoot, photo.FileName)
|
||||
file, fileErr := photoprism.NewMediaFile(fileName)
|
||||
|
||||
@@ -279,7 +282,18 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), time.Since(start))
|
||||
elapsed := time.Since(start)
|
||||
|
||||
switch {
|
||||
case processed == 0:
|
||||
log.Infof("vision: no pictures required processing [%s]", elapsed)
|
||||
case updated == processed:
|
||||
log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), elapsed)
|
||||
case updated == 0:
|
||||
log.Infof("vision: processed %s (no metadata changes detected) [%s]", english.Plural(processed, "picture", "pictures"), elapsed)
|
||||
default:
|
||||
log.Infof("vision: updated %s out of %s [%s]", english.Plural(updated, "picture", "pictures"), english.Plural(processed, "picture", "pictures"), elapsed)
|
||||
}
|
||||
|
||||
if updated > 0 {
|
||||
updateIndex = true
|
||||
|
||||
Reference in New Issue
Block a user