AI: Generate Captions & Labels using the OpenAI Responses API #5322

Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
Michael Mayer
2025-11-14 11:10:34 +01:00
parent 46d5e33c8c
commit d76acdb69f
28 changed files with 1822 additions and 127 deletions

View File

@@ -9,6 +9,9 @@ import (
"io"
"net/http"
"github.com/sirupsen/logrus"
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/header"
)
@@ -69,6 +72,10 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
return nil, parseErr
}
if log.IsLevelEnabled(logrus.TraceLevel) {
log.Tracef("vision: response %s", string(body))
}
return parsed, nil
}
@@ -89,12 +96,12 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
return apiResponse, nil
}
func decodeOllamaResponse(data []byte) (*ApiResponseOllama, error) {
resp := &ApiResponseOllama{}
func decodeOllamaResponse(data []byte) (*ollama.Response, error) {
resp := &ollama.Response{}
dec := json.NewDecoder(bytes.NewReader(data))
for {
var chunk ApiResponseOllama
var chunk ollama.Response
if err := dec.Decode(&chunk); err != nil {
if errors.Is(err, io.EOF) {
break

View File

@@ -8,6 +8,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/photoprism/photoprism/internal/ai/vision/ollama"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
@@ -49,7 +50,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
var req ApiRequest
assert.NoError(t, json.NewDecoder(r.Body).Decode(&req))
assert.Equal(t, FormatJSON, req.Format)
assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
Model: "qwen2.5vl:latest",
Response: `{"labels":[{"name":"test","confidence":0.9,"topicality":0.8}]}`,
}))
@@ -72,7 +73,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
})
t.Run("LabelsWithCodeFence", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
Model: "gemma3:latest",
Response: "```json\n{\"labels\":[{\"name\":\"lingerie\",\"confidence\":0.81,\"topicality\":0.73}]}\n```\nThe model provided additional commentary.",
}))
@@ -95,7 +96,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
})
t.Run("CaptionFallback", func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
Model: "qwen2.5vl:latest",
Response: "plain text",
}))

View File

@@ -1,10 +1,8 @@
package vision
import (
"errors"
"fmt"
"os"
"time"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/scheme"
@@ -12,53 +10,6 @@ import (
"github.com/photoprism/photoprism/pkg/rnd"
)
// ApiResponseOllama represents a Ollama API service response.
type ApiResponseOllama struct {
Id string `yaml:"Id,omitempty" json:"id,omitempty"`
Code int `yaml:"Code,omitempty" json:"code,omitempty"`
Error string `yaml:"Error,omitempty" json:"error,omitempty"`
Model string `yaml:"Model,omitempty" json:"model,omitempty"`
CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
Response string `yaml:"Response,omitempty" json:"response,omitempty"`
Done bool `yaml:"Done,omitempty" json:"done,omitempty"`
Context []int `yaml:"Context,omitempty" json:"context,omitempty"`
TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
Result ApiResult `yaml:"Result,omitempty" json:"result,omitempty"`
}
// Err returns an error if the request has failed.
func (r *ApiResponseOllama) Err() error {
if r == nil {
return errors.New("response is nil")
}
if r.Code >= 400 {
if r.Error != "" {
return errors.New(r.Error)
}
return fmt.Errorf("error %d", r.Code)
} else if r.Result.IsEmpty() {
return errors.New("no result")
}
return nil
}
// HasResult checks if there is at least one result in the response data.
func (r *ApiResponseOllama) HasResult() bool {
if r == nil {
return false
}
return !r.Result.IsEmpty()
}
// NewApiRequestOllama returns a new Ollama API request with the specified images as payload.
func NewApiRequestOllama(images Files, fileScheme scheme.Type) (*ApiRequest, error) {
imagesData := make(Files, len(images))

View File

@@ -11,6 +11,8 @@ import (
"github.com/sirupsen/logrus"
"github.com/photoprism/photoprism/internal/ai/vision/openai"
"github.com/photoprism/photoprism/internal/ai/vision/schema"
"github.com/photoprism/photoprism/internal/api/download"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/fs"
@@ -58,6 +60,11 @@ type ApiRequestOptions struct {
UseMmap bool `yaml:"UseMmap,omitempty" json:"use_mmap,omitempty"`
UseMlock bool `yaml:"UseMlock,omitempty" json:"use_mlock,omitempty"`
NumThread int `yaml:"NumThread,omitempty" json:"num_thread,omitempty"`
MaxOutputTokens int `yaml:"MaxOutputTokens,omitempty" json:"max_output_tokens,omitempty"`
Detail string `yaml:"Detail,omitempty" json:"detail,omitempty"`
ForceJson bool `yaml:"ForceJson,omitempty" json:"force_json,omitempty"`
SchemaVersion string `yaml:"SchemaVersion,omitempty" json:"schema_version,omitempty"`
CombineOutputs string `yaml:"CombineOutputs,omitempty" json:"combine_outputs,omitempty"`
}
// ApiRequestContext represents a context parameter returned from a previous request.
@@ -77,6 +84,7 @@ type ApiRequest struct {
Context *ApiRequestContext `form:"context" yaml:"Context,omitempty" json:"context,omitempty"`
Stream bool `form:"stream" yaml:"Stream,omitempty" json:"stream"`
Images Files `form:"images" yaml:"Images,omitempty" json:"images,omitempty"`
Schema json.RawMessage `form:"schema" yaml:"Schema,omitempty" json:"schema,omitempty"`
ResponseFormat ApiFormat `form:"-" yaml:"-" json:"-"`
}
@@ -195,6 +203,14 @@ func (r *ApiRequest) GetResponseFormat() ApiFormat {
// JSON returns the request data as JSON-encoded bytes.
func (r *ApiRequest) JSON() ([]byte, error) {
if r == nil {
return nil, errors.New("api request is nil")
}
if r.ResponseFormat == ApiFormatOpenAI {
return r.openAIJSON()
}
return json.Marshal(*r)
}
@@ -229,6 +245,8 @@ func (r *ApiRequest) sanitizedForLog() ApiRequest {
sanitized.Url = sanitizeLogPayload(r.Url)
sanitized.Schema = r.Schema
return sanitized
}
@@ -287,3 +305,134 @@ func isLikelyBase64(value string) bool {
return true
}
// openAIJSON converts the request data into an OpenAI Responses API payload.
func (r *ApiRequest) openAIJSON() ([]byte, error) {
detail := openai.DefaultDetail
if opts := r.Options; opts != nil && strings.TrimSpace(opts.Detail) != "" {
detail = strings.TrimSpace(opts.Detail)
}
messages := make([]openai.InputMessage, 0, 2)
if system := strings.TrimSpace(r.System); system != "" {
messages = append(messages, openai.InputMessage{
Role: "system",
Type: "message",
Content: []openai.ContentItem{
{
Type: openai.ContentTypeText,
Text: system,
},
},
})
}
userContent := make([]openai.ContentItem, 0, len(r.Images)+1)
if prompt := strings.TrimSpace(r.Prompt); prompt != "" {
userContent = append(userContent, openai.ContentItem{
Type: openai.ContentTypeText,
Text: prompt,
})
}
for _, img := range r.Images {
if img == "" {
continue
}
userContent = append(userContent, openai.ContentItem{
Type: openai.ContentTypeImage,
ImageURL: img,
Detail: detail,
})
}
if len(userContent) > 0 {
messages = append(messages, openai.InputMessage{
Role: "user",
Type: "message",
Content: userContent,
})
}
if len(messages) == 0 {
return nil, errors.New("openai request requires at least one message")
}
payload := openai.HTTPRequest{
Model: strings.TrimSpace(r.Model),
Input: messages,
}
if payload.Model == "" {
payload.Model = openai.DefaultModel
}
if strings.HasPrefix(strings.ToLower(payload.Model), "gpt-5") {
payload.Reasoning = &openai.Reasoning{Effort: "low"}
}
if opts := r.Options; opts != nil {
if opts.MaxOutputTokens > 0 {
payload.MaxOutputTokens = opts.MaxOutputTokens
}
if opts.Temperature > 0 {
payload.Temperature = opts.Temperature
}
if opts.TopP > 0 {
payload.TopP = opts.TopP
}
if opts.PresencePenalty != 0 {
payload.PresencePenalty = opts.PresencePenalty
}
if opts.FrequencyPenalty != 0 {
payload.FrequencyPenalty = opts.FrequencyPenalty
}
}
if format := buildOpenAIResponseFormat(r); format != nil {
payload.Text = &openai.TextOptions{
Format: format,
}
}
return json.Marshal(payload)
}
// buildOpenAIResponseFormat determines which response_format to send to OpenAI.
func buildOpenAIResponseFormat(r *ApiRequest) *openai.ResponseFormat {
if r == nil {
return nil
}
opts := r.Options
hasSchema := len(r.Schema) > 0
if !hasSchema && (opts == nil || !opts.ForceJson) {
return nil
}
result := &openai.ResponseFormat{}
if hasSchema {
result.Type = openai.ResponseFormatJSONSchema
result.Schema = r.Schema
if opts != nil && strings.TrimSpace(opts.SchemaVersion) != "" {
result.Name = strings.TrimSpace(opts.SchemaVersion)
} else {
result.Name = schema.JsonSchemaName(r.Schema, openai.DefaultSchemaVersion)
}
} else {
result.Type = openai.ResponseFormatJSONObject
}
return result
}

View File

@@ -53,7 +53,11 @@ func captionInternal(images Files, mediaSrc media.Src) (result *CaptionResult, m
apiRequest.System = model.GetSystemPrompt()
apiRequest.Prompt = model.GetPrompt()
apiRequest.Options = model.GetOptions()
if apiRequest.Options == nil {
apiRequest.Options = model.GetOptions()
}
apiRequest.WriteLog()
if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {

View File

@@ -58,14 +58,15 @@ func init() {
RegisterEngineAlias(EngineVision, EngineInfo{
RequestFormat: ApiFormatVision,
ResponseFormat: ApiFormatVision,
FileScheme: string(scheme.Data),
FileScheme: scheme.Data,
DefaultResolution: DefaultResolution,
})
RegisterEngineAlias(openai.EngineName, EngineInfo{
Uri: "https://api.openai.com/v1/responses",
RequestFormat: ApiFormatOpenAI,
ResponseFormat: ApiFormatOpenAI,
FileScheme: string(scheme.Data),
FileScheme: scheme.Data,
DefaultResolution: openai.DefaultResolution,
})
}
@@ -79,6 +80,7 @@ func RegisterEngine(format ApiFormat, engine Engine) {
// EngineInfo describes metadata that can be associated with an engine alias.
type EngineInfo struct {
Uri string
RequestFormat ApiFormat
ResponseFormat ApiFormat
FileScheme string

View File

@@ -28,7 +28,7 @@ func init() {
RegisterEngineAlias(ollama.EngineName, EngineInfo{
RequestFormat: ApiFormatOllama,
ResponseFormat: ApiFormatOllama,
FileScheme: string(scheme.Base64),
FileScheme: scheme.Base64,
DefaultResolution: ollama.DefaultResolution,
})
@@ -72,7 +72,7 @@ func (ollamaDefaults) SchemaTemplate(model *Model) string {
switch model.Type {
case ModelTypeLabels:
return ollama.LabelsSchema(model.PromptContains("nsfw"))
return ollama.SchemaLabels(model.PromptContains("nsfw"))
}
return ""
@@ -134,64 +134,93 @@ func (ollamaParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, stat
return nil, err
}
result := &ApiResponse{
response := &ApiResponse{
Id: req.GetId(),
Code: status,
Model: &Model{Name: ollamaResp.Model},
Result: ApiResult{
Labels: append([]LabelResult{}, ollamaResp.Result.Labels...),
Caption: func() *CaptionResult {
if ollamaResp.Result.Caption != nil {
copyCaption := *ollamaResp.Result.Caption
return &copyCaption
}
return nil
}(),
Labels: convertOllamaLabels(ollamaResp.Result.Labels),
Caption: convertOllamaCaption(ollamaResp.Result.Caption),
},
}
parsedLabels := len(result.Result.Labels) > 0
parsedLabels := len(response.Result.Labels) > 0
if !parsedLabels && strings.TrimSpace(ollamaResp.Response) != "" && req.Format == FormatJSON {
if labels, parseErr := parseOllamaLabels(ollamaResp.Response); parseErr != nil {
log.Debugf("vision: %s (parse ollama labels)", clean.Error(parseErr))
} else if len(labels) > 0 {
result.Result.Labels = append(result.Result.Labels, labels...)
response.Result.Labels = append(response.Result.Labels, labels...)
parsedLabels = true
}
}
if parsedLabels {
filtered := result.Result.Labels[:0]
for i := range result.Result.Labels {
if result.Result.Labels[i].Confidence <= 0 {
result.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
filtered := response.Result.Labels[:0]
for i := range response.Result.Labels {
if response.Result.Labels[i].Confidence <= 0 {
response.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
}
if result.Result.Labels[i].Topicality <= 0 {
result.Result.Labels[i].Topicality = result.Result.Labels[i].Confidence
if response.Result.Labels[i].Topicality <= 0 {
response.Result.Labels[i].Topicality = response.Result.Labels[i].Confidence
}
// Apply thresholds and canonicalize the name.
normalizeLabelResult(&result.Result.Labels[i])
normalizeLabelResult(&response.Result.Labels[i])
if result.Result.Labels[i].Name == "" {
if response.Result.Labels[i].Name == "" {
continue
}
if result.Result.Labels[i].Source == "" {
result.Result.Labels[i].Source = entity.SrcOllama
if response.Result.Labels[i].Source == "" {
response.Result.Labels[i].Source = entity.SrcOllama
}
filtered = append(filtered, result.Result.Labels[i])
filtered = append(filtered, response.Result.Labels[i])
}
result.Result.Labels = filtered
response.Result.Labels = filtered
} else if caption := strings.TrimSpace(ollamaResp.Response); caption != "" {
result.Result.Caption = &CaptionResult{
response.Result.Caption = &CaptionResult{
Text: caption,
Source: entity.SrcOllama,
}
}
return result, nil
return response, nil
}
func convertOllamaLabels(payload []ollama.LabelPayload) []LabelResult {
if len(payload) == 0 {
return nil
}
labels := make([]LabelResult, len(payload))
for i := range payload {
labels[i] = LabelResult{
Name: payload[i].Name,
Source: payload[i].Source,
Priority: payload[i].Priority,
Confidence: payload[i].Confidence,
Topicality: payload[i].Topicality,
Categories: payload[i].Categories,
NSFW: payload[i].NSFW,
NSFWConfidence: payload[i].NSFWConfidence,
}
}
return labels
}
func convertOllamaCaption(payload *ollama.CaptionPayload) *CaptionResult {
if payload == nil {
return nil
}
return &CaptionResult{
Text: payload.Text,
Source: payload.Source,
Confidence: payload.Confidence,
}
}

View File

@@ -10,9 +10,9 @@ import (
func TestOllamaDefaultConfidenceApplied(t *testing.T) {
req := &ApiRequest{Format: FormatJSON}
payload := ApiResponseOllama{
Result: ApiResult{
Labels: []LabelResult{{Name: "forest path", Confidence: 0, Topicality: 0}},
payload := ollama.Response{
Result: ollama.ResultPayload{
Labels: []ollama.LabelPayload{{Name: "forest path", Confidence: 0, Topicality: 0}},
},
}
raw, err := json.Marshal(payload)

View File

@@ -1,18 +1,342 @@
package vision
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"github.com/photoprism/photoprism/internal/ai/vision/openai"
"github.com/photoprism/photoprism/internal/entity"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
// init registers the OpenAI engine alias so models can set Engine: "openai"
// and inherit sensible defaults (request/response formats, file scheme, and
// preferred thumbnail resolution).
// openaiDefaults provides canned prompts, schema templates, and options for OpenAI engines.
type openaiDefaults struct{}
// openaiBuilder prepares ApiRequest objects for OpenAI's Responses API.
type openaiBuilder struct{}
// openaiParser converts Responses API payloads into ApiResponse instances.
type openaiParser struct{}
func init() {
RegisterEngineAlias(openai.EngineName, EngineInfo{
RequestFormat: ApiFormatOpenAI,
ResponseFormat: ApiFormatOpenAI,
FileScheme: string(scheme.Base64),
DefaultResolution: openai.DefaultResolution,
RegisterEngine(ApiFormatOpenAI, Engine{
Builder: openaiBuilder{},
Parser: openaiParser{},
Defaults: openaiDefaults{},
})
}
// SystemPrompt returns the default OpenAI system prompt for the specified model type.
func (openaiDefaults) SystemPrompt(model *Model) string {
if model == nil {
return ""
}
switch model.Type {
case ModelTypeCaption:
return openai.CaptionSystem
case ModelTypeLabels:
return openai.LabelSystem
default:
return ""
}
}
// UserPrompt returns the default OpenAI user prompt for the specified model type.
func (openaiDefaults) UserPrompt(model *Model) string {
if model == nil {
return ""
}
switch model.Type {
case ModelTypeCaption:
return openai.CaptionPrompt
case ModelTypeLabels:
if DetectNSFWLabels {
return openai.LabelPromptNSFW
}
return openai.LabelPromptDefault
default:
return ""
}
}
// SchemaTemplate returns the JSON schema template for the model, if applicable.
func (openaiDefaults) SchemaTemplate(model *Model) string {
if model == nil {
return ""
}
switch model.Type {
case ModelTypeLabels:
return string(openai.SchemaLabels(model.PromptContains("nsfw")))
default:
return ""
}
}
// Options returns default OpenAI request options for the model.
func (openaiDefaults) Options(model *Model) *ApiRequestOptions {
if model == nil {
return nil
}
switch model.Type {
case ModelTypeCaption:
/*
Options:
Detail: low
MaxOutputTokens: 512
Temperature: 0.1
TopP: 0.9
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
*/
return &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: openai.CaptionMaxTokens,
Temperature: openai.DefaultTemperature,
TopP: openai.DefaultTopP,
}
case ModelTypeLabels:
/*
Options:
Detail: low
MaxOutputTokens: 1024
Temperature: 0.1
ForceJson: true
SchemaVersion: "photoprism_vision_labels_v1"
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
*/
return &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: openai.LabelsMaxTokens,
Temperature: openai.DefaultTemperature,
TopP: openai.DefaultTopP,
ForceJson: true,
}
default:
return nil
}
}
// Build constructs an OpenAI request payload using base64-encoded thumbnails.
func (openaiBuilder) Build(ctx context.Context, model *Model, files Files) (*ApiRequest, error) {
if model == nil {
return nil, ErrInvalidModel
}
dataReq, err := NewApiRequestImages(files, scheme.Data)
if err != nil {
return nil, err
}
req := &ApiRequest{
Id: dataReq.Id,
Images: append(Files(nil), dataReq.Images...),
ResponseFormat: ApiFormatOpenAI,
}
if opts := model.GetOptions(); opts != nil {
req.Options = cloneOptions(opts)
if model.Type == ModelTypeCaption {
// Captions default to plain text responses; structured JSON is optional.
req.Options.ForceJson = false
if req.Options.MaxOutputTokens < openai.CaptionMaxTokens {
req.Options.MaxOutputTokens = openai.CaptionMaxTokens
}
} else if model.Type == ModelTypeLabels {
if req.Options.MaxOutputTokens < openai.LabelsMaxTokens {
req.Options.MaxOutputTokens = openai.LabelsMaxTokens
}
}
if strings.HasPrefix(strings.ToLower(strings.TrimSpace(model.Name)), "gpt-5") {
req.Options.Temperature = 0
req.Options.TopP = 0
}
}
if schema := strings.TrimSpace(model.SchemaTemplate()); schema != "" {
if raw, parseErr := parseOpenAISchema(schema); parseErr != nil {
log.Warnf("vision: failed to parse OpenAI schema template (%s)", clean.Error(parseErr))
} else {
req.Schema = raw
}
}
return req, nil
}
// Parse converts an OpenAI Responses API payload into the internal ApiResponse representation.
func (openaiParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, status int) (*ApiResponse, error) {
if status >= 300 {
if msg := openai.ParseErrorMessage(raw); msg != "" {
return nil, fmt.Errorf("openai: %s", msg)
}
return nil, fmt.Errorf("openai: status %d", status)
}
var resp openai.Response
if err := json.Unmarshal(raw, &resp); err != nil {
return nil, err
}
if resp.Error != nil && resp.Error.Message != "" {
return nil, errors.New(resp.Error.Message)
}
result := ApiResult{}
if jsonPayload := resp.FirstJSON(); len(jsonPayload) > 0 {
if err := populateOpenAIJSONResult(&result, jsonPayload); err != nil {
log.Debugf("vision: %s (parse openai json payload)", clean.Error(err))
}
}
if result.Caption == nil {
if text := resp.FirstText(); text != "" {
trimmed := strings.TrimSpace(text)
var parsedJSON bool
if len(trimmed) > 0 && (trimmed[0] == '{' || trimmed[0] == '[') {
if err := populateOpenAIJSONResult(&result, json.RawMessage(trimmed)); err != nil {
log.Debugf("vision: %s (parse openai json text payload)", clean.Error(err))
} else {
parsedJSON = true
}
}
if !parsedJSON && trimmed != "" {
result.Caption = &CaptionResult{
Text: trimmed,
Source: entity.SrcOpenAI,
}
}
}
}
var responseID string
if req != nil {
responseID = req.GetId()
}
modelName := strings.TrimSpace(resp.Model)
if modelName == "" && req != nil {
modelName = strings.TrimSpace(req.Model)
}
return &ApiResponse{
Id: responseID,
Code: status,
Model: &Model{Name: modelName},
Result: result,
}, nil
}
// parseOpenAISchema validates the provided JSON schema and returns it as a raw message.
func parseOpenAISchema(schema string) (json.RawMessage, error) {
var raw json.RawMessage
if err := json.Unmarshal([]byte(schema), &raw); err != nil {
return nil, err
}
return normalizeOpenAISchema(raw)
}
// normalizeOpenAISchema upgrades legacy label schema definitions so they comply with
// OpenAI's json_schema format requirements.
func normalizeOpenAISchema(raw json.RawMessage) (json.RawMessage, error) {
if len(raw) == 0 {
return raw, nil
}
var doc map[string]any
if err := json.Unmarshal(raw, &doc); err != nil {
// Fallback to the original payload if it isn't a JSON object.
return raw, nil
}
if t, ok := doc["type"]; ok {
if typeStr, ok := t.(string); ok && strings.TrimSpace(typeStr) != "" {
return raw, nil
}
}
if _, ok := doc["properties"]; ok {
return raw, nil
}
labels, ok := doc["labels"]
if !ok {
return raw, nil
}
nsfw := false
if items, ok := labels.([]any); ok && len(items) > 0 {
if first, ok := items[0].(map[string]any); ok {
if _, hasNSFW := first["nsfw"]; hasNSFW {
nsfw = true
}
if _, hasNSFWConfidence := first["nsfw_confidence"]; hasNSFWConfidence {
nsfw = true
}
}
}
return openai.SchemaLabels(nsfw), nil
}
// populateOpenAIJSONResult unmarshals a structured OpenAI response into ApiResult fields.
func populateOpenAIJSONResult(result *ApiResult, payload json.RawMessage) error {
if result == nil || len(payload) == 0 {
return nil
}
var envelope struct {
Caption *struct {
Text string `json:"text"`
Confidence float32 `json:"confidence"`
} `json:"caption"`
Labels []LabelResult `json:"labels"`
}
if err := json.Unmarshal(payload, &envelope); err != nil {
return err
}
if envelope.Caption != nil {
text := strings.TrimSpace(envelope.Caption.Text)
if text != "" {
result.Caption = &CaptionResult{
Text: text,
Confidence: envelope.Caption.Confidence,
Source: entity.SrcOpenAI,
}
}
}
if len(envelope.Labels) > 0 {
filtered := envelope.Labels[:0]
for i := range envelope.Labels {
if envelope.Labels[i].Source == "" {
envelope.Labels[i].Source = entity.SrcOpenAI
}
normalizeLabelResult(&envelope.Labels[i])
if envelope.Labels[i].Name == "" {
continue
}
filtered = append(filtered, envelope.Labels[i])
}
result.Labels = append(result.Labels, filtered...)
}
return nil
}

View File

@@ -0,0 +1,337 @@
package vision
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/photoprism/photoprism/internal/ai/vision/openai"
"github.com/photoprism/photoprism/internal/ai/vision/schema"
"github.com/photoprism/photoprism/internal/entity"
)
func TestOpenAIBuilderBuild(t *testing.T) {
model := &Model{
Type: ModelTypeLabels,
Name: openai.DefaultModel,
Engine: openai.EngineName,
}
model.ApplyEngineDefaults()
request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
require.NoError(t, err)
require.NotNil(t, request)
assert.Equal(t, ApiFormatOpenAI, request.ResponseFormat)
assert.NotEmpty(t, request.Images)
assert.NotNil(t, request.Options)
assert.Equal(t, openai.DefaultDetail, request.Options.Detail)
assert.True(t, request.Options.ForceJson)
assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.LabelsMaxTokens)
}
func TestOpenAIBuilderBuildCaptionDisablesForceJSON(t *testing.T) {
model := &Model{
Type: ModelTypeCaption,
Name: openai.DefaultModel,
Engine: openai.EngineName,
Options: &ApiRequestOptions{ForceJson: true},
}
model.ApplyEngineDefaults()
request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
require.NoError(t, err)
require.NotNil(t, request)
require.NotNil(t, request.Options)
assert.False(t, request.Options.ForceJson)
assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.CaptionMaxTokens)
}
func TestApiRequestJSONForOpenAI(t *testing.T) {
req := &ApiRequest{
Model: "gpt-5-mini",
System: "system",
Prompt: "describe the scene",
Images: []string{"data:image/jpeg;base64,AA=="},
ResponseFormat: ApiFormatOpenAI,
Options: &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: 128,
Temperature: 0.2,
TopP: 0.8,
ForceJson: true,
},
Schema: json.RawMessage(`{"type":"object","properties":{"caption":{"type":"object"}}}`),
}
payload, err := req.JSON()
require.NoError(t, err)
var decoded struct {
Model string `json:"model"`
Input []struct {
Role string `json:"role"`
Content []struct {
Type string `json:"type"`
} `json:"content"`
} `json:"input"`
Text struct {
Format struct {
Type string `json:"type"`
Name string `json:"name"`
Schema json.RawMessage `json:"schema"`
Strict bool `json:"strict"`
} `json:"format"`
} `json:"text"`
Reasoning struct {
Effort string `json:"effort"`
} `json:"reasoning"`
MaxOutputTokens int `json:"max_output_tokens"`
}
require.NoError(t, json.Unmarshal(payload, &decoded))
assert.Equal(t, "gpt-5-mini", decoded.Model)
require.Len(t, decoded.Input, 2)
assert.Equal(t, "system", decoded.Input[0].Role)
assert.Equal(t, openai.ResponseFormatJSONSchema, decoded.Text.Format.Type)
assert.Equal(t, schema.JsonSchemaName(decoded.Text.Format.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
assert.False(t, decoded.Text.Format.Strict)
assert.NotNil(t, decoded.Text.Format.Schema)
assert.Equal(t, "low", decoded.Reasoning.Effort)
assert.Equal(t, 128, decoded.MaxOutputTokens)
}
func TestApiRequestJSONForOpenAIDefaultSchemaName(t *testing.T) {
req := &ApiRequest{
Model: "gpt-5-mini",
Images: []string{"data:image/jpeg;base64,AA=="},
ResponseFormat: ApiFormatOpenAI,
Options: &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: 64,
ForceJson: true,
},
Schema: json.RawMessage(`{"type":"object"}`),
}
payload, err := req.JSON()
require.NoError(t, err)
var decoded struct {
Text struct {
Format struct {
Name string `json:"name"`
} `json:"format"`
} `json:"text"`
}
require.NoError(t, json.Unmarshal(payload, &decoded))
assert.Equal(t, schema.JsonSchemaName(req.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
}
func TestOpenAIParserParsesJSONFromTextPayload(t *testing.T) {
respPayload := `{
"id": "resp_123",
"model": "gpt-5-mini",
"output": [{
"role": "assistant",
"content": [{
"type": "output_text",
"text": "{\"labels\":[{\"name\":\"deer\",\"confidence\":0.98,\"topicality\":0.99}]}"
}]
}]
}`
req := &ApiRequest{
Id: "test",
Model: "gpt-5-mini",
ResponseFormat: ApiFormatOpenAI,
}
resp, err := openaiParser{}.Parse(context.Background(), req, []byte(respPayload), http.StatusOK)
require.NoError(t, err)
require.NotNil(t, resp)
require.Len(t, resp.Result.Labels, 1)
assert.Equal(t, "Deer", resp.Result.Labels[0].Name)
assert.Nil(t, resp.Result.Caption)
}
func TestParseOpenAISchemaLegacyUpgrade(t *testing.T) {
legacy := `{
"labels": [{
"name": "",
"confidence": 0,
"topicality": 0
}]
}`
raw, err := parseOpenAISchema(legacy)
require.NoError(t, err)
var decoded map[string]any
require.NoError(t, json.Unmarshal(raw, &decoded))
assert.Equal(t, "object", decoded["type"])
props, ok := decoded["properties"].(map[string]any)
require.True(t, ok)
labels, ok := props["labels"].(map[string]any)
require.True(t, ok)
assert.Equal(t, "array", labels["type"])
}
func TestParseOpenAISchemaLegacyUpgradeNSFW(t *testing.T) {
legacy := `{
"labels": [{
"name": "",
"confidence": 0,
"topicality": 0,
"nsfw": false,
"nsfw_confidence": 0
}]
}`
raw, err := parseOpenAISchema(legacy)
require.NoError(t, err)
var decoded map[string]any
require.NoError(t, json.Unmarshal(raw, &decoded))
props := decoded["properties"].(map[string]any)
labels := props["labels"].(map[string]any)
items := labels["items"].(map[string]any)
_, hasNSFW := items["properties"].(map[string]any)["nsfw"]
assert.True(t, hasNSFW)
}
func TestPerformApiRequestOpenAISuccess(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var reqPayload struct {
Model string `json:"model"`
}
assert.NoError(t, json.NewDecoder(r.Body).Decode(&reqPayload))
assert.Equal(t, "gpt-5-mini", reqPayload.Model)
response := map[string]any{
"id": "resp_123",
"model": "gpt-5-mini",
"output": []any{
map[string]any{
"role": "assistant",
"content": []any{
map[string]any{
"type": "output_json",
"json": map[string]any{
"caption": map[string]any{
"text": "A cat rests on a windowsill.",
"confidence": 0.91,
},
"labels": []map[string]any{
{
"name": "cat",
"confidence": 0.92,
"topicality": 0.88,
},
},
},
},
},
},
},
}
assert.NoError(t, json.NewEncoder(w).Encode(response))
}))
defer server.Close()
req := &ApiRequest{
Id: "test",
Model: "gpt-5-mini",
Images: []string{"data:image/jpeg;base64,AA=="},
ResponseFormat: ApiFormatOpenAI,
Options: &ApiRequestOptions{
Detail: openai.DefaultDetail,
},
Schema: json.RawMessage(`{"type":"object"}`),
}
resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "secret")
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.Result.Caption)
assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
assert.Equal(t, "A cat rests on a windowsill.", resp.Result.Caption.Text)
require.Len(t, resp.Result.Labels, 1)
assert.Equal(t, entity.SrcOpenAI, resp.Result.Labels[0].Source)
assert.Equal(t, "Cat", resp.Result.Labels[0].Name)
}
func TestPerformApiRequestOpenAITextFallback(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
response := map[string]any{
"id": "resp_456",
"model": "gpt-5-mini",
"output": []any{
map[string]any{
"role": "assistant",
"content": []any{
map[string]any{
"type": "output_text",
"text": "Two hikers reach the summit at sunset.",
},
},
},
},
}
assert.NoError(t, json.NewEncoder(w).Encode(response))
}))
defer server.Close()
req := &ApiRequest{
Id: "fallback",
Model: "gpt-5-mini",
Images: []string{"data:image/jpeg;base64,AA=="},
ResponseFormat: ApiFormatOpenAI,
Options: &ApiRequestOptions{
Detail: openai.DefaultDetail,
},
Schema: nil,
}
resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
require.NoError(t, err)
require.NotNil(t, resp.Result.Caption)
assert.Equal(t, "Two hikers reach the summit at sunset.", resp.Result.Caption.Text)
assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
}
func TestPerformApiRequestOpenAIError(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusBadRequest)
_ = json.NewEncoder(w).Encode(map[string]any{
"error": map[string]any{
"message": "Invalid image payload",
},
})
}))
defer server.Close()
req := &ApiRequest{
Id: "error",
Model: "gpt-5-mini",
ResponseFormat: ApiFormatOpenAI,
Schema: nil,
Images: []string{"data:image/jpeg;base64,AA=="},
}
_, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
require.Error(t, err)
assert.Contains(t, err.Error(), "Invalid image payload")
}

View File

@@ -96,8 +96,10 @@ func labelsInternal(images Files, mediaSrc media.Src, labelSrc entity.Src) (resu
apiRequest.Prompt = prompt
}
if options := model.GetOptions(); options != nil {
apiRequest.Options = options
if apiRequest.Options == nil {
if options := model.GetOptions(); options != nil {
apiRequest.Options = options
}
}
apiRequest.WriteLog()

View File

@@ -348,6 +348,26 @@ func mergeOptionDefaults(target, defaults *ApiRequestOptions) {
if len(target.Stop) == 0 && len(defaults.Stop) > 0 {
target.Stop = append([]string(nil), defaults.Stop...)
}
if target.MaxOutputTokens <= 0 && defaults.MaxOutputTokens > 0 {
target.MaxOutputTokens = defaults.MaxOutputTokens
}
if strings.TrimSpace(target.Detail) == "" && strings.TrimSpace(defaults.Detail) != "" {
target.Detail = strings.TrimSpace(defaults.Detail)
}
if !target.ForceJson && defaults.ForceJson {
target.ForceJson = true
}
if target.SchemaVersion == "" && defaults.SchemaVersion != "" {
target.SchemaVersion = defaults.SchemaVersion
}
if target.CombineOutputs == "" && defaults.CombineOutputs != "" {
target.CombineOutputs = defaults.CombineOutputs
}
}
func normalizeOptions(opts *ApiRequestOptions) {
@@ -422,6 +442,10 @@ func (m *Model) ApplyEngineDefaults() {
}
if info, ok := EngineInfoFor(engine); ok {
if m.Service.Uri == "" {
m.Service.Uri = info.Uri
}
if m.Service.RequestFormat == "" {
m.Service.RequestFormat = info.RequestFormat
}
@@ -490,7 +514,7 @@ func (m *Model) SchemaTemplate() string {
}
if m.schema == "" {
m.schema = visionschema.Labels(m.PromptContains("nsfw"))
m.schema = visionschema.LabelsJson(m.PromptContains("nsfw"))
}
}
})

View File

@@ -1,7 +1,5 @@
package ollama
import "github.com/photoprism/photoprism/internal/ai/vision/schema"
const (
// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
@@ -22,12 +20,3 @@ const (
// DefaultResolution is the default thumbnail size submitted to Ollama models.
DefaultResolution = 720
)
// LabelsSchema returns the canonical label schema string consumed by Ollama models.
func LabelsSchema(nsfw bool) string {
if nsfw {
return schema.LabelsNSFW
} else {
return schema.LabelsDefault
}
}

View File

@@ -0,0 +1,14 @@
package ollama
import (
"github.com/photoprism/photoprism/internal/ai/vision/schema"
)
// SchemaLabels returns the canonical label schema string consumed by Ollama models.
//
// Related documentation and references:
// - https://www.alibabacloud.com/help/en/model-studio/json-mode
// - https://www.json.org/json-en.html
func SchemaLabels(nsfw bool) string {
return schema.LabelsJson(nsfw)
}

View File

@@ -0,0 +1,79 @@
package ollama
import (
"errors"
"fmt"
"time"
)
// Response encapsulates the subset of the Ollama generate API response we care about.
type Response struct {
ID string `yaml:"Id,omitempty" json:"id,omitempty"`
Code int `yaml:"Code,omitempty" json:"code,omitempty"`
Error string `yaml:"Error,omitempty" json:"error,omitempty"`
Model string `yaml:"Model,omitempty" json:"model,omitempty"`
CreatedAt time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
Response string `yaml:"Response,omitempty" json:"response,omitempty"`
Done bool `yaml:"Done,omitempty" json:"done,omitempty"`
Context []int `yaml:"Context,omitempty" json:"context,omitempty"`
TotalDuration int64 `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
LoadDuration int `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
PromptEvalCount int `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
PromptEvalDuration int `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
EvalCount int `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
EvalDuration int64 `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
Result ResultPayload `yaml:"Result,omitempty" json:"result,omitempty"`
}
// Err returns an error if the request has failed.
func (r *Response) Err() error {
if r == nil {
return errors.New("response is nil")
}
if r.Code >= 400 {
if r.Error != "" {
return errors.New(r.Error)
}
return fmt.Errorf("error %d", r.Code)
} else if len(r.Result.Labels) == 0 && r.Result.Caption == nil {
return errors.New("no result")
}
return nil
}
// HasResult checks if there is at least one result in the response data.
func (r *Response) HasResult() bool {
if r == nil {
return false
}
return len(r.Result.Labels) > 0 || r.Result.Caption != nil
}
// ResultPayload mirrors the structure returned by Ollama for result data.
type ResultPayload struct {
Labels []LabelPayload `json:"labels"`
Caption *CaptionPayload `json:"caption,omitempty"`
}
// LabelPayload represents a single label object emitted by the Ollama adapter.
type LabelPayload struct {
Name string `json:"name"`
Source string `json:"source,omitempty"`
Priority int `json:"priority,omitempty"`
Confidence float32 `json:"confidence,omitempty"`
Topicality float32 `json:"topicality,omitempty"`
Categories []string `json:"categories,omitempty"`
NSFW bool `json:"nsfw,omitempty"`
NSFWConfidence float32 `json:"nsfw_confidence,omitempty"`
}
// CaptionPayload represents the caption object emitted by the Ollama adapter.
type CaptionPayload struct {
Text string `json:"text"`
Source string `json:"source,omitempty"`
Confidence float32 `json:"confidence,omitempty"`
}

View File

@@ -0,0 +1,128 @@
## PhotoPrism — OpenAI API Integration
**Last Updated:** November 14, 2025
### Overview
This package contains PhotoPrisms adapter for the OpenAI Responses API. It enables existing caption and label workflows (`GenerateCaption`, `GenerateLabels`, and the `photoprism vision run` CLI) to call OpenAI models alongside TensorFlow and Ollama without changing worker or API code. The implementation focuses on predictable results, structured outputs, and clear observability so operators can opt in gradually.
#### Context & Constraints
- OpenAI requests flow through the existing vision client (`internal/ai/vision/api_client.go`) and must honour PhotoPrisms timeout, logging, and ACL rules.
- Structured outputs are preferred but the adapter must gracefully handle free-form text; `output_text` responses are parsed both as JSON and as plain captions.
- Costs should remain predictable: requests are limited to a single 720px thumbnail (`detail=low`) and capped token budgets (512 caption, 1024 labels).
- Secrets are supplied per model (`Service.Key`) with fallbacks to `OPENAI_API_KEY` / `_FILE`. Logs must redact sensitive data.
#### Goals
- Provide drop-in OpenAI support for captions and labels using `vision.yml`.
- Keep configuration ergonomic by auto-populating prompts, schema names, token limits, and sampling defaults.
- Expose enough logging and tests so operators can compare OpenAI output with existing engines before enabling it broadly.
#### Non-Goals
- Introducing a new `generate` model type or combined caption/label endpoint (reserved for a later phase).
- Replacing the default TensorFlow models; they remain active as fallbacks.
- Managing OpenAI billing or quota dashboards beyond surfacing token counts in logs and metrics.
### Prompt, Model, & Schema Guidance
- **Models:** The adapter targets GPT5 vision tiers (e.g. `gpt-5-nano`, `gpt-5-mini`). These models support image inputs, structured outputs, and deterministic settings. Set `Name` to the exact provider identifier so defaults are applied correctly. Caption models share the same configuration surface and run through the same adapter.
- **Prompts:** Defaults live in `defaults.go`. Captions use a single-sentence instruction; labels use `LabelPromptDefault` (or `LabelPromptNSFW` when PhotoPrism requests NSFW metadata). Custom prompts should retain schema reminders so structured outputs stay valid.
- **Schemas:** Labels use the JSON schema returned by `schema.LabelsJsonSchema(nsfw)`; the response format name is derived via `schema.JsonSchemaName` (e.g. `photoprism_vision_labels_v1`). Captions omit schemas unless operators explicitly request a structured format.
- **When to keep defaults:** For most deployments, leaving `System`, `Prompt`, `Schema`, and `Options` unset yields stable output with minimal configuration. Override them only when domain-specific language or custom scoring is necessary, and add regression tests alongside.
Budget-conscious operators can experiment with lighter prompts or lower-resolution thumbnails, but should keep token limits and determinism settings intact to avoid unexpected bills and UI churn.
#### Performance & Cost Estimates
- **Token budgets:** Captions request up to 512 output tokens; labels request up to 1024. Input tokens are typically ≤700 for a single 720px thumbnail plus prompts.
- **Latency:** GPT5 nano/mini vision calls typically complete in 38s, depending on OpenAI region. Including reasoning metadata (`reasoning.effort=low`) has negligible impact but improves traceability.
- **Costs:** Consult OpenAIs pricing for the selected model. Multiply input/output tokens by the published rate. PhotoPrism currently sends one image per request to keep costs linear with photo count.
#### Defaults
- File scheme: `data:` URLs (base64) for all OpenAI models.
- Resolution: 720px thumbnails (`vision.Thumb(ModelTypeCaption|Labels)`).
- Options: `MaxOutputTokens` raised to 512 (caption) / 1024 (labels); `ForceJson=false` for captions, `true` for labels; `reasoning.effort="low"`.
- Sampling: `Temperature` and `TopP` set to `0` for `gpt-5*` models; inherited values (0.1/0.9) remain for other engines. `openaiBuilder.Build` performs this override while preserving the struct defaults for non-OpenAI adapters.
- Schema naming: Automatically derived via `schema.JsonSchemaName`, so operators may omit `SchemaVersion`.
### Configuration
#### Environment Variables
- `OPENAI_API_KEY` / `OPENAI_API_KEY_FILE` — fallback credentials when a models `Service.Key` is unset.
- Existing `PHOTOPRISM_VISION_*` variables remain authoritative (see the [Developer Guide](https://docs.photoprism.app/developer-guide/vision/service/) for full lists).
#### `vision.yml` Examples
```yaml
Models:
- Type: caption
Name: gpt-5-nano
Engine: openai
Disabled: false # opt in manually
Resolution: 720 # optional; default is 720
Options:
Detail: low # optional; defaults to low
MaxOutputTokens: 512
Service:
Uri: https://api.openai.com/v1/responses
FileScheme: data
Key: ${OPENAI_API_KEY}
- Type: labels
Name: gpt-5-mini
Engine: openai
Disabled: false
Resolution: 720
Options:
Detail: low
MaxOutputTokens: 1024
ForceJson: true # redundant but explicit
Service:
Uri: https://api.openai.com/v1/responses
FileScheme: data
Key: ${OPENAI_API_KEY}
```
Keep TensorFlow entries in place so PhotoPrism falls back when the external service is unavailable.
### Documentation
- Label Generation: <https://docs.photoprism.app/developer-guide/vision/label-generation/>
- Caption Generation: <https://docs.photoprism.app/developer-guide/vision/caption-generation/>
- Vision CLI Commands: <https://docs.photoprism.app/developer-guide/vision/cli/>
### Implementation Details
#### Core Concepts
- **Structured outputs:** PhotoPrism leverages OpenAIs structured output capability as documented at <https://platform.openai.com/docs/guides/structured-outputs>. When a JSON schema is supplied, the adapter emits `text.format` with `type: "json_schema"` and a schema name derived from the content. The parser then prefers `output_json`, but also attempts to decode `output_text` payloads that contain JSON objects.
- **Deterministic sampling:** GPT5 models are run with `temperature=0` and `top_p=0` to minimise variance, while still allowing developers to override values in `vision.yml` if needed.
- **Reasoning metadata:** Requests include `reasoning.effort="low"` so OpenAI returns structured reasoning usage counters, helping operators track token consumption.
- **Worker summaries:** The vision worker now logs either “updated …” or “processed … (no metadata changes detected)”, making reruns easy to audit.
#### Rate Limiting
OpenAI calls respect the existing `limiter.Auth` configuration used by the vision service. Failed requests surface standard HTTP errors and are not automatically retried; operators should ensure they have adequate account limits and consider external rate limiting when sharing credentials.
#### Testing & Validation
1. Unit tests: `go test ./internal/ai/vision/openai ./internal/ai/vision -run OpenAI -count=1`. Fixtures under `internal/ai/vision/openai/testdata/` replay real Responses payloads (captions and labels).
2. CLI smoke test: `photoprism vision run -m labels --count 1 --force --model=gpt-5-mini` with trace logging enabled to inspect sanitised Responses.
3. Compare worker summaries and label sources (`openai`) in the UI or via `photoprism vision ls`.
#### Code Map
- **Adapter & defaults:** `internal/ai/vision/openai` (defaults, schema helpers, transport, tests).
- **Request/response plumbing:** `internal/ai/vision/api_request.go`, `api_client.go`, `engine_openai.go`, `engine_openai_test.go`.
- **Workers & CLI:** `internal/workers/vision.go`, `internal/commands/vision_run.go`.
- **Shared utilities:** `internal/ai/vision/schema`, `pkg/clean`, `pkg/media`.
#### Next Steps
- [ ] Introduce the future `generate` model type that combines captions, labels, and optional markers.
- [ ] Evaluate additional OpenAI models as pricing and capabilities evolve.
- [ ] Expose token usage metrics (input/output/reasoning) via Prometheus once the schema stabilises.

View File

@@ -1,6 +1,29 @@
package openai
import "github.com/photoprism/photoprism/internal/ai/vision/schema"
const (
// CaptionSystem defines the default system prompt for caption models.
CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
// CaptionPrompt instructs caption models to respond with a single sentence.
CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
// LabelSystem defines the system prompt for label generation.
LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
// LabelPromptDefault requests general-purpose labels.
LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
// LabelPromptNSFW requests labels including NSFW metadata when required.
LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
DefaultDetail = "low"
// CaptionMaxTokens suggests the output budget for caption responses.
CaptionMaxTokens = 512
// LabelsMaxTokens suggests the output budget for label responses.
LabelsMaxTokens = 1024
// DefaultTemperature configures deterministic replies.
DefaultTemperature = 0.1
// DefaultTopP limits nucleus sampling.
DefaultTopP = 0.9
// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
DefaultSchemaVersion = "v1"
)
var (
// DefaultModel is the model used by default when accessing the OpenAI API.
@@ -8,8 +31,3 @@ var (
// DefaultResolution is the default thumbnail size submitted to the OpenAI.
DefaultResolution = 720
)
// LabelsSchema returns the canonical label schema string consumed by OpenAI models.
func LabelsSchema() string {
return schema.LabelsDefault
}

View File

@@ -0,0 +1,16 @@
package openai
import (
"encoding/json"
"github.com/photoprism/photoprism/internal/ai/vision/schema"
)
// SchemaLabels returns the canonical labels JSON Schema string consumed by Ollama models.
//
// Related documentation and references:
// - https://platform.openai.com/docs/guides/structured-outputs
// - https://json-schema.org/learn/miscellaneous-examples
func SchemaLabels(nsfw bool) json.RawMessage {
return schema.LabelsJsonSchema(nsfw)
}

View File

@@ -0,0 +1,73 @@
{
"id": "resp_0d356718505119f3006916e5d8730881a0b91de2aa700f6196",
"object": "response",
"created_at": 1763108312,
"status": "completed",
"background": false,
"billing": {
"payer": "developer"
},
"error": null,
"incomplete_details": null,
"instructions": null,
"max_output_tokens": 512,
"max_tool_calls": null,
"model": "gpt-5-nano-2025-08-07",
"output": [
{
"id": "rs_0d356718505119f3006916e5d8efd481a0a4f9cc1823cc6c83",
"type": "reasoning",
"summary": []
},
{
"id": "msg_0d356718505119f3006916e5d9433881a0bc79197d2cfc2027",
"type": "message",
"status": "completed",
"content": [
{
"type": "output_text",
"annotations": [],
"logprobs": [],
"text": "A bee gathers nectar from the vibrant red poppy\u2019s center."
}
],
"role": "assistant"
}
],
"parallel_tool_calls": true,
"previous_response_id": null,
"prompt_cache_key": null,
"prompt_cache_retention": null,
"reasoning": {
"effort": "low",
"summary": null
},
"safety_identifier": null,
"service_tier": "default",
"store": true,
"temperature": 1.0,
"text": {
"format": {
"type": "text"
},
"verbosity": "medium"
},
"tool_choice": "auto",
"tools": [],
"top_logprobs": 0,
"top_p": 1.0,
"truncation": "disabled",
"usage": {
"input_tokens": 576,
"input_tokens_details": {
"cached_tokens": 0
},
"output_tokens": 19,
"output_tokens_details": {
"reasoning_tokens": 0
},
"total_tokens": 595
},
"user": null,
"metadata": {}
}

View File

@@ -0,0 +1,114 @@
{
"id": "resp_0fa91dfb69b7d644006916ea0b72ac819f84ff3152a38dfcdb",
"object": "response",
"created_at": 1763109387,
"status": "completed",
"background": false,
"billing": {
"payer": "developer"
},
"error": null,
"incomplete_details": null,
"instructions": null,
"max_output_tokens": 1024,
"max_tool_calls": null,
"model": "gpt-5-mini-2025-08-07",
"output": [
{
"id": "rs_0fa91dfb69b7d644006916ea0c3450819f8a13396bf377f474",
"type": "reasoning",
"summary": []
},
{
"id": "msg_0fa91dfb69b7d644006916ea0d2dfc819faf52b11334fc10a4",
"type": "message",
"status": "completed",
"content": [
{
"type": "output_text",
"annotations": [],
"logprobs": [],
"text": "{\"labels\":[{\"name\":\"flower\",\"confidence\":0.99,\"topicality\":0.99},{\"name\":\"bee\",\"confidence\":0.95,\"topicality\":0.95},{\"name\":\"petal\",\"confidence\":0.92,\"topicality\":0.88},{\"name\":\"pollen\",\"confidence\":0.85,\"topicality\":0.8},{\"name\":\"insect\",\"confidence\":0.9,\"topicality\":0.85},{\"name\":\"red\",\"confidence\":0.88,\"topicality\":0.6},{\"name\":\"close-up\",\"confidence\":0.86,\"topicality\":0.7},{\"name\":\"nature\",\"confidence\":0.8,\"topicality\":0.5}]}"
}
],
"role": "assistant"
}
],
"parallel_tool_calls": true,
"previous_response_id": null,
"prompt_cache_key": null,
"prompt_cache_retention": null,
"reasoning": {
"effort": "low",
"summary": null
},
"safety_identifier": null,
"service_tier": "default",
"store": true,
"temperature": 1.0,
"text": {
"format": {
"type": "json_schema",
"description": null,
"name": "photoprism_vision_labels_v1",
"schema": {
"type": "object",
"properties": {
"labels": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"minLength": 1
},
"confidence": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"topicality": {
"type": "number",
"minimum": 0,
"maximum": 1
}
},
"required": [
"name",
"confidence",
"topicality"
],
"additionalProperties": false
},
"default": []
}
},
"required": [
"labels"
],
"additionalProperties": false
},
"strict": true
},
"verbosity": "medium"
},
"tool_choice": "auto",
"tools": [],
"top_logprobs": 0,
"top_p": 1.0,
"truncation": "disabled",
"usage": {
"input_tokens": 724,
"input_tokens_details": {
"cached_tokens": 0
},
"output_tokens": 169,
"output_tokens_details": {
"reasoning_tokens": 0
},
"total_tokens": 893
},
"user": null,
"metadata": {}
}

View File

@@ -0,0 +1,142 @@
package openai
import (
"encoding/json"
"strings"
)
const (
// ContentTypeText identifies text input segments for the Responses API.
ContentTypeText = "input_text"
// ContentTypeImage identifies image input segments for the Responses API.
ContentTypeImage = "input_image"
// ResponseFormatJSONSchema requests JSON constrained by a schema.
ResponseFormatJSONSchema = "json_schema"
// ResponseFormatJSONObject requests a free-form JSON object.
ResponseFormatJSONObject = "json_object"
)
// HTTPRequest represents the payload expected by OpenAI's Responses API.
type HTTPRequest struct {
Model string `json:"model"`
Input []InputMessage `json:"input"`
Text *TextOptions `json:"text,omitempty"`
Reasoning *Reasoning `json:"reasoning,omitempty"`
MaxOutputTokens int `json:"max_output_tokens,omitempty"`
Temperature float64 `json:"temperature,omitempty"`
TopP float64 `json:"top_p,omitempty"`
PresencePenalty float64 `json:"presence_penalty,omitempty"`
FrequencyPenalty float64 `json:"frequency_penalty,omitempty"`
}
// TextOptions carries formatting preferences for textual responses.
type TextOptions struct {
Format *ResponseFormat `json:"format,omitempty"`
}
// Reasoning configures the effort level for reasoning models.
type Reasoning struct {
Effort string `json:"effort,omitempty"`
}
// InputMessage captures a single system or user message in the request.
type InputMessage struct {
Role string `json:"role"`
Type string `json:"type,omitempty"`
Content []ContentItem `json:"content"`
}
// ContentItem represents a text or image entry within a message.
type ContentItem struct {
Type string `json:"type"`
Text string `json:"text,omitempty"`
ImageURL string `json:"image_url,omitempty"`
Detail string `json:"detail,omitempty"`
}
// ResponseFormat describes how OpenAI should format its response.
type ResponseFormat struct {
Type string `json:"type"`
Name string `json:"name,omitempty"`
Schema json.RawMessage `json:"schema,omitempty"`
Description string `json:"description,omitempty"`
Strict bool `json:"strict,omitempty"`
}
// Response mirrors the subset of the Responses API response we need.
type Response struct {
ID string `json:"id"`
Model string `json:"model"`
Output []ResponseOutput `json:"output"`
Error *struct {
Message string `json:"message"`
Type string `json:"type"`
} `json:"error,omitempty"`
}
// ResponseOutput captures assistant messages within the response.
type ResponseOutput struct {
Role string `json:"role"`
Content []ResponseContent `json:"content"`
}
// ResponseContent contains individual message parts (JSON or text).
type ResponseContent struct {
Type string `json:"type"`
Text string `json:"text,omitempty"`
JSON json.RawMessage `json:"json,omitempty"`
}
// FirstJSON returns the first JSON payload contained in the response.
func (r *Response) FirstJSON() json.RawMessage {
if r == nil {
return nil
}
for i := range r.Output {
for j := range r.Output[i].Content {
if len(r.Output[i].Content[j].JSON) > 0 {
return r.Output[i].Content[j].JSON
}
}
}
return nil
}
// FirstText returns the first textual payload contained in the response.
func (r *Response) FirstText() string {
if r == nil {
return ""
}
for i := range r.Output {
for j := range r.Output[i].Content {
if text := strings.TrimSpace(r.Output[i].Content[j].Text); text != "" {
return text
}
}
}
return ""
}
// ParseErrorMessage extracts a human readable error message from a Responses API payload.
func ParseErrorMessage(raw []byte) string {
var errResp struct {
Error *struct {
Message string `json:"message"`
} `json:"error"`
}
if err := json.Unmarshal(raw, &errResp); err != nil {
return ""
}
if errResp.Error != nil {
return strings.TrimSpace(errResp.Error.Message)
}
return ""
}

View File

@@ -0,0 +1,120 @@
package openai
import (
"encoding/json"
"os"
"path/filepath"
"testing"
)
func loadTestResponse(t *testing.T, name string) *Response {
t.Helper()
filePath := filepath.Join("testdata", name)
data, err := os.ReadFile(filePath)
if err != nil {
t.Fatalf("failed to read %s: %v", filePath, err)
}
var resp Response
if err := json.Unmarshal(data, &resp); err != nil {
t.Fatalf("failed to unmarshal %s: %v", filePath, err)
}
return &resp
}
func TestParseErrorMessage(t *testing.T) {
t.Run("returns message when present", func(t *testing.T) {
raw := []byte(`{"error":{"message":"Invalid schema"}}`)
msg := ParseErrorMessage(raw)
if msg != "Invalid schema" {
t.Fatalf("expected message, got %q", msg)
}
})
t.Run("returns empty string when error is missing", func(t *testing.T) {
raw := []byte(`{"output":[]}`)
if msg := ParseErrorMessage(raw); msg != "" {
t.Fatalf("expected empty message, got %q", msg)
}
})
}
func TestResponseFirstTextCaption(t *testing.T) {
resp := loadTestResponse(t, "caption-response.json")
if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
}
text := resp.FirstText()
expected := "A bee gathers nectar from the vibrant red poppys center."
if text != expected {
t.Fatalf("unexpected caption text: %q", text)
}
}
func TestResponseFirstTextLabels(t *testing.T) {
resp := loadTestResponse(t, "labels-response.json")
if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
}
text := resp.FirstText()
if len(text) == 0 {
t.Fatal("expected structured JSON string in text payload")
}
if text[0] != '{' {
t.Fatalf("expected JSON object in text payload, got %q", text)
}
}
func TestResponseFirstJSONFromStructuredPayload(t *testing.T) {
resp := &Response{
ID: "resp_structured",
Model: "gpt-5-mini",
Output: []ResponseOutput{
{
Role: "assistant",
Content: []ResponseContent{
{
Type: "output_json",
JSON: json.RawMessage(`{"labels":[{"name":"sunset"}]}`),
},
},
},
},
}
jsonPayload := resp.FirstJSON()
if len(jsonPayload) == 0 {
t.Fatal("expected JSON payload, got empty result")
}
var decoded struct {
Labels []map[string]string `json:"labels"`
}
if err := json.Unmarshal(jsonPayload, &decoded); err != nil {
t.Fatalf("failed to decode JSON payload: %v", err)
}
if len(decoded.Labels) != 1 || decoded.Labels[0]["name"] != "sunset" {
t.Fatalf("unexpected JSON payload: %+v", decoded.Labels)
}
}
func TestSchemaLabelsReturnsValidJSON(t *testing.T) {
raw := SchemaLabels(false)
var decoded map[string]any
if err := json.Unmarshal(raw, &decoded); err != nil {
t.Fatalf("schema should be valid JSON: %v", err)
}
if decoded["type"] != "object" {
t.Fatalf("expected type object, got %v", decoded["type"])
}
}

View File

@@ -1,16 +1,115 @@
package schema
// LabelsDefault provides the minimal JSON schema for label responses used across engines.
const (
LabelsDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}"
LabelsNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}"
import (
"encoding/json"
)
// Labels returns the canonical label schema string.
func Labels(nsfw bool) string {
// LabelsJsonSchemaDefault provides the minimal JSON schema for label responses used across engines.
const (
LabelsJsonSchemaDefault = `{
"type": "object",
"properties": {
"labels": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"minLength": 1
},
"confidence": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"topicality": {
"type": "number",
"minimum": 0,
"maximum": 1
}
},
"required": ["name", "confidence", "topicality"],
"additionalProperties": false
},
"default": []
}
},
"required": ["labels"],
"additionalProperties": false
}`
LabelsJsonDefault = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0 }]\n}"
LabelsJsonSchemaNSFW = `{
"type": "object",
"properties": {
"labels": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"minLength": 1
},
"confidence": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"topicality": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"nsfw": {
"type": "boolean"
},
"nsfw_confidence": {
"type": "number",
"minimum": 0,
"maximum": 1
}
},
"required": [
"name",
"confidence",
"topicality",
"nsfw",
"nsfw_confidence"
],
"additionalProperties": false
},
"default": []
}
},
"required": ["labels"],
"additionalProperties": false
}`
LabelsJsonNSFW = "{\n \"labels\": [{\n \"name\": \"\",\n \"confidence\": 0,\n \"topicality\": 0,\n \"nsfw\": false,\n \"nsfw_confidence\": 0\n }]\n}"
)
// LabelsJsonSchema returns the canonical label JSON Schema string for OpenAI API endpoints.
//
// Related documentation and references:
// - https://platform.openai.com/docs/guides/structured-outputs
// - https://json-schema.org/learn/miscellaneous-examples
func LabelsJsonSchema(nsfw bool) json.RawMessage {
if nsfw {
return LabelsNSFW
return json.RawMessage(LabelsJsonSchemaNSFW)
} else {
return LabelsDefault
return json.RawMessage(LabelsJsonSchemaDefault)
}
}
// LabelsJson returns the canonical label JSON string for Ollama vision models.
//
// Related documentation and references:
// - https://www.alibabacloud.com/help/en/model-studio/json-mode
// - https://www.json.org/json-en.html
func LabelsJson(nsfw bool) string {
if nsfw {
return LabelsJsonNSFW
} else {
return LabelsJsonDefault
}
}

View File

@@ -0,0 +1,36 @@
package schema
import (
"bytes"
"encoding/json"
"fmt"
"github.com/photoprism/photoprism/pkg/clean"
)
const (
NamePrefix = "photoprism_vision"
)
// JsonSchemaName returns the schema version string to be used for API requests.
func JsonSchemaName(schema json.RawMessage, version string) string {
var schemaName string
switch {
case bytes.Contains(schema, []byte("labels")):
schemaName = "labels"
case bytes.Contains(schema, []byte("labels")):
schemaName = "caption"
default:
schemaName = "schema"
}
version = clean.TypeLowerUnderscore(version)
if version == "" {
version = "v1"
}
return fmt.Sprintf("%s_%s_%s", NamePrefix, schemaName, version)
}

View File

@@ -0,0 +1,23 @@
package schema
import (
"encoding/json"
"testing"
"github.com/stretchr/testify/assert"
)
func TestJsonSchemaName(t *testing.T) {
t.Run("Default", func(t *testing.T) {
assert.Equal(t, "photoprism_vision_schema_v1", JsonSchemaName(nil, ""))
})
t.Run("Labels", func(t *testing.T) {
assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(json.RawMessage(LabelsJsonSchemaDefault), ""))
})
t.Run("LabelsV1", func(t *testing.T) {
assert.Equal(t, "photoprism_vision_labels_v2", JsonSchemaName([]byte("labels"), "v2"))
})
t.Run("LabelsJsonSchema", func(t *testing.T) {
assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(LabelsJsonSchema(false), "v1"))
})
}

View File

@@ -1,5 +1,5 @@
/*
Package schema defines canonical JSON schema templates shared by PhotoPrism's AI vision engines.
Package schema defines canonical JSON and JSON Schema templates shared by PhotoPrism's AI vision engines.
Copyright (c) 2018 - 2025 PhotoPrism UG. All rights reserved.

View File

@@ -4,5 +4,5 @@ package feat
var (
VisionModelGenerate = false // controls exposure of the generate endpoint and CLI commands
VisionModelMarkers = false // gates marker generation/return until downstream UI and reconciliation paths are ready
VisionServiceOpenAI = false // controls whether users are able to configure OpenAI as a vision service engine
VisionServiceOpenAI = true // controls whether users are able to configure OpenAI as a vision service engine
)

View File

@@ -135,6 +135,7 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
done := make(map[string]bool)
offset := 0
updated := 0
processed := 0
// Make sure count is within
if count < 1 || count > search.MaxResults {
@@ -197,6 +198,8 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
continue
}
processed++
fileName := photoprism.FileName(photo.FileRoot, photo.FileName)
file, fileErr := photoprism.NewMediaFile(fileName)
@@ -279,7 +282,18 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
}
}
log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), time.Since(start))
elapsed := time.Since(start)
switch {
case processed == 0:
log.Infof("vision: no pictures required processing [%s]", elapsed)
case updated == processed:
log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), elapsed)
case updated == 0:
log.Infof("vision: processed %s (no metadata changes detected) [%s]", english.Plural(processed, "picture", "pictures"), elapsed)
default:
log.Infof("vision: updated %s out of %s [%s]", english.Plural(updated, "picture", "pictures"), english.Plural(processed, "picture", "pictures"), elapsed)
}
if updated > 0 {
updateIndex = true