AI: Generate Captions & Labels using the OpenAI Responses API #5322

Signed-off-by: Michael Mayer <michael@photoprism.app>
2025-12-12 00:34:13 +01:00 · 2025-11-14 11:10:34 +01:00
parent 46d5e33c8c
commit d76acdb69f
28 changed files with 1822 additions and 127 deletions
--- a/internal/ai/vision/api_client.go
+++ b/internal/ai/vision/api_client.go
@@ -9,6 +9,9 @@ import (
 	"io"
 	"net/http"

+	"github.com/sirupsen/logrus"
+
+	"github.com/photoprism/photoprism/internal/ai/vision/ollama"
 	"github.com/photoprism/photoprism/pkg/clean"
 	"github.com/photoprism/photoprism/pkg/http/header"
 )
@@ -69,6 +72,10 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
 			return nil, parseErr
 		}

+		if log.IsLevelEnabled(logrus.TraceLevel) {
+			log.Tracef("vision: response %s", string(body))
+		}
+
 		return parsed, nil
 	}

@@ -89,12 +96,12 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp
 	return apiResponse, nil
 }

-func decodeOllamaResponse(data []byte) (*ApiResponseOllama, error) {
-	resp := &ApiResponseOllama{}
+func decodeOllamaResponse(data []byte) (*ollama.Response, error) {
+	resp := &ollama.Response{}
 	dec := json.NewDecoder(bytes.NewReader(data))

 	for {
-		var chunk ApiResponseOllama
+		var chunk ollama.Response
 		if err := dec.Decode(&chunk); err != nil {
 			if errors.Is(err, io.EOF) {
 				break
--- a/internal/ai/vision/api_client_test.go
+++ b/internal/ai/vision/api_client_test.go
@@ -8,6 +8,7 @@ import (

 	"github.com/stretchr/testify/assert"

+	"github.com/photoprism/photoprism/internal/ai/vision/ollama"
 	"github.com/photoprism/photoprism/pkg/http/scheme"
 )

@@ -49,7 +50,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
 			var req ApiRequest
 			assert.NoError(t, json.NewDecoder(r.Body).Decode(&req))
 			assert.Equal(t, FormatJSON, req.Format)
-			assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
+			assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
 				Model:    "qwen2.5vl:latest",
 				Response: `{"labels":[{"name":"test","confidence":0.9,"topicality":0.8}]}`,
 			}))
@@ -72,7 +73,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
 	})
 	t.Run("LabelsWithCodeFence", func(t *testing.T) {
 		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
+			assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
 				Model:    "gemma3:latest",
 				Response: "```json\n{\"labels\":[{\"name\":\"lingerie\",\"confidence\":0.81,\"topicality\":0.73}]}\n```\nThe model provided additional commentary.",
 			}))
@@ -95,7 +96,7 @@ func TestPerformApiRequestOllama(t *testing.T) {
 	})
 	t.Run("CaptionFallback", func(t *testing.T) {
 		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			assert.NoError(t, json.NewEncoder(w).Encode(ApiResponseOllama{
+			assert.NoError(t, json.NewEncoder(w).Encode(ollama.Response{
 				Model:    "qwen2.5vl:latest",
 				Response: "plain text",
 			}))
--- a/internal/ai/vision/api_ollama.go
+++ b/internal/ai/vision/api_ollama.go
@@ -1,10 +1,8 @@
 package vision

 import (
-	"errors"
 	"fmt"
 	"os"
-	"time"

 	"github.com/photoprism/photoprism/pkg/clean"
 	"github.com/photoprism/photoprism/pkg/http/scheme"
@@ -12,53 +10,6 @@ import (
 	"github.com/photoprism/photoprism/pkg/rnd"
 )

-// ApiResponseOllama represents a Ollama API service response.
-type ApiResponseOllama struct {
-	Id                 string    `yaml:"Id,omitempty" json:"id,omitempty"`
-	Code               int       `yaml:"Code,omitempty" json:"code,omitempty"`
-	Error              string    `yaml:"Error,omitempty" json:"error,omitempty"`
-	Model              string    `yaml:"Model,omitempty" json:"model,omitempty"`
-	CreatedAt          time.Time `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
-	Response           string    `yaml:"Response,omitempty" json:"response,omitempty"`
-	Done               bool      `yaml:"Done,omitempty" json:"done,omitempty"`
-	Context            []int     `yaml:"Context,omitempty" json:"context,omitempty"`
-	TotalDuration      int64     `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
-	LoadDuration       int       `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
-	PromptEvalCount    int       `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
-	PromptEvalDuration int       `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
-	EvalCount          int       `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
-	EvalDuration       int64     `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
-	Result             ApiResult `yaml:"Result,omitempty" json:"result,omitempty"`
-}
-
-// Err returns an error if the request has failed.
-func (r *ApiResponseOllama) Err() error {
-	if r == nil {
-		return errors.New("response is nil")
-	}
-
-	if r.Code >= 400 {
-		if r.Error != "" {
-			return errors.New(r.Error)
-		}
-
-		return fmt.Errorf("error %d", r.Code)
-	} else if r.Result.IsEmpty() {
-		return errors.New("no result")
-	}
-
-	return nil
-}
-
-// HasResult checks if there is at least one result in the response data.
-func (r *ApiResponseOllama) HasResult() bool {
-	if r == nil {
-		return false
-	}
-
-	return !r.Result.IsEmpty()
-}
-
 // NewApiRequestOllama returns a new Ollama API request with the specified images as payload.
 func NewApiRequestOllama(images Files, fileScheme scheme.Type) (*ApiRequest, error) {
 	imagesData := make(Files, len(images))
--- a/internal/ai/vision/api_request.go
+++ b/internal/ai/vision/api_request.go
@@ -11,6 +11,8 @@ import (

 	"github.com/sirupsen/logrus"

+	"github.com/photoprism/photoprism/internal/ai/vision/openai"
+	"github.com/photoprism/photoprism/internal/ai/vision/schema"
 	"github.com/photoprism/photoprism/internal/api/download"
 	"github.com/photoprism/photoprism/pkg/clean"
 	"github.com/photoprism/photoprism/pkg/fs"
@@ -58,6 +60,11 @@ type ApiRequestOptions struct {
 	UseMmap          bool     `yaml:"UseMmap,omitempty" json:"use_mmap,omitempty"`
 	UseMlock         bool     `yaml:"UseMlock,omitempty" json:"use_mlock,omitempty"`
 	NumThread        int      `yaml:"NumThread,omitempty" json:"num_thread,omitempty"`
+	MaxOutputTokens  int      `yaml:"MaxOutputTokens,omitempty" json:"max_output_tokens,omitempty"`
+	Detail           string   `yaml:"Detail,omitempty" json:"detail,omitempty"`
+	ForceJson        bool     `yaml:"ForceJson,omitempty" json:"force_json,omitempty"`
+	SchemaVersion    string   `yaml:"SchemaVersion,omitempty" json:"schema_version,omitempty"`
+	CombineOutputs   string   `yaml:"CombineOutputs,omitempty" json:"combine_outputs,omitempty"`
 }

 // ApiRequestContext represents a context parameter returned from a previous request.
@@ -77,6 +84,7 @@ type ApiRequest struct {
 	Context        *ApiRequestContext `form:"context" yaml:"Context,omitempty" json:"context,omitempty"`
 	Stream         bool               `form:"stream" yaml:"Stream,omitempty" json:"stream"`
 	Images         Files              `form:"images" yaml:"Images,omitempty" json:"images,omitempty"`
+	Schema         json.RawMessage    `form:"schema" yaml:"Schema,omitempty" json:"schema,omitempty"`
 	ResponseFormat ApiFormat          `form:"-" yaml:"-" json:"-"`
 }

@@ -195,6 +203,14 @@ func (r *ApiRequest) GetResponseFormat() ApiFormat {

 // JSON returns the request data as JSON-encoded bytes.
 func (r *ApiRequest) JSON() ([]byte, error) {
+	if r == nil {
+		return nil, errors.New("api request is nil")
+	}
+
+	if r.ResponseFormat == ApiFormatOpenAI {
+		return r.openAIJSON()
+	}
+
 	return json.Marshal(*r)
 }

@@ -229,6 +245,8 @@ func (r *ApiRequest) sanitizedForLog() ApiRequest {

 	sanitized.Url = sanitizeLogPayload(r.Url)

+	sanitized.Schema = r.Schema
+
 	return sanitized
 }

@@ -287,3 +305,134 @@ func isLikelyBase64(value string) bool {

 	return true
 }
+
+// openAIJSON converts the request data into an OpenAI Responses API payload.
+func (r *ApiRequest) openAIJSON() ([]byte, error) {
+	detail := openai.DefaultDetail
+
+	if opts := r.Options; opts != nil && strings.TrimSpace(opts.Detail) != "" {
+		detail = strings.TrimSpace(opts.Detail)
+	}
+
+	messages := make([]openai.InputMessage, 0, 2)
+
+	if system := strings.TrimSpace(r.System); system != "" {
+		messages = append(messages, openai.InputMessage{
+			Role: "system",
+			Type: "message",
+			Content: []openai.ContentItem{
+				{
+					Type: openai.ContentTypeText,
+					Text: system,
+				},
+			},
+		})
+	}
+
+	userContent := make([]openai.ContentItem, 0, len(r.Images)+1)
+
+	if prompt := strings.TrimSpace(r.Prompt); prompt != "" {
+		userContent = append(userContent, openai.ContentItem{
+			Type: openai.ContentTypeText,
+			Text: prompt,
+		})
+	}
+
+	for _, img := range r.Images {
+		if img == "" {
+			continue
+		}
+
+		userContent = append(userContent, openai.ContentItem{
+			Type:     openai.ContentTypeImage,
+			ImageURL: img,
+			Detail:   detail,
+		})
+	}
+
+	if len(userContent) > 0 {
+		messages = append(messages, openai.InputMessage{
+			Role:    "user",
+			Type:    "message",
+			Content: userContent,
+		})
+	}
+
+	if len(messages) == 0 {
+		return nil, errors.New("openai request requires at least one message")
+	}
+
+	payload := openai.HTTPRequest{
+		Model: strings.TrimSpace(r.Model),
+		Input: messages,
+	}
+
+	if payload.Model == "" {
+		payload.Model = openai.DefaultModel
+	}
+
+	if strings.HasPrefix(strings.ToLower(payload.Model), "gpt-5") {
+		payload.Reasoning = &openai.Reasoning{Effort: "low"}
+	}
+
+	if opts := r.Options; opts != nil {
+		if opts.MaxOutputTokens > 0 {
+			payload.MaxOutputTokens = opts.MaxOutputTokens
+		}
+
+		if opts.Temperature > 0 {
+			payload.Temperature = opts.Temperature
+		}
+
+		if opts.TopP > 0 {
+			payload.TopP = opts.TopP
+		}
+
+		if opts.PresencePenalty != 0 {
+			payload.PresencePenalty = opts.PresencePenalty
+		}
+
+		if opts.FrequencyPenalty != 0 {
+			payload.FrequencyPenalty = opts.FrequencyPenalty
+		}
+	}
+
+	if format := buildOpenAIResponseFormat(r); format != nil {
+		payload.Text = &openai.TextOptions{
+			Format: format,
+		}
+	}
+
+	return json.Marshal(payload)
+}
+
+// buildOpenAIResponseFormat determines which response_format to send to OpenAI.
+func buildOpenAIResponseFormat(r *ApiRequest) *openai.ResponseFormat {
+	if r == nil {
+		return nil
+	}
+
+	opts := r.Options
+	hasSchema := len(r.Schema) > 0
+
+	if !hasSchema && (opts == nil || !opts.ForceJson) {
+		return nil
+	}
+
+	result := &openai.ResponseFormat{}
+
+	if hasSchema {
+		result.Type = openai.ResponseFormatJSONSchema
+		result.Schema = r.Schema
+
+		if opts != nil && strings.TrimSpace(opts.SchemaVersion) != "" {
+			result.Name = strings.TrimSpace(opts.SchemaVersion)
+		} else {
+			result.Name = schema.JsonSchemaName(r.Schema, openai.DefaultSchemaVersion)
+		}
+	} else {
+		result.Type = openai.ResponseFormatJSONObject
+	}
+
+	return result
+}
--- a/internal/ai/vision/caption.go
+++ b/internal/ai/vision/caption.go
@@ -53,7 +53,11 @@ func captionInternal(images Files, mediaSrc media.Src) (result *CaptionResult, m

 			apiRequest.System = model.GetSystemPrompt()
 			apiRequest.Prompt = model.GetPrompt()
-			apiRequest.Options = model.GetOptions()
+
+			if apiRequest.Options == nil {
+				apiRequest.Options = model.GetOptions()
+			}
+
 			apiRequest.WriteLog()

 			if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
--- a/internal/ai/vision/engine.go
+++ b/internal/ai/vision/engine.go
@@ -58,14 +58,15 @@ func init() {
 	RegisterEngineAlias(EngineVision, EngineInfo{
 		RequestFormat:     ApiFormatVision,
 		ResponseFormat:    ApiFormatVision,
-		FileScheme:        string(scheme.Data),
+		FileScheme:        scheme.Data,
 		DefaultResolution: DefaultResolution,
 	})

 	RegisterEngineAlias(openai.EngineName, EngineInfo{
+		Uri:               "https://api.openai.com/v1/responses",
 		RequestFormat:     ApiFormatOpenAI,
 		ResponseFormat:    ApiFormatOpenAI,
-		FileScheme:        string(scheme.Data),
+		FileScheme:        scheme.Data,
 		DefaultResolution: openai.DefaultResolution,
 	})
 }
@@ -79,6 +80,7 @@ func RegisterEngine(format ApiFormat, engine Engine) {

 // EngineInfo describes metadata that can be associated with an engine alias.
 type EngineInfo struct {
+	Uri               string
 	RequestFormat     ApiFormat
 	ResponseFormat    ApiFormat
 	FileScheme        string
--- a/internal/ai/vision/engine_ollama.go
+++ b/internal/ai/vision/engine_ollama.go
@@ -28,7 +28,7 @@ func init() {
 	RegisterEngineAlias(ollama.EngineName, EngineInfo{
 		RequestFormat:     ApiFormatOllama,
 		ResponseFormat:    ApiFormatOllama,
-		FileScheme:        string(scheme.Base64),
+		FileScheme:        scheme.Base64,
 		DefaultResolution: ollama.DefaultResolution,
 	})

@@ -72,7 +72,7 @@ func (ollamaDefaults) SchemaTemplate(model *Model) string {

 	switch model.Type {
 	case ModelTypeLabels:
-		return ollama.LabelsSchema(model.PromptContains("nsfw"))
+		return ollama.SchemaLabels(model.PromptContains("nsfw"))
 	}

 	return ""
@@ -134,64 +134,93 @@ func (ollamaParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, stat
 		return nil, err
 	}

-	result := &ApiResponse{
+	response := &ApiResponse{
 		Id:    req.GetId(),
 		Code:  status,
 		Model: &Model{Name: ollamaResp.Model},
 		Result: ApiResult{
-			Labels: append([]LabelResult{}, ollamaResp.Result.Labels...),
-			Caption: func() *CaptionResult {
-				if ollamaResp.Result.Caption != nil {
-					copyCaption := *ollamaResp.Result.Caption
-					return &copyCaption
-				}
-				return nil
-			}(),
+			Labels:  convertOllamaLabels(ollamaResp.Result.Labels),
+			Caption: convertOllamaCaption(ollamaResp.Result.Caption),
 		},
 	}

-	parsedLabels := len(result.Result.Labels) > 0
+	parsedLabels := len(response.Result.Labels) > 0

 	if !parsedLabels && strings.TrimSpace(ollamaResp.Response) != "" && req.Format == FormatJSON {
 		if labels, parseErr := parseOllamaLabels(ollamaResp.Response); parseErr != nil {
 			log.Debugf("vision: %s (parse ollama labels)", clean.Error(parseErr))
 		} else if len(labels) > 0 {
-			result.Result.Labels = append(result.Result.Labels, labels...)
+			response.Result.Labels = append(response.Result.Labels, labels...)
 			parsedLabels = true
 		}
 	}

 	if parsedLabels {
-		filtered := result.Result.Labels[:0]
-		for i := range result.Result.Labels {
-			if result.Result.Labels[i].Confidence <= 0 {
-				result.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
+		filtered := response.Result.Labels[:0]
+		for i := range response.Result.Labels {
+			if response.Result.Labels[i].Confidence <= 0 {
+				response.Result.Labels[i].Confidence = ollama.LabelConfidenceDefault
 			}

-			if result.Result.Labels[i].Topicality <= 0 {
-				result.Result.Labels[i].Topicality = result.Result.Labels[i].Confidence
+			if response.Result.Labels[i].Topicality <= 0 {
+				response.Result.Labels[i].Topicality = response.Result.Labels[i].Confidence
 			}

 			// Apply thresholds and canonicalize the name.
-			normalizeLabelResult(&result.Result.Labels[i])
+			normalizeLabelResult(&response.Result.Labels[i])

-			if result.Result.Labels[i].Name == "" {
+			if response.Result.Labels[i].Name == "" {
 				continue
 			}

-			if result.Result.Labels[i].Source == "" {
-				result.Result.Labels[i].Source = entity.SrcOllama
+			if response.Result.Labels[i].Source == "" {
+				response.Result.Labels[i].Source = entity.SrcOllama
 			}

-			filtered = append(filtered, result.Result.Labels[i])
+			filtered = append(filtered, response.Result.Labels[i])
 		}
-		result.Result.Labels = filtered
+		response.Result.Labels = filtered
 	} else if caption := strings.TrimSpace(ollamaResp.Response); caption != "" {
-		result.Result.Caption = &CaptionResult{
+		response.Result.Caption = &CaptionResult{
 			Text:   caption,
 			Source: entity.SrcOllama,
 		}
 	}

-	return result, nil
+	return response, nil
+}
+
+func convertOllamaLabels(payload []ollama.LabelPayload) []LabelResult {
+	if len(payload) == 0 {
+		return nil
+	}
+
+	labels := make([]LabelResult, len(payload))
+
+	for i := range payload {
+		labels[i] = LabelResult{
+			Name:           payload[i].Name,
+			Source:         payload[i].Source,
+			Priority:       payload[i].Priority,
+			Confidence:     payload[i].Confidence,
+			Topicality:     payload[i].Topicality,
+			Categories:     payload[i].Categories,
+			NSFW:           payload[i].NSFW,
+			NSFWConfidence: payload[i].NSFWConfidence,
+		}
+	}
+
+	return labels
+}
+
+func convertOllamaCaption(payload *ollama.CaptionPayload) *CaptionResult {
+	if payload == nil {
+		return nil
+	}
+
+	return &CaptionResult{
+		Text:       payload.Text,
+		Source:     payload.Source,
+		Confidence: payload.Confidence,
+	}
 }
--- a/internal/ai/vision/engine_ollama_test.go
+++ b/internal/ai/vision/engine_ollama_test.go
@@ -10,9 +10,9 @@ import (

 func TestOllamaDefaultConfidenceApplied(t *testing.T) {
 	req := &ApiRequest{Format: FormatJSON}
-	payload := ApiResponseOllama{
-		Result: ApiResult{
-			Labels: []LabelResult{{Name: "forest path", Confidence: 0, Topicality: 0}},
+	payload := ollama.Response{
+		Result: ollama.ResultPayload{
+			Labels: []ollama.LabelPayload{{Name: "forest path", Confidence: 0, Topicality: 0}},
 		},
 	}
 	raw, err := json.Marshal(payload)
--- a/internal/ai/vision/engine_openai.go
+++ b/internal/ai/vision/engine_openai.go
@@ -1,18 +1,342 @@
 package vision

 import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strings"
+
 	"github.com/photoprism/photoprism/internal/ai/vision/openai"
+	"github.com/photoprism/photoprism/internal/entity"
+	"github.com/photoprism/photoprism/pkg/clean"
 	"github.com/photoprism/photoprism/pkg/http/scheme"
 )

-// init registers the OpenAI engine alias so models can set Engine: "openai"
-// and inherit sensible defaults (request/response formats, file scheme, and
-// preferred thumbnail resolution).
+// openaiDefaults provides canned prompts, schema templates, and options for OpenAI engines.
+type openaiDefaults struct{}
+
+// openaiBuilder prepares ApiRequest objects for OpenAI's Responses API.
+type openaiBuilder struct{}
+
+// openaiParser converts Responses API payloads into ApiResponse instances.
+type openaiParser struct{}
+
 func init() {
-	RegisterEngineAlias(openai.EngineName, EngineInfo{
-		RequestFormat:     ApiFormatOpenAI,
-		ResponseFormat:    ApiFormatOpenAI,
-		FileScheme:        string(scheme.Base64),
-		DefaultResolution: openai.DefaultResolution,
+	RegisterEngine(ApiFormatOpenAI, Engine{
+		Builder:  openaiBuilder{},
+		Parser:   openaiParser{},
+		Defaults: openaiDefaults{},
 	})
 }
+
+// SystemPrompt returns the default OpenAI system prompt for the specified model type.
+func (openaiDefaults) SystemPrompt(model *Model) string {
+	if model == nil {
+		return ""
+	}
+
+	switch model.Type {
+	case ModelTypeCaption:
+		return openai.CaptionSystem
+	case ModelTypeLabels:
+		return openai.LabelSystem
+	default:
+		return ""
+	}
+}
+
+// UserPrompt returns the default OpenAI user prompt for the specified model type.
+func (openaiDefaults) UserPrompt(model *Model) string {
+	if model == nil {
+		return ""
+	}
+
+	switch model.Type {
+	case ModelTypeCaption:
+		return openai.CaptionPrompt
+	case ModelTypeLabels:
+		if DetectNSFWLabels {
+			return openai.LabelPromptNSFW
+		}
+		return openai.LabelPromptDefault
+	default:
+		return ""
+	}
+}
+
+// SchemaTemplate returns the JSON schema template for the model, if applicable.
+func (openaiDefaults) SchemaTemplate(model *Model) string {
+	if model == nil {
+		return ""
+	}
+
+	switch model.Type {
+	case ModelTypeLabels:
+		return string(openai.SchemaLabels(model.PromptContains("nsfw")))
+	default:
+		return ""
+	}
+}
+
+// Options returns default OpenAI request options for the model.
+func (openaiDefaults) Options(model *Model) *ApiRequestOptions {
+	if model == nil {
+		return nil
+	}
+
+	switch model.Type {
+	case ModelTypeCaption:
+		/*
+		  Options:
+		    Detail: low
+		    MaxOutputTokens: 512
+		    Temperature: 0.1
+		    TopP: 0.9
+		  (Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
+		*/
+		return &ApiRequestOptions{
+			Detail:          openai.DefaultDetail,
+			MaxOutputTokens: openai.CaptionMaxTokens,
+			Temperature:     openai.DefaultTemperature,
+			TopP:            openai.DefaultTopP,
+		}
+	case ModelTypeLabels:
+		/*
+		  Options:
+		    Detail: low
+		    MaxOutputTokens: 1024
+		    Temperature: 0.1
+		    ForceJson: true
+		    SchemaVersion: "photoprism_vision_labels_v1"
+		  (Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
+		*/
+		return &ApiRequestOptions{
+			Detail:          openai.DefaultDetail,
+			MaxOutputTokens: openai.LabelsMaxTokens,
+			Temperature:     openai.DefaultTemperature,
+			TopP:            openai.DefaultTopP,
+			ForceJson:       true,
+		}
+	default:
+		return nil
+	}
+}
+
+// Build constructs an OpenAI request payload using base64-encoded thumbnails.
+func (openaiBuilder) Build(ctx context.Context, model *Model, files Files) (*ApiRequest, error) {
+	if model == nil {
+		return nil, ErrInvalidModel
+	}
+
+	dataReq, err := NewApiRequestImages(files, scheme.Data)
+	if err != nil {
+		return nil, err
+	}
+
+	req := &ApiRequest{
+		Id:             dataReq.Id,
+		Images:         append(Files(nil), dataReq.Images...),
+		ResponseFormat: ApiFormatOpenAI,
+	}
+
+	if opts := model.GetOptions(); opts != nil {
+		req.Options = cloneOptions(opts)
+		if model.Type == ModelTypeCaption {
+			// Captions default to plain text responses; structured JSON is optional.
+			req.Options.ForceJson = false
+			if req.Options.MaxOutputTokens < openai.CaptionMaxTokens {
+				req.Options.MaxOutputTokens = openai.CaptionMaxTokens
+			}
+		} else if model.Type == ModelTypeLabels {
+			if req.Options.MaxOutputTokens < openai.LabelsMaxTokens {
+				req.Options.MaxOutputTokens = openai.LabelsMaxTokens
+			}
+		}
+
+		if strings.HasPrefix(strings.ToLower(strings.TrimSpace(model.Name)), "gpt-5") {
+			req.Options.Temperature = 0
+			req.Options.TopP = 0
+		}
+	}
+
+	if schema := strings.TrimSpace(model.SchemaTemplate()); schema != "" {
+		if raw, parseErr := parseOpenAISchema(schema); parseErr != nil {
+			log.Warnf("vision: failed to parse OpenAI schema template (%s)", clean.Error(parseErr))
+		} else {
+			req.Schema = raw
+		}
+	}
+
+	return req, nil
+}
+
+// Parse converts an OpenAI Responses API payload into the internal ApiResponse representation.
+func (openaiParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, status int) (*ApiResponse, error) {
+	if status >= 300 {
+		if msg := openai.ParseErrorMessage(raw); msg != "" {
+			return nil, fmt.Errorf("openai: %s", msg)
+		}
+		return nil, fmt.Errorf("openai: status %d", status)
+	}
+
+	var resp openai.Response
+	if err := json.Unmarshal(raw, &resp); err != nil {
+		return nil, err
+	}
+
+	if resp.Error != nil && resp.Error.Message != "" {
+		return nil, errors.New(resp.Error.Message)
+	}
+
+	result := ApiResult{}
+	if jsonPayload := resp.FirstJSON(); len(jsonPayload) > 0 {
+		if err := populateOpenAIJSONResult(&result, jsonPayload); err != nil {
+			log.Debugf("vision: %s (parse openai json payload)", clean.Error(err))
+		}
+	}
+
+	if result.Caption == nil {
+		if text := resp.FirstText(); text != "" {
+			trimmed := strings.TrimSpace(text)
+			var parsedJSON bool
+
+			if len(trimmed) > 0 && (trimmed[0] == '{' || trimmed[0] == '[') {
+				if err := populateOpenAIJSONResult(&result, json.RawMessage(trimmed)); err != nil {
+					log.Debugf("vision: %s (parse openai json text payload)", clean.Error(err))
+				} else {
+					parsedJSON = true
+				}
+			}
+
+			if !parsedJSON && trimmed != "" {
+				result.Caption = &CaptionResult{
+					Text:   trimmed,
+					Source: entity.SrcOpenAI,
+				}
+			}
+		}
+	}
+
+	var responseID string
+	if req != nil {
+		responseID = req.GetId()
+	}
+
+	modelName := strings.TrimSpace(resp.Model)
+	if modelName == "" && req != nil {
+		modelName = strings.TrimSpace(req.Model)
+	}
+
+	return &ApiResponse{
+		Id:     responseID,
+		Code:   status,
+		Model:  &Model{Name: modelName},
+		Result: result,
+	}, nil
+}
+
+// parseOpenAISchema validates the provided JSON schema and returns it as a raw message.
+func parseOpenAISchema(schema string) (json.RawMessage, error) {
+	var raw json.RawMessage
+	if err := json.Unmarshal([]byte(schema), &raw); err != nil {
+		return nil, err
+	}
+	return normalizeOpenAISchema(raw)
+}
+
+// normalizeOpenAISchema upgrades legacy label schema definitions so they comply with
+// OpenAI's json_schema format requirements.
+func normalizeOpenAISchema(raw json.RawMessage) (json.RawMessage, error) {
+	if len(raw) == 0 {
+		return raw, nil
+	}
+
+	var doc map[string]any
+	if err := json.Unmarshal(raw, &doc); err != nil {
+		// Fallback to the original payload if it isn't a JSON object.
+		return raw, nil
+	}
+
+	if t, ok := doc["type"]; ok {
+		if typeStr, ok := t.(string); ok && strings.TrimSpace(typeStr) != "" {
+			return raw, nil
+		}
+	}
+
+	if _, ok := doc["properties"]; ok {
+		return raw, nil
+	}
+
+	labels, ok := doc["labels"]
+	if !ok {
+		return raw, nil
+	}
+
+	nsfw := false
+
+	if items, ok := labels.([]any); ok && len(items) > 0 {
+		if first, ok := items[0].(map[string]any); ok {
+			if _, hasNSFW := first["nsfw"]; hasNSFW {
+				nsfw = true
+			}
+			if _, hasNSFWConfidence := first["nsfw_confidence"]; hasNSFWConfidence {
+				nsfw = true
+			}
+		}
+	}
+
+	return openai.SchemaLabels(nsfw), nil
+}
+
+// populateOpenAIJSONResult unmarshals a structured OpenAI response into ApiResult fields.
+func populateOpenAIJSONResult(result *ApiResult, payload json.RawMessage) error {
+	if result == nil || len(payload) == 0 {
+		return nil
+	}
+
+	var envelope struct {
+		Caption *struct {
+			Text       string  `json:"text"`
+			Confidence float32 `json:"confidence"`
+		} `json:"caption"`
+		Labels []LabelResult `json:"labels"`
+	}
+
+	if err := json.Unmarshal(payload, &envelope); err != nil {
+		return err
+	}
+
+	if envelope.Caption != nil {
+		text := strings.TrimSpace(envelope.Caption.Text)
+		if text != "" {
+			result.Caption = &CaptionResult{
+				Text:       text,
+				Confidence: envelope.Caption.Confidence,
+				Source:     entity.SrcOpenAI,
+			}
+		}
+	}
+
+	if len(envelope.Labels) > 0 {
+		filtered := envelope.Labels[:0]
+
+		for i := range envelope.Labels {
+			if envelope.Labels[i].Source == "" {
+				envelope.Labels[i].Source = entity.SrcOpenAI
+			}
+
+			normalizeLabelResult(&envelope.Labels[i])
+
+			if envelope.Labels[i].Name == "" {
+				continue
+			}
+
+			filtered = append(filtered, envelope.Labels[i])
+		}
+
+		result.Labels = append(result.Labels, filtered...)
+	}
+
+	return nil
+}
--- a/internal/ai/vision/engine_openai_test.go
+++ b/internal/ai/vision/engine_openai_test.go
@@ -0,0 +1,337 @@
+package vision
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/photoprism/photoprism/internal/ai/vision/openai"
+	"github.com/photoprism/photoprism/internal/ai/vision/schema"
+	"github.com/photoprism/photoprism/internal/entity"
+)
+
+func TestOpenAIBuilderBuild(t *testing.T) {
+	model := &Model{
+		Type:   ModelTypeLabels,
+		Name:   openai.DefaultModel,
+		Engine: openai.EngineName,
+	}
+	model.ApplyEngineDefaults()
+
+	request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
+	require.NoError(t, err)
+	require.NotNil(t, request)
+
+	assert.Equal(t, ApiFormatOpenAI, request.ResponseFormat)
+	assert.NotEmpty(t, request.Images)
+	assert.NotNil(t, request.Options)
+	assert.Equal(t, openai.DefaultDetail, request.Options.Detail)
+	assert.True(t, request.Options.ForceJson)
+	assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.LabelsMaxTokens)
+}
+
+func TestOpenAIBuilderBuildCaptionDisablesForceJSON(t *testing.T) {
+	model := &Model{
+		Type:    ModelTypeCaption,
+		Name:    openai.DefaultModel,
+		Engine:  openai.EngineName,
+		Options: &ApiRequestOptions{ForceJson: true},
+	}
+	model.ApplyEngineDefaults()
+
+	request, err := openaiBuilder{}.Build(context.Background(), model, Files{examplesPath + "/chameleon_lime.jpg"})
+	require.NoError(t, err)
+	require.NotNil(t, request)
+	require.NotNil(t, request.Options)
+	assert.False(t, request.Options.ForceJson)
+	assert.GreaterOrEqual(t, request.Options.MaxOutputTokens, openai.CaptionMaxTokens)
+}
+
+func TestApiRequestJSONForOpenAI(t *testing.T) {
+	req := &ApiRequest{
+		Model:          "gpt-5-mini",
+		System:         "system",
+		Prompt:         "describe the scene",
+		Images:         []string{"data:image/jpeg;base64,AA=="},
+		ResponseFormat: ApiFormatOpenAI,
+		Options: &ApiRequestOptions{
+			Detail:          openai.DefaultDetail,
+			MaxOutputTokens: 128,
+			Temperature:     0.2,
+			TopP:            0.8,
+			ForceJson:       true,
+		},
+		Schema: json.RawMessage(`{"type":"object","properties":{"caption":{"type":"object"}}}`),
+	}
+
+	payload, err := req.JSON()
+	require.NoError(t, err)
+
+	var decoded struct {
+		Model string `json:"model"`
+		Input []struct {
+			Role    string `json:"role"`
+			Content []struct {
+				Type string `json:"type"`
+			} `json:"content"`
+		} `json:"input"`
+		Text struct {
+			Format struct {
+				Type   string          `json:"type"`
+				Name   string          `json:"name"`
+				Schema json.RawMessage `json:"schema"`
+				Strict bool            `json:"strict"`
+			} `json:"format"`
+		} `json:"text"`
+		Reasoning struct {
+			Effort string `json:"effort"`
+		} `json:"reasoning"`
+		MaxOutputTokens int `json:"max_output_tokens"`
+	}
+
+	require.NoError(t, json.Unmarshal(payload, &decoded))
+	assert.Equal(t, "gpt-5-mini", decoded.Model)
+	require.Len(t, decoded.Input, 2)
+	assert.Equal(t, "system", decoded.Input[0].Role)
+	assert.Equal(t, openai.ResponseFormatJSONSchema, decoded.Text.Format.Type)
+	assert.Equal(t, schema.JsonSchemaName(decoded.Text.Format.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
+	assert.False(t, decoded.Text.Format.Strict)
+	assert.NotNil(t, decoded.Text.Format.Schema)
+	assert.Equal(t, "low", decoded.Reasoning.Effort)
+	assert.Equal(t, 128, decoded.MaxOutputTokens)
+}
+
+func TestApiRequestJSONForOpenAIDefaultSchemaName(t *testing.T) {
+	req := &ApiRequest{
+		Model:          "gpt-5-mini",
+		Images:         []string{"data:image/jpeg;base64,AA=="},
+		ResponseFormat: ApiFormatOpenAI,
+		Options: &ApiRequestOptions{
+			Detail:          openai.DefaultDetail,
+			MaxOutputTokens: 64,
+			ForceJson:       true,
+		},
+		Schema: json.RawMessage(`{"type":"object"}`),
+	}
+
+	payload, err := req.JSON()
+	require.NoError(t, err)
+
+	var decoded struct {
+		Text struct {
+			Format struct {
+				Name string `json:"name"`
+			} `json:"format"`
+		} `json:"text"`
+	}
+
+	require.NoError(t, json.Unmarshal(payload, &decoded))
+	assert.Equal(t, schema.JsonSchemaName(req.Schema, openai.DefaultSchemaVersion), decoded.Text.Format.Name)
+}
+
+func TestOpenAIParserParsesJSONFromTextPayload(t *testing.T) {
+	respPayload := `{
+		"id": "resp_123",
+		"model": "gpt-5-mini",
+		"output": [{
+			"role": "assistant",
+			"content": [{
+				"type": "output_text",
+				"text": "{\"labels\":[{\"name\":\"deer\",\"confidence\":0.98,\"topicality\":0.99}]}"
+			}]
+		}]
+	}`
+
+	req := &ApiRequest{
+		Id:             "test",
+		Model:          "gpt-5-mini",
+		ResponseFormat: ApiFormatOpenAI,
+	}
+
+	resp, err := openaiParser{}.Parse(context.Background(), req, []byte(respPayload), http.StatusOK)
+	require.NoError(t, err)
+	require.NotNil(t, resp)
+	require.Len(t, resp.Result.Labels, 1)
+	assert.Equal(t, "Deer", resp.Result.Labels[0].Name)
+	assert.Nil(t, resp.Result.Caption)
+}
+
+func TestParseOpenAISchemaLegacyUpgrade(t *testing.T) {
+	legacy := `{
+		"labels": [{
+			"name": "",
+			"confidence": 0,
+			"topicality": 0
+		}]
+	}`
+
+	raw, err := parseOpenAISchema(legacy)
+	require.NoError(t, err)
+
+	var decoded map[string]any
+	require.NoError(t, json.Unmarshal(raw, &decoded))
+
+	assert.Equal(t, "object", decoded["type"])
+
+	props, ok := decoded["properties"].(map[string]any)
+	require.True(t, ok)
+	labels, ok := props["labels"].(map[string]any)
+	require.True(t, ok)
+	assert.Equal(t, "array", labels["type"])
+}
+
+func TestParseOpenAISchemaLegacyUpgradeNSFW(t *testing.T) {
+	legacy := `{
+		"labels": [{
+			"name": "",
+			"confidence": 0,
+			"topicality": 0,
+			"nsfw": false,
+			"nsfw_confidence": 0
+		}]
+	}`
+
+	raw, err := parseOpenAISchema(legacy)
+	require.NoError(t, err)
+
+	var decoded map[string]any
+	require.NoError(t, json.Unmarshal(raw, &decoded))
+
+	props := decoded["properties"].(map[string]any)
+	labels := props["labels"].(map[string]any)
+	items := labels["items"].(map[string]any)
+	_, hasNSFW := items["properties"].(map[string]any)["nsfw"]
+	assert.True(t, hasNSFW)
+}
+
+func TestPerformApiRequestOpenAISuccess(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		var reqPayload struct {
+			Model string `json:"model"`
+		}
+		assert.NoError(t, json.NewDecoder(r.Body).Decode(&reqPayload))
+		assert.Equal(t, "gpt-5-mini", reqPayload.Model)
+
+		response := map[string]any{
+			"id":    "resp_123",
+			"model": "gpt-5-mini",
+			"output": []any{
+				map[string]any{
+					"role": "assistant",
+					"content": []any{
+						map[string]any{
+							"type": "output_json",
+							"json": map[string]any{
+								"caption": map[string]any{
+									"text":       "A cat rests on a windowsill.",
+									"confidence": 0.91,
+								},
+								"labels": []map[string]any{
+									{
+										"name":       "cat",
+										"confidence": 0.92,
+										"topicality": 0.88,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+		}
+
+		assert.NoError(t, json.NewEncoder(w).Encode(response))
+	}))
+	defer server.Close()
+
+	req := &ApiRequest{
+		Id:             "test",
+		Model:          "gpt-5-mini",
+		Images:         []string{"data:image/jpeg;base64,AA=="},
+		ResponseFormat: ApiFormatOpenAI,
+		Options: &ApiRequestOptions{
+			Detail: openai.DefaultDetail,
+		},
+		Schema: json.RawMessage(`{"type":"object"}`),
+	}
+
+	resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "secret")
+	require.NoError(t, err)
+	require.NotNil(t, resp)
+
+	require.NotNil(t, resp.Result.Caption)
+	assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
+	assert.Equal(t, "A cat rests on a windowsill.", resp.Result.Caption.Text)
+
+	require.Len(t, resp.Result.Labels, 1)
+	assert.Equal(t, entity.SrcOpenAI, resp.Result.Labels[0].Source)
+	assert.Equal(t, "Cat", resp.Result.Labels[0].Name)
+}
+
+func TestPerformApiRequestOpenAITextFallback(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		response := map[string]any{
+			"id":    "resp_456",
+			"model": "gpt-5-mini",
+			"output": []any{
+				map[string]any{
+					"role": "assistant",
+					"content": []any{
+						map[string]any{
+							"type": "output_text",
+							"text": "Two hikers reach the summit at sunset.",
+						},
+					},
+				},
+			},
+		}
+		assert.NoError(t, json.NewEncoder(w).Encode(response))
+	}))
+	defer server.Close()
+
+	req := &ApiRequest{
+		Id:             "fallback",
+		Model:          "gpt-5-mini",
+		Images:         []string{"data:image/jpeg;base64,AA=="},
+		ResponseFormat: ApiFormatOpenAI,
+		Options: &ApiRequestOptions{
+			Detail: openai.DefaultDetail,
+		},
+		Schema: nil,
+	}
+
+	resp, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
+	require.NoError(t, err)
+	require.NotNil(t, resp.Result.Caption)
+	assert.Equal(t, "Two hikers reach the summit at sunset.", resp.Result.Caption.Text)
+	assert.Equal(t, entity.SrcOpenAI, resp.Result.Caption.Source)
+}
+
+func TestPerformApiRequestOpenAIError(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusBadRequest)
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"error": map[string]any{
+				"message": "Invalid image payload",
+			},
+		})
+	}))
+	defer server.Close()
+
+	req := &ApiRequest{
+		Id:             "error",
+		Model:          "gpt-5-mini",
+		ResponseFormat: ApiFormatOpenAI,
+		Schema:         nil,
+		Images:         []string{"data:image/jpeg;base64,AA=="},
+	}
+
+	_, err := PerformApiRequest(req, server.URL, http.MethodPost, "")
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "Invalid image payload")
+}
--- a/internal/ai/vision/labels.go
+++ b/internal/ai/vision/labels.go
@@ -96,8 +96,10 @@ func labelsInternal(images Files, mediaSrc media.Src, labelSrc entity.Src) (resu
 				apiRequest.Prompt = prompt
 			}

-			if options := model.GetOptions(); options != nil {
-				apiRequest.Options = options
+			if apiRequest.Options == nil {
+				if options := model.GetOptions(); options != nil {
+					apiRequest.Options = options
+				}
 			}

 			apiRequest.WriteLog()
--- a/internal/ai/vision/model.go
+++ b/internal/ai/vision/model.go
@@ -348,6 +348,26 @@ func mergeOptionDefaults(target, defaults *ApiRequestOptions) {
 	if len(target.Stop) == 0 && len(defaults.Stop) > 0 {
 		target.Stop = append([]string(nil), defaults.Stop...)
 	}
+
+	if target.MaxOutputTokens <= 0 && defaults.MaxOutputTokens > 0 {
+		target.MaxOutputTokens = defaults.MaxOutputTokens
+	}
+
+	if strings.TrimSpace(target.Detail) == "" && strings.TrimSpace(defaults.Detail) != "" {
+		target.Detail = strings.TrimSpace(defaults.Detail)
+	}
+
+	if !target.ForceJson && defaults.ForceJson {
+		target.ForceJson = true
+	}
+
+	if target.SchemaVersion == "" && defaults.SchemaVersion != "" {
+		target.SchemaVersion = defaults.SchemaVersion
+	}
+
+	if target.CombineOutputs == "" && defaults.CombineOutputs != "" {
+		target.CombineOutputs = defaults.CombineOutputs
+	}
 }

 func normalizeOptions(opts *ApiRequestOptions) {
@@ -422,6 +442,10 @@ func (m *Model) ApplyEngineDefaults() {
 	}

 	if info, ok := EngineInfoFor(engine); ok {
+		if m.Service.Uri == "" {
+			m.Service.Uri = info.Uri
+		}
+
 		if m.Service.RequestFormat == "" {
 			m.Service.RequestFormat = info.RequestFormat
 		}
@@ -490,7 +514,7 @@ func (m *Model) SchemaTemplate() string {
 			}

 			if m.schema == "" {
-				m.schema = visionschema.Labels(m.PromptContains("nsfw"))
+				m.schema = visionschema.LabelsJson(m.PromptContains("nsfw"))
 			}
 		}
 	})
--- a/internal/ai/vision/ollama/defaults.go
+++ b/internal/ai/vision/ollama/defaults.go
@@ -1,7 +1,5 @@
 package ollama

-import "github.com/photoprism/photoprism/internal/ai/vision/schema"
-
 const (
 	// CaptionPrompt instructs Ollama caption models to emit a single, active-voice sentence.
 	CaptionPrompt = "Create a caption with exactly one sentence in the active voice that describes the main visual content. Begin with the main subject and clear action. Avoid text formatting, meta-language, and filler words."
@@ -22,12 +20,3 @@ const (
 	// DefaultResolution is the default thumbnail size submitted to Ollama models.
 	DefaultResolution = 720
 )
-
-// LabelsSchema returns the canonical label schema string consumed by Ollama models.
-func LabelsSchema(nsfw bool) string {
-	if nsfw {
-		return schema.LabelsNSFW
-	} else {
-		return schema.LabelsDefault
-	}
-}
--- a/internal/ai/vision/ollama/schema.go
+++ b/internal/ai/vision/ollama/schema.go
@@ -0,0 +1,14 @@
+package ollama
+
+import (
+	"github.com/photoprism/photoprism/internal/ai/vision/schema"
+)
+
+// SchemaLabels returns the canonical label schema string consumed by Ollama models.
+//
+// Related documentation and references:
+// - https://www.alibabacloud.com/help/en/model-studio/json-mode
+// - https://www.json.org/json-en.html
+func SchemaLabels(nsfw bool) string {
+	return schema.LabelsJson(nsfw)
+}
--- a/internal/ai/vision/ollama/transport.go
+++ b/internal/ai/vision/ollama/transport.go
@@ -0,0 +1,79 @@
+package ollama
+
+import (
+	"errors"
+	"fmt"
+	"time"
+)
+
+// Response encapsulates the subset of the Ollama generate API response we care about.
+type Response struct {
+	ID                 string        `yaml:"Id,omitempty" json:"id,omitempty"`
+	Code               int           `yaml:"Code,omitempty" json:"code,omitempty"`
+	Error              string        `yaml:"Error,omitempty" json:"error,omitempty"`
+	Model              string        `yaml:"Model,omitempty" json:"model,omitempty"`
+	CreatedAt          time.Time     `yaml:"CreatedAt,omitempty" json:"created_at,omitempty"`
+	Response           string        `yaml:"Response,omitempty" json:"response,omitempty"`
+	Done               bool          `yaml:"Done,omitempty" json:"done,omitempty"`
+	Context            []int         `yaml:"Context,omitempty" json:"context,omitempty"`
+	TotalDuration      int64         `yaml:"TotalDuration,omitempty" json:"total_duration,omitempty"`
+	LoadDuration       int           `yaml:"LoadDuration,omitempty" json:"load_duration,omitempty"`
+	PromptEvalCount    int           `yaml:"PromptEvalCount,omitempty" json:"prompt_eval_count,omitempty"`
+	PromptEvalDuration int           `yaml:"PromptEvalDuration,omitempty" json:"prompt_eval_duration,omitempty"`
+	EvalCount          int           `yaml:"EvalCount,omitempty" json:"eval_count,omitempty"`
+	EvalDuration       int64         `yaml:"EvalDuration,omitempty" json:"eval_duration,omitempty"`
+	Result             ResultPayload `yaml:"Result,omitempty" json:"result,omitempty"`
+}
+
+// Err returns an error if the request has failed.
+func (r *Response) Err() error {
+	if r == nil {
+		return errors.New("response is nil")
+	}
+
+	if r.Code >= 400 {
+		if r.Error != "" {
+			return errors.New(r.Error)
+		}
+
+		return fmt.Errorf("error %d", r.Code)
+	} else if len(r.Result.Labels) == 0 && r.Result.Caption == nil {
+		return errors.New("no result")
+	}
+
+	return nil
+}
+
+// HasResult checks if there is at least one result in the response data.
+func (r *Response) HasResult() bool {
+	if r == nil {
+		return false
+	}
+
+	return len(r.Result.Labels) > 0 || r.Result.Caption != nil
+}
+
+// ResultPayload mirrors the structure returned by Ollama for result data.
+type ResultPayload struct {
+	Labels  []LabelPayload  `json:"labels"`
+	Caption *CaptionPayload `json:"caption,omitempty"`
+}
+
+// LabelPayload represents a single label object emitted by the Ollama adapter.
+type LabelPayload struct {
+	Name           string   `json:"name"`
+	Source         string   `json:"source,omitempty"`
+	Priority       int      `json:"priority,omitempty"`
+	Confidence     float32  `json:"confidence,omitempty"`
+	Topicality     float32  `json:"topicality,omitempty"`
+	Categories     []string `json:"categories,omitempty"`
+	NSFW           bool     `json:"nsfw,omitempty"`
+	NSFWConfidence float32  `json:"nsfw_confidence,omitempty"`
+}
+
+// CaptionPayload represents the caption object emitted by the Ollama adapter.
+type CaptionPayload struct {
+	Text       string  `json:"text"`
+	Source     string  `json:"source,omitempty"`
+	Confidence float32 `json:"confidence,omitempty"`
+}
--- a/internal/ai/vision/openai/README.md
+++ b/internal/ai/vision/openai/README.md
@@ -0,0 +1,128 @@
+## PhotoPrism — OpenAI API Integration
+
+**Last Updated:** November 14, 2025
+
+### Overview
+
+This package contains PhotoPrism’s adapter for the OpenAI Responses API. It enables existing caption and label workflows (`GenerateCaption`, `GenerateLabels`, and the `photoprism vision run` CLI) to call OpenAI models alongside TensorFlow and Ollama without changing worker or API code. The implementation focuses on predictable results, structured outputs, and clear observability so operators can opt in gradually.
+
+#### Context & Constraints
+
+- OpenAI requests flow through the existing vision client (`internal/ai/vision/api_client.go`) and must honour PhotoPrism’s timeout, logging, and ACL rules.
+- Structured outputs are preferred but the adapter must gracefully handle free-form text; `output_text` responses are parsed both as JSON and as plain captions.
+- Costs should remain predictable: requests are limited to a single 720 px thumbnail (`detail=low`) and capped token budgets (512 caption, 1024 labels).
+- Secrets are supplied per model (`Service.Key`) with fallbacks to `OPENAI_API_KEY` / `_FILE`. Logs must redact sensitive data.
+
+#### Goals
+
+- Provide drop-in OpenAI support for captions and labels using `vision.yml`.
+- Keep configuration ergonomic by auto-populating prompts, schema names, token limits, and sampling defaults.
+- Expose enough logging and tests so operators can compare OpenAI output with existing engines before enabling it broadly.
+
+#### Non-Goals
+
+- Introducing a new `generate` model type or combined caption/label endpoint (reserved for a later phase).
+- Replacing the default TensorFlow models; they remain active as fallbacks.
+- Managing OpenAI billing or quota dashboards beyond surfacing token counts in logs and metrics.
+
+### Prompt, Model, & Schema Guidance
+
+- **Models:** The adapter targets GPT‑5 vision tiers (e.g. `gpt-5-nano`, `gpt-5-mini`). These models support image inputs, structured outputs, and deterministic settings. Set `Name` to the exact provider identifier so defaults are applied correctly. Caption models share the same configuration surface and run through the same adapter.
+- **Prompts:** Defaults live in `defaults.go`. Captions use a single-sentence instruction; labels use `LabelPromptDefault` (or `LabelPromptNSFW` when PhotoPrism requests NSFW metadata). Custom prompts should retain schema reminders so structured outputs stay valid.
+- **Schemas:** Labels use the JSON schema returned by `schema.LabelsJsonSchema(nsfw)`; the response format name is derived via `schema.JsonSchemaName` (e.g. `photoprism_vision_labels_v1`). Captions omit schemas unless operators explicitly request a structured format.
+- **When to keep defaults:** For most deployments, leaving `System`, `Prompt`, `Schema`, and `Options` unset yields stable output with minimal configuration. Override them only when domain-specific language or custom scoring is necessary, and add regression tests alongside.
+
+Budget-conscious operators can experiment with lighter prompts or lower-resolution thumbnails, but should keep token limits and determinism settings intact to avoid unexpected bills and UI churn.
+
+#### Performance & Cost Estimates
+
+- **Token budgets:** Captions request up to 512 output tokens; labels request up to 1024. Input tokens are typically ≤700 for a single 720 px thumbnail plus prompts.
+- **Latency:** GPT‑5 nano/mini vision calls typically complete in 3–8 s, depending on OpenAI region. Including reasoning metadata (`reasoning.effort=low`) has negligible impact but improves traceability.
+- **Costs:** Consult OpenAI’s pricing for the selected model. Multiply input/output tokens by the published rate. PhotoPrism currently sends one image per request to keep costs linear with photo count.
+
+#### Defaults
+
+- File scheme: `data:` URLs (base64) for all OpenAI models.
+- Resolution: 720 px thumbnails (`vision.Thumb(ModelTypeCaption|Labels)`).
+- Options: `MaxOutputTokens` raised to 512 (caption) / 1024 (labels); `ForceJson=false` for captions, `true` for labels; `reasoning.effort="low"`.
+- Sampling: `Temperature` and `TopP` set to `0` for `gpt-5*` models; inherited values (0.1/0.9) remain for other engines. `openaiBuilder.Build` performs this override while preserving the struct defaults for non-OpenAI adapters.
+- Schema naming: Automatically derived via `schema.JsonSchemaName`, so operators may omit `SchemaVersion`.
+
+### Configuration
+
+#### Environment Variables
+
+- `OPENAI_API_KEY` / `OPENAI_API_KEY_FILE` — fallback credentials when a model’s `Service.Key` is unset.
+- Existing `PHOTOPRISM_VISION_*` variables remain authoritative (see the [Developer Guide](https://docs.photoprism.app/developer-guide/vision/service/) for full lists).
+
+#### `vision.yml` Examples
+
+```yaml
+Models:
+  - Type: caption
+    Name: gpt-5-nano
+    Engine: openai
+    Disabled: false    # opt in manually
+    Resolution: 720    # optional; default is 720
+    Options:
+      Detail: low      # optional; defaults to low
+      MaxOutputTokens: 512
+    Service:
+      Uri: https://api.openai.com/v1/responses
+      FileScheme: data
+      Key: ${OPENAI_API_KEY}
+
+  - Type: labels
+    Name: gpt-5-mini
+    Engine: openai
+    Disabled: false
+    Resolution: 720
+    Options:
+      Detail: low
+      MaxOutputTokens: 1024
+      ForceJson: true  # redundant but explicit
+    Service:
+      Uri: https://api.openai.com/v1/responses
+      FileScheme: data
+      Key: ${OPENAI_API_KEY}
+```
+
+Keep TensorFlow entries in place so PhotoPrism falls back when the external service is unavailable.
+
+### Documentation
+
+- Label Generation: <https://docs.photoprism.app/developer-guide/vision/label-generation/>
+- Caption Generation: <https://docs.photoprism.app/developer-guide/vision/caption-generation/>
+- Vision CLI Commands: <https://docs.photoprism.app/developer-guide/vision/cli/>
+
+### Implementation Details
+
+#### Core Concepts
+
+- **Structured outputs:** PhotoPrism leverages OpenAI’s structured output capability as documented at <https://platform.openai.com/docs/guides/structured-outputs>. When a JSON schema is supplied, the adapter emits `text.format` with `type: "json_schema"` and a schema name derived from the content. The parser then prefers `output_json`, but also attempts to decode `output_text` payloads that contain JSON objects.
+- **Deterministic sampling:** GPT‑5 models are run with `temperature=0` and `top_p=0` to minimise variance, while still allowing developers to override values in `vision.yml` if needed.
+- **Reasoning metadata:** Requests include `reasoning.effort="low"` so OpenAI returns structured reasoning usage counters, helping operators track token consumption.
+- **Worker summaries:** The vision worker now logs either “updated …” or “processed … (no metadata changes detected)”, making reruns easy to audit.
+
+#### Rate Limiting
+
+OpenAI calls respect the existing `limiter.Auth` configuration used by the vision service. Failed requests surface standard HTTP errors and are not automatically retried; operators should ensure they have adequate account limits and consider external rate limiting when sharing credentials.
+
+#### Testing & Validation
+
+1. Unit tests: `go test ./internal/ai/vision/openai ./internal/ai/vision -run OpenAI -count=1`. Fixtures under `internal/ai/vision/openai/testdata/` replay real Responses payloads (captions and labels).
+2. CLI smoke test: `photoprism vision run -m labels --count 1 --force --model=gpt-5-mini` with trace logging enabled to inspect sanitised Responses.
+3. Compare worker summaries and label sources (`openai`) in the UI or via `photoprism vision ls`.
+
+#### Code Map
+
+- **Adapter & defaults:** `internal/ai/vision/openai` (defaults, schema helpers, transport, tests).
+- **Request/response plumbing:** `internal/ai/vision/api_request.go`, `api_client.go`, `engine_openai.go`, `engine_openai_test.go`.
+- **Workers & CLI:** `internal/workers/vision.go`, `internal/commands/vision_run.go`.
+- **Shared utilities:** `internal/ai/vision/schema`, `pkg/clean`, `pkg/media`.
+
+#### Next Steps
+
+- [ ] Introduce the future `generate` model type that combines captions, labels, and optional markers.
+- [ ] Evaluate additional OpenAI models as pricing and capabilities evolve.
+- [ ] Expose token usage metrics (input/output/reasoning) via Prometheus once the schema stabilises.
--- a/internal/ai/vision/openai/defaults.go
+++ b/internal/ai/vision/openai/defaults.go
@@ -1,6 +1,29 @@
 package openai

-import "github.com/photoprism/photoprism/internal/ai/vision/schema"
+const (
+	// CaptionSystem defines the default system prompt for caption models.
+	CaptionSystem = "You are a PhotoPrism vision model. Return concise, user-friendly captions that describe the main subjects accurately."
+	// CaptionPrompt instructs caption models to respond with a single sentence.
+	CaptionPrompt = "Provide exactly one sentence describing the key subject and action in the image. Avoid filler words and technical jargon."
+	// LabelSystem defines the system prompt for label generation.
+	LabelSystem = "You are a PhotoPrism vision model. Emit JSON that matches the provided schema and keep label names short, singular nouns."
+	// LabelPromptDefault requests general-purpose labels.
+	LabelPromptDefault = "Analyze the image and return label objects with name, confidence (0-1), and topicality (0-1)."
+	// LabelPromptNSFW requests labels including NSFW metadata when required.
+	LabelPromptNSFW = "Analyze the image and return label objects with name, confidence (0-1), topicality (0-1), nsfw (true when sensitive), and nsfw_confidence (0-1)."
+	// DefaultDetail specifies the preferred thumbnail detail level for Requests API calls.
+	DefaultDetail = "low"
+	// CaptionMaxTokens suggests the output budget for caption responses.
+	CaptionMaxTokens = 512
+	// LabelsMaxTokens suggests the output budget for label responses.
+	LabelsMaxTokens = 1024
+	// DefaultTemperature configures deterministic replies.
+	DefaultTemperature = 0.1
+	// DefaultTopP limits nucleus sampling.
+	DefaultTopP = 0.9
+	// DefaultSchemaVersion is used when callers do not specify an explicit schema version.
+	DefaultSchemaVersion = "v1"
+)

 var (
 	// DefaultModel is the model used by default when accessing the OpenAI API.
@@ -8,8 +31,3 @@ var (
 	// DefaultResolution is the default thumbnail size submitted to the OpenAI.
 	DefaultResolution = 720
 )
-
-// LabelsSchema returns the canonical label schema string consumed by OpenAI models.
-func LabelsSchema() string {
-	return schema.LabelsDefault
-}
--- a/internal/ai/vision/openai/schema.go
+++ b/internal/ai/vision/openai/schema.go
@@ -0,0 +1,16 @@
+package openai
+
+import (
+	"encoding/json"
+
+	"github.com/photoprism/photoprism/internal/ai/vision/schema"
+)
+
+// SchemaLabels returns the canonical labels JSON Schema string consumed by Ollama models.
+//
+// Related documentation and references:
+// - https://platform.openai.com/docs/guides/structured-outputs
+// - https://json-schema.org/learn/miscellaneous-examples
+func SchemaLabels(nsfw bool) json.RawMessage {
+	return schema.LabelsJsonSchema(nsfw)
+}
--- a/internal/ai/vision/openai/testdata/caption-response.json
+++ b/internal/ai/vision/openai/testdata/caption-response.json
@@ -0,0 +1,73 @@
+{
+  "id": "resp_0d356718505119f3006916e5d8730881a0b91de2aa700f6196",
+  "object": "response",
+  "created_at": 1763108312,
+  "status": "completed",
+  "background": false,
+  "billing": {
+    "payer": "developer"
+  },
+  "error": null,
+  "incomplete_details": null,
+  "instructions": null,
+  "max_output_tokens": 512,
+  "max_tool_calls": null,
+  "model": "gpt-5-nano-2025-08-07",
+  "output": [
+    {
+      "id": "rs_0d356718505119f3006916e5d8efd481a0a4f9cc1823cc6c83",
+      "type": "reasoning",
+      "summary": []
+    },
+    {
+      "id": "msg_0d356718505119f3006916e5d9433881a0bc79197d2cfc2027",
+      "type": "message",
+      "status": "completed",
+      "content": [
+        {
+          "type": "output_text",
+          "annotations": [],
+          "logprobs": [],
+          "text": "A bee gathers nectar from the vibrant red poppy\u2019s center."
+        }
+      ],
+      "role": "assistant"
+    }
+  ],
+  "parallel_tool_calls": true,
+  "previous_response_id": null,
+  "prompt_cache_key": null,
+  "prompt_cache_retention": null,
+  "reasoning": {
+    "effort": "low",
+    "summary": null
+  },
+  "safety_identifier": null,
+  "service_tier": "default",
+  "store": true,
+  "temperature": 1.0,
+  "text": {
+    "format": {
+      "type": "text"
+    },
+    "verbosity": "medium"
+  },
+  "tool_choice": "auto",
+  "tools": [],
+  "top_logprobs": 0,
+  "top_p": 1.0,
+  "truncation": "disabled",
+  "usage": {
+    "input_tokens": 576,
+    "input_tokens_details": {
+      "cached_tokens": 0
+    },
+    "output_tokens": 19,
+    "output_tokens_details": {
+      "reasoning_tokens": 0
+    },
+    "total_tokens": 595
+  },
+  "user": null,
+  "metadata": {}
+}
--- a/internal/ai/vision/openai/testdata/labels-response.json
+++ b/internal/ai/vision/openai/testdata/labels-response.json
@@ -0,0 +1,114 @@
+{
+  "id": "resp_0fa91dfb69b7d644006916ea0b72ac819f84ff3152a38dfcdb",
+  "object": "response",
+  "created_at": 1763109387,
+  "status": "completed",
+  "background": false,
+  "billing": {
+    "payer": "developer"
+  },
+  "error": null,
+  "incomplete_details": null,
+  "instructions": null,
+  "max_output_tokens": 1024,
+  "max_tool_calls": null,
+  "model": "gpt-5-mini-2025-08-07",
+  "output": [
+    {
+      "id": "rs_0fa91dfb69b7d644006916ea0c3450819f8a13396bf377f474",
+      "type": "reasoning",
+      "summary": []
+    },
+    {
+      "id": "msg_0fa91dfb69b7d644006916ea0d2dfc819faf52b11334fc10a4",
+      "type": "message",
+      "status": "completed",
+      "content": [
+        {
+          "type": "output_text",
+          "annotations": [],
+          "logprobs": [],
+          "text": "{\"labels\":[{\"name\":\"flower\",\"confidence\":0.99,\"topicality\":0.99},{\"name\":\"bee\",\"confidence\":0.95,\"topicality\":0.95},{\"name\":\"petal\",\"confidence\":0.92,\"topicality\":0.88},{\"name\":\"pollen\",\"confidence\":0.85,\"topicality\":0.8},{\"name\":\"insect\",\"confidence\":0.9,\"topicality\":0.85},{\"name\":\"red\",\"confidence\":0.88,\"topicality\":0.6},{\"name\":\"close-up\",\"confidence\":0.86,\"topicality\":0.7},{\"name\":\"nature\",\"confidence\":0.8,\"topicality\":0.5}]}"
+        }
+      ],
+      "role": "assistant"
+    }
+  ],
+  "parallel_tool_calls": true,
+  "previous_response_id": null,
+  "prompt_cache_key": null,
+  "prompt_cache_retention": null,
+  "reasoning": {
+    "effort": "low",
+    "summary": null
+  },
+  "safety_identifier": null,
+  "service_tier": "default",
+  "store": true,
+  "temperature": 1.0,
+  "text": {
+    "format": {
+      "type": "json_schema",
+      "description": null,
+      "name": "photoprism_vision_labels_v1",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "labels": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "name": {
+                  "type": "string",
+                  "minLength": 1
+                },
+                "confidence": {
+                  "type": "number",
+                  "minimum": 0,
+                  "maximum": 1
+                },
+                "topicality": {
+                  "type": "number",
+                  "minimum": 0,
+                  "maximum": 1
+                }
+              },
+              "required": [
+                "name",
+                "confidence",
+                "topicality"
+              ],
+              "additionalProperties": false
+            },
+            "default": []
+          }
+        },
+        "required": [
+          "labels"
+        ],
+        "additionalProperties": false
+      },
+      "strict": true
+    },
+    "verbosity": "medium"
+  },
+  "tool_choice": "auto",
+  "tools": [],
+  "top_logprobs": 0,
+  "top_p": 1.0,
+  "truncation": "disabled",
+  "usage": {
+    "input_tokens": 724,
+    "input_tokens_details": {
+      "cached_tokens": 0
+    },
+    "output_tokens": 169,
+    "output_tokens_details": {
+      "reasoning_tokens": 0
+    },
+    "total_tokens": 893
+  },
+  "user": null,
+  "metadata": {}
+}
--- a/internal/ai/vision/openai/transport.go
+++ b/internal/ai/vision/openai/transport.go
@@ -0,0 +1,142 @@
+package openai
+
+import (
+	"encoding/json"
+	"strings"
+)
+
+const (
+	// ContentTypeText identifies text input segments for the Responses API.
+	ContentTypeText = "input_text"
+	// ContentTypeImage identifies image input segments for the Responses API.
+	ContentTypeImage = "input_image"
+
+	// ResponseFormatJSONSchema requests JSON constrained by a schema.
+	ResponseFormatJSONSchema = "json_schema"
+	// ResponseFormatJSONObject requests a free-form JSON object.
+	ResponseFormatJSONObject = "json_object"
+)
+
+// HTTPRequest represents the payload expected by OpenAI's Responses API.
+type HTTPRequest struct {
+	Model            string         `json:"model"`
+	Input            []InputMessage `json:"input"`
+	Text             *TextOptions   `json:"text,omitempty"`
+	Reasoning        *Reasoning     `json:"reasoning,omitempty"`
+	MaxOutputTokens  int            `json:"max_output_tokens,omitempty"`
+	Temperature      float64        `json:"temperature,omitempty"`
+	TopP             float64        `json:"top_p,omitempty"`
+	PresencePenalty  float64        `json:"presence_penalty,omitempty"`
+	FrequencyPenalty float64        `json:"frequency_penalty,omitempty"`
+}
+
+// TextOptions carries formatting preferences for textual responses.
+type TextOptions struct {
+	Format *ResponseFormat `json:"format,omitempty"`
+}
+
+// Reasoning configures the effort level for reasoning models.
+type Reasoning struct {
+	Effort string `json:"effort,omitempty"`
+}
+
+// InputMessage captures a single system or user message in the request.
+type InputMessage struct {
+	Role    string        `json:"role"`
+	Type    string        `json:"type,omitempty"`
+	Content []ContentItem `json:"content"`
+}
+
+// ContentItem represents a text or image entry within a message.
+type ContentItem struct {
+	Type     string `json:"type"`
+	Text     string `json:"text,omitempty"`
+	ImageURL string `json:"image_url,omitempty"`
+	Detail   string `json:"detail,omitempty"`
+}
+
+// ResponseFormat describes how OpenAI should format its response.
+type ResponseFormat struct {
+	Type        string          `json:"type"`
+	Name        string          `json:"name,omitempty"`
+	Schema      json.RawMessage `json:"schema,omitempty"`
+	Description string          `json:"description,omitempty"`
+	Strict      bool            `json:"strict,omitempty"`
+}
+
+// Response mirrors the subset of the Responses API response we need.
+type Response struct {
+	ID     string           `json:"id"`
+	Model  string           `json:"model"`
+	Output []ResponseOutput `json:"output"`
+	Error  *struct {
+		Message string `json:"message"`
+		Type    string `json:"type"`
+	} `json:"error,omitempty"`
+}
+
+// ResponseOutput captures assistant messages within the response.
+type ResponseOutput struct {
+	Role    string            `json:"role"`
+	Content []ResponseContent `json:"content"`
+}
+
+// ResponseContent contains individual message parts (JSON or text).
+type ResponseContent struct {
+	Type string          `json:"type"`
+	Text string          `json:"text,omitempty"`
+	JSON json.RawMessage `json:"json,omitempty"`
+}
+
+// FirstJSON returns the first JSON payload contained in the response.
+func (r *Response) FirstJSON() json.RawMessage {
+	if r == nil {
+		return nil
+	}
+
+	for i := range r.Output {
+		for j := range r.Output[i].Content {
+			if len(r.Output[i].Content[j].JSON) > 0 {
+				return r.Output[i].Content[j].JSON
+			}
+		}
+	}
+
+	return nil
+}
+
+// FirstText returns the first textual payload contained in the response.
+func (r *Response) FirstText() string {
+	if r == nil {
+		return ""
+	}
+
+	for i := range r.Output {
+		for j := range r.Output[i].Content {
+			if text := strings.TrimSpace(r.Output[i].Content[j].Text); text != "" {
+				return text
+			}
+		}
+	}
+
+	return ""
+}
+
+// ParseErrorMessage extracts a human readable error message from a Responses API payload.
+func ParseErrorMessage(raw []byte) string {
+	var errResp struct {
+		Error *struct {
+			Message string `json:"message"`
+		} `json:"error"`
+	}
+
+	if err := json.Unmarshal(raw, &errResp); err != nil {
+		return ""
+	}
+
+	if errResp.Error != nil {
+		return strings.TrimSpace(errResp.Error.Message)
+	}
+
+	return ""
+}
--- a/internal/ai/vision/openai/transport_test.go
+++ b/internal/ai/vision/openai/transport_test.go
@@ -0,0 +1,120 @@
+package openai
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func loadTestResponse(t *testing.T, name string) *Response {
+	t.Helper()
+
+	filePath := filepath.Join("testdata", name)
+
+	data, err := os.ReadFile(filePath)
+	if err != nil {
+		t.Fatalf("failed to read %s: %v", filePath, err)
+	}
+
+	var resp Response
+	if err := json.Unmarshal(data, &resp); err != nil {
+		t.Fatalf("failed to unmarshal %s: %v", filePath, err)
+	}
+
+	return &resp
+}
+
+func TestParseErrorMessage(t *testing.T) {
+	t.Run("returns message when present", func(t *testing.T) {
+		raw := []byte(`{"error":{"message":"Invalid schema"}}`)
+		msg := ParseErrorMessage(raw)
+		if msg != "Invalid schema" {
+			t.Fatalf("expected message, got %q", msg)
+		}
+	})
+
+	t.Run("returns empty string when error is missing", func(t *testing.T) {
+		raw := []byte(`{"output":[]}`)
+		if msg := ParseErrorMessage(raw); msg != "" {
+			t.Fatalf("expected empty message, got %q", msg)
+		}
+	})
+}
+
+func TestResponseFirstTextCaption(t *testing.T) {
+	resp := loadTestResponse(t, "caption-response.json")
+
+	if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
+		t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
+	}
+
+	text := resp.FirstText()
+	expected := "A bee gathers nectar from the vibrant red poppy’s center."
+	if text != expected {
+		t.Fatalf("unexpected caption text: %q", text)
+	}
+}
+
+func TestResponseFirstTextLabels(t *testing.T) {
+	resp := loadTestResponse(t, "labels-response.json")
+
+	if jsonPayload := resp.FirstJSON(); len(jsonPayload) != 0 {
+		t.Fatalf("expected no JSON payload, got: %s", jsonPayload)
+	}
+
+	text := resp.FirstText()
+	if len(text) == 0 {
+		t.Fatal("expected structured JSON string in text payload")
+	}
+	if text[0] != '{' {
+		t.Fatalf("expected JSON object in text payload, got %q", text)
+	}
+}
+
+func TestResponseFirstJSONFromStructuredPayload(t *testing.T) {
+	resp := &Response{
+		ID:    "resp_structured",
+		Model: "gpt-5-mini",
+		Output: []ResponseOutput{
+			{
+				Role: "assistant",
+				Content: []ResponseContent{
+					{
+						Type: "output_json",
+						JSON: json.RawMessage(`{"labels":[{"name":"sunset"}]}`),
+					},
+				},
+			},
+		},
+	}
+
+	jsonPayload := resp.FirstJSON()
+	if len(jsonPayload) == 0 {
+		t.Fatal("expected JSON payload, got empty result")
+	}
+
+	var decoded struct {
+		Labels []map[string]string `json:"labels"`
+	}
+	if err := json.Unmarshal(jsonPayload, &decoded); err != nil {
+		t.Fatalf("failed to decode JSON payload: %v", err)
+	}
+
+	if len(decoded.Labels) != 1 || decoded.Labels[0]["name"] != "sunset" {
+		t.Fatalf("unexpected JSON payload: %+v", decoded.Labels)
+	}
+}
+
+func TestSchemaLabelsReturnsValidJSON(t *testing.T) {
+	raw := SchemaLabels(false)
+
+	var decoded map[string]any
+	if err := json.Unmarshal(raw, &decoded); err != nil {
+		t.Fatalf("schema should be valid JSON: %v", err)
+	}
+
+	if decoded["type"] != "object" {
+		t.Fatalf("expected type object, got %v", decoded["type"])
+	}
+}
--- a/internal/ai/vision/schema/labels.go
+++ b/internal/ai/vision/schema/labels.go
@@ -1,16 +1,115 @@
 package schema

-// LabelsDefault provides the minimal JSON schema for label responses used across engines.
-const (
-	LabelsDefault = "{\n  \"labels\": [{\n    \"name\": \"\",\n    \"confidence\": 0,\n    \"topicality\": 0 }]\n}"
-	LabelsNSFW    = "{\n  \"labels\": [{\n    \"name\": \"\",\n    \"confidence\": 0,\n    \"topicality\": 0,\n    \"nsfw\": false,\n    \"nsfw_confidence\": 0\n  }]\n}"
+import (
+	"encoding/json"
 )

-// Labels returns the canonical label schema string.
-func Labels(nsfw bool) string {
+// LabelsJsonSchemaDefault provides the minimal JSON schema for label responses used across engines.
+const (
+	LabelsJsonSchemaDefault = `{
+  "type": "object",
+  "properties": {
+    "labels": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string",
+            "minLength": 1
+          },
+          "confidence": {
+            "type": "number",
+            "minimum": 0,
+            "maximum": 1
+          },
+          "topicality": {
+            "type": "number",
+            "minimum": 0,
+            "maximum": 1
+          }
+        },
+        "required": ["name", "confidence", "topicality"],
+        "additionalProperties": false
+      },
+      "default": []
+    }
+  },
+  "required": ["labels"],
+  "additionalProperties": false
+}`
+	LabelsJsonDefault    = "{\n  \"labels\": [{\n    \"name\": \"\",\n    \"confidence\": 0,\n    \"topicality\": 0 }]\n}"
+	LabelsJsonSchemaNSFW = `{
+  "type": "object",
+  "properties": {
+    "labels": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string",
+            "minLength": 1
+          },
+          "confidence": {
+            "type": "number",
+            "minimum": 0,
+            "maximum": 1
+          },
+          "topicality": {
+            "type": "number",
+            "minimum": 0,
+            "maximum": 1
+          },
+          "nsfw": {
+            "type": "boolean"
+          },
+          "nsfw_confidence": {
+            "type": "number",
+            "minimum": 0,
+            "maximum": 1
+          }
+        },
+        "required": [
+          "name",
+          "confidence",
+          "topicality",
+          "nsfw",
+          "nsfw_confidence"
+        ],
+        "additionalProperties": false
+      },
+      "default": []
+    }
+  },
+  "required": ["labels"],
+  "additionalProperties": false
+}`
+	LabelsJsonNSFW = "{\n  \"labels\": [{\n    \"name\": \"\",\n    \"confidence\": 0,\n    \"topicality\": 0,\n    \"nsfw\": false,\n    \"nsfw_confidence\": 0\n  }]\n}"
+)
+
+// LabelsJsonSchema returns the canonical label JSON Schema string for OpenAI API endpoints.
+//
+// Related documentation and references:
+// - https://platform.openai.com/docs/guides/structured-outputs
+// - https://json-schema.org/learn/miscellaneous-examples
+func LabelsJsonSchema(nsfw bool) json.RawMessage {
 	if nsfw {
-		return LabelsNSFW
+		return json.RawMessage(LabelsJsonSchemaNSFW)
 	} else {
-		return LabelsDefault
+		return json.RawMessage(LabelsJsonSchemaDefault)
+	}
+}
+
+// LabelsJson returns the canonical label JSON string for Ollama vision models.
+//
+// Related documentation and references:
+// - https://www.alibabacloud.com/help/en/model-studio/json-mode
+// - https://www.json.org/json-en.html
+func LabelsJson(nsfw bool) string {
+	if nsfw {
+		return LabelsJsonNSFW
+	} else {
+		return LabelsJsonDefault
 	}
 }
--- a/internal/ai/vision/schema/name.go
+++ b/internal/ai/vision/schema/name.go
@@ -0,0 +1,36 @@
+package schema
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+
+	"github.com/photoprism/photoprism/pkg/clean"
+)
+
+const (
+	NamePrefix = "photoprism_vision"
+)
+
+// JsonSchemaName returns the schema version string to be used for API requests.
+func JsonSchemaName(schema json.RawMessage, version string) string {
+	var schemaName string
+
+	switch {
+	case bytes.Contains(schema, []byte("labels")):
+		schemaName = "labels"
+	case bytes.Contains(schema, []byte("labels")):
+		schemaName = "caption"
+	default:
+		schemaName = "schema"
+	}
+
+	version = clean.TypeLowerUnderscore(version)
+
+	if version == "" {
+		version = "v1"
+	}
+
+	return fmt.Sprintf("%s_%s_%s", NamePrefix, schemaName, version)
+
+}
--- a/internal/ai/vision/schema/name_test.go
+++ b/internal/ai/vision/schema/name_test.go
@@ -0,0 +1,23 @@
+package schema
+
+import (
+	"encoding/json"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestJsonSchemaName(t *testing.T) {
+	t.Run("Default", func(t *testing.T) {
+		assert.Equal(t, "photoprism_vision_schema_v1", JsonSchemaName(nil, ""))
+	})
+	t.Run("Labels", func(t *testing.T) {
+		assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(json.RawMessage(LabelsJsonSchemaDefault), ""))
+	})
+	t.Run("LabelsV1", func(t *testing.T) {
+		assert.Equal(t, "photoprism_vision_labels_v2", JsonSchemaName([]byte("labels"), "v2"))
+	})
+	t.Run("LabelsJsonSchema", func(t *testing.T) {
+		assert.Equal(t, "photoprism_vision_labels_v1", JsonSchemaName(LabelsJsonSchema(false), "v1"))
+	})
+}
--- a/internal/ai/vision/schema/schema.go
+++ b/internal/ai/vision/schema/schema.go
@@ -1,5 +1,5 @@
 /*
-Package schema defines canonical JSON schema templates shared by PhotoPrism's AI vision engines.
+Package schema defines canonical JSON and JSON Schema templates shared by PhotoPrism's AI vision engines.

 Copyright (c) 2018 - 2025 PhotoPrism UG. All rights reserved.

--- a/internal/config/feat/vision.go
+++ b/internal/config/feat/vision.go
@@ -4,5 +4,5 @@ package feat
 var (
 	VisionModelGenerate = false // controls exposure of the generate endpoint and CLI commands
 	VisionModelMarkers  = false // gates marker generation/return until downstream UI and reconciliation paths are ready
-	VisionServiceOpenAI = false // controls whether users are able to configure OpenAI as a vision service engine
+	VisionServiceOpenAI = true  // controls whether users are able to configure OpenAI as a vision service engine
 )
--- a/internal/workers/vision.go
+++ b/internal/workers/vision.go
@@ -135,6 +135,7 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
 	done := make(map[string]bool)
 	offset := 0
 	updated := 0
+	processed := 0

 	// Make sure count is within
 	if count < 1 || count > search.MaxResults {
@@ -197,6 +198,8 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
 			continue
 		}

+		processed++
+
 		fileName := photoprism.FileName(photo.FileRoot, photo.FileName)
 		file, fileErr := photoprism.NewMediaFile(fileName)

@@ -279,7 +282,18 @@ func (w *Vision) Start(filter string, count int, models []string, customSrc stri
 		}
 	}

-	log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), time.Since(start))
+	elapsed := time.Since(start)
+
+	switch {
+	case processed == 0:
+		log.Infof("vision: no pictures required processing [%s]", elapsed)
+	case updated == processed:
+		log.Infof("vision: updated %s [%s]", english.Plural(updated, "picture", "pictures"), elapsed)
+	case updated == 0:
+		log.Infof("vision: processed %s (no metadata changes detected) [%s]", english.Plural(processed, "picture", "pictures"), elapsed)
+	default:
+		log.Infof("vision: updated %s out of %s [%s]", english.Plural(updated, "picture", "pictures"), english.Plural(processed, "picture", "pictures"), elapsed)
+	}

 	if updated > 0 {
 		updateIndex = true