AI: Generate Captions & Labels using the OpenAI Responses API #5322

Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
Michael Mayer
2025-11-14 11:10:34 +01:00
parent 46d5e33c8c
commit d76acdb69f
28 changed files with 1822 additions and 127 deletions

View File

@@ -11,6 +11,8 @@ import (
"github.com/sirupsen/logrus"
"github.com/photoprism/photoprism/internal/ai/vision/openai"
"github.com/photoprism/photoprism/internal/ai/vision/schema"
"github.com/photoprism/photoprism/internal/api/download"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/fs"
@@ -58,6 +60,11 @@ type ApiRequestOptions struct {
UseMmap bool `yaml:"UseMmap,omitempty" json:"use_mmap,omitempty"`
UseMlock bool `yaml:"UseMlock,omitempty" json:"use_mlock,omitempty"`
NumThread int `yaml:"NumThread,omitempty" json:"num_thread,omitempty"`
MaxOutputTokens int `yaml:"MaxOutputTokens,omitempty" json:"max_output_tokens,omitempty"`
Detail string `yaml:"Detail,omitempty" json:"detail,omitempty"`
ForceJson bool `yaml:"ForceJson,omitempty" json:"force_json,omitempty"`
SchemaVersion string `yaml:"SchemaVersion,omitempty" json:"schema_version,omitempty"`
CombineOutputs string `yaml:"CombineOutputs,omitempty" json:"combine_outputs,omitempty"`
}
// ApiRequestContext represents a context parameter returned from a previous request.
@@ -77,6 +84,7 @@ type ApiRequest struct {
Context *ApiRequestContext `form:"context" yaml:"Context,omitempty" json:"context,omitempty"`
Stream bool `form:"stream" yaml:"Stream,omitempty" json:"stream"`
Images Files `form:"images" yaml:"Images,omitempty" json:"images,omitempty"`
Schema json.RawMessage `form:"schema" yaml:"Schema,omitempty" json:"schema,omitempty"`
ResponseFormat ApiFormat `form:"-" yaml:"-" json:"-"`
}
@@ -195,6 +203,14 @@ func (r *ApiRequest) GetResponseFormat() ApiFormat {
// JSON returns the request data as JSON-encoded bytes.
func (r *ApiRequest) JSON() ([]byte, error) {
if r == nil {
return nil, errors.New("api request is nil")
}
if r.ResponseFormat == ApiFormatOpenAI {
return r.openAIJSON()
}
return json.Marshal(*r)
}
@@ -229,6 +245,8 @@ func (r *ApiRequest) sanitizedForLog() ApiRequest {
sanitized.Url = sanitizeLogPayload(r.Url)
sanitized.Schema = r.Schema
return sanitized
}
@@ -287,3 +305,134 @@ func isLikelyBase64(value string) bool {
return true
}
// openAIJSON converts the request data into an OpenAI Responses API payload.
func (r *ApiRequest) openAIJSON() ([]byte, error) {
detail := openai.DefaultDetail
if opts := r.Options; opts != nil && strings.TrimSpace(opts.Detail) != "" {
detail = strings.TrimSpace(opts.Detail)
}
messages := make([]openai.InputMessage, 0, 2)
if system := strings.TrimSpace(r.System); system != "" {
messages = append(messages, openai.InputMessage{
Role: "system",
Type: "message",
Content: []openai.ContentItem{
{
Type: openai.ContentTypeText,
Text: system,
},
},
})
}
userContent := make([]openai.ContentItem, 0, len(r.Images)+1)
if prompt := strings.TrimSpace(r.Prompt); prompt != "" {
userContent = append(userContent, openai.ContentItem{
Type: openai.ContentTypeText,
Text: prompt,
})
}
for _, img := range r.Images {
if img == "" {
continue
}
userContent = append(userContent, openai.ContentItem{
Type: openai.ContentTypeImage,
ImageURL: img,
Detail: detail,
})
}
if len(userContent) > 0 {
messages = append(messages, openai.InputMessage{
Role: "user",
Type: "message",
Content: userContent,
})
}
if len(messages) == 0 {
return nil, errors.New("openai request requires at least one message")
}
payload := openai.HTTPRequest{
Model: strings.TrimSpace(r.Model),
Input: messages,
}
if payload.Model == "" {
payload.Model = openai.DefaultModel
}
if strings.HasPrefix(strings.ToLower(payload.Model), "gpt-5") {
payload.Reasoning = &openai.Reasoning{Effort: "low"}
}
if opts := r.Options; opts != nil {
if opts.MaxOutputTokens > 0 {
payload.MaxOutputTokens = opts.MaxOutputTokens
}
if opts.Temperature > 0 {
payload.Temperature = opts.Temperature
}
if opts.TopP > 0 {
payload.TopP = opts.TopP
}
if opts.PresencePenalty != 0 {
payload.PresencePenalty = opts.PresencePenalty
}
if opts.FrequencyPenalty != 0 {
payload.FrequencyPenalty = opts.FrequencyPenalty
}
}
if format := buildOpenAIResponseFormat(r); format != nil {
payload.Text = &openai.TextOptions{
Format: format,
}
}
return json.Marshal(payload)
}
// buildOpenAIResponseFormat determines which response_format to send to OpenAI.
func buildOpenAIResponseFormat(r *ApiRequest) *openai.ResponseFormat {
if r == nil {
return nil
}
opts := r.Options
hasSchema := len(r.Schema) > 0
if !hasSchema && (opts == nil || !opts.ForceJson) {
return nil
}
result := &openai.ResponseFormat{}
if hasSchema {
result.Type = openai.ResponseFormatJSONSchema
result.Schema = r.Schema
if opts != nil && strings.TrimSpace(opts.SchemaVersion) != "" {
result.Name = strings.TrimSpace(opts.SchemaVersion)
} else {
result.Name = schema.JsonSchemaName(r.Schema, openai.DefaultSchemaVersion)
}
} else {
result.Type = openai.ResponseFormatJSONObject
}
return result
}