Files
photoprism/internal/ai/vision/engine_openai.go
2025-11-22 11:15:17 +01:00

345 lines
8.3 KiB
Go

package vision
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"github.com/photoprism/photoprism/internal/ai/vision/openai"
"github.com/photoprism/photoprism/internal/entity"
"github.com/photoprism/photoprism/pkg/clean"
"github.com/photoprism/photoprism/pkg/http/scheme"
)
// openaiDefaults provides canned prompts, schema templates, and options for OpenAI engines.
type openaiDefaults struct{}
// openaiBuilder prepares ApiRequest objects for OpenAI's Responses API.
type openaiBuilder struct{}
// openaiParser converts Responses API payloads into ApiResponse instances.
type openaiParser struct{}
func init() {
RegisterEngine(ApiFormatOpenAI, Engine{
Builder: openaiBuilder{},
Parser: openaiParser{},
Defaults: openaiDefaults{},
})
}
// SystemPrompt returns the default OpenAI system prompt for the specified model type.
func (openaiDefaults) SystemPrompt(model *Model) string {
if model == nil {
return ""
}
switch model.Type {
case ModelTypeCaption:
return openai.CaptionSystem
case ModelTypeLabels:
return openai.LabelSystem
default:
return ""
}
}
// UserPrompt returns the default OpenAI user prompt for the specified model type.
func (openaiDefaults) UserPrompt(model *Model) string {
if model == nil {
return ""
}
switch model.Type {
case ModelTypeCaption:
return openai.CaptionPrompt
case ModelTypeLabels:
if DetectNSFWLabels {
return openai.LabelPromptNSFW
}
return openai.LabelPromptDefault
default:
return ""
}
}
// SchemaTemplate returns the JSON schema template for the model, if applicable.
func (openaiDefaults) SchemaTemplate(model *Model) string {
if model == nil {
return ""
}
switch model.Type {
case ModelTypeLabels:
return string(openai.SchemaLabels(model.PromptContains("nsfw")))
default:
return ""
}
}
// Options returns default OpenAI request options for the model.
func (openaiDefaults) Options(model *Model) *ApiRequestOptions {
if model == nil {
return nil
}
switch model.Type {
case ModelTypeCaption:
/*
Options:
Detail: low
MaxOutputTokens: 512
Temperature: 0.1
TopP: 0.9
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
*/
return &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: openai.CaptionMaxTokens,
Temperature: openai.DefaultTemperature,
TopP: openai.DefaultTopP,
}
case ModelTypeLabels:
/*
Options:
Detail: low
MaxOutputTokens: 1024
Temperature: 0.1
ForceJson: true
SchemaVersion: "photoprism_vision_labels_v1"
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
*/
return &ApiRequestOptions{
Detail: openai.DefaultDetail,
MaxOutputTokens: openai.LabelsMaxTokens,
Temperature: openai.DefaultTemperature,
TopP: openai.DefaultTopP,
ForceJson: true,
}
default:
return nil
}
}
// Build constructs an OpenAI request payload using base64-encoded thumbnails.
func (openaiBuilder) Build(ctx context.Context, model *Model, files Files) (*ApiRequest, error) {
if model == nil {
return nil, ErrInvalidModel
}
dataReq, err := NewApiRequestImages(files, scheme.Data)
if err != nil {
return nil, err
}
req := &ApiRequest{
Id: dataReq.Id,
Images: append(Files(nil), dataReq.Images...),
ResponseFormat: ApiFormatOpenAI,
}
if opts := model.GetOptions(); opts != nil {
req.Options = cloneOptions(opts)
switch model.Type {
case ModelTypeCaption:
// Captions default to plain text responses; structured JSON is optional.
req.Options.ForceJson = false
if req.Options.MaxOutputTokens < openai.CaptionMaxTokens {
req.Options.MaxOutputTokens = openai.CaptionMaxTokens
}
case ModelTypeLabels:
if req.Options.MaxOutputTokens < openai.LabelsMaxTokens {
req.Options.MaxOutputTokens = openai.LabelsMaxTokens
}
}
if strings.HasPrefix(strings.ToLower(strings.TrimSpace(model.Name)), "gpt-5") {
req.Options.Temperature = 0
req.Options.TopP = 0
}
}
if schema := strings.TrimSpace(model.SchemaTemplate()); schema != "" {
if raw, parseErr := parseOpenAISchema(schema); parseErr != nil {
log.Warnf("vision: failed to parse OpenAI schema template (%s)", clean.Error(parseErr))
} else {
req.Schema = raw
}
}
return req, nil
}
// Parse converts an OpenAI Responses API payload into the internal ApiResponse representation.
func (openaiParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, status int) (*ApiResponse, error) {
if status >= 300 {
if msg := openai.ParseErrorMessage(raw); msg != "" {
return nil, fmt.Errorf("openai: %s", msg)
}
return nil, fmt.Errorf("openai: status %d", status)
}
var resp openai.Response
if err := json.Unmarshal(raw, &resp); err != nil {
return nil, err
}
if resp.Error != nil && resp.Error.Message != "" {
return nil, errors.New(resp.Error.Message)
}
result := ApiResult{}
if jsonPayload := resp.FirstJSON(); len(jsonPayload) > 0 {
if err := populateOpenAIJSONResult(&result, jsonPayload); err != nil {
log.Debugf("vision: %s (parse openai json payload)", clean.Error(err))
}
}
if result.Caption == nil {
if text := resp.FirstText(); text != "" {
trimmed := strings.TrimSpace(text)
var parsedJSON bool
if len(trimmed) > 0 && (trimmed[0] == '{' || trimmed[0] == '[') {
if err := populateOpenAIJSONResult(&result, json.RawMessage(trimmed)); err != nil {
log.Debugf("vision: %s (parse openai json text payload)", clean.Error(err))
} else {
parsedJSON = true
}
}
if !parsedJSON && trimmed != "" {
result.Caption = &CaptionResult{
Text: trimmed,
Source: entity.SrcOpenAI,
}
}
}
}
var responseID string
if req != nil {
responseID = req.GetId()
}
modelName := strings.TrimSpace(resp.Model)
if modelName == "" && req != nil {
modelName = strings.TrimSpace(req.Model)
}
return &ApiResponse{
Id: responseID,
Code: status,
Model: &Model{Name: modelName},
Result: result,
}, nil
}
// parseOpenAISchema validates the provided JSON schema and returns it as a raw message.
func parseOpenAISchema(schema string) (json.RawMessage, error) {
var raw json.RawMessage
if err := json.Unmarshal([]byte(schema), &raw); err != nil {
return nil, err
}
return normalizeOpenAISchema(raw)
}
// normalizeOpenAISchema upgrades legacy label schema definitions so they comply with
// OpenAI's json_schema format requirements.
func normalizeOpenAISchema(raw json.RawMessage) (json.RawMessage, error) {
if len(raw) == 0 {
return raw, nil
}
var doc map[string]any
if err := json.Unmarshal(raw, &doc); err != nil {
// Fallback to the original payload if it isn't a JSON object.
return raw, nil
}
if t, ok := doc["type"]; ok {
if typeStr, ok := t.(string); ok && strings.TrimSpace(typeStr) != "" {
return raw, nil
}
}
if _, ok := doc["properties"]; ok {
return raw, nil
}
labels, ok := doc["labels"]
if !ok {
return raw, nil
}
nsfw := false
if items, ok := labels.([]any); ok && len(items) > 0 {
if first, ok := items[0].(map[string]any); ok {
if _, hasNSFW := first["nsfw"]; hasNSFW {
nsfw = true
}
if _, hasNSFWConfidence := first["nsfw_confidence"]; hasNSFWConfidence {
nsfw = true
}
}
}
return openai.SchemaLabels(nsfw), nil
}
// populateOpenAIJSONResult unmarshals a structured OpenAI response into ApiResult fields.
func populateOpenAIJSONResult(result *ApiResult, payload json.RawMessage) error {
if result == nil || len(payload) == 0 {
return nil
}
var envelope struct {
Caption *struct {
Text string `json:"text"`
Confidence float32 `json:"confidence"`
} `json:"caption"`
Labels []LabelResult `json:"labels"`
}
if err := json.Unmarshal(payload, &envelope); err != nil {
return err
}
if envelope.Caption != nil {
text := strings.TrimSpace(envelope.Caption.Text)
if text != "" {
result.Caption = &CaptionResult{
Text: text,
Confidence: envelope.Caption.Confidence,
Source: entity.SrcOpenAI,
}
}
}
if len(envelope.Labels) > 0 {
filtered := envelope.Labels[:0]
for i := range envelope.Labels {
if envelope.Labels[i].Source == "" {
envelope.Labels[i].Source = entity.SrcOpenAI
}
normalizeLabelResult(&envelope.Labels[i])
if envelope.Labels[i].Name == "" {
continue
}
filtered = append(filtered, envelope.Labels[i])
}
result.Labels = append(result.Labels, filtered...)
}
return nil
}