mirror of
https://github.com/photoprism/photoprism.git
synced 2025-12-12 00:34:13 +01:00
345 lines
8.3 KiB
Go
345 lines
8.3 KiB
Go
package vision
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/photoprism/photoprism/internal/ai/vision/openai"
|
|
"github.com/photoprism/photoprism/internal/entity"
|
|
"github.com/photoprism/photoprism/pkg/clean"
|
|
"github.com/photoprism/photoprism/pkg/http/scheme"
|
|
)
|
|
|
|
// openaiDefaults provides canned prompts, schema templates, and options for OpenAI engines.
|
|
type openaiDefaults struct{}
|
|
|
|
// openaiBuilder prepares ApiRequest objects for OpenAI's Responses API.
|
|
type openaiBuilder struct{}
|
|
|
|
// openaiParser converts Responses API payloads into ApiResponse instances.
|
|
type openaiParser struct{}
|
|
|
|
func init() {
|
|
RegisterEngine(ApiFormatOpenAI, Engine{
|
|
Builder: openaiBuilder{},
|
|
Parser: openaiParser{},
|
|
Defaults: openaiDefaults{},
|
|
})
|
|
}
|
|
|
|
// SystemPrompt returns the default OpenAI system prompt for the specified model type.
|
|
func (openaiDefaults) SystemPrompt(model *Model) string {
|
|
if model == nil {
|
|
return ""
|
|
}
|
|
|
|
switch model.Type {
|
|
case ModelTypeCaption:
|
|
return openai.CaptionSystem
|
|
case ModelTypeLabels:
|
|
return openai.LabelSystem
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
// UserPrompt returns the default OpenAI user prompt for the specified model type.
|
|
func (openaiDefaults) UserPrompt(model *Model) string {
|
|
if model == nil {
|
|
return ""
|
|
}
|
|
|
|
switch model.Type {
|
|
case ModelTypeCaption:
|
|
return openai.CaptionPrompt
|
|
case ModelTypeLabels:
|
|
if DetectNSFWLabels {
|
|
return openai.LabelPromptNSFW
|
|
}
|
|
return openai.LabelPromptDefault
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
// SchemaTemplate returns the JSON schema template for the model, if applicable.
|
|
func (openaiDefaults) SchemaTemplate(model *Model) string {
|
|
if model == nil {
|
|
return ""
|
|
}
|
|
|
|
switch model.Type {
|
|
case ModelTypeLabels:
|
|
return string(openai.SchemaLabels(model.PromptContains("nsfw")))
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
// Options returns default OpenAI request options for the model.
|
|
func (openaiDefaults) Options(model *Model) *ApiRequestOptions {
|
|
if model == nil {
|
|
return nil
|
|
}
|
|
|
|
switch model.Type {
|
|
case ModelTypeCaption:
|
|
/*
|
|
Options:
|
|
Detail: low
|
|
MaxOutputTokens: 512
|
|
Temperature: 0.1
|
|
TopP: 0.9
|
|
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
|
|
*/
|
|
return &ApiRequestOptions{
|
|
Detail: openai.DefaultDetail,
|
|
MaxOutputTokens: openai.CaptionMaxTokens,
|
|
Temperature: openai.DefaultTemperature,
|
|
TopP: openai.DefaultTopP,
|
|
}
|
|
case ModelTypeLabels:
|
|
/*
|
|
Options:
|
|
Detail: low
|
|
MaxOutputTokens: 1024
|
|
Temperature: 0.1
|
|
ForceJson: true
|
|
SchemaVersion: "photoprism_vision_labels_v1"
|
|
(Sampling values are zeroed for GPT-5 models in openaiBuilder.Build.)
|
|
*/
|
|
return &ApiRequestOptions{
|
|
Detail: openai.DefaultDetail,
|
|
MaxOutputTokens: openai.LabelsMaxTokens,
|
|
Temperature: openai.DefaultTemperature,
|
|
TopP: openai.DefaultTopP,
|
|
ForceJson: true,
|
|
}
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Build constructs an OpenAI request payload using base64-encoded thumbnails.
|
|
func (openaiBuilder) Build(ctx context.Context, model *Model, files Files) (*ApiRequest, error) {
|
|
if model == nil {
|
|
return nil, ErrInvalidModel
|
|
}
|
|
|
|
dataReq, err := NewApiRequestImages(files, scheme.Data)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
req := &ApiRequest{
|
|
Id: dataReq.Id,
|
|
Images: append(Files(nil), dataReq.Images...),
|
|
ResponseFormat: ApiFormatOpenAI,
|
|
}
|
|
|
|
if opts := model.GetOptions(); opts != nil {
|
|
req.Options = cloneOptions(opts)
|
|
|
|
switch model.Type {
|
|
case ModelTypeCaption:
|
|
// Captions default to plain text responses; structured JSON is optional.
|
|
req.Options.ForceJson = false
|
|
if req.Options.MaxOutputTokens < openai.CaptionMaxTokens {
|
|
req.Options.MaxOutputTokens = openai.CaptionMaxTokens
|
|
}
|
|
case ModelTypeLabels:
|
|
if req.Options.MaxOutputTokens < openai.LabelsMaxTokens {
|
|
req.Options.MaxOutputTokens = openai.LabelsMaxTokens
|
|
}
|
|
}
|
|
|
|
if strings.HasPrefix(strings.ToLower(strings.TrimSpace(model.Name)), "gpt-5") {
|
|
req.Options.Temperature = 0
|
|
req.Options.TopP = 0
|
|
}
|
|
}
|
|
|
|
if schema := strings.TrimSpace(model.SchemaTemplate()); schema != "" {
|
|
if raw, parseErr := parseOpenAISchema(schema); parseErr != nil {
|
|
log.Warnf("vision: failed to parse OpenAI schema template (%s)", clean.Error(parseErr))
|
|
} else {
|
|
req.Schema = raw
|
|
}
|
|
}
|
|
|
|
return req, nil
|
|
}
|
|
|
|
// Parse converts an OpenAI Responses API payload into the internal ApiResponse representation.
|
|
func (openaiParser) Parse(ctx context.Context, req *ApiRequest, raw []byte, status int) (*ApiResponse, error) {
|
|
if status >= 300 {
|
|
if msg := openai.ParseErrorMessage(raw); msg != "" {
|
|
return nil, fmt.Errorf("openai: %s", msg)
|
|
}
|
|
return nil, fmt.Errorf("openai: status %d", status)
|
|
}
|
|
|
|
var resp openai.Response
|
|
if err := json.Unmarshal(raw, &resp); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if resp.Error != nil && resp.Error.Message != "" {
|
|
return nil, errors.New(resp.Error.Message)
|
|
}
|
|
|
|
result := ApiResult{}
|
|
if jsonPayload := resp.FirstJSON(); len(jsonPayload) > 0 {
|
|
if err := populateOpenAIJSONResult(&result, jsonPayload); err != nil {
|
|
log.Debugf("vision: %s (parse openai json payload)", clean.Error(err))
|
|
}
|
|
}
|
|
|
|
if result.Caption == nil {
|
|
if text := resp.FirstText(); text != "" {
|
|
trimmed := strings.TrimSpace(text)
|
|
var parsedJSON bool
|
|
|
|
if len(trimmed) > 0 && (trimmed[0] == '{' || trimmed[0] == '[') {
|
|
if err := populateOpenAIJSONResult(&result, json.RawMessage(trimmed)); err != nil {
|
|
log.Debugf("vision: %s (parse openai json text payload)", clean.Error(err))
|
|
} else {
|
|
parsedJSON = true
|
|
}
|
|
}
|
|
|
|
if !parsedJSON && trimmed != "" {
|
|
result.Caption = &CaptionResult{
|
|
Text: trimmed,
|
|
Source: entity.SrcOpenAI,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
var responseID string
|
|
if req != nil {
|
|
responseID = req.GetId()
|
|
}
|
|
|
|
modelName := strings.TrimSpace(resp.Model)
|
|
if modelName == "" && req != nil {
|
|
modelName = strings.TrimSpace(req.Model)
|
|
}
|
|
|
|
return &ApiResponse{
|
|
Id: responseID,
|
|
Code: status,
|
|
Model: &Model{Name: modelName},
|
|
Result: result,
|
|
}, nil
|
|
}
|
|
|
|
// parseOpenAISchema validates the provided JSON schema and returns it as a raw message.
|
|
func parseOpenAISchema(schema string) (json.RawMessage, error) {
|
|
var raw json.RawMessage
|
|
if err := json.Unmarshal([]byte(schema), &raw); err != nil {
|
|
return nil, err
|
|
}
|
|
return normalizeOpenAISchema(raw)
|
|
}
|
|
|
|
// normalizeOpenAISchema upgrades legacy label schema definitions so they comply with
|
|
// OpenAI's json_schema format requirements.
|
|
func normalizeOpenAISchema(raw json.RawMessage) (json.RawMessage, error) {
|
|
if len(raw) == 0 {
|
|
return raw, nil
|
|
}
|
|
|
|
var doc map[string]any
|
|
if err := json.Unmarshal(raw, &doc); err != nil {
|
|
// Fallback to the original payload if it isn't a JSON object.
|
|
return raw, nil
|
|
}
|
|
|
|
if t, ok := doc["type"]; ok {
|
|
if typeStr, ok := t.(string); ok && strings.TrimSpace(typeStr) != "" {
|
|
return raw, nil
|
|
}
|
|
}
|
|
|
|
if _, ok := doc["properties"]; ok {
|
|
return raw, nil
|
|
}
|
|
|
|
labels, ok := doc["labels"]
|
|
if !ok {
|
|
return raw, nil
|
|
}
|
|
|
|
nsfw := false
|
|
|
|
if items, ok := labels.([]any); ok && len(items) > 0 {
|
|
if first, ok := items[0].(map[string]any); ok {
|
|
if _, hasNSFW := first["nsfw"]; hasNSFW {
|
|
nsfw = true
|
|
}
|
|
if _, hasNSFWConfidence := first["nsfw_confidence"]; hasNSFWConfidence {
|
|
nsfw = true
|
|
}
|
|
}
|
|
}
|
|
|
|
return openai.SchemaLabels(nsfw), nil
|
|
}
|
|
|
|
// populateOpenAIJSONResult unmarshals a structured OpenAI response into ApiResult fields.
|
|
func populateOpenAIJSONResult(result *ApiResult, payload json.RawMessage) error {
|
|
if result == nil || len(payload) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var envelope struct {
|
|
Caption *struct {
|
|
Text string `json:"text"`
|
|
Confidence float32 `json:"confidence"`
|
|
} `json:"caption"`
|
|
Labels []LabelResult `json:"labels"`
|
|
}
|
|
|
|
if err := json.Unmarshal(payload, &envelope); err != nil {
|
|
return err
|
|
}
|
|
|
|
if envelope.Caption != nil {
|
|
text := strings.TrimSpace(envelope.Caption.Text)
|
|
if text != "" {
|
|
result.Caption = &CaptionResult{
|
|
Text: text,
|
|
Confidence: envelope.Caption.Confidence,
|
|
Source: entity.SrcOpenAI,
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(envelope.Labels) > 0 {
|
|
filtered := envelope.Labels[:0]
|
|
|
|
for i := range envelope.Labels {
|
|
if envelope.Labels[i].Source == "" {
|
|
envelope.Labels[i].Source = entity.SrcOpenAI
|
|
}
|
|
|
|
normalizeLabelResult(&envelope.Labels[i])
|
|
|
|
if envelope.Labels[i].Name == "" {
|
|
continue
|
|
}
|
|
|
|
filtered = append(filtered, envelope.Labels[i])
|
|
}
|
|
|
|
result.Labels = append(result.Labels, filtered...)
|
|
}
|
|
|
|
return nil
|
|
}
|