AI: Update photoprism vision defaults for captioning #3438 #5011

Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
Michael Mayer
2025-07-15 00:51:46 +02:00
parent e029a64632
commit ff229e1bd0
9 changed files with 81 additions and 28 deletions

View File

@@ -129,3 +129,13 @@ func NewLabelsResponse(id string, model *Model, results classify.Labels) ApiResp
Result: ApiResult{Labels: labels}, Result: ApiResult{Labels: labels},
} }
} }
// NewCaptionResponse generates a new Vision API image caption service response.
func NewCaptionResponse(id string, model *Model, result *CaptionResult) ApiResponse {
return ApiResponse{
Id: clean.Type(id),
Code: http.StatusOK,
Model: &Model{Type: ModelTypeLabels, Name: model.Name, Version: model.Version, Resolution: model.Resolution},
Result: ApiResult{Caption: result},
}
}

View File

@@ -7,19 +7,29 @@ import (
"github.com/photoprism/photoprism/pkg/media" "github.com/photoprism/photoprism/pkg/media"
) )
// CaptionPromptDefault is the default prompt used to generate captions.
var CaptionPromptDefault = `Create a caption that sounds natural and briefly describes the main content of the image in up to` +
` three sentences for use in a photo management application. Begin with the type or number of subjects and` +
` action. Omit text formatting and avoid meta-language such as "this picture", "the picture", "the photo",` +
` "there are", "here is", or "a picture of". Use explicit language to describe the scene if it helps to` +
` properly understand the picture.`
// CaptionModelDefault is the default model used to generate captions.
var CaptionModelDefault = "qwen2.5vl"
// Caption returns generated captions for the specified images. // Caption returns generated captions for the specified images.
func Caption(imgName string, src media.Src) (result CaptionResult, err error) { func Caption(images Files, src media.Src) (result *CaptionResult, model *Model, err error) {
// Return if there is no configuration or no image classification models are configured. // Return if there is no configuration or no image classification models are configured.
if Config == nil { if Config == nil {
return result, errors.New("vision service is not configured") return result, model, errors.New("vision service is not configured")
} else if model := Config.Model(ModelTypeCaption); model != nil { } else if model = Config.Model(ModelTypeCaption); model != nil {
// Use remote service API if a server endpoint has been configured. // Use remote service API if a server endpoint has been configured.
if uri, method := model.Endpoint(); uri != "" && method != "" { if uri, method := model.Endpoint(); uri != "" && method != "" {
var apiRequest *ApiRequest var apiRequest *ApiRequest
var apiResponse *ApiResponse var apiResponse *ApiResponse
if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), Files{imgName}, model.EndpointFileScheme()); err != nil { if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), images, model.EndpointFileScheme()); err != nil {
return result, err return result, model, err
} }
if model.Name != "" { if model.Name != "" {
@@ -28,19 +38,23 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
if model.Version != "" { if model.Version != "" {
apiRequest.Version = model.Version apiRequest.Version = model.Version
} else {
apiRequest.Version = "latest"
} }
if model.Prompt != "" { if model.Prompt != "" {
apiRequest.Prompt = model.Prompt apiRequest.Prompt = model.Prompt
} else {
apiRequest.Prompt = CaptionPromptDefault
} }
// Log JSON request data in trace mode. // Log JSON request data in trace mode.
apiRequest.WriteLog() apiRequest.WriteLog()
if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil { if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
return result, err return result, model, err
} else if apiResponse.Result.Caption == nil { } else if apiResponse.Result.Caption == nil {
return result, errors.New("invalid caption model response") return result, model, errors.New("invalid caption model response")
} }
// Set image as the default caption source. // Set image as the default caption source.
@@ -48,13 +62,13 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
apiResponse.Result.Caption.Source = entity.SrcImage apiResponse.Result.Caption.Source = entity.SrcImage
} }
result = *apiResponse.Result.Caption result = apiResponse.Result.Caption
} else { } else {
return result, errors.New("invalid caption model configuration") return result, model, errors.New("invalid caption model configuration")
} }
} else { } else {
return result, errors.New("missing caption model") return result, model, errors.New("missing caption model")
} }
return result, nil return result, model, nil
} }

View File

@@ -20,9 +20,10 @@ func TestCaption(t *testing.T) {
t.Run("Success", func(t *testing.T) { t.Run("Success", func(t *testing.T) {
expectedText := "An image of sound waves" expectedText := "An image of sound waves"
result, err := Caption("https://dl.photoprism.app/img/artwork/colorwaves-400.jpg", media.SrcRemote) result, model, err := Caption(Files{"https://dl.photoprism.app/img/artwork/colorwaves-400.jpg"}, media.SrcRemote)
assert.NoError(t, err) assert.NoError(t, err)
assert.NotNil(t, model)
assert.IsType(t, CaptionResult{}, result) assert.IsType(t, CaptionResult{}, result)
assert.LessOrEqual(t, float32(0.0), result.Confidence) assert.LessOrEqual(t, float32(0.0), result.Confidence)
@@ -31,9 +32,10 @@ func TestCaption(t *testing.T) {
assert.Equal(t, expectedText, result.Text) assert.Equal(t, expectedText, result.Text)
}) })
t.Run("Invalid", func(t *testing.T) { t.Run("Invalid", func(t *testing.T) {
result, err := Caption("", media.SrcLocal) result, model, err := Caption(nil, media.SrcLocal)
assert.Error(t, err) assert.Error(t, err)
assert.Nil(t, model)
assert.IsType(t, CaptionResult{}, result) assert.IsType(t, CaptionResult{}, result)
assert.Equal(t, "", result.Text) assert.Equal(t, "", result.Text)
assert.Equal(t, float32(0.0), result.Confidence) assert.Equal(t, float32(0.0), result.Confidence)

View File

@@ -14,6 +14,9 @@ import (
var modelMutex = sync.Mutex{} var modelMutex = sync.Mutex{}
// ModelVersionDefault is the default model version.
var ModelVersionDefault = "latest"
// Model represents a computer vision model configuration. // Model represents a computer vision model configuration.
type Model struct { type Model struct {
Type ModelType `yaml:"Type,omitempty" json:"type,omitempty"` Type ModelType `yaml:"Type,omitempty" json:"type,omitempty"`

View File

@@ -32,8 +32,9 @@ var (
CaptionModel = &Model{ CaptionModel = &Model{
Type: ModelTypeCaption, Type: ModelTypeCaption,
Resolution: 224, Resolution: 224,
Name: "qwen2.5vl", Name: CaptionModelDefault,
Version: "latest", Version: ModelVersionDefault,
Prompt: CaptionPromptDefault,
Service: Service{ Service: Service{
Uri: "http://photoprism-vision:5000/api/v1/vision/caption", Uri: "http://photoprism-vision:5000/api/v1/vision/caption",
Method: http.MethodPost, Method: http.MethodPost,

View File

@@ -18,6 +18,12 @@ Models:
- Type: caption - Type: caption
Name: qwen2.5vl Name: qwen2.5vl
Version: latest Version: latest
Prompt: Create a caption that sounds natural and briefly describes the main content
of the image in up to three sentences for use in a photo management application.
Begin with the type or number of subjects and action. Omit text formatting and
avoid meta-language such as "this picture", "the picture", "the photo", "there
are", "here is", or "a picture of". Use explicit language to describe the scene
if it helps to properly understand the picture.
Resolution: 224 Resolution: 224
Service: Service:
Uri: http://photoprism-vision:5000/api/v1/vision/caption Uri: http://photoprism-vision:5000/api/v1/vision/caption

View File

@@ -8,6 +8,7 @@ import (
"github.com/photoprism/photoprism/internal/ai/vision" "github.com/photoprism/photoprism/internal/ai/vision"
"github.com/photoprism/photoprism/internal/auth/acl" "github.com/photoprism/photoprism/internal/auth/acl"
"github.com/photoprism/photoprism/internal/photoprism/get" "github.com/photoprism/photoprism/internal/photoprism/get"
"github.com/photoprism/photoprism/pkg/media"
"github.com/photoprism/photoprism/pkg/media/http/header" "github.com/photoprism/photoprism/pkg/media/http/header"
) )
@@ -51,18 +52,30 @@ func PostVisionCaption(router *gin.RouterGroup) {
return return
} }
// TODO: Return error code 501 until this service is implemented. // Run inference to generate a caption.
code := http.StatusNotImplemented result, model, err := vision.Caption(request.Images, media.SrcRemote)
// Generate Vision API service response. if err != nil {
response := vision.ApiResponse{ log.Errorf("vision: %s (caption)", err)
Id: request.GetId(), c.JSON(http.StatusBadRequest, vision.NewApiError(request.GetId(), http.StatusBadRequest))
Code: code, return
Error: http.StatusText(http.StatusNotImplemented), } else if model == nil {
Model: &vision.Model{Type: vision.ModelTypeCaption}, log.Errorf("vision: no model specified (caption)")
Result: vision.ApiResult{Caption: &vision.CaptionResult{Text: "This is a test.", Confidence: 0.14159265359}}, c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
return
} else if result == nil {
log.Errorf("vision: no result (caption)")
c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
return
} }
c.JSON(code, response) // Generate Vision API service response.
response := vision.NewCaptionResponse(
request.GetId(),
&vision.Model{Type: model.Type, Name: model.Name, Version: model.Version},
result,
)
c.JSON(http.StatusOK, response)
}) })
} }

View File

@@ -63,7 +63,11 @@ func PostVisionLabels(router *gin.RouterGroup) {
} }
// Generate Vision API service response. // Generate Vision API service response.
response := vision.NewLabelsResponse(request.GetId(), vision.NasnetModel, labels) response := vision.NewLabelsResponse(
request.GetId(),
&vision.Model{Type: vision.ModelTypeLabels},
labels,
)
c.JSON(http.StatusOK, response) c.JSON(http.StatusOK, response)
}) })

View File

@@ -9,7 +9,7 @@ import (
) )
// Caption returns generated caption for the specified media file. // Caption returns generated caption for the specified media file.
func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err error) { func (ind *Index) Caption(file *MediaFile) (caption *vision.CaptionResult, err error) {
start := time.Now() start := time.Now()
size := vision.Thumb(vision.ModelTypeCaption) size := vision.Thumb(vision.ModelTypeCaption)
@@ -22,7 +22,7 @@ func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err er
} }
// Get matching labels from computer vision model. // Get matching labels from computer vision model.
if caption, err = vision.Caption(fileName, media.SrcLocal); err != nil { if caption, _, err = vision.Caption(vision.Files{fileName}, media.SrcLocal); err != nil {
} else if caption.Text != "" { } else if caption.Text != "" {
log.Infof("vision: generated caption for %s [%s]", clean.Log(file.BaseName()), time.Since(start)) log.Infof("vision: generated caption for %s [%s]", clean.Log(file.BaseName()), time.Since(start))
} }