AI: Update photoprism vision defaults for captioning #3438 #5011

Signed-off-by: Michael Mayer <michael@photoprism.app>
2025-12-12 00:34:13 +01:00 · 2025-07-15 00:51:46 +02:00
parent e029a64632
commit ff229e1bd0
9 changed files with 81 additions and 28 deletions
--- a/internal/ai/vision/api_response.go
+++ b/internal/ai/vision/api_response.go
@@ -129,3 +129,13 @@ func NewLabelsResponse(id string, model *Model, results classify.Labels) ApiResp
 		Result: ApiResult{Labels: labels},
 	}
 }
 // NewCaptionResponse generates a new Vision API image caption service response.
 func NewCaptionResponse(id string, model *Model, result *CaptionResult) ApiResponse {
 	return ApiResponse{
 		Id:     clean.Type(id),
 		Code:   http.StatusOK,
 		Model:  &Model{Type: ModelTypeLabels, Name: model.Name, Version: model.Version, Resolution: model.Resolution},
 		Result: ApiResult{Caption: result},
 	}
 }
--- a/internal/ai/vision/caption.go
+++ b/internal/ai/vision/caption.go
@@ -7,19 +7,29 @@ import (
 	"github.com/photoprism/photoprism/pkg/media"
 )
 // CaptionPromptDefault is the default prompt used to generate captions.
 var CaptionPromptDefault = `Create a caption that sounds natural and briefly describes the main content of the image in up to` +
 	` three sentences for use in a photo management application. Begin with the type or number of subjects and` +
 	` action. Omit text formatting and avoid meta-language such as "this picture", "the picture", "the photo",` +
 	` "there are", "here is", or "a picture of". Use explicit language to describe the scene if it helps to` +
 	` properly understand the picture.`
 // CaptionModelDefault is the default model used to generate captions.
 var CaptionModelDefault = "qwen2.5vl"
 // Caption returns generated captions for the specified images.
-func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
+func Caption(images Files, src media.Src) (result *CaptionResult, model *Model, err error) {
 	// Return if there is no configuration or no image classification models are configured.
 	if Config == nil {
-		return result, errors.New("vision service is not configured")
+		return result, model, errors.New("vision service is not configured")
-	} else if model := Config.Model(ModelTypeCaption); model != nil {
+	} else if model = Config.Model(ModelTypeCaption); model != nil {
 		// Use remote service API if a server endpoint has been configured.
 		if uri, method := model.Endpoint(); uri != "" && method != "" {
 			var apiRequest *ApiRequest
 			var apiResponse *ApiResponse
-			if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), Files{imgName}, model.EndpointFileScheme()); err != nil {
+			if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), images, model.EndpointFileScheme()); err != nil {
-				return result, err
+				return result, model, err
 			}
 			if model.Name != "" {
@@ -28,19 +38,23 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
 			if model.Version != "" {
 				apiRequest.Version = model.Version
 			} else {
 				apiRequest.Version = "latest"
 			}
 			if model.Prompt != "" {
 				apiRequest.Prompt = model.Prompt
 			} else {
 				apiRequest.Prompt = CaptionPromptDefault
 			}
 			// Log JSON request data in trace mode.
 			apiRequest.WriteLog()
 			if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
-				return result, err
+				return result, model, err
 			} else if apiResponse.Result.Caption == nil {
-				return result, errors.New("invalid caption model response")
+				return result, model, errors.New("invalid caption model response")
 			}
 			// Set image as the default caption source.
@@ -48,13 +62,13 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
 				apiResponse.Result.Caption.Source = entity.SrcImage
 			}
-			result = *apiResponse.Result.Caption
+			result = apiResponse.Result.Caption
 		} else {
-			return result, errors.New("invalid caption model configuration")
+			return result, model, errors.New("invalid caption model configuration")
 		}
 	} else {
-		return result, errors.New("missing caption model")
+		return result, model, errors.New("missing caption model")
 	}
-	return result, nil
+	return result, model, nil
 }
--- a/internal/ai/vision/caption_test.go
+++ b/internal/ai/vision/caption_test.go
@@ -20,9 +20,10 @@ func TestCaption(t *testing.T) {
 	t.Run("Success", func(t *testing.T) {
 		expectedText := "An image of sound waves"
-		result, err := Caption("https://dl.photoprism.app/img/artwork/colorwaves-400.jpg", media.SrcRemote)
+		result, model, err := Caption(Files{"https://dl.photoprism.app/img/artwork/colorwaves-400.jpg"}, media.SrcRemote)
 		assert.NoError(t, err)
 		assert.NotNil(t, model)
 		assert.IsType(t, CaptionResult{}, result)
 		assert.LessOrEqual(t, float32(0.0), result.Confidence)
@@ -31,9 +32,10 @@ func TestCaption(t *testing.T) {
 		assert.Equal(t, expectedText, result.Text)
 	})
 	t.Run("Invalid", func(t *testing.T) {
-		result, err := Caption("", media.SrcLocal)
+		result, model, err := Caption(nil, media.SrcLocal)
 		assert.Error(t, err)
 		assert.Nil(t, model)
 		assert.IsType(t, CaptionResult{}, result)
 		assert.Equal(t, "", result.Text)
 		assert.Equal(t, float32(0.0), result.Confidence)
--- a/internal/ai/vision/model.go
+++ b/internal/ai/vision/model.go
@@ -14,6 +14,9 @@ import (
 var modelMutex = sync.Mutex{}
 // ModelVersionDefault is the default model version.
 var ModelVersionDefault = "latest"
 // Model represents a computer vision model configuration.
 type Model struct {
 	Type          ModelType `yaml:"Type,omitempty" json:"type,omitempty"`
--- a/internal/ai/vision/models.go
+++ b/internal/ai/vision/models.go
@@ -32,8 +32,9 @@ var (
 	CaptionModel = &Model{
 		Type:       ModelTypeCaption,
 		Resolution: 224,
-		Name:       "qwen2.5vl",
+		Name:       CaptionModelDefault,
-		Version:    "latest",
+		Version:    ModelVersionDefault,
 		Prompt:     CaptionPromptDefault,
 		Service: Service{
 			Uri:            "http://photoprism-vision:5000/api/v1/vision/caption",
 			Method:         http.MethodPost,
--- a/internal/ai/vision/testdata/vision.yml
+++ b/internal/ai/vision/testdata/vision.yml
@@ -18,6 +18,12 @@ Models:
 - Type: caption
  Name: qwen2.5vl
  Version: latest
  Prompt: Create a caption that sounds natural and briefly describes the main content
    of the image in up to three sentences for use in a photo management application.
    Begin with the type or number of subjects and action. Omit text formatting and
    avoid meta-language such as "this picture", "the picture", "the photo", "there
    are", "here is", or "a picture of". Use explicit language to describe the scene
    if it helps to properly understand the picture.
  Resolution: 224
  Service:
    Uri: http://photoprism-vision:5000/api/v1/vision/caption
--- a/internal/api/vision_caption.go
+++ b/internal/api/vision_caption.go
@@ -8,6 +8,7 @@ import (
 	"github.com/photoprism/photoprism/internal/ai/vision"
 	"github.com/photoprism/photoprism/internal/auth/acl"
 	"github.com/photoprism/photoprism/internal/photoprism/get"
 	"github.com/photoprism/photoprism/pkg/media"
 	"github.com/photoprism/photoprism/pkg/media/http/header"
 )
@@ -51,18 +52,30 @@ func PostVisionCaption(router *gin.RouterGroup) {
 			return
 		}
-		// TODO: Return error code 501 until this service is implemented.
+		// Run inference to generate a caption.
-		code := http.StatusNotImplemented
+		result, model, err := vision.Caption(request.Images, media.SrcRemote)
-		// Generate Vision API service response.
+		if err != nil {
-		response := vision.ApiResponse{
+			log.Errorf("vision: %s (caption)", err)
-			Id:     request.GetId(),
+			c.JSON(http.StatusBadRequest, vision.NewApiError(request.GetId(), http.StatusBadRequest))
-			Code:   code,
+			return
-			Error:  http.StatusText(http.StatusNotImplemented),
+		} else if model == nil {
-			Model:  &vision.Model{Type: vision.ModelTypeCaption},
+			log.Errorf("vision: no model specified (caption)")
-			Result: vision.ApiResult{Caption: &vision.CaptionResult{Text: "This is a test.", Confidence: 0.14159265359}},
+			c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
 			return
 		} else if result == nil {
 			log.Errorf("vision: no result (caption)")
 			c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
 			return
 		}
-		c.JSON(code, response)
+		// Generate Vision API service response.
 		response := vision.NewCaptionResponse(
 			request.GetId(),
 			&vision.Model{Type: model.Type, Name: model.Name, Version: model.Version},
 			result,
 		)
 		c.JSON(http.StatusOK, response)
 	})
 }
--- a/internal/api/vision_labels.go
+++ b/internal/api/vision_labels.go
@@ -63,7 +63,11 @@ func PostVisionLabels(router *gin.RouterGroup) {
 		}
 		// Generate Vision API service response.
-		response := vision.NewLabelsResponse(request.GetId(), vision.NasnetModel, labels)
+		response := vision.NewLabelsResponse(
 			request.GetId(),
 			&vision.Model{Type: vision.ModelTypeLabels},
 			labels,
 		)
 		c.JSON(http.StatusOK, response)
 	})
--- a/internal/photoprism/index_caption.go
+++ b/internal/photoprism/index_caption.go
@@ -9,7 +9,7 @@ import (
 )
 // Caption returns generated caption for the specified media file.
-func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err error) {
+func (ind *Index) Caption(file *MediaFile) (caption *vision.CaptionResult, err error) {
 	start := time.Now()
 	size := vision.Thumb(vision.ModelTypeCaption)
@@ -22,7 +22,7 @@ func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err er
 	}
 	// Get matching labels from computer vision model.
-	if caption, err = vision.Caption(fileName, media.SrcLocal); err != nil {
+	if caption, _, err = vision.Caption(vision.Files{fileName}, media.SrcLocal); err != nil {
 	} else if caption.Text != "" {
 		log.Infof("vision: generated caption for %s [%s]", clean.Log(file.BaseName()), time.Since(start))
 	}