AI: Update photoprism vision defaults for captioning #3438 #5011

Signed-off-by: Michael Mayer <michael@photoprism.app>
2025-12-12 00:34:13 +01:00 · 2025-07-15 00:51:46 +02:00
parent e029a64632
commit ff229e1bd0
9 changed files with 81 additions and 28 deletions
--- a/internal/ai/vision/api_response.go
+++ b/internal/ai/vision/api_response.go
@@ -129,3 +129,13 @@ func NewLabelsResponse(id string, model *Model, results classify.Labels) ApiResp
 		Result: ApiResult{Labels: labels},
 	}
 }
+
+// NewCaptionResponse generates a new Vision API image caption service response.
+func NewCaptionResponse(id string, model *Model, result *CaptionResult) ApiResponse {
+	return ApiResponse{
+		Id:     clean.Type(id),
+		Code:   http.StatusOK,
+		Model:  &Model{Type: ModelTypeLabels, Name: model.Name, Version: model.Version, Resolution: model.Resolution},
+		Result: ApiResult{Caption: result},
+	}
+}
--- a/internal/ai/vision/caption.go
+++ b/internal/ai/vision/caption.go
@@ -7,19 +7,29 @@ import (
 	"github.com/photoprism/photoprism/pkg/media"
 )

+// CaptionPromptDefault is the default prompt used to generate captions.
+var CaptionPromptDefault = `Create a caption that sounds natural and briefly describes the main content of the image in up to` +
+	` three sentences for use in a photo management application. Begin with the type or number of subjects and` +
+	` action. Omit text formatting and avoid meta-language such as "this picture", "the picture", "the photo",` +
+	` "there are", "here is", or "a picture of". Use explicit language to describe the scene if it helps to` +
+	` properly understand the picture.`
+
+// CaptionModelDefault is the default model used to generate captions.
+var CaptionModelDefault = "qwen2.5vl"
+
 // Caption returns generated captions for the specified images.
-func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
+func Caption(images Files, src media.Src) (result *CaptionResult, model *Model, err error) {
 	// Return if there is no configuration or no image classification models are configured.
 	if Config == nil {
-		return result, errors.New("vision service is not configured")
-	} else if model := Config.Model(ModelTypeCaption); model != nil {
+		return result, model, errors.New("vision service is not configured")
+	} else if model = Config.Model(ModelTypeCaption); model != nil {
 		// Use remote service API if a server endpoint has been configured.
 		if uri, method := model.Endpoint(); uri != "" && method != "" {
 			var apiRequest *ApiRequest
 			var apiResponse *ApiResponse

-			if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), Files{imgName}, model.EndpointFileScheme()); err != nil {
-				return result, err
+			if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), images, model.EndpointFileScheme()); err != nil {
+				return result, model, err
 			}

 			if model.Name != "" {
@@ -28,19 +38,23 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {

 			if model.Version != "" {
 				apiRequest.Version = model.Version
+			} else {
+				apiRequest.Version = "latest"
 			}

 			if model.Prompt != "" {
 				apiRequest.Prompt = model.Prompt
+			} else {
+				apiRequest.Prompt = CaptionPromptDefault
 			}

 			// Log JSON request data in trace mode.
 			apiRequest.WriteLog()

 			if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
-				return result, err
+				return result, model, err
 			} else if apiResponse.Result.Caption == nil {
-				return result, errors.New("invalid caption model response")
+				return result, model, errors.New("invalid caption model response")
 			}

 			// Set image as the default caption source.
@@ -48,13 +62,13 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
 				apiResponse.Result.Caption.Source = entity.SrcImage
 			}

-			result = *apiResponse.Result.Caption
+			result = apiResponse.Result.Caption
 		} else {
-			return result, errors.New("invalid caption model configuration")
+			return result, model, errors.New("invalid caption model configuration")
 		}
 	} else {
-		return result, errors.New("missing caption model")
+		return result, model, errors.New("missing caption model")
 	}

-	return result, nil
+	return result, model, nil
 }
--- a/internal/ai/vision/caption_test.go
+++ b/internal/ai/vision/caption_test.go
@@ -20,9 +20,10 @@ func TestCaption(t *testing.T) {
 	t.Run("Success", func(t *testing.T) {
 		expectedText := "An image of sound waves"

-		result, err := Caption("https://dl.photoprism.app/img/artwork/colorwaves-400.jpg", media.SrcRemote)
+		result, model, err := Caption(Files{"https://dl.photoprism.app/img/artwork/colorwaves-400.jpg"}, media.SrcRemote)

 		assert.NoError(t, err)
+		assert.NotNil(t, model)
 		assert.IsType(t, CaptionResult{}, result)
 		assert.LessOrEqual(t, float32(0.0), result.Confidence)

@@ -31,9 +32,10 @@ func TestCaption(t *testing.T) {
 		assert.Equal(t, expectedText, result.Text)
 	})
 	t.Run("Invalid", func(t *testing.T) {
-		result, err := Caption("", media.SrcLocal)
+		result, model, err := Caption(nil, media.SrcLocal)

 		assert.Error(t, err)
+		assert.Nil(t, model)
 		assert.IsType(t, CaptionResult{}, result)
 		assert.Equal(t, "", result.Text)
 		assert.Equal(t, float32(0.0), result.Confidence)
--- a/internal/ai/vision/model.go
+++ b/internal/ai/vision/model.go
@@ -14,6 +14,9 @@ import (

 var modelMutex = sync.Mutex{}

+// ModelVersionDefault is the default model version.
+var ModelVersionDefault = "latest"
+
 // Model represents a computer vision model configuration.
 type Model struct {
 	Type          ModelType `yaml:"Type,omitempty" json:"type,omitempty"`
--- a/internal/ai/vision/models.go
+++ b/internal/ai/vision/models.go
@@ -32,8 +32,9 @@ var (
 	CaptionModel = &Model{
 		Type:       ModelTypeCaption,
 		Resolution: 224,
-		Name:       "qwen2.5vl",
-		Version:    "latest",
+		Name:       CaptionModelDefault,
+		Version:    ModelVersionDefault,
+		Prompt:     CaptionPromptDefault,
 		Service: Service{
 			Uri:            "http://photoprism-vision:5000/api/v1/vision/caption",
 			Method:         http.MethodPost,
--- a/internal/ai/vision/testdata/vision.yml
+++ b/internal/ai/vision/testdata/vision.yml
@@ -18,6 +18,12 @@ Models:
 - Type: caption
  Name: qwen2.5vl
  Version: latest
+  Prompt: Create a caption that sounds natural and briefly describes the main content
+    of the image in up to three sentences for use in a photo management application.
+    Begin with the type or number of subjects and action. Omit text formatting and
+    avoid meta-language such as "this picture", "the picture", "the photo", "there
+    are", "here is", or "a picture of". Use explicit language to describe the scene
+    if it helps to properly understand the picture.
  Resolution: 224
  Service:
    Uri: http://photoprism-vision:5000/api/v1/vision/caption
--- a/internal/api/vision_caption.go
+++ b/internal/api/vision_caption.go
@@ -8,6 +8,7 @@ import (
 	"github.com/photoprism/photoprism/internal/ai/vision"
 	"github.com/photoprism/photoprism/internal/auth/acl"
 	"github.com/photoprism/photoprism/internal/photoprism/get"
+	"github.com/photoprism/photoprism/pkg/media"
 	"github.com/photoprism/photoprism/pkg/media/http/header"
 )

@@ -51,18 +52,30 @@ func PostVisionCaption(router *gin.RouterGroup) {
 			return
 		}

-		// TODO: Return error code 501 until this service is implemented.
-		code := http.StatusNotImplemented
+		// Run inference to generate a caption.
+		result, model, err := vision.Caption(request.Images, media.SrcRemote)

-		// Generate Vision API service response.
-		response := vision.ApiResponse{
-			Id:     request.GetId(),
-			Code:   code,
-			Error:  http.StatusText(http.StatusNotImplemented),
-			Model:  &vision.Model{Type: vision.ModelTypeCaption},
-			Result: vision.ApiResult{Caption: &vision.CaptionResult{Text: "This is a test.", Confidence: 0.14159265359}},
+		if err != nil {
+			log.Errorf("vision: %s (caption)", err)
+			c.JSON(http.StatusBadRequest, vision.NewApiError(request.GetId(), http.StatusBadRequest))
+			return
+		} else if model == nil {
+			log.Errorf("vision: no model specified (caption)")
+			c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
+			return
+		} else if result == nil {
+			log.Errorf("vision: no result (caption)")
+			c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
+			return
 		}

-		c.JSON(code, response)
+		// Generate Vision API service response.
+		response := vision.NewCaptionResponse(
+			request.GetId(),
+			&vision.Model{Type: model.Type, Name: model.Name, Version: model.Version},
+			result,
+		)
+
+		c.JSON(http.StatusOK, response)
 	})
 }
--- a/internal/api/vision_labels.go
+++ b/internal/api/vision_labels.go
@@ -63,7 +63,11 @@ func PostVisionLabels(router *gin.RouterGroup) {
 		}

 		// Generate Vision API service response.
-		response := vision.NewLabelsResponse(request.GetId(), vision.NasnetModel, labels)
+		response := vision.NewLabelsResponse(
+			request.GetId(),
+			&vision.Model{Type: vision.ModelTypeLabels},
+			labels,
+		)

 		c.JSON(http.StatusOK, response)
 	})
--- a/internal/photoprism/index_caption.go
+++ b/internal/photoprism/index_caption.go
@@ -9,7 +9,7 @@ import (
 )

 // Caption returns generated caption for the specified media file.
-func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err error) {
+func (ind *Index) Caption(file *MediaFile) (caption *vision.CaptionResult, err error) {
 	start := time.Now()

 	size := vision.Thumb(vision.ModelTypeCaption)
@@ -22,7 +22,7 @@ func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err er
 	}

 	// Get matching labels from computer vision model.
-	if caption, err = vision.Caption(fileName, media.SrcLocal); err != nil {
+	if caption, _, err = vision.Caption(vision.Files{fileName}, media.SrcLocal); err != nil {
 	} else if caption.Text != "" {
 		log.Infof("vision: generated caption for %s [%s]", clean.Log(file.BaseName()), time.Since(start))
 	}