AI: Update photoprism vision defaults for captioning #3438 #5011

Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
Michael Mayer
2025-07-15 00:51:46 +02:00
parent e029a64632
commit ff229e1bd0
9 changed files with 81 additions and 28 deletions

View File

@@ -129,3 +129,13 @@ func NewLabelsResponse(id string, model *Model, results classify.Labels) ApiResp
Result: ApiResult{Labels: labels},
}
}
// NewCaptionResponse generates a new Vision API image caption service response.
func NewCaptionResponse(id string, model *Model, result *CaptionResult) ApiResponse {
return ApiResponse{
Id: clean.Type(id),
Code: http.StatusOK,
Model: &Model{Type: ModelTypeLabels, Name: model.Name, Version: model.Version, Resolution: model.Resolution},
Result: ApiResult{Caption: result},
}
}

View File

@@ -7,19 +7,29 @@ import (
"github.com/photoprism/photoprism/pkg/media"
)
// CaptionPromptDefault is the default prompt used to generate captions.
var CaptionPromptDefault = `Create a caption that sounds natural and briefly describes the main content of the image in up to` +
` three sentences for use in a photo management application. Begin with the type or number of subjects and` +
` action. Omit text formatting and avoid meta-language such as "this picture", "the picture", "the photo",` +
` "there are", "here is", or "a picture of". Use explicit language to describe the scene if it helps to` +
` properly understand the picture.`
// CaptionModelDefault is the default model used to generate captions.
var CaptionModelDefault = "qwen2.5vl"
// Caption returns generated captions for the specified images.
func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
func Caption(images Files, src media.Src) (result *CaptionResult, model *Model, err error) {
// Return if there is no configuration or no image classification models are configured.
if Config == nil {
return result, errors.New("vision service is not configured")
} else if model := Config.Model(ModelTypeCaption); model != nil {
return result, model, errors.New("vision service is not configured")
} else if model = Config.Model(ModelTypeCaption); model != nil {
// Use remote service API if a server endpoint has been configured.
if uri, method := model.Endpoint(); uri != "" && method != "" {
var apiRequest *ApiRequest
var apiResponse *ApiResponse
if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), Files{imgName}, model.EndpointFileScheme()); err != nil {
return result, err
if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), images, model.EndpointFileScheme()); err != nil {
return result, model, err
}
if model.Name != "" {
@@ -28,19 +38,23 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
if model.Version != "" {
apiRequest.Version = model.Version
} else {
apiRequest.Version = "latest"
}
if model.Prompt != "" {
apiRequest.Prompt = model.Prompt
} else {
apiRequest.Prompt = CaptionPromptDefault
}
// Log JSON request data in trace mode.
apiRequest.WriteLog()
if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
return result, err
return result, model, err
} else if apiResponse.Result.Caption == nil {
return result, errors.New("invalid caption model response")
return result, model, errors.New("invalid caption model response")
}
// Set image as the default caption source.
@@ -48,13 +62,13 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
apiResponse.Result.Caption.Source = entity.SrcImage
}
result = *apiResponse.Result.Caption
result = apiResponse.Result.Caption
} else {
return result, errors.New("invalid caption model configuration")
return result, model, errors.New("invalid caption model configuration")
}
} else {
return result, errors.New("missing caption model")
return result, model, errors.New("missing caption model")
}
return result, nil
return result, model, nil
}

View File

@@ -20,9 +20,10 @@ func TestCaption(t *testing.T) {
t.Run("Success", func(t *testing.T) {
expectedText := "An image of sound waves"
result, err := Caption("https://dl.photoprism.app/img/artwork/colorwaves-400.jpg", media.SrcRemote)
result, model, err := Caption(Files{"https://dl.photoprism.app/img/artwork/colorwaves-400.jpg"}, media.SrcRemote)
assert.NoError(t, err)
assert.NotNil(t, model)
assert.IsType(t, CaptionResult{}, result)
assert.LessOrEqual(t, float32(0.0), result.Confidence)
@@ -31,9 +32,10 @@ func TestCaption(t *testing.T) {
assert.Equal(t, expectedText, result.Text)
})
t.Run("Invalid", func(t *testing.T) {
result, err := Caption("", media.SrcLocal)
result, model, err := Caption(nil, media.SrcLocal)
assert.Error(t, err)
assert.Nil(t, model)
assert.IsType(t, CaptionResult{}, result)
assert.Equal(t, "", result.Text)
assert.Equal(t, float32(0.0), result.Confidence)

View File

@@ -14,6 +14,9 @@ import (
var modelMutex = sync.Mutex{}
// ModelVersionDefault is the default model version.
var ModelVersionDefault = "latest"
// Model represents a computer vision model configuration.
type Model struct {
Type ModelType `yaml:"Type,omitempty" json:"type,omitempty"`

View File

@@ -32,8 +32,9 @@ var (
CaptionModel = &Model{
Type: ModelTypeCaption,
Resolution: 224,
Name: "qwen2.5vl",
Version: "latest",
Name: CaptionModelDefault,
Version: ModelVersionDefault,
Prompt: CaptionPromptDefault,
Service: Service{
Uri: "http://photoprism-vision:5000/api/v1/vision/caption",
Method: http.MethodPost,

View File

@@ -18,6 +18,12 @@ Models:
- Type: caption
Name: qwen2.5vl
Version: latest
Prompt: Create a caption that sounds natural and briefly describes the main content
of the image in up to three sentences for use in a photo management application.
Begin with the type or number of subjects and action. Omit text formatting and
avoid meta-language such as "this picture", "the picture", "the photo", "there
are", "here is", or "a picture of". Use explicit language to describe the scene
if it helps to properly understand the picture.
Resolution: 224
Service:
Uri: http://photoprism-vision:5000/api/v1/vision/caption

View File

@@ -8,6 +8,7 @@ import (
"github.com/photoprism/photoprism/internal/ai/vision"
"github.com/photoprism/photoprism/internal/auth/acl"
"github.com/photoprism/photoprism/internal/photoprism/get"
"github.com/photoprism/photoprism/pkg/media"
"github.com/photoprism/photoprism/pkg/media/http/header"
)
@@ -51,18 +52,30 @@ func PostVisionCaption(router *gin.RouterGroup) {
return
}
// TODO: Return error code 501 until this service is implemented.
code := http.StatusNotImplemented
// Run inference to generate a caption.
result, model, err := vision.Caption(request.Images, media.SrcRemote)
if err != nil {
log.Errorf("vision: %s (caption)", err)
c.JSON(http.StatusBadRequest, vision.NewApiError(request.GetId(), http.StatusBadRequest))
return
} else if model == nil {
log.Errorf("vision: no model specified (caption)")
c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
return
} else if result == nil {
log.Errorf("vision: no result (caption)")
c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
return
}
// Generate Vision API service response.
response := vision.ApiResponse{
Id: request.GetId(),
Code: code,
Error: http.StatusText(http.StatusNotImplemented),
Model: &vision.Model{Type: vision.ModelTypeCaption},
Result: vision.ApiResult{Caption: &vision.CaptionResult{Text: "This is a test.", Confidence: 0.14159265359}},
}
response := vision.NewCaptionResponse(
request.GetId(),
&vision.Model{Type: model.Type, Name: model.Name, Version: model.Version},
result,
)
c.JSON(code, response)
c.JSON(http.StatusOK, response)
})
}

View File

@@ -63,7 +63,11 @@ func PostVisionLabels(router *gin.RouterGroup) {
}
// Generate Vision API service response.
response := vision.NewLabelsResponse(request.GetId(), vision.NasnetModel, labels)
response := vision.NewLabelsResponse(
request.GetId(),
&vision.Model{Type: vision.ModelTypeLabels},
labels,
)
c.JSON(http.StatusOK, response)
})

View File

@@ -9,7 +9,7 @@ import (
)
// Caption returns generated caption for the specified media file.
func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err error) {
func (ind *Index) Caption(file *MediaFile) (caption *vision.CaptionResult, err error) {
start := time.Now()
size := vision.Thumb(vision.ModelTypeCaption)
@@ -22,7 +22,7 @@ func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err er
}
// Get matching labels from computer vision model.
if caption, err = vision.Caption(fileName, media.SrcLocal); err != nil {
if caption, _, err = vision.Caption(vision.Files{fileName}, media.SrcLocal); err != nil {
} else if caption.Text != "" {
log.Infof("vision: generated caption for %s [%s]", clean.Log(file.BaseName()), time.Since(start))
}