mirror of
https://github.com/photoprism/photoprism.git
synced 2025-12-12 00:34:13 +01:00
Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
@@ -129,3 +129,13 @@ func NewLabelsResponse(id string, model *Model, results classify.Labels) ApiResp
|
||||
Result: ApiResult{Labels: labels},
|
||||
}
|
||||
}
|
||||
|
||||
// NewCaptionResponse generates a new Vision API image caption service response.
|
||||
func NewCaptionResponse(id string, model *Model, result *CaptionResult) ApiResponse {
|
||||
return ApiResponse{
|
||||
Id: clean.Type(id),
|
||||
Code: http.StatusOK,
|
||||
Model: &Model{Type: ModelTypeLabels, Name: model.Name, Version: model.Version, Resolution: model.Resolution},
|
||||
Result: ApiResult{Caption: result},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,19 +7,29 @@ import (
|
||||
"github.com/photoprism/photoprism/pkg/media"
|
||||
)
|
||||
|
||||
// CaptionPromptDefault is the default prompt used to generate captions.
|
||||
var CaptionPromptDefault = `Create a caption that sounds natural and briefly describes the main content of the image in up to` +
|
||||
` three sentences for use in a photo management application. Begin with the type or number of subjects and` +
|
||||
` action. Omit text formatting and avoid meta-language such as "this picture", "the picture", "the photo",` +
|
||||
` "there are", "here is", or "a picture of". Use explicit language to describe the scene if it helps to` +
|
||||
` properly understand the picture.`
|
||||
|
||||
// CaptionModelDefault is the default model used to generate captions.
|
||||
var CaptionModelDefault = "qwen2.5vl"
|
||||
|
||||
// Caption returns generated captions for the specified images.
|
||||
func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
|
||||
func Caption(images Files, src media.Src) (result *CaptionResult, model *Model, err error) {
|
||||
// Return if there is no configuration or no image classification models are configured.
|
||||
if Config == nil {
|
||||
return result, errors.New("vision service is not configured")
|
||||
} else if model := Config.Model(ModelTypeCaption); model != nil {
|
||||
return result, model, errors.New("vision service is not configured")
|
||||
} else if model = Config.Model(ModelTypeCaption); model != nil {
|
||||
// Use remote service API if a server endpoint has been configured.
|
||||
if uri, method := model.Endpoint(); uri != "" && method != "" {
|
||||
var apiRequest *ApiRequest
|
||||
var apiResponse *ApiResponse
|
||||
|
||||
if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), Files{imgName}, model.EndpointFileScheme()); err != nil {
|
||||
return result, err
|
||||
if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), images, model.EndpointFileScheme()); err != nil {
|
||||
return result, model, err
|
||||
}
|
||||
|
||||
if model.Name != "" {
|
||||
@@ -28,19 +38,23 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
|
||||
|
||||
if model.Version != "" {
|
||||
apiRequest.Version = model.Version
|
||||
} else {
|
||||
apiRequest.Version = "latest"
|
||||
}
|
||||
|
||||
if model.Prompt != "" {
|
||||
apiRequest.Prompt = model.Prompt
|
||||
} else {
|
||||
apiRequest.Prompt = CaptionPromptDefault
|
||||
}
|
||||
|
||||
// Log JSON request data in trace mode.
|
||||
apiRequest.WriteLog()
|
||||
|
||||
if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
|
||||
return result, err
|
||||
return result, model, err
|
||||
} else if apiResponse.Result.Caption == nil {
|
||||
return result, errors.New("invalid caption model response")
|
||||
return result, model, errors.New("invalid caption model response")
|
||||
}
|
||||
|
||||
// Set image as the default caption source.
|
||||
@@ -48,13 +62,13 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
|
||||
apiResponse.Result.Caption.Source = entity.SrcImage
|
||||
}
|
||||
|
||||
result = *apiResponse.Result.Caption
|
||||
result = apiResponse.Result.Caption
|
||||
} else {
|
||||
return result, errors.New("invalid caption model configuration")
|
||||
return result, model, errors.New("invalid caption model configuration")
|
||||
}
|
||||
} else {
|
||||
return result, errors.New("missing caption model")
|
||||
return result, model, errors.New("missing caption model")
|
||||
}
|
||||
|
||||
return result, nil
|
||||
return result, model, nil
|
||||
}
|
||||
|
||||
@@ -20,9 +20,10 @@ func TestCaption(t *testing.T) {
|
||||
t.Run("Success", func(t *testing.T) {
|
||||
expectedText := "An image of sound waves"
|
||||
|
||||
result, err := Caption("https://dl.photoprism.app/img/artwork/colorwaves-400.jpg", media.SrcRemote)
|
||||
result, model, err := Caption(Files{"https://dl.photoprism.app/img/artwork/colorwaves-400.jpg"}, media.SrcRemote)
|
||||
|
||||
assert.NoError(t, err)
|
||||
assert.NotNil(t, model)
|
||||
assert.IsType(t, CaptionResult{}, result)
|
||||
assert.LessOrEqual(t, float32(0.0), result.Confidence)
|
||||
|
||||
@@ -31,9 +32,10 @@ func TestCaption(t *testing.T) {
|
||||
assert.Equal(t, expectedText, result.Text)
|
||||
})
|
||||
t.Run("Invalid", func(t *testing.T) {
|
||||
result, err := Caption("", media.SrcLocal)
|
||||
result, model, err := Caption(nil, media.SrcLocal)
|
||||
|
||||
assert.Error(t, err)
|
||||
assert.Nil(t, model)
|
||||
assert.IsType(t, CaptionResult{}, result)
|
||||
assert.Equal(t, "", result.Text)
|
||||
assert.Equal(t, float32(0.0), result.Confidence)
|
||||
|
||||
@@ -14,6 +14,9 @@ import (
|
||||
|
||||
var modelMutex = sync.Mutex{}
|
||||
|
||||
// ModelVersionDefault is the default model version.
|
||||
var ModelVersionDefault = "latest"
|
||||
|
||||
// Model represents a computer vision model configuration.
|
||||
type Model struct {
|
||||
Type ModelType `yaml:"Type,omitempty" json:"type,omitempty"`
|
||||
|
||||
@@ -32,8 +32,9 @@ var (
|
||||
CaptionModel = &Model{
|
||||
Type: ModelTypeCaption,
|
||||
Resolution: 224,
|
||||
Name: "qwen2.5vl",
|
||||
Version: "latest",
|
||||
Name: CaptionModelDefault,
|
||||
Version: ModelVersionDefault,
|
||||
Prompt: CaptionPromptDefault,
|
||||
Service: Service{
|
||||
Uri: "http://photoprism-vision:5000/api/v1/vision/caption",
|
||||
Method: http.MethodPost,
|
||||
|
||||
6
internal/ai/vision/testdata/vision.yml
vendored
6
internal/ai/vision/testdata/vision.yml
vendored
@@ -18,6 +18,12 @@ Models:
|
||||
- Type: caption
|
||||
Name: qwen2.5vl
|
||||
Version: latest
|
||||
Prompt: Create a caption that sounds natural and briefly describes the main content
|
||||
of the image in up to three sentences for use in a photo management application.
|
||||
Begin with the type or number of subjects and action. Omit text formatting and
|
||||
avoid meta-language such as "this picture", "the picture", "the photo", "there
|
||||
are", "here is", or "a picture of". Use explicit language to describe the scene
|
||||
if it helps to properly understand the picture.
|
||||
Resolution: 224
|
||||
Service:
|
||||
Uri: http://photoprism-vision:5000/api/v1/vision/caption
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"github.com/photoprism/photoprism/internal/ai/vision"
|
||||
"github.com/photoprism/photoprism/internal/auth/acl"
|
||||
"github.com/photoprism/photoprism/internal/photoprism/get"
|
||||
"github.com/photoprism/photoprism/pkg/media"
|
||||
"github.com/photoprism/photoprism/pkg/media/http/header"
|
||||
)
|
||||
|
||||
@@ -51,18 +52,30 @@ func PostVisionCaption(router *gin.RouterGroup) {
|
||||
return
|
||||
}
|
||||
|
||||
// TODO: Return error code 501 until this service is implemented.
|
||||
code := http.StatusNotImplemented
|
||||
// Run inference to generate a caption.
|
||||
result, model, err := vision.Caption(request.Images, media.SrcRemote)
|
||||
|
||||
// Generate Vision API service response.
|
||||
response := vision.ApiResponse{
|
||||
Id: request.GetId(),
|
||||
Code: code,
|
||||
Error: http.StatusText(http.StatusNotImplemented),
|
||||
Model: &vision.Model{Type: vision.ModelTypeCaption},
|
||||
Result: vision.ApiResult{Caption: &vision.CaptionResult{Text: "This is a test.", Confidence: 0.14159265359}},
|
||||
if err != nil {
|
||||
log.Errorf("vision: %s (caption)", err)
|
||||
c.JSON(http.StatusBadRequest, vision.NewApiError(request.GetId(), http.StatusBadRequest))
|
||||
return
|
||||
} else if model == nil {
|
||||
log.Errorf("vision: no model specified (caption)")
|
||||
c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
|
||||
return
|
||||
} else if result == nil {
|
||||
log.Errorf("vision: no result (caption)")
|
||||
c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(code, response)
|
||||
// Generate Vision API service response.
|
||||
response := vision.NewCaptionResponse(
|
||||
request.GetId(),
|
||||
&vision.Model{Type: model.Type, Name: model.Name, Version: model.Version},
|
||||
result,
|
||||
)
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -63,7 +63,11 @@ func PostVisionLabels(router *gin.RouterGroup) {
|
||||
}
|
||||
|
||||
// Generate Vision API service response.
|
||||
response := vision.NewLabelsResponse(request.GetId(), vision.NasnetModel, labels)
|
||||
response := vision.NewLabelsResponse(
|
||||
request.GetId(),
|
||||
&vision.Model{Type: vision.ModelTypeLabels},
|
||||
labels,
|
||||
)
|
||||
|
||||
c.JSON(http.StatusOK, response)
|
||||
})
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
)
|
||||
|
||||
// Caption returns generated caption for the specified media file.
|
||||
func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err error) {
|
||||
func (ind *Index) Caption(file *MediaFile) (caption *vision.CaptionResult, err error) {
|
||||
start := time.Now()
|
||||
|
||||
size := vision.Thumb(vision.ModelTypeCaption)
|
||||
@@ -22,7 +22,7 @@ func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err er
|
||||
}
|
||||
|
||||
// Get matching labels from computer vision model.
|
||||
if caption, err = vision.Caption(fileName, media.SrcLocal); err != nil {
|
||||
if caption, _, err = vision.Caption(vision.Files{fileName}, media.SrcLocal); err != nil {
|
||||
} else if caption.Text != "" {
|
||||
log.Infof("vision: generated caption for %s [%s]", clean.Log(file.BaseName()), time.Since(start))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user