mirror of
https://github.com/photoprism/photoprism.git
synced 2025-12-12 00:34:13 +01:00
Signed-off-by: Michael Mayer <michael@photoprism.app>
This commit is contained in:
@@ -129,3 +129,13 @@ func NewLabelsResponse(id string, model *Model, results classify.Labels) ApiResp
|
|||||||
Result: ApiResult{Labels: labels},
|
Result: ApiResult{Labels: labels},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewCaptionResponse generates a new Vision API image caption service response.
|
||||||
|
func NewCaptionResponse(id string, model *Model, result *CaptionResult) ApiResponse {
|
||||||
|
return ApiResponse{
|
||||||
|
Id: clean.Type(id),
|
||||||
|
Code: http.StatusOK,
|
||||||
|
Model: &Model{Type: ModelTypeLabels, Name: model.Name, Version: model.Version, Resolution: model.Resolution},
|
||||||
|
Result: ApiResult{Caption: result},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,19 +7,29 @@ import (
|
|||||||
"github.com/photoprism/photoprism/pkg/media"
|
"github.com/photoprism/photoprism/pkg/media"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// CaptionPromptDefault is the default prompt used to generate captions.
|
||||||
|
var CaptionPromptDefault = `Create a caption that sounds natural and briefly describes the main content of the image in up to` +
|
||||||
|
` three sentences for use in a photo management application. Begin with the type or number of subjects and` +
|
||||||
|
` action. Omit text formatting and avoid meta-language such as "this picture", "the picture", "the photo",` +
|
||||||
|
` "there are", "here is", or "a picture of". Use explicit language to describe the scene if it helps to` +
|
||||||
|
` properly understand the picture.`
|
||||||
|
|
||||||
|
// CaptionModelDefault is the default model used to generate captions.
|
||||||
|
var CaptionModelDefault = "qwen2.5vl"
|
||||||
|
|
||||||
// Caption returns generated captions for the specified images.
|
// Caption returns generated captions for the specified images.
|
||||||
func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
|
func Caption(images Files, src media.Src) (result *CaptionResult, model *Model, err error) {
|
||||||
// Return if there is no configuration or no image classification models are configured.
|
// Return if there is no configuration or no image classification models are configured.
|
||||||
if Config == nil {
|
if Config == nil {
|
||||||
return result, errors.New("vision service is not configured")
|
return result, model, errors.New("vision service is not configured")
|
||||||
} else if model := Config.Model(ModelTypeCaption); model != nil {
|
} else if model = Config.Model(ModelTypeCaption); model != nil {
|
||||||
// Use remote service API if a server endpoint has been configured.
|
// Use remote service API if a server endpoint has been configured.
|
||||||
if uri, method := model.Endpoint(); uri != "" && method != "" {
|
if uri, method := model.Endpoint(); uri != "" && method != "" {
|
||||||
var apiRequest *ApiRequest
|
var apiRequest *ApiRequest
|
||||||
var apiResponse *ApiResponse
|
var apiResponse *ApiResponse
|
||||||
|
|
||||||
if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), Files{imgName}, model.EndpointFileScheme()); err != nil {
|
if apiRequest, err = NewApiRequest(model.EndpointRequestFormat(), images, model.EndpointFileScheme()); err != nil {
|
||||||
return result, err
|
return result, model, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if model.Name != "" {
|
if model.Name != "" {
|
||||||
@@ -28,19 +38,23 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
|
|||||||
|
|
||||||
if model.Version != "" {
|
if model.Version != "" {
|
||||||
apiRequest.Version = model.Version
|
apiRequest.Version = model.Version
|
||||||
|
} else {
|
||||||
|
apiRequest.Version = "latest"
|
||||||
}
|
}
|
||||||
|
|
||||||
if model.Prompt != "" {
|
if model.Prompt != "" {
|
||||||
apiRequest.Prompt = model.Prompt
|
apiRequest.Prompt = model.Prompt
|
||||||
|
} else {
|
||||||
|
apiRequest.Prompt = CaptionPromptDefault
|
||||||
}
|
}
|
||||||
|
|
||||||
// Log JSON request data in trace mode.
|
// Log JSON request data in trace mode.
|
||||||
apiRequest.WriteLog()
|
apiRequest.WriteLog()
|
||||||
|
|
||||||
if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
|
if apiResponse, err = PerformApiRequest(apiRequest, uri, method, model.EndpointKey()); err != nil {
|
||||||
return result, err
|
return result, model, err
|
||||||
} else if apiResponse.Result.Caption == nil {
|
} else if apiResponse.Result.Caption == nil {
|
||||||
return result, errors.New("invalid caption model response")
|
return result, model, errors.New("invalid caption model response")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set image as the default caption source.
|
// Set image as the default caption source.
|
||||||
@@ -48,13 +62,13 @@ func Caption(imgName string, src media.Src) (result CaptionResult, err error) {
|
|||||||
apiResponse.Result.Caption.Source = entity.SrcImage
|
apiResponse.Result.Caption.Source = entity.SrcImage
|
||||||
}
|
}
|
||||||
|
|
||||||
result = *apiResponse.Result.Caption
|
result = apiResponse.Result.Caption
|
||||||
} else {
|
} else {
|
||||||
return result, errors.New("invalid caption model configuration")
|
return result, model, errors.New("invalid caption model configuration")
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
return result, errors.New("missing caption model")
|
return result, model, errors.New("missing caption model")
|
||||||
}
|
}
|
||||||
|
|
||||||
return result, nil
|
return result, model, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,9 +20,10 @@ func TestCaption(t *testing.T) {
|
|||||||
t.Run("Success", func(t *testing.T) {
|
t.Run("Success", func(t *testing.T) {
|
||||||
expectedText := "An image of sound waves"
|
expectedText := "An image of sound waves"
|
||||||
|
|
||||||
result, err := Caption("https://dl.photoprism.app/img/artwork/colorwaves-400.jpg", media.SrcRemote)
|
result, model, err := Caption(Files{"https://dl.photoprism.app/img/artwork/colorwaves-400.jpg"}, media.SrcRemote)
|
||||||
|
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
|
assert.NotNil(t, model)
|
||||||
assert.IsType(t, CaptionResult{}, result)
|
assert.IsType(t, CaptionResult{}, result)
|
||||||
assert.LessOrEqual(t, float32(0.0), result.Confidence)
|
assert.LessOrEqual(t, float32(0.0), result.Confidence)
|
||||||
|
|
||||||
@@ -31,9 +32,10 @@ func TestCaption(t *testing.T) {
|
|||||||
assert.Equal(t, expectedText, result.Text)
|
assert.Equal(t, expectedText, result.Text)
|
||||||
})
|
})
|
||||||
t.Run("Invalid", func(t *testing.T) {
|
t.Run("Invalid", func(t *testing.T) {
|
||||||
result, err := Caption("", media.SrcLocal)
|
result, model, err := Caption(nil, media.SrcLocal)
|
||||||
|
|
||||||
assert.Error(t, err)
|
assert.Error(t, err)
|
||||||
|
assert.Nil(t, model)
|
||||||
assert.IsType(t, CaptionResult{}, result)
|
assert.IsType(t, CaptionResult{}, result)
|
||||||
assert.Equal(t, "", result.Text)
|
assert.Equal(t, "", result.Text)
|
||||||
assert.Equal(t, float32(0.0), result.Confidence)
|
assert.Equal(t, float32(0.0), result.Confidence)
|
||||||
|
|||||||
@@ -14,6 +14,9 @@ import (
|
|||||||
|
|
||||||
var modelMutex = sync.Mutex{}
|
var modelMutex = sync.Mutex{}
|
||||||
|
|
||||||
|
// ModelVersionDefault is the default model version.
|
||||||
|
var ModelVersionDefault = "latest"
|
||||||
|
|
||||||
// Model represents a computer vision model configuration.
|
// Model represents a computer vision model configuration.
|
||||||
type Model struct {
|
type Model struct {
|
||||||
Type ModelType `yaml:"Type,omitempty" json:"type,omitempty"`
|
Type ModelType `yaml:"Type,omitempty" json:"type,omitempty"`
|
||||||
|
|||||||
@@ -32,8 +32,9 @@ var (
|
|||||||
CaptionModel = &Model{
|
CaptionModel = &Model{
|
||||||
Type: ModelTypeCaption,
|
Type: ModelTypeCaption,
|
||||||
Resolution: 224,
|
Resolution: 224,
|
||||||
Name: "qwen2.5vl",
|
Name: CaptionModelDefault,
|
||||||
Version: "latest",
|
Version: ModelVersionDefault,
|
||||||
|
Prompt: CaptionPromptDefault,
|
||||||
Service: Service{
|
Service: Service{
|
||||||
Uri: "http://photoprism-vision:5000/api/v1/vision/caption",
|
Uri: "http://photoprism-vision:5000/api/v1/vision/caption",
|
||||||
Method: http.MethodPost,
|
Method: http.MethodPost,
|
||||||
|
|||||||
6
internal/ai/vision/testdata/vision.yml
vendored
6
internal/ai/vision/testdata/vision.yml
vendored
@@ -18,6 +18,12 @@ Models:
|
|||||||
- Type: caption
|
- Type: caption
|
||||||
Name: qwen2.5vl
|
Name: qwen2.5vl
|
||||||
Version: latest
|
Version: latest
|
||||||
|
Prompt: Create a caption that sounds natural and briefly describes the main content
|
||||||
|
of the image in up to three sentences for use in a photo management application.
|
||||||
|
Begin with the type or number of subjects and action. Omit text formatting and
|
||||||
|
avoid meta-language such as "this picture", "the picture", "the photo", "there
|
||||||
|
are", "here is", or "a picture of". Use explicit language to describe the scene
|
||||||
|
if it helps to properly understand the picture.
|
||||||
Resolution: 224
|
Resolution: 224
|
||||||
Service:
|
Service:
|
||||||
Uri: http://photoprism-vision:5000/api/v1/vision/caption
|
Uri: http://photoprism-vision:5000/api/v1/vision/caption
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import (
|
|||||||
"github.com/photoprism/photoprism/internal/ai/vision"
|
"github.com/photoprism/photoprism/internal/ai/vision"
|
||||||
"github.com/photoprism/photoprism/internal/auth/acl"
|
"github.com/photoprism/photoprism/internal/auth/acl"
|
||||||
"github.com/photoprism/photoprism/internal/photoprism/get"
|
"github.com/photoprism/photoprism/internal/photoprism/get"
|
||||||
|
"github.com/photoprism/photoprism/pkg/media"
|
||||||
"github.com/photoprism/photoprism/pkg/media/http/header"
|
"github.com/photoprism/photoprism/pkg/media/http/header"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -51,18 +52,30 @@ func PostVisionCaption(router *gin.RouterGroup) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Return error code 501 until this service is implemented.
|
// Run inference to generate a caption.
|
||||||
code := http.StatusNotImplemented
|
result, model, err := vision.Caption(request.Images, media.SrcRemote)
|
||||||
|
|
||||||
// Generate Vision API service response.
|
if err != nil {
|
||||||
response := vision.ApiResponse{
|
log.Errorf("vision: %s (caption)", err)
|
||||||
Id: request.GetId(),
|
c.JSON(http.StatusBadRequest, vision.NewApiError(request.GetId(), http.StatusBadRequest))
|
||||||
Code: code,
|
return
|
||||||
Error: http.StatusText(http.StatusNotImplemented),
|
} else if model == nil {
|
||||||
Model: &vision.Model{Type: vision.ModelTypeCaption},
|
log.Errorf("vision: no model specified (caption)")
|
||||||
Result: vision.ApiResult{Caption: &vision.CaptionResult{Text: "This is a test.", Confidence: 0.14159265359}},
|
c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
|
||||||
|
return
|
||||||
|
} else if result == nil {
|
||||||
|
log.Errorf("vision: no result (caption)")
|
||||||
|
c.JSON(http.StatusInternalServerError, vision.NewApiError(request.GetId(), http.StatusInternalServerError))
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
c.JSON(code, response)
|
// Generate Vision API service response.
|
||||||
|
response := vision.NewCaptionResponse(
|
||||||
|
request.GetId(),
|
||||||
|
&vision.Model{Type: model.Type, Name: model.Name, Version: model.Version},
|
||||||
|
result,
|
||||||
|
)
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, response)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -63,7 +63,11 @@ func PostVisionLabels(router *gin.RouterGroup) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Generate Vision API service response.
|
// Generate Vision API service response.
|
||||||
response := vision.NewLabelsResponse(request.GetId(), vision.NasnetModel, labels)
|
response := vision.NewLabelsResponse(
|
||||||
|
request.GetId(),
|
||||||
|
&vision.Model{Type: vision.ModelTypeLabels},
|
||||||
|
labels,
|
||||||
|
)
|
||||||
|
|
||||||
c.JSON(http.StatusOK, response)
|
c.JSON(http.StatusOK, response)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Caption returns generated caption for the specified media file.
|
// Caption returns generated caption for the specified media file.
|
||||||
func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err error) {
|
func (ind *Index) Caption(file *MediaFile) (caption *vision.CaptionResult, err error) {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
|
|
||||||
size := vision.Thumb(vision.ModelTypeCaption)
|
size := vision.Thumb(vision.ModelTypeCaption)
|
||||||
@@ -22,7 +22,7 @@ func (ind *Index) Caption(file *MediaFile) (caption vision.CaptionResult, err er
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get matching labels from computer vision model.
|
// Get matching labels from computer vision model.
|
||||||
if caption, err = vision.Caption(fileName, media.SrcLocal); err != nil {
|
if caption, _, err = vision.Caption(vision.Files{fileName}, media.SrcLocal); err != nil {
|
||||||
} else if caption.Text != "" {
|
} else if caption.Text != "" {
|
||||||
log.Infof("vision: generated caption for %s [%s]", clean.Log(file.BaseName()), time.Since(start))
|
log.Infof("vision: generated caption for %s [%s]", clean.Log(file.BaseName()), time.Since(start))
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user