diff --git a/Makefile b/Makefile index 8e077c9ba..482df8a22 100644 --- a/Makefile +++ b/Makefile @@ -399,9 +399,9 @@ docker-build: $(DOCKER_COMPOSE) build --pull docker-nvidia: docker-nvidia-up docker-nvidia-up: - docker compose -f compose.nvidia.yaml up + docker compose --profile=vision -f compose.nvidia.yaml up docker-nvidia-build: - docker compose -f compose.nvidia.yaml up + docker compose --profile=vision -f compose.nvidia.yaml build docker-intel: docker-intel-up docker-intel-up: docker compose -f compose.intel.yaml up diff --git a/compose.nvidia.yaml b/compose.nvidia.yaml index 1a62442ca..1706df1f3 100644 --- a/compose.nvidia.yaml +++ b/compose.nvidia.yaml @@ -22,6 +22,8 @@ services: links: - "traefik:localssl.dev" - "traefik:app.localssl.dev" + - "traefik:vision.localssl.dev" + - "traefik:qdrant.localssl.dev" - "traefik:keycloak.localssl.dev" - "traefik:dummy-oidc.localssl.dev" - "traefik:dummy-webdav.localssl.dev" @@ -112,7 +114,7 @@ services: TF_CPP_MIN_LOG_LEVEL: 0 # show TensorFlow log messages for development ## Nvidia Video Transcoding (https://docs.photoprism.app/getting-started/advanced/transcoding/#nvidia-container-toolkit): NVIDIA_VISIBLE_DEVICES: "all" - NVIDIA_DRIVER_CAPABILITIES: "compute,video,utility" + NVIDIA_DRIVER_CAPABILITIES: "all" PHOTOPRISM_FFMPEG_ENCODER: "nvidia" # H.264/AVC encoder (software, intel, nvidia, apple, raspberry, or vaapi) PHOTOPRISM_FFMPEG_SIZE: "1920" # video size limit in pixels (720-7680) (default: 3840) PHOTOPRISM_FFMPEG_BITRATE: "50" # video bitrate limit in Mbit/s (default: 50) @@ -144,7 +146,24 @@ services: extends: file: ./compose.yaml service: mariadb + photoprism-vision: + profiles: ["all", "vision"] + environment: + TF_CPP_MIN_LOG_LEVEL: 2 + NVIDIA_VISIBLE_DEVICES: "all" + NVIDIA_DRIVER_CAPABILITIES: "all" + deploy: + resources: + reservations: + devices: + - driver: "nvidia" + count: 1 + capabilities: [ gpu ] + extends: + file: ./compose.yaml + service: photoprism-vision qdrant: + profiles: ["all", "vision"] extends: file: ./compose.yaml service: qdrant diff --git a/compose.yaml b/compose.yaml index 9a44dbe72..75a3c18da 100644 --- a/compose.yaml +++ b/compose.yaml @@ -25,6 +25,8 @@ services: links: - "traefik:localssl.dev" - "traefik:app.localssl.dev" + - "traefik:vision.localssl.dev" + - "traefik:qdrant.localssl.dev" - "traefik:keycloak.localssl.dev" - "traefik:dummy-oidc.localssl.dev" - "traefik:dummy-webdav.localssl.dev" @@ -170,6 +172,11 @@ services: ## Web UI: https://qdrant.localssl.dev/dashboard qdrant: image: qdrant/qdrant:latest + profiles: ["all", "vision"] + links: + - "traefik:localssl.dev" + - "traefik:app.localssl.dev" + - "traefik:vision.localssl.dev" labels: - "traefik.enable=true" - "traefik.http.services.qdrant.loadbalancer.server.port=6333" @@ -188,6 +195,32 @@ services: - ./.qdrant.yaml:/qdrant/config/production.yaml - ./storage/qdrant:/qdrant/storage + ## PhotoPrism® Computer Vision API + ## See: https://github.com/photoprism/photoprism-vision + photoprism-vision: + image: photoprism/vision:latest + profiles: ["all", "vision"] + stop_grace_period: 5s + working_dir: "/app" + links: + - "traefik:localssl.dev" + - "traefik:app.localssl.dev" + - "traefik:qdrant.localssl.dev" + labels: + - "traefik.enable=true" + - "traefik.http.services.qdrant.loadbalancer.server.port=5000" + - "traefik.http.services.qdrant.loadbalancer.server.scheme=http" + - "traefik.http.routers.qdrant.entrypoints=websecure" + - "traefik.http.routers.qdrant.rule=Host(`vision.localssl.dev`)" + - "traefik.http.routers.qdrant.priority=3" + - "traefik.http.routers.qdrant.tls.domains[0].main=localssl.dev" + - "traefik.http.routers.qdrant.tls.domains[0].sans=*.localssl.dev" + - "traefik.http.routers.qdrant.tls=true" + expose: + - 5000 + environment: + TF_CPP_MIN_LOG_LEVEL: 2 + ## Traefik v3 (Reverse Proxy) ## includes "*.localssl.dev" SSL certificate for test environments ## Docs: https://doc.traefik.io/traefik/ diff --git a/internal/ai/vision/api_client.go b/internal/ai/vision/api_client.go index 6e67953ee..c3a9abc41 100644 --- a/internal/ai/vision/api_client.go +++ b/internal/ai/vision/api_client.go @@ -69,6 +69,10 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp client := http.Client{Timeout: ServiceTimeout} req, reqErr := http.NewRequest(method, uri, bytes.NewReader(data)) + // Add "application/json" content type header. + header.SetContentType(req, header.ContentTypeJson) + + // Add an authentication header if an access token is configured. if key != "" { header.SetAuthorization(req, key) } @@ -91,6 +95,8 @@ func PerformApiRequest(apiRequest *ApiRequest, uri, method, key string) (apiResp return apiResponse, apiErr } else if apiErr = json.Unmarshal(apiJson, apiResponse); apiErr != nil { return apiResponse, apiErr + } else if clientResp.StatusCode >= 300 { + log.Debugf("vision: %s (status code %d)", apiJson, clientResp.StatusCode) } return apiResponse, nil diff --git a/internal/ai/vision/api_request.go b/internal/ai/vision/api_request.go index ec3a970a7..90bc22f07 100644 --- a/internal/ai/vision/api_request.go +++ b/internal/ai/vision/api_request.go @@ -12,6 +12,7 @@ type Files = []string type ApiRequest struct { Id string `form:"id" yaml:"Id,omitempty" json:"id,omitempty"` Model string `form:"model" yaml:"Model,omitempty" json:"model,omitempty"` + Url string `form:"url" yaml:"Url,omitempty" json:"url,omitempty"` Images Files `form:"images" yaml:"Images,omitempty" json:"images,omitempty"` } diff --git a/internal/ai/vision/caption.go b/internal/ai/vision/caption.go new file mode 100644 index 000000000..6739f7479 --- /dev/null +++ b/internal/ai/vision/caption.go @@ -0,0 +1,81 @@ +package vision + +import ( + "errors" + "fmt" + "net/url" + "slices" + + "github.com/photoprism/photoprism/internal/api/download" + "github.com/photoprism/photoprism/pkg/clean" + "github.com/photoprism/photoprism/pkg/fs" + "github.com/photoprism/photoprism/pkg/media" + "github.com/photoprism/photoprism/pkg/media/http/scheme" + "github.com/photoprism/photoprism/pkg/rnd" +) + +// Caption returns generated captions for the specified images. +func Caption(imgName string, src media.Src) (result CaptionResult, err error) { + // Return if there is no configuration or no image classification models are configured. + if Config == nil { + return result, errors.New("vision service is not configured") + } else if model := Config.Model(ModelTypeCaption); model != nil { + // Use remote service API if a server endpoint has been configured. + if uri, method := model.Endpoint(); uri != "" && method != "" { + var imgUrl string + + switch src { + case media.SrcLocal: + // Return if no thumbnail filenames were given. + if !fs.FileExistsNotEmpty(imgName) { + return result, errors.New("invalid image file name") + } + + dlId, dlErr := download.Register(imgName) + + if dlErr != nil { + return result, fmt.Errorf("%s (create download url)", err) + } + + imgUrl = fmt.Sprintf("%s/%s", DownloadUrl, dlId) + case media.SrcRemote: + var u *url.URL + if u, err = url.Parse(imgName); err != nil { + return result, fmt.Errorf("%s (invalid image url)", err) + } else if !slices.Contains(scheme.HttpsHttp, u.Scheme) { + return result, fmt.Errorf("unsupported image url scheme %s", clean.Log(u.Scheme)) + } else { + imgUrl = u.String() + } + default: + return result, fmt.Errorf("unsupported media source type %s", clean.Log(src)) + } + + apiRequest := &ApiRequest{ + Id: rnd.UUID(), + Model: model.Name, + Url: imgUrl, + } + + if json, _ := apiRequest.MarshalJSON(); len(json) > 0 { + log.Debugf("request: %s", json) + } + + apiResponse, apiErr := PerformApiRequest(apiRequest, uri, method, model.EndpointKey()) + + if apiErr != nil { + return result, apiErr + } else if apiResponse.Result.Caption == nil { + return result, errors.New("invalid caption model response") + } + + result = *apiResponse.Result.Caption + } else { + return result, errors.New("invalid caption model configuration") + } + } else { + return result, errors.New("missing caption model") + } + + return result, nil +} diff --git a/internal/ai/vision/caption_test.go b/internal/ai/vision/caption_test.go new file mode 100644 index 000000000..fba4f0c2f --- /dev/null +++ b/internal/ai/vision/caption_test.go @@ -0,0 +1,41 @@ +package vision + +import ( + "net" + "testing" + "time" + + "github.com/stretchr/testify/assert" + + "github.com/photoprism/photoprism/pkg/media" +) + +func TestCaption(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } else if _, err := net.DialTimeout("tcp", "photoprism-vision:5000", 10*time.Second); err != nil { + t.Skip("skipping test because photoprism-vision is not running.") + } + + t.Run("Success", func(t *testing.T) { + expectedText := "An image of sound waves" + + result, err := Caption("https://dl.photoprism.app/img/artwork/colorwaves-400.jpg", media.SrcRemote) + + assert.NoError(t, err) + assert.IsType(t, CaptionResult{}, result) + assert.LessOrEqual(t, float32(0.0), result.Confidence) + + t.Logf("caption: %#v", result) + + assert.Equal(t, expectedText, result.Text) + }) + t.Run("Invalid", func(t *testing.T) { + result, err := Caption("", media.SrcLocal) + + assert.Error(t, err) + assert.IsType(t, CaptionResult{}, result) + assert.Equal(t, "", result.Text) + assert.Equal(t, float32(0.0), result.Confidence) + }) +} diff --git a/internal/ai/vision/face_embeddings_test.go b/internal/ai/vision/face_embeddings_test.go new file mode 100644 index 000000000..41c803a2d --- /dev/null +++ b/internal/ai/vision/face_embeddings_test.go @@ -0,0 +1,38 @@ +package vision + +import ( + "os" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/photoprism/photoprism/internal/ai/face" + "github.com/photoprism/photoprism/pkg/fs" +) + +func TestFaceEmbeddings(t *testing.T) { + t.Run("Success", func(t *testing.T) { + img, imgErr := os.ReadFile(fs.Abs("./testdata/face_160x160.jpg")) + + if imgErr != nil { + t.Fatal(imgErr) + } + + result, err := FaceEmbeddings(img) + + assert.NoError(t, err) + assert.IsType(t, face.Embeddings{}, result) + assert.Equal(t, 1, len(result)) + + // t.Log(result) + }) + t.Run("InvalidFile", func(t *testing.T) { + result, err := FaceEmbeddings([]byte{}) + + assert.Error(t, err) + assert.IsType(t, face.Embeddings{}, result) + assert.Equal(t, 0, len(result)) + + // t.Log(result) + }) +} diff --git a/internal/ai/vision/models.go b/internal/ai/vision/models.go index ee44a3a75..db1e53ab3 100644 --- a/internal/ai/vision/models.go +++ b/internal/ai/vision/models.go @@ -29,8 +29,8 @@ var ( } CaptionModel = &Model{ Type: ModelTypeCaption, - Name: "Caption", - Uri: "http://photoprism-vision/api/v1/vision/describe", + Name: "kosmos-2", + Uri: "http://photoprism-vision:5000/api/v1/vision/caption", Method: http.MethodPost, Resolution: 720, } diff --git a/pkg/media/http/header/auth.go b/pkg/media/http/header/auth.go index 9f4f7a815..58e732439 100644 --- a/pkg/media/http/header/auth.go +++ b/pkg/media/http/header/auth.go @@ -67,7 +67,7 @@ func Authorization(c *gin.Context) (authType, authToken string) { return "", "" } -// SetAuthorization adds a bearer token authorization header to a request. +// SetAuthorization adds a bearer token authorization header to the given request. func SetAuthorization(r *http.Request, authToken string) { if authToken != "" { r.Header.Add(Auth, fmt.Sprintf("%s %s", AuthBearer, authToken)) diff --git a/pkg/media/http/header/content_types.go b/pkg/media/http/header/content_types.go index df5618fb0..a857cb31d 100644 --- a/pkg/media/http/header/content_types.go +++ b/pkg/media/http/header/content_types.go @@ -117,3 +117,10 @@ func HasContentType(header *http.Header, contentType string) bool { return false } + +// SetContentType adds a content type header to the given request. +func SetContentType(r *http.Request, contentType string) { + if contentType != "" { + r.Header.Add(ContentType, contentType) + } +} diff --git a/pkg/media/http/scheme/const.go b/pkg/media/http/scheme/const.go index 54e03baa3..92194cab4 100644 --- a/pkg/media/http/scheme/const.go +++ b/pkg/media/http/scheme/const.go @@ -14,4 +14,5 @@ const ( var ( HttpsData = []string{Https, Data} + HttpsHttp = []string{Https, Http} )