From 93c5825364b8daff2a570a4a004d0164f41c5f17 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Sat, 10 Aug 2024 08:55:16 +0530 Subject: [PATCH 1/7] Add MobileCLIP URLs --- desktop/src/main/services/ml-worker.ts | 32 ++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts index f4b9221f64..d6e61eeb7d 100644 --- a/desktop/src/main/services/ml-worker.ts +++ b/desktop/src/main/services/ml-worker.ts @@ -201,9 +201,15 @@ const createInferenceSession = async (modelPath: string) => { }); }; +// TODO-ML: Remove me +// const cachedCLIPImageSessionOAI = makeCachedInferenceSession( +// "clip-image-vit-32-float32.onnx", +// 351468764 /* 335 MB */, +// ); + const cachedCLIPImageSession = makeCachedInferenceSession( - "clip-image-vit-32-float32.onnx", - 351468764 /* 335.2 MB */, + "mobileclip_s2_image.onnx", + 143061211 /* 143 MB */, ); /** @@ -223,9 +229,27 @@ export const computeCLIPImageEmbedding = async (input: Float32Array) => { return ensure(results.output).data as Float32Array; }; +// TODO-ML: Remove me +// const cachedCLIPTextSessionOAIQ = makeCachedInferenceSession( +// "clip-text-vit-32-uint8.onnx", +// 64173509 /* 61 MB */, +// ); + +// TODO-ML: Remove me +// const cachedCLIPTextSessionOAI = makeCachedInferenceSession( +// "clip-text-vit-32-float32-int32.onnx", +// 254069585 /* 254 MB */, +// ); + +// TODO-ML: Remove me +// const cachedCLIPTextSession = makeCachedInferenceSession( +// "mobileclip_s2_text.onnx", +// 253895732 /* 253 MB */, +// ); + const cachedCLIPTextSession = makeCachedInferenceSession( - "clip-text-vit-32-uint8.onnx", - 64173509 /* 61.2 MB */, + "mobileclip_s2_text_int32.onnx", + 253895600 /* 253 MB */, ); let _tokenizer: Tokenizer | undefined; From 5ce8d9838fa56c324bc17b16ddd1b818cc620e80 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Sat, 10 Aug 2024 09:48:07 +0530 Subject: [PATCH 2/7] 224 => 256 https://github.com/apple/ml-mobileclip/blob/main/mobileclip/configs/mobileclip_s2.json --- desktop/src/main/services/ml-worker.ts | 2 +- web/packages/new/photos/services/ml/clip.ts | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts index d6e61eeb7d..40c1c5fb5e 100644 --- a/desktop/src/main/services/ml-worker.ts +++ b/desktop/src/main/services/ml-worker.ts @@ -220,7 +220,7 @@ const cachedCLIPImageSession = makeCachedInferenceSession( export const computeCLIPImageEmbedding = async (input: Float32Array) => { const session = await cachedCLIPImageSession(); const feeds = { - input: new ort.Tensor("float32", input, [1, 3, 224, 224]), + input: new ort.Tensor("float32", input, [1, 3, 256, 256]), }; const t = Date.now(); const results = await session.run(feeds); diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts index 78eff1c04d..c61cb5b535 100644 --- a/web/packages/new/photos/services/ml/clip.ts +++ b/web/packages/new/photos/services/ml/clip.ts @@ -120,8 +120,7 @@ const computeEmbedding = async ( * Convert {@link imageData} into the format that the CLIP model expects. */ const convertToCLIPInput = (imageData: ImageData) => { - const requiredWidth = 224; - const requiredHeight = 224; + const [requiredWidth, requiredHeight] = [256, 256]; const mean = [0.48145466, 0.4578275, 0.40821073] as const; const std = [0.26862954, 0.26130258, 0.27577711] as const; From b503f7599952947b41a76fa366951030a6ddabc1 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Sat, 10 Aug 2024 13:09:23 +0530 Subject: [PATCH 3/7] Don't need the mean/std --- web/packages/new/photos/services/ml/clip.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts index c61cb5b535..1cd14047f4 100644 --- a/web/packages/new/photos/services/ml/clip.ts +++ b/web/packages/new/photos/services/ml/clip.ts @@ -122,8 +122,10 @@ const computeEmbedding = async ( const convertToCLIPInput = (imageData: ImageData) => { const [requiredWidth, requiredHeight] = [256, 256]; - const mean = [0.48145466, 0.4578275, 0.40821073] as const; - const std = [0.26862954, 0.26130258, 0.27577711] as const; + // const mean = [0.48145466, 0.4578275, 0.40821073] as const; + const mean = [0, 0, 0] as const; + // const std = [0.26862954, 0.26130258, 0.27577711] as const; + const std = [1, 1, 1] as const; const { width, height, data: pixelData } = imageData; From 1f28fdada2bd249bbabffcf4f02cf37596ec5385 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Sat, 10 Aug 2024 13:11:02 +0530 Subject: [PATCH 4/7] Bilinear --- web/packages/new/photos/services/ml/clip.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts index 1cd14047f4..4c19e0395b 100644 --- a/web/packages/new/photos/services/ml/clip.ts +++ b/web/packages/new/photos/services/ml/clip.ts @@ -1,7 +1,7 @@ import type { ElectronMLWorker } from "@/base/types/ipc"; import type { ImageBitmapAndData } from "./blob"; import { clipIndexes } from "./db"; -import { pixelRGBBicubic } from "./image"; +import { pixelRGBBilinear } from "./image"; import { dotProduct, norm } from "./math"; import type { CLIPMatches } from "./worker-types"; @@ -145,7 +145,7 @@ const convertToCLIPInput = (imageData: ImageData) => { const cOffsetB = 2 * requiredHeight * requiredWidth; // ChannelOffsetBlue for (let h = 0 + heightOffset; h < scaledHeight - heightOffset; h++) { for (let w = 0 + widthOffset; w < scaledWidth - widthOffset; w++) { - const { r, g, b } = pixelRGBBicubic( + const { r, g, b } = pixelRGBBilinear( w / scale, h / scale, pixelData, From 5bbc2615e432d4190519edb0c3249e4e002d038f Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Sat, 10 Aug 2024 13:39:14 +0530 Subject: [PATCH 5/7] Tune the threshold for MobileCLIP Experimentation. - 0.15 was noisy - 0.23 was too strict --- web/packages/new/photos/services/ml/clip.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts index 4c19e0395b..e0d1211fbe 100644 --- a/web/packages/new/photos/services/ml/clip.ts +++ b/web/packages/new/photos/services/ml/clip.ts @@ -190,5 +190,5 @@ export const clipMatches = async ( // This code is on the hot path, so these optimizations help. [fileID, dotProduct(embedding, textEmbedding)] as const, ); - return new Map(items.filter(([, score]) => score >= 0.23)); + return new Map(items.filter(([, score]) => score >= 0.2)); }; From 72bce123a5fc6b32e208086aad3c7025e2428992 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Sat, 10 Aug 2024 13:42:26 +0530 Subject: [PATCH 6/7] Cleanup --- desktop/src/main/services/ml-worker.ts | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts index 40c1c5fb5e..68a8eb8349 100644 --- a/desktop/src/main/services/ml-worker.ts +++ b/desktop/src/main/services/ml-worker.ts @@ -201,12 +201,6 @@ const createInferenceSession = async (modelPath: string) => { }); }; -// TODO-ML: Remove me -// const cachedCLIPImageSessionOAI = makeCachedInferenceSession( -// "clip-image-vit-32-float32.onnx", -// 351468764 /* 335 MB */, -// ); - const cachedCLIPImageSession = makeCachedInferenceSession( "mobileclip_s2_image.onnx", 143061211 /* 143 MB */, @@ -229,24 +223,6 @@ export const computeCLIPImageEmbedding = async (input: Float32Array) => { return ensure(results.output).data as Float32Array; }; -// TODO-ML: Remove me -// const cachedCLIPTextSessionOAIQ = makeCachedInferenceSession( -// "clip-text-vit-32-uint8.onnx", -// 64173509 /* 61 MB */, -// ); - -// TODO-ML: Remove me -// const cachedCLIPTextSessionOAI = makeCachedInferenceSession( -// "clip-text-vit-32-float32-int32.onnx", -// 254069585 /* 254 MB */, -// ); - -// TODO-ML: Remove me -// const cachedCLIPTextSession = makeCachedInferenceSession( -// "mobileclip_s2_text.onnx", -// 253895732 /* 253 MB */, -// ); - const cachedCLIPTextSession = makeCachedInferenceSession( "mobileclip_s2_text_int32.onnx", 253895600 /* 253 MB */, @@ -294,7 +270,7 @@ export const computeCLIPTextEmbeddingIfAvailable = async (text: string) => { const cachedFaceDetectionSession = makeCachedInferenceSession( "yolov5s_face_640_640_dynamic.onnx", - 30762872 /* 29.3 MB */, + 30762872 /* 29 MB */, ); /** From ac8a5b491d0f02d349d21c1f6538d06e0445afb5 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Sat, 10 Aug 2024 13:45:03 +0530 Subject: [PATCH 7/7] Update refs --- desktop/src/main/services/ml-worker.ts | 4 ++-- web/packages/new/photos/services/ml/clip.ts | 18 +++++++----------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts index 68a8eb8349..cd9a3e0404 100644 --- a/desktop/src/main/services/ml-worker.ts +++ b/desktop/src/main/services/ml-worker.ts @@ -209,7 +209,7 @@ const cachedCLIPImageSession = makeCachedInferenceSession( /** * Compute CLIP embeddings for an image. * - * The embeddings are computed using ONNX runtime, with CLIP as the model. + * The embeddings are computed using ONNX runtime, with MobileCLIP as the model. */ export const computeCLIPImageEmbedding = async (input: Float32Array) => { const session = await cachedCLIPImageSession(); @@ -237,7 +237,7 @@ const getTokenizer = () => { /** * Compute CLIP embeddings for an text snippet. * - * The embeddings are computed using ONNX runtime, with CLIP as the model. + * The embeddings are computed using ONNX runtime, with MobileCLIP as the model. */ export const computeCLIPTextEmbeddingIfAvailable = async (text: string) => { const sessionOrSkip = await Promise.race([ diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts index e0d1211fbe..b226ef10cb 100644 --- a/web/packages/new/photos/services/ml/clip.ts +++ b/web/packages/new/photos/services/ml/clip.ts @@ -39,8 +39,9 @@ export const clipIndexingVersion = 1; * initial launch of this feature using the GGML runtime. * * Since the initial launch, we've switched over to another runtime, - * [ONNX](https://onnxruntime.ai) and have made other implementation changes, - * but the overall gist remains the same. + * [ONNX](https://onnxruntime.ai), started using Apple's + * [MobileCLIP](https://github.com/apple/ml-mobileclip/) as the model and have + * made other implementation changes, but the overall gist remains the same. * * Note that we don't train the neural network - we only use one of the publicly * available pre-trained neural networks for inference. These neural networks @@ -117,16 +118,11 @@ const computeEmbedding = async ( }; /** - * Convert {@link imageData} into the format that the CLIP model expects. + * Convert {@link imageData} into the format that the MobileCLIP model expects. */ const convertToCLIPInput = (imageData: ImageData) => { const [requiredWidth, requiredHeight] = [256, 256]; - // const mean = [0.48145466, 0.4578275, 0.40821073] as const; - const mean = [0, 0, 0] as const; - // const std = [0.26862954, 0.26130258, 0.27577711] as const; - const std = [1, 1, 1] as const; - const { width, height, data: pixelData } = imageData; // Maintain aspect ratio. @@ -152,9 +148,9 @@ const convertToCLIPInput = (imageData: ImageData) => { width, height, ); - clipInput[pi] = (r / 255.0 - mean[0]) / std[0]; - clipInput[pi + cOffsetG] = (g / 255.0 - mean[1]) / std[1]; - clipInput[pi + cOffsetB] = (b / 255.0 - mean[2]) / std[2]; + clipInput[pi] = r / 255.0; + clipInput[pi + cOffsetG] = g / 255.0; + clipInput[pi + cOffsetB] = b / 255.0; pi++; } }