From 93c5825364b8daff2a570a4a004d0164f41c5f17 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Sat, 10 Aug 2024 08:55:16 +0530
Subject: [PATCH 1/7] Add MobileCLIP URLs

---
 desktop/src/main/services/ml-worker.ts | 32 ++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts
index f4b9221f64..d6e61eeb7d 100644
--- a/desktop/src/main/services/ml-worker.ts
+++ b/desktop/src/main/services/ml-worker.ts
@@ -201,9 +201,15 @@ const createInferenceSession = async (modelPath: string) => {
     });
 };
 
+// TODO-ML: Remove me
+// const cachedCLIPImageSessionOAI = makeCachedInferenceSession(
+//     "clip-image-vit-32-float32.onnx",
+//     351468764 /* 335 MB */,
+// );
+
 const cachedCLIPImageSession = makeCachedInferenceSession(
-    "clip-image-vit-32-float32.onnx",
-    351468764 /* 335.2 MB */,
+    "mobileclip_s2_image.onnx",
+    143061211 /* 143 MB */,
 );
 
 /**
@@ -223,9 +229,27 @@ export const computeCLIPImageEmbedding = async (input: Float32Array) => {
     return ensure(results.output).data as Float32Array;
 };
 
+// TODO-ML: Remove me
+// const cachedCLIPTextSessionOAIQ = makeCachedInferenceSession(
+//     "clip-text-vit-32-uint8.onnx",
+//     64173509 /* 61 MB */,
+// );
+
+// TODO-ML: Remove me
+// const cachedCLIPTextSessionOAI = makeCachedInferenceSession(
+//     "clip-text-vit-32-float32-int32.onnx",
+//     254069585 /* 254 MB */,
+// );
+
+// TODO-ML: Remove me
+// const cachedCLIPTextSession = makeCachedInferenceSession(
+//     "mobileclip_s2_text.onnx",
+//     253895732 /* 253 MB */,
+// );
+
 const cachedCLIPTextSession = makeCachedInferenceSession(
-    "clip-text-vit-32-uint8.onnx",
-    64173509 /* 61.2 MB */,
+    "mobileclip_s2_text_int32.onnx",
+    253895600 /* 253 MB */,
 );
 
 let _tokenizer: Tokenizer | undefined;

From 5ce8d9838fa56c324bc17b16ddd1b818cc620e80 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Sat, 10 Aug 2024 09:48:07 +0530
Subject: [PATCH 2/7] 224 => 256

https://github.com/apple/ml-mobileclip/blob/main/mobileclip/configs/mobileclip_s2.json
---
 desktop/src/main/services/ml-worker.ts      | 2 +-
 web/packages/new/photos/services/ml/clip.ts | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts
index d6e61eeb7d..40c1c5fb5e 100644
--- a/desktop/src/main/services/ml-worker.ts
+++ b/desktop/src/main/services/ml-worker.ts
@@ -220,7 +220,7 @@ const cachedCLIPImageSession = makeCachedInferenceSession(
 export const computeCLIPImageEmbedding = async (input: Float32Array) => {
     const session = await cachedCLIPImageSession();
     const feeds = {
-        input: new ort.Tensor("float32", input, [1, 3, 224, 224]),
+        input: new ort.Tensor("float32", input, [1, 3, 256, 256]),
     };
     const t = Date.now();
     const results = await session.run(feeds);
diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts
index 78eff1c04d..c61cb5b535 100644
--- a/web/packages/new/photos/services/ml/clip.ts
+++ b/web/packages/new/photos/services/ml/clip.ts
@@ -120,8 +120,7 @@ const computeEmbedding = async (
  * Convert {@link imageData} into the format that the CLIP model expects.
  */
 const convertToCLIPInput = (imageData: ImageData) => {
-    const requiredWidth = 224;
-    const requiredHeight = 224;
+    const [requiredWidth, requiredHeight] = [256, 256];
 
     const mean = [0.48145466, 0.4578275, 0.40821073] as const;
     const std = [0.26862954, 0.26130258, 0.27577711] as const;

From b503f7599952947b41a76fa366951030a6ddabc1 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Sat, 10 Aug 2024 13:09:23 +0530
Subject: [PATCH 3/7] Don't need the mean/std

---
 web/packages/new/photos/services/ml/clip.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts
index c61cb5b535..1cd14047f4 100644
--- a/web/packages/new/photos/services/ml/clip.ts
+++ b/web/packages/new/photos/services/ml/clip.ts
@@ -122,8 +122,10 @@ const computeEmbedding = async (
 const convertToCLIPInput = (imageData: ImageData) => {
     const [requiredWidth, requiredHeight] = [256, 256];
 
-    const mean = [0.48145466, 0.4578275, 0.40821073] as const;
-    const std = [0.26862954, 0.26130258, 0.27577711] as const;
+    // const mean = [0.48145466, 0.4578275, 0.40821073] as const;
+    const mean = [0, 0, 0] as const;
+    // const std = [0.26862954, 0.26130258, 0.27577711] as const;
+    const std = [1, 1, 1] as const;
 
     const { width, height, data: pixelData } = imageData;
 

From 1f28fdada2bd249bbabffcf4f02cf37596ec5385 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Sat, 10 Aug 2024 13:11:02 +0530
Subject: [PATCH 4/7] Bilinear

---
 web/packages/new/photos/services/ml/clip.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts
index 1cd14047f4..4c19e0395b 100644
--- a/web/packages/new/photos/services/ml/clip.ts
+++ b/web/packages/new/photos/services/ml/clip.ts
@@ -1,7 +1,7 @@
 import type { ElectronMLWorker } from "@/base/types/ipc";
 import type { ImageBitmapAndData } from "./blob";
 import { clipIndexes } from "./db";
-import { pixelRGBBicubic } from "./image";
+import { pixelRGBBilinear } from "./image";
 import { dotProduct, norm } from "./math";
 import type { CLIPMatches } from "./worker-types";
 
@@ -145,7 +145,7 @@ const convertToCLIPInput = (imageData: ImageData) => {
     const cOffsetB = 2 * requiredHeight * requiredWidth; // ChannelOffsetBlue
     for (let h = 0 + heightOffset; h < scaledHeight - heightOffset; h++) {
         for (let w = 0 + widthOffset; w < scaledWidth - widthOffset; w++) {
-            const { r, g, b } = pixelRGBBicubic(
+            const { r, g, b } = pixelRGBBilinear(
                 w / scale,
                 h / scale,
                 pixelData,

From 5bbc2615e432d4190519edb0c3249e4e002d038f Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Sat, 10 Aug 2024 13:39:14 +0530
Subject: [PATCH 5/7] Tune the threshold for MobileCLIP

Experimentation.

- 0.15 was noisy
- 0.23 was too strict
---
 web/packages/new/photos/services/ml/clip.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts
index 4c19e0395b..e0d1211fbe 100644
--- a/web/packages/new/photos/services/ml/clip.ts
+++ b/web/packages/new/photos/services/ml/clip.ts
@@ -190,5 +190,5 @@ export const clipMatches = async (
             // This code is on the hot path, so these optimizations help.
             [fileID, dotProduct(embedding, textEmbedding)] as const,
     );
-    return new Map(items.filter(([, score]) => score >= 0.23));
+    return new Map(items.filter(([, score]) => score >= 0.2));
 };

From 72bce123a5fc6b32e208086aad3c7025e2428992 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Sat, 10 Aug 2024 13:42:26 +0530
Subject: [PATCH 6/7] Cleanup

---
 desktop/src/main/services/ml-worker.ts | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts
index 40c1c5fb5e..68a8eb8349 100644
--- a/desktop/src/main/services/ml-worker.ts
+++ b/desktop/src/main/services/ml-worker.ts
@@ -201,12 +201,6 @@ const createInferenceSession = async (modelPath: string) => {
     });
 };
 
-// TODO-ML: Remove me
-// const cachedCLIPImageSessionOAI = makeCachedInferenceSession(
-//     "clip-image-vit-32-float32.onnx",
-//     351468764 /* 335 MB */,
-// );
-
 const cachedCLIPImageSession = makeCachedInferenceSession(
     "mobileclip_s2_image.onnx",
     143061211 /* 143 MB */,
@@ -229,24 +223,6 @@ export const computeCLIPImageEmbedding = async (input: Float32Array) => {
     return ensure(results.output).data as Float32Array;
 };
 
-// TODO-ML: Remove me
-// const cachedCLIPTextSessionOAIQ = makeCachedInferenceSession(
-//     "clip-text-vit-32-uint8.onnx",
-//     64173509 /* 61 MB */,
-// );
-
-// TODO-ML: Remove me
-// const cachedCLIPTextSessionOAI = makeCachedInferenceSession(
-//     "clip-text-vit-32-float32-int32.onnx",
-//     254069585 /* 254 MB */,
-// );
-
-// TODO-ML: Remove me
-// const cachedCLIPTextSession = makeCachedInferenceSession(
-//     "mobileclip_s2_text.onnx",
-//     253895732 /* 253 MB */,
-// );
-
 const cachedCLIPTextSession = makeCachedInferenceSession(
     "mobileclip_s2_text_int32.onnx",
     253895600 /* 253 MB */,
@@ -294,7 +270,7 @@ export const computeCLIPTextEmbeddingIfAvailable = async (text: string) => {
 
 const cachedFaceDetectionSession = makeCachedInferenceSession(
     "yolov5s_face_640_640_dynamic.onnx",
-    30762872 /* 29.3 MB */,
+    30762872 /* 29 MB */,
 );
 
 /**

From ac8a5b491d0f02d349d21c1f6538d06e0445afb5 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Sat, 10 Aug 2024 13:45:03 +0530
Subject: [PATCH 7/7] Update refs

---
 desktop/src/main/services/ml-worker.ts      |  4 ++--
 web/packages/new/photos/services/ml/clip.ts | 18 +++++++-----------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts
index 68a8eb8349..cd9a3e0404 100644
--- a/desktop/src/main/services/ml-worker.ts
+++ b/desktop/src/main/services/ml-worker.ts
@@ -209,7 +209,7 @@ const cachedCLIPImageSession = makeCachedInferenceSession(
 /**
  * Compute CLIP embeddings for an image.
  *
- * The embeddings are computed using ONNX runtime, with CLIP as the model.
+ * The embeddings are computed using ONNX runtime, with MobileCLIP as the model.
  */
 export const computeCLIPImageEmbedding = async (input: Float32Array) => {
     const session = await cachedCLIPImageSession();
@@ -237,7 +237,7 @@ const getTokenizer = () => {
 /**
  * Compute CLIP embeddings for an text snippet.
  *
- * The embeddings are computed using ONNX runtime, with CLIP as the model.
+ * The embeddings are computed using ONNX runtime, with MobileCLIP as the model.
  */
 export const computeCLIPTextEmbeddingIfAvailable = async (text: string) => {
     const sessionOrSkip = await Promise.race([
diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts
index e0d1211fbe..b226ef10cb 100644
--- a/web/packages/new/photos/services/ml/clip.ts
+++ b/web/packages/new/photos/services/ml/clip.ts
@@ -39,8 +39,9 @@ export const clipIndexingVersion = 1;
  * initial launch of this feature using the GGML runtime.
  *
  * Since the initial launch, we've switched over to another runtime,
- * [ONNX](https://onnxruntime.ai) and have made other implementation changes,
- * but the overall gist remains the same.
+ * [ONNX](https://onnxruntime.ai), started using Apple's
+ * [MobileCLIP](https://github.com/apple/ml-mobileclip/) as the model and have
+ * made other implementation changes, but the overall gist remains the same.
  *
  * Note that we don't train the neural network - we only use one of the publicly
  * available pre-trained neural networks for inference. These neural networks
@@ -117,16 +118,11 @@ const computeEmbedding = async (
 };
 
 /**
- * Convert {@link imageData} into the format that the CLIP model expects.
+ * Convert {@link imageData} into the format that the MobileCLIP model expects.
  */
 const convertToCLIPInput = (imageData: ImageData) => {
     const [requiredWidth, requiredHeight] = [256, 256];
 
-    // const mean = [0.48145466, 0.4578275, 0.40821073] as const;
-    const mean = [0, 0, 0] as const;
-    // const std = [0.26862954, 0.26130258, 0.27577711] as const;
-    const std = [1, 1, 1] as const;
-
     const { width, height, data: pixelData } = imageData;
 
     // Maintain aspect ratio.
@@ -152,9 +148,9 @@ const convertToCLIPInput = (imageData: ImageData) => {
                 width,
                 height,
             );
-            clipInput[pi] = (r / 255.0 - mean[0]) / std[0];
-            clipInput[pi + cOffsetG] = (g / 255.0 - mean[1]) / std[1];
-            clipInput[pi + cOffsetB] = (b / 255.0 - mean[2]) / std[2];
+            clipInput[pi] = r / 255.0;
+            clipInput[pi + cOffsetG] = g / 255.0;
+            clipInput[pi + cOffsetB] = b / 255.0;
             pi++;
         }
     }