From cda925fc80965322efd170c5b0e063ff1525f5a0 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 26 Sep 2024 07:25:25 +0530
Subject: [PATCH] Tweaks (non-functional)

---
 desktop/src/main/services/ml-worker.ts      | 3 +--
 web/packages/base/types/ipc.ts              | 5 +++--
 web/packages/new/photos/services/ml/clip.ts | 8 +++++++-
 3 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts
index fb4ec1817a..fcbd73576c 100644
--- a/desktop/src/main/services/ml-worker.ts
+++ b/desktop/src/main/services/ml-worker.ts
@@ -203,8 +203,7 @@ const createInferenceSession = async (modelPath: string) => {
 
 const cachedCLIPImageSession = makeCachedInferenceSession(
     "mobileclip_s2_image_opset18_rgba_sim.onnx",
-    143061211 /* 143 MB */,
-    // TODO: manav: check above number, because I got 143093992 but might be calculating wrong
+    143093992 /* 143 MB */,
 );
 
 /**
diff --git a/web/packages/base/types/ipc.ts b/web/packages/base/types/ipc.ts
index d5251f6e6f..303251bb1e 100644
--- a/web/packages/base/types/ipc.ts
+++ b/web/packages/base/types/ipc.ts
@@ -553,8 +553,9 @@ export interface ElectronMLWorker {
      * See: [Note: Natural language search using CLIP]
      *
      * The input is a opaque float32 array representing the image. The layout
-     * and exact encoding of the input is specific to our implementation and the
-     * ML model (CLIP) we use.
+     * and exact encoding of the input is specific to the runtime (ONNX) and the
+     * ML model (a MobileCLIP variant) we use. In particular, the image
+     * pre-processing happens within our model itself.
      *
      * @returns A CLIP embedding (an array of 512 floating point values).
      */
diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts
index c8e40ddf91..59df54af90 100644
--- a/web/packages/new/photos/services/ml/clip.ts
+++ b/web/packages/new/photos/services/ml/clip.ts
@@ -112,9 +112,15 @@ const computeEmbedding = async (
     imageData: ImageData,
     electron: ElectronMLWorker,
 ): Promise<Float32Array> => {
+    // In contrast to the face detection model, the image pre-preprocessing
+    // happens within the model itself, using ONNX primitives. This is more
+    // performant and also saves us from having to reinvent (say) the
+    // antialising wheels.
     const { height, width, data: pixelData } = imageData;
     const inputShape = [height, width, 4]; // [H, W, C]
-    return normalized(await electron.computeCLIPImageEmbedding(pixelData, inputShape));
+    return normalized(
+        await electron.computeCLIPImageEmbedding(pixelData, inputShape),
+    );
 };
 
 const normalized = (embedding: Float32Array) => {