Tweaks (non-functional)

2024-09-26 07:25:25 +05:30
parent c8ab6be9f8
commit cda925fc80
3 changed files with 11 additions and 5 deletions
--- a/desktop/src/main/services/ml-worker.ts
+++ b/desktop/src/main/services/ml-worker.ts
@@ -203,8 +203,7 @@ const createInferenceSession = async (modelPath: string) => {

 const cachedCLIPImageSession = makeCachedInferenceSession(
    "mobileclip_s2_image_opset18_rgba_sim.onnx",
-    143061211 /* 143 MB */,
-    // TODO: manav: check above number, because I got 143093992 but might be calculating wrong
+    143093992 /* 143 MB */,
 );

 /**
--- a/web/packages/base/types/ipc.ts
+++ b/web/packages/base/types/ipc.ts
@@ -553,8 +553,9 @@ export interface ElectronMLWorker {
     * See: [Note: Natural language search using CLIP]
     *
     * The input is a opaque float32 array representing the image. The layout
-     * and exact encoding of the input is specific to our implementation and the
-     * ML model (CLIP) we use.
+     * and exact encoding of the input is specific to the runtime (ONNX) and the
+     * ML model (a MobileCLIP variant) we use. In particular, the image
+     * pre-processing happens within our model itself.
     *
     * @returns A CLIP embedding (an array of 512 floating point values).
     */
--- a/web/packages/new/photos/services/ml/clip.ts
+++ b/web/packages/new/photos/services/ml/clip.ts
@@ -112,9 +112,15 @@ const computeEmbedding = async (
    imageData: ImageData,
    electron: ElectronMLWorker,
 ): Promise<Float32Array> => {
+    // In contrast to the face detection model, the image pre-preprocessing
+    // happens within the model itself, using ONNX primitives. This is more
+    // performant and also saves us from having to reinvent (say) the
+    // antialising wheels.
    const { height, width, data: pixelData } = imageData;
    const inputShape = [height, width, 4]; // [H, W, C]
-    return normalized(await electron.computeCLIPImageEmbedding(pixelData, inputShape));
+    return normalized(
+        await electron.computeCLIPImageEmbedding(pixelData, inputShape),
+    );
 };

 const normalized = (embedding: Float32Array) => {