From 56fe538f07b937bde6259c87f82548533b1925b1 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Tue, 9 Jul 2024 14:55:27 +0530
Subject: [PATCH] Tie

---
 desktop/src/main/ipc.ts                       |  6 +--
 desktop/src/main/services/ml-clip.ts          | 42 +++----------------
 desktop/src/preload.ts                        |  4 +-
 web/apps/photos/src/services/searchService.ts |  2 +-
 web/packages/next/types/ipc.ts                | 11 +++--
 5 files changed, 16 insertions(+), 49 deletions(-)

diff --git a/desktop/src/main/ipc.ts b/desktop/src/main/ipc.ts
index d6672fa3ab..641ce9963d 100644
--- a/desktop/src/main/ipc.ts
+++ b/desktop/src/main/ipc.ts
@@ -186,10 +186,8 @@ export const attachIPCHandlers = () => {
 
     // - ML
 
-    ipcMain.handle(
-        "computeCLIPImageEmbedding",
-        (_, jpegImageData: Uint8Array) =>
-            computeCLIPImageEmbedding(jpegImageData),
+    ipcMain.handle("computeCLIPImageEmbedding", (_, input: Float32Array) =>
+        computeCLIPImageEmbedding(input),
     );
 
     ipcMain.handle("computeCLIPTextEmbeddingIfAvailable", (_, text: string) =>
diff --git a/desktop/src/main/services/ml-clip.ts b/desktop/src/main/services/ml-clip.ts
index 655f9b2437..c51d952fe8 100644
--- a/desktop/src/main/services/ml-clip.ts
+++ b/desktop/src/main/services/ml-clip.ts
@@ -1,23 +1,12 @@
-// TODO: These arise from the array indexing in the pre-processing code. Isolate
-// once that code settles down to its final place (currently duplicated across
-// web and desktop).
-/* eslint-disable @typescript-eslint/no-non-null-assertion */
-
 /**
  * @file Compute CLIP embeddings for images and text.
  *
  * The embeddings are computed using ONNX runtime, with CLIP as the model.
- *
- * @see `web/apps/photos/src/services/clip-service.ts` for more details.
  */
 import Tokenizer from "clip-bpe-js";
-import jpeg from "jpeg-js";
-import fs from "node:fs/promises";
 import * as ort from "onnxruntime-node";
 import log from "../log";
-import { writeStream } from "../stream";
 import { ensure, wait } from "../utils/common";
-import { deleteTempFile, makeTempFilePath } from "../utils/temp";
 import { makeCachedInferenceSession } from "./ml";
 
 const cachedCLIPImageSession = makeCachedInferenceSession(
@@ -25,36 +14,18 @@ const cachedCLIPImageSession = makeCachedInferenceSession(
     351468764 /* 335.2 MB */,
 );
 
-export const computeCLIPImageEmbedding = async (jpegImageData: Uint8Array) => {
-    const tempFilePath = await makeTempFilePath();
-    const imageStream = new Response(jpegImageData.buffer).body;
-    await writeStream(tempFilePath, ensure(imageStream));
-    try {
-        return await clipImageEmbedding_(tempFilePath);
-    } finally {
-        await deleteTempFile(tempFilePath);
-    }
-};
-
-const clipImageEmbedding_ = async (jpegFilePath: string) => {
+export const computeCLIPImageEmbedding = async (input: Float32Array) => {
     const session = await cachedCLIPImageSession();
-    const t1 = Date.now();
-    const rgbData = await getRGBData(jpegFilePath);
+    const t = Date.now();
     const feeds = {
-        input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]),
+        input: new ort.Tensor("float32", input, [1, 3, 224, 224]),
     };
-    const t2 = Date.now();
     const results = await session.run(feeds);
-    log.debug(
-        () =>
-            `ONNX/CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
-    );
+    log.debug(() => `ONNX/CLIP image embedding took ${Date.now() - t} ms`);
     /* Need these model specific casts to type the result */
-    const imageEmbedding = ensure(results.output).data as Float32Array;
-    return normalizeEmbedding(imageEmbedding);
+    return ensure(results.output).data as Float32Array;
 };
 
-
 const cachedCLIPTextSession = makeCachedInferenceSession(
     "clip-text-vit-32-uint8.onnx",
     64173509 /* 61.2 MB */,
@@ -95,6 +66,5 @@ export const computeCLIPTextEmbeddingIfAvailable = async (text: string) => {
         () =>
             `ONNX/CLIP text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
     );
-    const textEmbedding = ensure(results.output).data as Float32Array;
-    return normalizeEmbedding(textEmbedding);
+    return ensure(results.output).data as Float32Array;
 };
diff --git a/desktop/src/preload.ts b/desktop/src/preload.ts
index 29bf9c0946..be11c2d8d4 100644
--- a/desktop/src/preload.ts
+++ b/desktop/src/preload.ts
@@ -163,8 +163,8 @@ const ffmpegExec = (
 
 // - ML
 
-const computeCLIPImageEmbedding = (jpegImageData: Uint8Array) =>
-    ipcRenderer.invoke("computeCLIPImageEmbedding", jpegImageData);
+const computeCLIPImageEmbedding = (input: Float32Array) =>
+    ipcRenderer.invoke("computeCLIPImageEmbedding", input);
 
 const computeCLIPTextEmbeddingIfAvailable = (text: string) =>
     ipcRenderer.invoke("computeCLIPTextEmbeddingIfAvailable", text);
diff --git a/web/apps/photos/src/services/searchService.ts b/web/apps/photos/src/services/searchService.ts
index b2d2c6aad3..84a38e144a 100644
--- a/web/apps/photos/src/services/searchService.ts
+++ b/web/apps/photos/src/services/searchService.ts
@@ -402,7 +402,7 @@ const searchClip = async (
 // };
 
 // getTextEmbeddingIfAvailable = async (text: string) => {
-//     return ensureElectron().computeCLIPTextEmbeddingIfAvailable(text);
+//    return normalizeEmbedding(ensureElectron().computeCLIPTextEmbeddingIfAvailable(text));
 // };
 
 // export const computeClipMatchScore = async (
diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts
index be53612981..7a11553835 100644
--- a/web/packages/next/types/ipc.ts
+++ b/web/packages/next/types/ipc.ts
@@ -339,14 +339,13 @@ export interface Electron {
      *
      * See: [Note: Natural language search using CLIP]
      *
-     * The input is a opaque binary data whose internal structure is specific to
-     * our implementation and the ML model (CLIP) we use.
+     * The input is a opaque float32 array representing the image. The layout
+     * and exact encoding of the input is specific to our implementation and the
+     * ML model (CLIP) we use.
      *
-     * @returns A CLIP embedding.
+     * @returns A CLIP embedding (an array of 512 floating point values).
      */
-    computeCLIPImageEmbedding: (
-        jpegImageData: Uint8Array,
-    ) => Promise<Float32Array>;
+    computeCLIPImageEmbedding: (input: Float32Array) => Promise<Float32Array>;
 
     /**
      * Return a CLIP embedding of the given image if we already have the model