From cda925fc80965322efd170c5b0e063ff1525f5a0 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 26 Sep 2024 07:25:25 +0530 Subject: [PATCH] Tweaks (non-functional) --- desktop/src/main/services/ml-worker.ts | 3 +-- web/packages/base/types/ipc.ts | 5 +++-- web/packages/new/photos/services/ml/clip.ts | 8 +++++++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/desktop/src/main/services/ml-worker.ts b/desktop/src/main/services/ml-worker.ts index fb4ec1817a..fcbd73576c 100644 --- a/desktop/src/main/services/ml-worker.ts +++ b/desktop/src/main/services/ml-worker.ts @@ -203,8 +203,7 @@ const createInferenceSession = async (modelPath: string) => { const cachedCLIPImageSession = makeCachedInferenceSession( "mobileclip_s2_image_opset18_rgba_sim.onnx", - 143061211 /* 143 MB */, - // TODO: manav: check above number, because I got 143093992 but might be calculating wrong + 143093992 /* 143 MB */, ); /** diff --git a/web/packages/base/types/ipc.ts b/web/packages/base/types/ipc.ts index d5251f6e6f..303251bb1e 100644 --- a/web/packages/base/types/ipc.ts +++ b/web/packages/base/types/ipc.ts @@ -553,8 +553,9 @@ export interface ElectronMLWorker { * See: [Note: Natural language search using CLIP] * * The input is a opaque float32 array representing the image. The layout - * and exact encoding of the input is specific to our implementation and the - * ML model (CLIP) we use. + * and exact encoding of the input is specific to the runtime (ONNX) and the + * ML model (a MobileCLIP variant) we use. In particular, the image + * pre-processing happens within our model itself. * * @returns A CLIP embedding (an array of 512 floating point values). */ diff --git a/web/packages/new/photos/services/ml/clip.ts b/web/packages/new/photos/services/ml/clip.ts index c8e40ddf91..59df54af90 100644 --- a/web/packages/new/photos/services/ml/clip.ts +++ b/web/packages/new/photos/services/ml/clip.ts @@ -112,9 +112,15 @@ const computeEmbedding = async ( imageData: ImageData, electron: ElectronMLWorker, ): Promise => { + // In contrast to the face detection model, the image pre-preprocessing + // happens within the model itself, using ONNX primitives. This is more + // performant and also saves us from having to reinvent (say) the + // antialising wheels. const { height, width, data: pixelData } = imageData; const inputShape = [height, width, 4]; // [H, W, C] - return normalized(await electron.computeCLIPImageEmbedding(pixelData, inputShape)); + return normalized( + await electron.computeCLIPImageEmbedding(pixelData, inputShape), + ); }; const normalized = (embedding: Float32Array) => {