From 56fe538f07b937bde6259c87f82548533b1925b1 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Tue, 9 Jul 2024 14:55:27 +0530 Subject: [PATCH] Tie --- desktop/src/main/ipc.ts | 6 +-- desktop/src/main/services/ml-clip.ts | 42 +++---------------- desktop/src/preload.ts | 4 +- web/apps/photos/src/services/searchService.ts | 2 +- web/packages/next/types/ipc.ts | 11 +++-- 5 files changed, 16 insertions(+), 49 deletions(-) diff --git a/desktop/src/main/ipc.ts b/desktop/src/main/ipc.ts index d6672fa3ab..641ce9963d 100644 --- a/desktop/src/main/ipc.ts +++ b/desktop/src/main/ipc.ts @@ -186,10 +186,8 @@ export const attachIPCHandlers = () => { // - ML - ipcMain.handle( - "computeCLIPImageEmbedding", - (_, jpegImageData: Uint8Array) => - computeCLIPImageEmbedding(jpegImageData), + ipcMain.handle("computeCLIPImageEmbedding", (_, input: Float32Array) => + computeCLIPImageEmbedding(input), ); ipcMain.handle("computeCLIPTextEmbeddingIfAvailable", (_, text: string) => diff --git a/desktop/src/main/services/ml-clip.ts b/desktop/src/main/services/ml-clip.ts index 655f9b2437..c51d952fe8 100644 --- a/desktop/src/main/services/ml-clip.ts +++ b/desktop/src/main/services/ml-clip.ts @@ -1,23 +1,12 @@ -// TODO: These arise from the array indexing in the pre-processing code. Isolate -// once that code settles down to its final place (currently duplicated across -// web and desktop). -/* eslint-disable @typescript-eslint/no-non-null-assertion */ - /** * @file Compute CLIP embeddings for images and text. * * The embeddings are computed using ONNX runtime, with CLIP as the model. - * - * @see `web/apps/photos/src/services/clip-service.ts` for more details. */ import Tokenizer from "clip-bpe-js"; -import jpeg from "jpeg-js"; -import fs from "node:fs/promises"; import * as ort from "onnxruntime-node"; import log from "../log"; -import { writeStream } from "../stream"; import { ensure, wait } from "../utils/common"; -import { deleteTempFile, makeTempFilePath } from "../utils/temp"; import { makeCachedInferenceSession } from "./ml"; const cachedCLIPImageSession = makeCachedInferenceSession( @@ -25,36 +14,18 @@ const cachedCLIPImageSession = makeCachedInferenceSession( 351468764 /* 335.2 MB */, ); -export const computeCLIPImageEmbedding = async (jpegImageData: Uint8Array) => { - const tempFilePath = await makeTempFilePath(); - const imageStream = new Response(jpegImageData.buffer).body; - await writeStream(tempFilePath, ensure(imageStream)); - try { - return await clipImageEmbedding_(tempFilePath); - } finally { - await deleteTempFile(tempFilePath); - } -}; - -const clipImageEmbedding_ = async (jpegFilePath: string) => { +export const computeCLIPImageEmbedding = async (input: Float32Array) => { const session = await cachedCLIPImageSession(); - const t1 = Date.now(); - const rgbData = await getRGBData(jpegFilePath); + const t = Date.now(); const feeds = { - input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]), + input: new ort.Tensor("float32", input, [1, 3, 224, 224]), }; - const t2 = Date.now(); const results = await session.run(feeds); - log.debug( - () => - `ONNX/CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, - ); + log.debug(() => `ONNX/CLIP image embedding took ${Date.now() - t} ms`); /* Need these model specific casts to type the result */ - const imageEmbedding = ensure(results.output).data as Float32Array; - return normalizeEmbedding(imageEmbedding); + return ensure(results.output).data as Float32Array; }; - const cachedCLIPTextSession = makeCachedInferenceSession( "clip-text-vit-32-uint8.onnx", 64173509 /* 61.2 MB */, @@ -95,6 +66,5 @@ export const computeCLIPTextEmbeddingIfAvailable = async (text: string) => { () => `ONNX/CLIP text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, ); - const textEmbedding = ensure(results.output).data as Float32Array; - return normalizeEmbedding(textEmbedding); + return ensure(results.output).data as Float32Array; }; diff --git a/desktop/src/preload.ts b/desktop/src/preload.ts index 29bf9c0946..be11c2d8d4 100644 --- a/desktop/src/preload.ts +++ b/desktop/src/preload.ts @@ -163,8 +163,8 @@ const ffmpegExec = ( // - ML -const computeCLIPImageEmbedding = (jpegImageData: Uint8Array) => - ipcRenderer.invoke("computeCLIPImageEmbedding", jpegImageData); +const computeCLIPImageEmbedding = (input: Float32Array) => + ipcRenderer.invoke("computeCLIPImageEmbedding", input); const computeCLIPTextEmbeddingIfAvailable = (text: string) => ipcRenderer.invoke("computeCLIPTextEmbeddingIfAvailable", text); diff --git a/web/apps/photos/src/services/searchService.ts b/web/apps/photos/src/services/searchService.ts index b2d2c6aad3..84a38e144a 100644 --- a/web/apps/photos/src/services/searchService.ts +++ b/web/apps/photos/src/services/searchService.ts @@ -402,7 +402,7 @@ const searchClip = async ( // }; // getTextEmbeddingIfAvailable = async (text: string) => { -// return ensureElectron().computeCLIPTextEmbeddingIfAvailable(text); +// return normalizeEmbedding(ensureElectron().computeCLIPTextEmbeddingIfAvailable(text)); // }; // export const computeClipMatchScore = async ( diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts index be53612981..7a11553835 100644 --- a/web/packages/next/types/ipc.ts +++ b/web/packages/next/types/ipc.ts @@ -339,14 +339,13 @@ export interface Electron { * * See: [Note: Natural language search using CLIP] * - * The input is a opaque binary data whose internal structure is specific to - * our implementation and the ML model (CLIP) we use. + * The input is a opaque float32 array representing the image. The layout + * and exact encoding of the input is specific to our implementation and the + * ML model (CLIP) we use. * - * @returns A CLIP embedding. + * @returns A CLIP embedding (an array of 512 floating point values). */ - computeCLIPImageEmbedding: ( - jpegImageData: Uint8Array, - ) => Promise; + computeCLIPImageEmbedding: (input: Float32Array) => Promise; /** * Return a CLIP embedding of the given image if we already have the model