From 544e6be3fdfd279c1946e6c6a9f37afa4dc132fd Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Tue, 9 Apr 2024 15:38:47 +0530 Subject: [PATCH] Document --- web/apps/photos/src/services/clipService.ts | 41 +++++++++++++++++++++ web/packages/next/types/ipc.ts | 16 +++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/web/apps/photos/src/services/clipService.ts b/web/apps/photos/src/services/clipService.ts index 53e026d4ff..e35f74f071 100644 --- a/web/apps/photos/src/services/clipService.ts +++ b/web/apps/photos/src/services/clipService.ts @@ -21,6 +21,47 @@ export interface ClipExtractionStatus { indexed: number; } +/** + * Use a CLIP based neural network for natural language search. + * + * [Note: CLIP based magic search] + * + * CLIP (Contrastive Language-Image Pretraining) is a neural network trained on + * (image, text) pairs. It can be thought of as two separate (but jointly + * trained) encoders - one for images, and one for text - that both map to the + * same embedding space. + * + * We use this for natural language search within the app (aka "magic search"): + * + * 1. Pre-compute an embedding for each image. + * + * 2. When the user searches, compute an embedding for the search term. + * + * 3. Use cosine similarity to find the find the image (embedding) closest to + * the text (embedding). + * + * More details are in the blog post that describes the initial launch of this + * feature using the GGML runtime: + * https://ente.io/blog/image-search-with-clip-ggml/ + * + * Since the initial launch, we've added support for another runtime, ONNX. + * + * Note that we don't train the neural network - we use one of the publicly + * available pre-trained neural networks (which are wholly defined by their + * connectivity and weights), and use one of the standard ML runtimes to load + * these weights and instantiate a running network that we can use to compute + * the embeddings. Theoretically, the same CLIP model can be loaded by different + * frameworks / runtimes, but in practice each runtime has its own preferred + * format, and there are also quantization tradeoffs. So for each runtime that + * we support we download a distinct model (binary encoding of weights). + * + * Currently supported runtimes are: + * + * - [GGML](https://github.com/monatis/clip.cpp) + * - [ONNX](https://onnx.ai) + * + * Both these currently have one (and only one) associated model. + */ class ClipServiceImpl { private embeddingExtractionInProgress: AbortController | null = null; private reRunNeeded = false; diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts index d13c775f40..43eaee575a 100644 --- a/web/packages/next/types/ipc.ts +++ b/web/packages/next/types/ipc.ts @@ -10,6 +10,10 @@ export interface AppUpdateInfo { version: string; } +export type CLIPModel = "ggml-clip" | "onnx-clip"; + +export const isCLIPModel = (s: unknown) => s == "ggml-clip" || s == "onnx-clip"; + export enum Model { GGML_CLIP = "ggml-clip", ONNX_CLIP = "onnx-clip", @@ -147,9 +151,19 @@ export interface Electron { // - ML + /** + * Compute and return a CLIP embedding of the given image. + * + * See: [Note: CLIP based magic search] + * + * @param model The CLIP model and ML runtime combination to use. + * @param jpegImageData The raw bytes of the image encoded as an JPEG. + * + * @returns A CLIP embedding. + */ computeImageEmbedding: ( model: Model, - imageData: Uint8Array, + jpegImageData: Uint8Array, ) => Promise; computeTextEmbedding: (model: Model, text: string) => Promise;