From 544e6be3fdfd279c1946e6c6a9f37afa4dc132fd Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Tue, 9 Apr 2024 15:38:47 +0530
Subject: [PATCH] Document

---
 web/apps/photos/src/services/clipService.ts | 41 +++++++++++++++++++++
 web/packages/next/types/ipc.ts              | 16 +++++++-
 2 files changed, 56 insertions(+), 1 deletion(-)
diff --git a/web/apps/photos/src/services/clipService.ts b/web/apps/photos/src/services/clipService.ts
index 53e026d4ff..e35f74f071 100644
--- a/web/apps/photos/src/services/clipService.ts
+++ b/web/apps/photos/src/services/clipService.ts
@@ -21,6 +21,47 @@ export interface ClipExtractionStatus {
     indexed: number;
 }
 
+/**
+ * Use a CLIP based neural network for natural language search.
+ *
+ * [Note: CLIP based magic search]
+ *
+ * CLIP (Contrastive Language-Image Pretraining) is a neural network trained on
+ * (image, text) pairs. It can be thought of as two separate (but jointly
+ * trained) encoders - one for images, and one for text - that both map to the
+ * same embedding space.
+ *
+ * We use this for natural language search within the app (aka "magic search"):
+ *
+ * 1. Pre-compute an embedding for each image.
+ *
+ * 2. When the user searches, compute an embedding for the search term.
+ *
+ * 3. Use cosine similarity to find the find the image (embedding) closest to
+ *    the text (embedding).
+ *
+ * More details are in the blog post that describes the initial launch of this
+ * feature using the GGML runtime:
+ * https://ente.io/blog/image-search-with-clip-ggml/
+ *
+ * Since the initial launch, we've added support for another runtime, ONNX.
+ *
+ * Note that we don't train the neural network - we use one of the publicly
+ * available pre-trained neural networks (which are wholly defined by their
+ * connectivity and weights), and use one of the standard ML runtimes to load
+ * these weights and instantiate a running network that we can use to compute
+ * the embeddings. Theoretically, the same CLIP model can be loaded by different
+ * frameworks / runtimes, but in practice each runtime has its own preferred
+ * format, and there are also quantization tradeoffs. So for each runtime that
+ * we support we download a distinct model (binary encoding of weights).
+ *
+ * Currently supported runtimes are:
+ *
+ * - [GGML](https://github.com/monatis/clip.cpp)
+ * - [ONNX](https://onnx.ai)
+ *
+ * Both these currently have one (and only one) associated model.
+ */
 class ClipServiceImpl {
     private embeddingExtractionInProgress: AbortController | null = null;
     private reRunNeeded = false;
diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts
index d13c775f40..43eaee575a 100644
--- a/web/packages/next/types/ipc.ts
+++ b/web/packages/next/types/ipc.ts
@@ -10,6 +10,10 @@ export interface AppUpdateInfo {
     version: string;
 }
 
+export type CLIPModel = "ggml-clip" | "onnx-clip";
+
+export const isCLIPModel = (s: unknown) => s == "ggml-clip" || s == "onnx-clip";
+
 export enum Model {
     GGML_CLIP = "ggml-clip",
     ONNX_CLIP = "onnx-clip",
@@ -147,9 +151,19 @@ export interface Electron {
 
     // - ML
 
+    /**
+     * Compute and return a CLIP embedding of the given image.
+     *
+     * See: [Note: CLIP based magic search]
+     *
+     * @param model The CLIP model and ML runtime combination to use.
+     * @param jpegImageData The raw bytes of the image encoded as an JPEG.
+     *
+     * @returns A CLIP embedding.
+     */
     computeImageEmbedding: (
         model: Model,
-        imageData: Uint8Array,
+        jpegImageData: Uint8Array,
     ) => Promise<Float32Array>;
 
     computeTextEmbedding: (model: Model, text: string) => Promise<Float32Array>;