From d3eb85be8d653022eeffe7af8b36c1f3cdd8f89e Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 13:17:56 +0530
Subject: [PATCH 01/17] Split

---
 desktop/src/main/ipc.ts           |   2 +-
 desktop/src/main/services/clip.ts | 288 ------------------------------
 2 files changed, 1 insertion(+), 289 deletions(-)
 delete mode 100644 desktop/src/main/services/clip.ts
diff --git a/desktop/src/main/ipc.ts b/desktop/src/main/ipc.ts
index 2b328bb986..1d863b8e9a 100644
--- a/desktop/src/main/ipc.ts
+++ b/desktop/src/main/ipc.ts
@@ -36,7 +36,7 @@ import {
     updateAndRestart,
     updateOnNextRestart,
 } from "./services/app-update";
-import { clipImageEmbedding, clipTextEmbedding } from "./services/clip";
+import { clipImageEmbedding, clipTextEmbedding } from "./services/ml-clip";
 import { runFFmpegCmd } from "./services/ffmpeg";
 import { getDirFiles } from "./services/fs";
 import {
diff --git a/desktop/src/main/services/clip.ts b/desktop/src/main/services/clip.ts
deleted file mode 100644
index 525e613424..0000000000
--- a/desktop/src/main/services/clip.ts
+++ /dev/null
@@ -1,288 +0,0 @@
-/**
- * @file Compute CLIP embeddings
- *
- * @see `web/apps/photos/src/services/clip-service.ts` for more details. This
- * file implements the Node.js implementation of the actual embedding
- * computation. By doing it in the Node.js layer, we can use the binary ONNX
- * runtimes which are 10-20x faster than the WASM based web ones.
- *
- * The embeddings are computed using ONNX runtime. The model itself is not
- * shipped with the app but is downloaded on demand.
- */
-import { app, net } from "electron/main";
-import { existsSync } from "fs";
-import jpeg from "jpeg-js";
-import fs from "node:fs/promises";
-import path from "node:path";
-import * as ort from "onnxruntime-node";
-import Tokenizer from "../../thirdparty/clip-bpe-ts/mod";
-import { CustomErrors } from "../../types/ipc";
-import { writeStream } from "../fs";
-import log from "../log";
-import { generateTempFilePath } from "../temp";
-import { deleteTempFile } from "./ffmpeg";
-
-const textModelName = "clip-text-vit-32-uint8.onnx";
-const textModelByteSize = 64173509; // 61.2 MB
-
-const imageModelName = "clip-image-vit-32-float32.onnx";
-const imageModelByteSize = 351468764; // 335.2 MB
-
-/** Return the path where the given {@link modelName} is meant to be saved */
-const modelSavePath = (modelName: string) =>
-    path.join(app.getPath("userData"), "models", modelName);
-
-const downloadModel = async (saveLocation: string, name: string) => {
-    // `mkdir -p` the directory where we want to save the model.
-    const saveDir = path.dirname(saveLocation);
-    await fs.mkdir(saveDir, { recursive: true });
-    // Download
-    log.info(`Downloading CLIP model from ${name}`);
-    const url = `https://models.ente.io/${name}`;
-    const res = await net.fetch(url);
-    if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`);
-    // Save
-    await writeStream(saveLocation, res.body);
-    log.info(`Downloaded CLIP model ${name}`);
-};
-
-let activeImageModelDownload: Promise<void> | undefined;
-
-const imageModelPathDownloadingIfNeeded = async () => {
-    try {
-        const modelPath = modelSavePath(imageModelName);
-        if (activeImageModelDownload) {
-            log.info("Waiting for CLIP image model download to finish");
-            await activeImageModelDownload;
-        } else {
-            if (!existsSync(modelPath)) {
-                log.info("CLIP image model not found, downloading");
-                activeImageModelDownload = downloadModel(
-                    modelPath,
-                    imageModelName,
-                );
-                await activeImageModelDownload;
-            } else {
-                const localFileSize = (await fs.stat(modelPath)).size;
-                if (localFileSize !== imageModelByteSize) {
-                    log.error(
-                        `CLIP image model size ${localFileSize} does not match the expected size, downloading again`,
-                    );
-                    activeImageModelDownload = downloadModel(
-                        modelPath,
-                        imageModelName,
-                    );
-                    await activeImageModelDownload;
-                }
-            }
-        }
-        return modelPath;
-    } finally {
-        activeImageModelDownload = undefined;
-    }
-};
-
-let textModelDownloadInProgress = false;
-
-const textModelPathDownloadingIfNeeded = async () => {
-    if (textModelDownloadInProgress)
-        throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
-
-    const modelPath = modelSavePath(textModelName);
-    if (!existsSync(modelPath)) {
-        log.info("CLIP text model not found, downloading");
-        textModelDownloadInProgress = true;
-        downloadModel(modelPath, textModelName)
-            .catch((e) => {
-                // log but otherwise ignore
-                log.error("CLIP text model download failed", e);
-            })
-            .finally(() => {
-                textModelDownloadInProgress = false;
-            });
-        throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
-    } else {
-        const localFileSize = (await fs.stat(modelPath)).size;
-        if (localFileSize !== textModelByteSize) {
-            log.error(
-                `CLIP text model size ${localFileSize} does not match the expected size, downloading again`,
-            );
-            textModelDownloadInProgress = true;
-            downloadModel(modelPath, textModelName)
-                .catch((e) => {
-                    // log but otherwise ignore
-                    log.error("CLIP text model download failed", e);
-                })
-                .finally(() => {
-                    textModelDownloadInProgress = false;
-                });
-            throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
-        }
-    }
-
-    return modelPath;
-};
-
-const createInferenceSession = async (modelPath: string) => {
-    return await ort.InferenceSession.create(modelPath, {
-        intraOpNumThreads: 1,
-        enableCpuMemArena: false,
-    });
-};
-
-let imageSessionPromise: Promise<any> | undefined;
-
-const onnxImageSession = async () => {
-    if (!imageSessionPromise) {
-        imageSessionPromise = (async () => {
-            const modelPath = await imageModelPathDownloadingIfNeeded();
-            return createInferenceSession(modelPath);
-        })();
-    }
-    return imageSessionPromise;
-};
-
-let _textSession: any = null;
-
-const onnxTextSession = async () => {
-    if (!_textSession) {
-        const modelPath = await textModelPathDownloadingIfNeeded();
-        _textSession = await createInferenceSession(modelPath);
-    }
-    return _textSession;
-};
-
-export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
-    const tempFilePath = await generateTempFilePath("");
-    const imageStream = new Response(jpegImageData.buffer).body;
-    await writeStream(tempFilePath, imageStream);
-    try {
-        return await clipImageEmbedding_(tempFilePath);
-    } finally {
-        await deleteTempFile(tempFilePath);
-    }
-};
-
-const clipImageEmbedding_ = async (jpegFilePath: string) => {
-    const imageSession = await onnxImageSession();
-    const t1 = Date.now();
-    const rgbData = await getRGBData(jpegFilePath);
-    const feeds = {
-        input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]),
-    };
-    const t2 = Date.now();
-    const results = await imageSession.run(feeds);
-    log.debug(
-        () =>
-            `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
-    );
-    const imageEmbedding = results["output"].data; // Float32Array
-    return normalizeEmbedding(imageEmbedding);
-};
-
-const getRGBData = async (jpegFilePath: string) => {
-    const jpegData = await fs.readFile(jpegFilePath);
-    const rawImageData = jpeg.decode(jpegData, {
-        useTArray: true,
-        formatAsRGBA: false,
-    });
-
-    const nx: number = rawImageData.width;
-    const ny: number = rawImageData.height;
-    const inputImage: Uint8Array = rawImageData.data;
-
-    const nx2: number = 224;
-    const ny2: number = 224;
-    const totalSize: number = 3 * nx2 * ny2;
-
-    const result: number[] = Array(totalSize).fill(0);
-    const scale: number = Math.max(nx, ny) / 224;
-
-    const nx3: number = Math.round(nx / scale);
-    const ny3: number = Math.round(ny / scale);
-
-    const mean: number[] = [0.48145466, 0.4578275, 0.40821073];
-    const std: number[] = [0.26862954, 0.26130258, 0.27577711];
-
-    for (let y = 0; y < ny3; y++) {
-        for (let x = 0; x < nx3; x++) {
-            for (let c = 0; c < 3; c++) {
-                // Linear interpolation
-                const sx: number = (x + 0.5) * scale - 0.5;
-                const sy: number = (y + 0.5) * scale - 0.5;
-
-                const x0: number = Math.max(0, Math.floor(sx));
-                const y0: number = Math.max(0, Math.floor(sy));
-
-                const x1: number = Math.min(x0 + 1, nx - 1);
-                const y1: number = Math.min(y0 + 1, ny - 1);
-
-                const dx: number = sx - x0;
-                const dy: number = sy - y0;
-
-                const j00: number = 3 * (y0 * nx + x0) + c;
-                const j01: number = 3 * (y0 * nx + x1) + c;
-                const j10: number = 3 * (y1 * nx + x0) + c;
-                const j11: number = 3 * (y1 * nx + x1) + c;
-
-                const v00: number = inputImage[j00];
-                const v01: number = inputImage[j01];
-                const v10: number = inputImage[j10];
-                const v11: number = inputImage[j11];
-
-                const v0: number = v00 * (1 - dx) + v01 * dx;
-                const v1: number = v10 * (1 - dx) + v11 * dx;
-
-                const v: number = v0 * (1 - dy) + v1 * dy;
-
-                const v2: number = Math.min(Math.max(Math.round(v), 0), 255);
-
-                // createTensorWithDataList is dumb compared to reshape and
-                // hence has to be given with one channel after another
-                const i: number = y * nx3 + x + (c % 3) * 224 * 224;
-
-                result[i] = (v2 / 255 - mean[c]) / std[c];
-            }
-        }
-    }
-
-    return result;
-};
-
-const normalizeEmbedding = (embedding: Float32Array) => {
-    let normalization = 0;
-    for (let index = 0; index < embedding.length; index++) {
-        normalization += embedding[index] * embedding[index];
-    }
-    const sqrtNormalization = Math.sqrt(normalization);
-    for (let index = 0; index < embedding.length; index++) {
-        embedding[index] = embedding[index] / sqrtNormalization;
-    }
-    return embedding;
-};
-
-let _tokenizer: Tokenizer = null;
-const getTokenizer = () => {
-    if (!_tokenizer) {
-        _tokenizer = new Tokenizer();
-    }
-    return _tokenizer;
-};
-
-export const clipTextEmbedding = async (text: string) => {
-    const imageSession = await onnxTextSession();
-    const t1 = Date.now();
-    const tokenizer = getTokenizer();
-    const tokenizedText = Int32Array.from(tokenizer.encodeForCLIP(text));
-    const feeds = {
-        input: new ort.Tensor("int32", tokenizedText, [1, 77]),
-    };
-    const t2 = Date.now();
-    const results = await imageSession.run(feeds);
-    log.debug(
-        () =>
-            `CLIP text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
-    );
-    const textEmbedding = results["output"].data;
-    return normalizeEmbedding(textEmbedding);
-};

From 2b6047a979bcd46170e1e8e6d23706c7f7f55d45 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 13:40:35 +0530
Subject: [PATCH 02/17] Split

---
 desktop/src/main/services/ml-clip.ts | 248 +++++++++++++++++++++++++++
 desktop/src/main/services/ml-face.ts |  77 +++++++++
 desktop/src/main/services/ml.ts      |  79 +++++++++
 3 files changed, 404 insertions(+)
 create mode 100644 desktop/src/main/services/ml-clip.ts
 create mode 100644 desktop/src/main/services/ml-face.ts
 create mode 100644 desktop/src/main/services/ml.ts

diff --git a/desktop/src/main/services/ml-clip.ts b/desktop/src/main/services/ml-clip.ts
new file mode 100644
index 0000000000..3fe6da2eb2
--- /dev/null
+++ b/desktop/src/main/services/ml-clip.ts
@@ -0,0 +1,248 @@
+/**
+ * @file Compute CLIP embeddings for images and text.
+ *
+ * The embeddings are computed using ONNX runtime, with CLIP as the model.
+ *
+ * @see `web/apps/photos/src/services/clip-service.ts` for more details.
+ */
+import { existsSync } from "fs";
+import jpeg from "jpeg-js";
+import fs from "node:fs/promises";
+import * as ort from "onnxruntime-node";
+import Tokenizer from "../../thirdparty/clip-bpe-ts/mod";
+import { CustomErrors } from "../../types/ipc";
+import { writeStream } from "../fs";
+import log from "../log";
+import { generateTempFilePath } from "../temp";
+import { deleteTempFile } from "./ffmpeg";
+import {
+    createInferenceSession,
+    downloadModel,
+    modelPathDownloadingIfNeeded,
+    modelSavePath,
+} from "./ml";
+
+const textModelName = "clip-text-vit-32-uint8.onnx";
+const textModelByteSize = 64173509; // 61.2 MB
+
+const imageModelName = "clip-image-vit-32-float32.onnx";
+const imageModelByteSize = 351468764; // 335.2 MB
+
+let activeImageModelDownload: Promise<string> | undefined;
+
+const imageModelPathDownloadingIfNeeded = async () => {
+    try {
+        if (activeImageModelDownload) {
+            log.info("Waiting for CLIP image model download to finish");
+            await activeImageModelDownload;
+        } else {
+            activeImageModelDownload = modelPathDownloadingIfNeeded(
+                imageModelName,
+                imageModelByteSize,
+            );
+            return await activeImageModelDownload;
+        }
+    } finally {
+        activeImageModelDownload = undefined;
+    }
+};
+
+let textModelDownloadInProgress = false;
+
+/* TODO(MR): use the generic method. Then we can remove the exports for the
+   internal details functions that we use here */
+const textModelPathDownloadingIfNeeded = async () => {
+    if (textModelDownloadInProgress)
+        throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
+
+    const modelPath = modelSavePath(textModelName);
+    if (!existsSync(modelPath)) {
+        log.info("CLIP text model not found, downloading");
+        textModelDownloadInProgress = true;
+        downloadModel(modelPath, textModelName)
+            .catch((e) => {
+                // log but otherwise ignore
+                log.error("CLIP text model download failed", e);
+            })
+            .finally(() => {
+                textModelDownloadInProgress = false;
+            });
+        throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
+    } else {
+        const localFileSize = (await fs.stat(modelPath)).size;
+        if (localFileSize !== textModelByteSize) {
+            log.error(
+                `CLIP text model size ${localFileSize} does not match the expected size, downloading again`,
+            );
+            textModelDownloadInProgress = true;
+            downloadModel(modelPath, textModelName)
+                .catch((e) => {
+                    // log but otherwise ignore
+                    log.error("CLIP text model download failed", e);
+                })
+                .finally(() => {
+                    textModelDownloadInProgress = false;
+                });
+            throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING);
+        }
+    }
+
+    return modelPath;
+};
+
+let imageSessionPromise: Promise<any> | undefined;
+
+const onnxImageSession = async () => {
+    if (!imageSessionPromise) {
+        imageSessionPromise = (async () => {
+            const modelPath = await imageModelPathDownloadingIfNeeded();
+            return createInferenceSession(modelPath);
+        })();
+    }
+    return imageSessionPromise;
+};
+
+let _textSession: any = null;
+
+const onnxTextSession = async () => {
+    if (!_textSession) {
+        const modelPath = await textModelPathDownloadingIfNeeded();
+        _textSession = await createInferenceSession(modelPath);
+    }
+    return _textSession;
+};
+
+export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
+    const tempFilePath = await generateTempFilePath("");
+    const imageStream = new Response(jpegImageData.buffer).body;
+    await writeStream(tempFilePath, imageStream);
+    try {
+        return await clipImageEmbedding_(tempFilePath);
+    } finally {
+        await deleteTempFile(tempFilePath);
+    }
+};
+
+const clipImageEmbedding_ = async (jpegFilePath: string) => {
+    const imageSession = await onnxImageSession();
+    const t1 = Date.now();
+    const rgbData = await getRGBData(jpegFilePath);
+    const feeds = {
+        input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]),
+    };
+    const t2 = Date.now();
+    const results = await imageSession.run(feeds);
+    log.debug(
+        () =>
+            `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
+    );
+    const imageEmbedding = results["output"].data; // Float32Array
+    return normalizeEmbedding(imageEmbedding);
+};
+
+const getRGBData = async (jpegFilePath: string) => {
+    const jpegData = await fs.readFile(jpegFilePath);
+    const rawImageData = jpeg.decode(jpegData, {
+        useTArray: true,
+        formatAsRGBA: false,
+    });
+
+    const nx: number = rawImageData.width;
+    const ny: number = rawImageData.height;
+    const inputImage: Uint8Array = rawImageData.data;
+
+    const nx2: number = 224;
+    const ny2: number = 224;
+    const totalSize: number = 3 * nx2 * ny2;
+
+    const result: number[] = Array(totalSize).fill(0);
+    const scale: number = Math.max(nx, ny) / 224;
+
+    const nx3: number = Math.round(nx / scale);
+    const ny3: number = Math.round(ny / scale);
+
+    const mean: number[] = [0.48145466, 0.4578275, 0.40821073];
+    const std: number[] = [0.26862954, 0.26130258, 0.27577711];
+
+    for (let y = 0; y < ny3; y++) {
+        for (let x = 0; x < nx3; x++) {
+            for (let c = 0; c < 3; c++) {
+                // Linear interpolation
+                const sx: number = (x + 0.5) * scale - 0.5;
+                const sy: number = (y + 0.5) * scale - 0.5;
+
+                const x0: number = Math.max(0, Math.floor(sx));
+                const y0: number = Math.max(0, Math.floor(sy));
+
+                const x1: number = Math.min(x0 + 1, nx - 1);
+                const y1: number = Math.min(y0 + 1, ny - 1);
+
+                const dx: number = sx - x0;
+                const dy: number = sy - y0;
+
+                const j00: number = 3 * (y0 * nx + x0) + c;
+                const j01: number = 3 * (y0 * nx + x1) + c;
+                const j10: number = 3 * (y1 * nx + x0) + c;
+                const j11: number = 3 * (y1 * nx + x1) + c;
+
+                const v00: number = inputImage[j00];
+                const v01: number = inputImage[j01];
+                const v10: number = inputImage[j10];
+                const v11: number = inputImage[j11];
+
+                const v0: number = v00 * (1 - dx) + v01 * dx;
+                const v1: number = v10 * (1 - dx) + v11 * dx;
+
+                const v: number = v0 * (1 - dy) + v1 * dy;
+
+                const v2: number = Math.min(Math.max(Math.round(v), 0), 255);
+
+                // createTensorWithDataList is dumb compared to reshape and
+                // hence has to be given with one channel after another
+                const i: number = y * nx3 + x + (c % 3) * 224 * 224;
+
+                result[i] = (v2 / 255 - mean[c]) / std[c];
+            }
+        }
+    }
+
+    return result;
+};
+
+const normalizeEmbedding = (embedding: Float32Array) => {
+    let normalization = 0;
+    for (let index = 0; index < embedding.length; index++) {
+        normalization += embedding[index] * embedding[index];
+    }
+    const sqrtNormalization = Math.sqrt(normalization);
+    for (let index = 0; index < embedding.length; index++) {
+        embedding[index] = embedding[index] / sqrtNormalization;
+    }
+    return embedding;
+};
+
+let _tokenizer: Tokenizer = null;
+const getTokenizer = () => {
+    if (!_tokenizer) {
+        _tokenizer = new Tokenizer();
+    }
+    return _tokenizer;
+};
+
+export const clipTextEmbedding = async (text: string) => {
+    const imageSession = await onnxTextSession();
+    const t1 = Date.now();
+    const tokenizer = getTokenizer();
+    const tokenizedText = Int32Array.from(tokenizer.encodeForCLIP(text));
+    const feeds = {
+        input: new ort.Tensor("int32", tokenizedText, [1, 77]),
+    };
+    const t2 = Date.now();
+    const results = await imageSession.run(feeds);
+    log.debug(
+        () =>
+            `CLIP text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
+    );
+    const textEmbedding = results["output"].data;
+    return normalizeEmbedding(textEmbedding);
+};
diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
new file mode 100644
index 0000000000..c547885bb0
--- /dev/null
+++ b/desktop/src/main/services/ml-face.ts
@@ -0,0 +1,77 @@
+/**
+ * @file Various face recognition related tasks.
+ *
+ * - Face detection with the YOLO model.
+ * - Face embedding with the mobilefacenet model.
+ *
+ * The runtime used is ONNX.
+ */
+import * as ort from "onnxruntime-node";
+import log from "../log";
+import { createInferenceSession, modelPathDownloadingIfNeeded } from "./ml";
+
+const faceDetectionModelName = "yolov5s_face_640_640_dynamic.onnx";
+const faceDetectionModelByteSize = 30762872; // 29.3 MB
+
+const faceEmbeddingModelName = "mobilefacenet_opset15.onnx";
+const faceEmbeddingModelByteSize = 5286998; // 5 MB
+
+let activeFaceDetectionModelDownload: Promise<string> | undefined;
+
+const faceDetectionModelPathDownloadingIfNeeded = async () => {
+    try {
+        if (activeFaceDetectionModelDownload) {
+            log.info("Waiting for face detection model download to finish");
+            await activeFaceDetectionModelDownload;
+        } else {
+            activeFaceDetectionModelDownload = modelPathDownloadingIfNeeded(
+                faceDetectionModelName,
+                faceDetectionModelByteSize,
+            );
+            return await activeFaceDetectionModelDownload;
+        }
+    } finally {
+        activeFaceDetectionModelDownload = undefined;
+    }
+};
+
+let _faceDetectionSession: Promise<ort.InferenceSession> | undefined;
+
+const faceDetectionSession = async () => {
+    if (!_faceDetectionSession) {
+        _faceDetectionSession =
+            faceDetectionModelPathDownloadingIfNeeded().then((modelPath) =>
+                createInferenceSession(modelPath),
+            );
+    }
+    return _faceDetectionSession;
+};
+
+
+// export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
+//     const tempFilePath = await generateTempFilePath("");
+//     const imageStream = new Response(jpegImageData.buffer).body;
+//     await writeStream(tempFilePath, imageStream);
+//     try {
+//         return await clipImageEmbedding_(tempFilePath);
+//     } finally {
+//         await deleteTempFile(tempFilePath);
+//     }
+// };
+
+// const clipImageEmbedding_ = async (jpegFilePath: string) => {
+//     const imageSession = await onnxImageSession();
+//     const t1 = Date.now();
+//     const rgbData = await getRGBData(jpegFilePath);
+//     const feeds = {
+//         input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]),
+//     };
+//     const t2 = Date.now();
+//     const results = await imageSession.run(feeds);
+//     log.debug(
+//         () =>
+//             `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
+//     );
+//     const imageEmbedding = results["output"].data; // Float32Array
+//     return normalizeEmbedding(imageEmbedding);
+// };
diff --git a/desktop/src/main/services/ml.ts b/desktop/src/main/services/ml.ts
new file mode 100644
index 0000000000..10402db217
--- /dev/null
+++ b/desktop/src/main/services/ml.ts
@@ -0,0 +1,79 @@
+/**
+ * @file AI/ML related functionality.
+ *
+ * @see also `ml-clip.ts`, `ml-face.ts`.
+ *
+ * The ML runtime we use for inference is [ONNX](https://onnxruntime.ai). Models
+ * for various tasks are not shipped with the app but are downloaded on demand.
+ *
+ * The primary reason for doing these tasks in the Node.js layer is so that we
+ * can use the binary ONNX runtime which is 10-20x faster than the WASM based
+ * web one.
+ */
+import { app, net } from "electron/main";
+import { existsSync } from "fs";
+import fs from "node:fs/promises";
+import path from "node:path";
+import * as ort from "onnxruntime-node";
+import { writeStream } from "../fs";
+import log from "../log";
+
+/**
+ * Download the model named {@link modelName} if we don't already have it.
+ *
+ * Also verify that the size of the model we get matches {@expectedByteSize} (if
+ * not, redownload it).
+ *
+ * @returns the path to the model on the local machine.
+ */
+export const modelPathDownloadingIfNeeded = async (
+    modelName: string,
+    expectedByteSize: number,
+) => {
+    const modelPath = modelSavePath(modelName);
+
+    if (!existsSync(modelPath)) {
+        log.info("CLIP image model not found, downloading");
+        await downloadModel(modelPath, modelName);
+    } else {
+        const size = (await fs.stat(modelPath)).size;
+        if (size !== expectedByteSize) {
+            log.error(
+                `The size ${size} of model ${modelName} does not match the expected size, downloading again`,
+            );
+            await downloadModel(modelPath, modelName);
+        }
+    }
+
+    return modelPath;
+};
+
+/** Return the path where the given {@link modelName} is meant to be saved */
+export const modelSavePath = (modelName: string) =>
+    path.join(app.getPath("userData"), "models", modelName);
+
+export const downloadModel = async (saveLocation: string, name: string) => {
+    // `mkdir -p` the directory where we want to save the model.
+    const saveDir = path.dirname(saveLocation);
+    await fs.mkdir(saveDir, { recursive: true });
+    // Download
+    log.info(`Downloading ML model from ${name}`);
+    const url = `https://models.ente.io/${name}`;
+    const res = await net.fetch(url);
+    if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`);
+    // Save
+    await writeStream(saveLocation, res.body);
+    log.info(`Downloaded CLIP model ${name}`);
+};
+
+/**
+ * Crete an ONNX {@link InferenceSession} with some defaults.
+ */
+export const createInferenceSession = async (modelPath: string) => {
+    return await ort.InferenceSession.create(modelPath, {
+        // Restrict the number of threads to 1
+        intraOpNumThreads: 1,
+        // Be more conservative with RAM usage
+        enableCpuMemArena: false,
+    });
+};

From 7bf8912dbc3278496f723ac740ca68c8af4a70f5 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 13:45:02 +0530
Subject: [PATCH 03/17] Duplicate for now

---
 desktop/src/main/services/ml-face.ts | 30 ++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
index c547885bb0..f88f432ee8 100644
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@@ -47,6 +47,36 @@ const faceDetectionSession = async () => {
     return _faceDetectionSession;
 };
 
+let activeFaceEmbeddingModelDownload: Promise<string> | undefined;
+
+const faceEmbeddingModelPathDownloadingIfNeeded = async () => {
+    try {
+        if (activeFaceEmbeddingModelDownload) {
+            log.info("Waiting for face embedding model download to finish");
+            await activeFaceEmbeddingModelDownload;
+        } else {
+            activeFaceEmbeddingModelDownload = modelPathDownloadingIfNeeded(
+                faceEmbeddingModelName,
+                faceEmbeddingModelByteSize,
+            );
+            return await activeFaceEmbeddingModelDownload;
+        }
+    } finally {
+        activeFaceEmbeddingModelDownload = undefined;
+    }
+};
+
+let _faceEmbeddingSession: Promise<ort.InferenceSession> | undefined;
+
+const faceEmbeddingSession = async () => {
+    if (!_faceEmbeddingSession) {
+        _faceEmbeddingSession =
+            faceEmbeddingModelPathDownloadingIfNeeded().then((modelPath) =>
+                createInferenceSession(modelPath),
+            );
+    }
+    return _faceEmbeddingSession;
+};
 
 // export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
 //     const tempFilePath = await generateTempFilePath("");

From 2bb9e77e34e1334712647fc37868ecc4d7cddfdb Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 13:46:34 +0530
Subject: [PATCH 04/17] Remove unused code

---
 .../yoloFaceDetectionService.ts               | 37 -------------------
 1 file changed, 37 deletions(-)

diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
index 9fc0f7ad24..71b51f674e 100644
--- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
+++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
@@ -156,43 +156,6 @@ class YoloFaceDetectionService implements FaceDetectionService {
         };
     }
 
-    /**
-     * @deprecated The method should not be used
-     */
-    private imageBitmapToTensorData(imageBitmap) {
-        // Create an OffscreenCanvas and set its size
-        const offscreenCanvas = new OffscreenCanvas(
-            imageBitmap.width,
-            imageBitmap.height,
-        );
-        const ctx = offscreenCanvas.getContext("2d");
-        ctx.drawImage(imageBitmap, 0, 0, imageBitmap.width, imageBitmap.height);
-        const imageData = ctx.getImageData(
-            0,
-            0,
-            imageBitmap.width,
-            imageBitmap.height,
-        );
-        const pixelData = imageData.data;
-        const data = new Float32Array(
-            1 * 3 * imageBitmap.width * imageBitmap.height,
-        );
-        // Populate the Float32Array with normalized pixel values
-        for (let i = 0; i < pixelData.length; i += 4) {
-            // Normalize pixel values to the range [0, 1]
-            data[i / 4] = pixelData[i] / 255.0; // Red channel
-            data[i / 4 + imageBitmap.width * imageBitmap.height] =
-                pixelData[i + 1] / 255.0; // Green channel
-            data[i / 4 + 2 * imageBitmap.width * imageBitmap.height] =
-                pixelData[i + 2] / 255.0; // Blue channel
-        }
-
-        return {
-            data: data,
-            shape: [1, 3, imageBitmap.width, imageBitmap.height],
-        };
-    }
-
     // The rowOutput is a Float32Array of shape [25200, 16], where each row represents a bounding box.
     private getFacesFromYoloOutput(
         rowOutput: Float32Array,

From a88f551b6a6b8bc6f3ca76f1b4af1d188ffdaa0a Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 13:58:52 +0530
Subject: [PATCH 05/17] WIP IPC API

---
 desktop/src/main/services/ml-face.ts          |  24 ++++
 .../machineLearning/machineLearningFactory.ts |   3 -
 .../mobileFaceNetEmbeddingService.ts          |   6 -
 .../yoloFaceDetectionService.ts               | 116 ++++++------------
 .../photos/src/types/machineLearning/index.ts |   3 +-
 web/packages/next/types/ipc.ts                |  21 +++-
 6 files changed, 79 insertions(+), 94 deletions(-)

diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
index f88f432ee8..bf8eea1625 100644
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@@ -78,6 +78,30 @@ const faceEmbeddingSession = async () => {
     return _faceEmbeddingSession;
 };
 
+private async initOnnx() {
+    console.log("start ort");
+    this.onnxInferenceSession = await ort.InferenceSession.create(
+        "/models/yoloface/yolov5s_face_640_640_dynamic.onnx",
+    );
+    const data = new Float32Array(1 * 3 * 640 * 640);
+    const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
+    // TODO(MR): onnx-yolo
+    // const feeds: Record<string, ort.Tensor> = {};
+    const feeds: Record<string, any> = {};
+    const name = this.onnxInferenceSession.inputNames[0];
+    feeds[name] = inputTensor;
+    await this.onnxInferenceSession.run(feeds);
+    console.log("start end");
+}
+
+private async getOnnxInferenceSession() {
+    if (!this.onnxInferenceSession) {
+        await this.initOnnx();
+    }
+    return this.onnxInferenceSession;
+}
+
+
 // export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
 //     const tempFilePath = await generateTempFilePath("");
 //     const imageStream = new Response(jpegImageData.buffer).body;
diff --git a/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts b/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts
index 36e37d9b83..991ae68087 100644
--- a/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts
+++ b/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts
@@ -203,9 +203,6 @@ export class LocalMLSyncContext implements MLSyncContext {
     }
 
     public async dispose() {
-        // await this.faceDetectionService.dispose();
-        // await this.faceEmbeddingService.dispose();
-
         this.localFilesMap = undefined;
         await this.syncQueue.onIdle();
         this.syncQueue.removeAllListeners();
diff --git a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
index 39953689e6..6b2450a24b 100644
--- a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
+++ b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
@@ -96,12 +96,6 @@ class MobileFaceNetEmbeddingService implements FaceEmbeddingService {
         }
         return embeddings;
     }
-
-    public async dispose() {
-        const inferenceSession = await this.getOnnxInferenceSession();
-        inferenceSession?.release();
-        this.onnxInferenceSession = undefined;
-    }
 }
 
 export default new MobileFaceNetEmbeddingService();
diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
index 71b51f674e..02e5bb02b2 100644
--- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
+++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
@@ -1,4 +1,5 @@
 import { MAX_FACE_DISTANCE_PERCENT } from "constants/mlConfig";
+import { euclidean } from "hdbscan";
 import {
     Matrix,
     applyToPoint,
@@ -21,17 +22,7 @@ import {
 import { newBox } from "utils/machineLearning";
 import { Box, Point } from "../../../thirdparty/face-api/classes";
 
-// TODO(MR): onnx-yolo
-// import * as ort from "onnxruntime-web";
-// import { env } from "onnxruntime-web";
-const ort: any = {};
-
-// TODO(MR): onnx-yolo
-// env.wasm.wasmPaths = "/js/onnx/";
 class YoloFaceDetectionService implements FaceDetectionService {
-    // TODO(MR): onnx-yolo
-    // private onnxInferenceSession?: ort.InferenceSession;
-    private onnxInferenceSession?: any;
     public method: Versioned<FaceDetectionMethod>;
 
     public constructor() {
@@ -41,27 +32,44 @@ class YoloFaceDetectionService implements FaceDetectionService {
         };
     }
 
-    private async initOnnx() {
-        console.log("start ort");
-        this.onnxInferenceSession = await ort.InferenceSession.create(
-            "/models/yoloface/yolov5s_face_640_640_dynamic.onnx",
-        );
-        const data = new Float32Array(1 * 3 * 640 * 640);
+    public async detectFaces(
+        imageBitmap: ImageBitmap,
+    ): Promise<Array<FaceDetection>> {
+        const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT;
+        const preprocessResult =
+            this.preprocessImageBitmapToFloat32ChannelsFirst(
+                imageBitmap,
+                640,
+                640,
+            );
+        const data = preprocessResult.data;
+        const resized = preprocessResult.newSize;
         const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
         // TODO(MR): onnx-yolo
         // const feeds: Record<string, ort.Tensor> = {};
         const feeds: Record<string, any> = {};
-        const name = this.onnxInferenceSession.inputNames[0];
-        feeds[name] = inputTensor;
-        await this.onnxInferenceSession.run(feeds);
-        console.log("start end");
-    }
-
-    private async getOnnxInferenceSession() {
-        if (!this.onnxInferenceSession) {
-            await this.initOnnx();
-        }
-        return this.onnxInferenceSession;
+        feeds["input"] = inputTensor;
+        const inferenceSession = await this.getOnnxInferenceSession();
+        const runout = await inferenceSession.run(feeds);
+        const outputData = runout.output.data;
+        const faces = this.getFacesFromYoloOutput(
+            outputData as Float32Array,
+            0.7,
+        );
+        const inBox = newBox(0, 0, resized.width, resized.height);
+        const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height);
+        const transform = computeTransformToBox(inBox, toBox);
+        const faceDetections: Array<FaceDetection> = faces?.map((f) => {
+            const box = transformBox(f.box, transform);
+            const normLandmarks = f.landmarks;
+            const landmarks = transformPoints(normLandmarks, transform);
+            return {
+                box,
+                landmarks,
+                probability: f.probability as number,
+            } as FaceDetection;
+        });
+        return removeDuplicateDetections(faceDetections, maxFaceDistance);
     }
 
     private preprocessImageBitmapToFloat32ChannelsFirst(
@@ -233,64 +241,10 @@ class YoloFaceDetectionService implements FaceDetectionService {
             probability: faceDetection.probability,
         };
     }
-
-    private async estimateOnnx(imageBitmap: ImageBitmap) {
-        const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT;
-        const preprocessResult =
-            this.preprocessImageBitmapToFloat32ChannelsFirst(
-                imageBitmap,
-                640,
-                640,
-            );
-        const data = preprocessResult.data;
-        const resized = preprocessResult.newSize;
-        const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
-        // TODO(MR): onnx-yolo
-        // const feeds: Record<string, ort.Tensor> = {};
-        const feeds: Record<string, any> = {};
-        feeds["input"] = inputTensor;
-        const inferenceSession = await this.getOnnxInferenceSession();
-        const runout = await inferenceSession.run(feeds);
-        const outputData = runout.output.data;
-        const faces = this.getFacesFromYoloOutput(
-            outputData as Float32Array,
-            0.7,
-        );
-        const inBox = newBox(0, 0, resized.width, resized.height);
-        const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height);
-        const transform = computeTransformToBox(inBox, toBox);
-        const faceDetections: Array<FaceDetection> = faces?.map((f) => {
-            const box = transformBox(f.box, transform);
-            const normLandmarks = f.landmarks;
-            const landmarks = transformPoints(normLandmarks, transform);
-            return {
-                box,
-                landmarks,
-                probability: f.probability as number,
-            } as FaceDetection;
-        });
-        return removeDuplicateDetections(faceDetections, maxFaceDistance);
-    }
-
-    public async detectFaces(
-        imageBitmap: ImageBitmap,
-    ): Promise<Array<FaceDetection>> {
-        // measure time taken
-        const facesFromOnnx = await this.estimateOnnx(imageBitmap);
-        return facesFromOnnx;
-    }
-
-    public async dispose() {
-        const inferenceSession = await this.getOnnxInferenceSession();
-        inferenceSession?.release();
-        this.onnxInferenceSession = undefined;
-    }
 }
 
 export default new YoloFaceDetectionService();
 
-import { euclidean } from "hdbscan";
-
 /**
  * Removes duplicate face detections from an array of detections.
  *
diff --git a/web/apps/photos/src/types/machineLearning/index.ts b/web/apps/photos/src/types/machineLearning/index.ts
index 3def20a088..399990696c 100644
--- a/web/apps/photos/src/types/machineLearning/index.ts
+++ b/web/apps/photos/src/types/machineLearning/index.ts
@@ -261,13 +261,12 @@ export declare type MLIndex = "files" | "people";
 
 export interface FaceDetectionService {
     method: Versioned<FaceDetectionMethod>;
-    // init(): Promise<void>;
+
     detectFaces(image: ImageBitmap): Promise<Array<FaceDetection>>;
     getRelativeDetection(
         faceDetection: FaceDetection,
         imageDimensions: Dimensions,
     ): FaceDetection;
-    dispose(): Promise<void>;
 }
 
 export interface FaceCropService {
diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts
index a0bc07d9a8..83d9ee6bdd 100644
--- a/web/packages/next/types/ipc.ts
+++ b/web/packages/next/types/ipc.ts
@@ -196,7 +196,7 @@ export interface Electron {
     // - ML
 
     /**
-     * Compute and return a CLIP embedding of the given image.
+     * Return a CLIP embedding of the given image.
      *
      * See: [Note: CLIP based magic search]
      *
@@ -207,7 +207,7 @@ export interface Electron {
     clipImageEmbedding: (jpegImageData: Uint8Array) => Promise<Float32Array>;
 
     /**
-     * Compute and return a CLIP embedding of the given image.
+     * Return a CLIP embedding of the given image.
      *
      * See: [Note: CLIP based magic search]
      *
@@ -217,6 +217,23 @@ export interface Electron {
      */
     clipTextEmbedding: (text: string) => Promise<Float32Array>;
 
+    /**
+     * Detect faces in the given image using YOLO.
+     *
+     * Both the input and output are opaque binary data whose internal structure
+     * is model (YOLO) and our implementation specific. That said, specifically
+     * the {@link inputImage} a particular bitmap encoding of an image.
+     */
+    detectFaces: (inputImage: Uint8Array) => Promise<Float32Array>;
+
+    /**
+     * Return a mobilefacenet embedding for the given face data.
+     *
+     * Both the input and output are opaque binary data whose internal structure
+     * is model (mobilefacenet) and our implementation specific.
+     */
+    faceEmbedding: (input: Float32Array) => Promise<Float32Array>;
+
     // - File selection
     // TODO: Deprecated - use dialogs on the renderer process itself
 

From 41f7b30ca078f2262a1c19ebf453360a55f2fa45 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 14:22:52 +0530
Subject: [PATCH 06/17] Wire together

---
 desktop/src/main/ipc.ts              | 11 ++++++++++-
 desktop/src/main/services/ml-face.ts | 12 +++++++++++-
 desktop/src/preload.ts               |  8 ++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/desktop/src/main/ipc.ts b/desktop/src/main/ipc.ts
index 1d863b8e9a..b6e8848183 100644
--- a/desktop/src/main/ipc.ts
+++ b/desktop/src/main/ipc.ts
@@ -36,13 +36,14 @@ import {
     updateAndRestart,
     updateOnNextRestart,
 } from "./services/app-update";
-import { clipImageEmbedding, clipTextEmbedding } from "./services/ml-clip";
 import { runFFmpegCmd } from "./services/ffmpeg";
 import { getDirFiles } from "./services/fs";
 import {
     convertToJPEG,
     generateImageThumbnail,
 } from "./services/imageProcessor";
+import { clipImageEmbedding, clipTextEmbedding } from "./services/ml-clip";
+import { detectFaces, faceEmbedding } from "./services/ml-face";
 import {
     clearStores,
     encryptionKey,
@@ -146,6 +147,14 @@ export const attachIPCHandlers = () => {
         clipTextEmbedding(text),
     );
 
+    ipcMain.handle("detectFaces", (_, imageData: Uint8Array) =>
+        detectFaces(imageData),
+    );
+
+    ipcMain.handle("faceEmbedding", (_, input: Float32Array) =>
+        faceEmbedding(input),
+    );
+
     // - File selection
 
     ipcMain.handle("selectDirectory", () => selectDirectory());
diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
index bf8eea1625..066f5406bf 100644
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@@ -78,6 +78,16 @@ const faceEmbeddingSession = async () => {
     return _faceEmbeddingSession;
 };
 
+export const detectFaces = async (inputImage: Uint8Array) => {
+    throw new Error("test");
+};
+
+export const faceEmbedding = async (input: Float32Array) => {
+    throw new Error("test");
+};
+
+/*
+
 private async initOnnx() {
     console.log("start ort");
     this.onnxInferenceSession = await ort.InferenceSession.create(
@@ -100,7 +110,7 @@ private async getOnnxInferenceSession() {
     }
     return this.onnxInferenceSession;
 }
-
+*/
 
 // export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
 //     const tempFilePath = await generateTempFilePath("");
diff --git a/desktop/src/preload.ts b/desktop/src/preload.ts
index 07736502bd..bea5c9e18f 100644
--- a/desktop/src/preload.ts
+++ b/desktop/src/preload.ts
@@ -143,6 +143,12 @@ const clipImageEmbedding = (jpegImageData: Uint8Array): Promise<Float32Array> =>
 const clipTextEmbedding = (text: string): Promise<Float32Array> =>
     ipcRenderer.invoke("clipTextEmbedding", text);
 
+const detectFaces = (imageData: Uint8Array): Promise<Float32Array> =>
+    ipcRenderer.invoke("detectFaces", imageData);
+
+const faceEmbedding = (input: Float32Array): Promise<Float32Array> =>
+    ipcRenderer.invoke("faceEmbedding", input);
+
 // - File selection
 
 // TODO: Deprecated - use dialogs on the renderer process itself
@@ -322,6 +328,8 @@ contextBridge.exposeInMainWorld("electron", {
     // - ML
     clipImageEmbedding,
     clipTextEmbedding,
+    detectFaces,
+    faceEmbedding,
 
     // - File selection
     selectDirectory,

From a1d6ef43b4565733010e76b413090cdf61681729 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 14:37:44 +0530
Subject: [PATCH 07/17] Roundtrip

---
 desktop/src/main/ipc.ts                       |  4 +-
 desktop/src/main/services/ml-face.ts          | 38 +++++++++++--------
 desktop/src/preload.ts                        |  4 +-
 .../yoloFaceDetectionService.ts               | 10 +----
 web/packages/next/types/ipc.ts                |  9 ++---
 5 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/desktop/src/main/ipc.ts b/desktop/src/main/ipc.ts
index b6e8848183..180e68cdcf 100644
--- a/desktop/src/main/ipc.ts
+++ b/desktop/src/main/ipc.ts
@@ -147,8 +147,8 @@ export const attachIPCHandlers = () => {
         clipTextEmbedding(text),
     );
 
-    ipcMain.handle("detectFaces", (_, imageData: Uint8Array) =>
-        detectFaces(imageData),
+    ipcMain.handle("detectFaces", (_, input: Float32Array) =>
+        detectFaces(input),
     );
 
     ipcMain.handle("faceEmbedding", (_, input: Float32Array) =>
diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
index 066f5406bf..78eb82bd15 100644
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@@ -78,8 +78,29 @@ const faceEmbeddingSession = async () => {
     return _faceEmbeddingSession;
 };
 
-export const detectFaces = async (inputImage: Uint8Array) => {
-    throw new Error("test");
+export const detectFaces = async (input: Float32Array) => {
+    // console.log("start ort");
+    // this.onnxInferenceSession = await ort.InferenceSession.create(
+    //     "/models/yoloface/yolov5s_face_640_640_dynamic.onnx",
+    // );
+    // const data = new Float32Array(1 * 3 * 640 * 640);
+    // const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
+    // // TODO(MR): onnx-yolo
+    // // const feeds: Record<string, ort.Tensor> = {};
+    // const feeds: Record<string, any> = {};
+    // const name = this.onnxInferenceSession.inputNames[0];
+    // feeds[name] = inputTensor;
+    // await this.onnxInferenceSession.run(feeds);
+    // console.log("start end");
+
+    const session = await faceDetectionSession();
+    const t = Date.now();
+    const feeds = {
+        input: new ort.Tensor("float32", input, [1, 3, 640, 640]),
+    };
+    const results = await session.run(feeds);
+    log.debug(() => `onnx/yolo inference took ${Date.now() - t} ms`);
+    return results["output"].data;
 };
 
 export const faceEmbedding = async (input: Float32Array) => {
@@ -89,19 +110,6 @@ export const faceEmbedding = async (input: Float32Array) => {
 /*
 
 private async initOnnx() {
-    console.log("start ort");
-    this.onnxInferenceSession = await ort.InferenceSession.create(
-        "/models/yoloface/yolov5s_face_640_640_dynamic.onnx",
-    );
-    const data = new Float32Array(1 * 3 * 640 * 640);
-    const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
-    // TODO(MR): onnx-yolo
-    // const feeds: Record<string, ort.Tensor> = {};
-    const feeds: Record<string, any> = {};
-    const name = this.onnxInferenceSession.inputNames[0];
-    feeds[name] = inputTensor;
-    await this.onnxInferenceSession.run(feeds);
-    console.log("start end");
 }
 
 private async getOnnxInferenceSession() {
diff --git a/desktop/src/preload.ts b/desktop/src/preload.ts
index bea5c9e18f..2db39e2290 100644
--- a/desktop/src/preload.ts
+++ b/desktop/src/preload.ts
@@ -143,8 +143,8 @@ const clipImageEmbedding = (jpegImageData: Uint8Array): Promise<Float32Array> =>
 const clipTextEmbedding = (text: string): Promise<Float32Array> =>
     ipcRenderer.invoke("clipTextEmbedding", text);
 
-const detectFaces = (imageData: Uint8Array): Promise<Float32Array> =>
-    ipcRenderer.invoke("detectFaces", imageData);
+const detectFaces = (input: Float32Array): Promise<Float32Array> =>
+    ipcRenderer.invoke("detectFaces", input);
 
 const faceEmbedding = (input: Float32Array): Promise<Float32Array> =>
     ipcRenderer.invoke("faceEmbedding", input);
diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
index 02e5bb02b2..9efd31cbb7 100644
--- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
+++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
@@ -1,3 +1,4 @@
+import { ensureElectron } from "@/next/electron";
 import { MAX_FACE_DISTANCE_PERCENT } from "constants/mlConfig";
 import { euclidean } from "hdbscan";
 import {
@@ -44,14 +45,7 @@ class YoloFaceDetectionService implements FaceDetectionService {
             );
         const data = preprocessResult.data;
         const resized = preprocessResult.newSize;
-        const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
-        // TODO(MR): onnx-yolo
-        // const feeds: Record<string, ort.Tensor> = {};
-        const feeds: Record<string, any> = {};
-        feeds["input"] = inputTensor;
-        const inferenceSession = await this.getOnnxInferenceSession();
-        const runout = await inferenceSession.run(feeds);
-        const outputData = runout.output.data;
+        const outputData = await ensureElectron().detectFaces(data);
         const faces = this.getFacesFromYoloOutput(
             outputData as Float32Array,
             0.7,
diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts
index 83d9ee6bdd..5b0979eaa2 100644
--- a/web/packages/next/types/ipc.ts
+++ b/web/packages/next/types/ipc.ts
@@ -221,16 +221,15 @@ export interface Electron {
      * Detect faces in the given image using YOLO.
      *
      * Both the input and output are opaque binary data whose internal structure
-     * is model (YOLO) and our implementation specific. That said, specifically
-     * the {@link inputImage} a particular bitmap encoding of an image.
+     * is specific to our implementation and the model (YOLO) we use.
      */
-    detectFaces: (inputImage: Uint8Array) => Promise<Float32Array>;
+    detectFaces: (input: Float32Array) => Promise<Float32Array>;
 
     /**
-     * Return a mobilefacenet embedding for the given face data.
+     * Return a MobileFaceNet embedding for the given face data.
      *
      * Both the input and output are opaque binary data whose internal structure
-     * is model (mobilefacenet) and our implementation specific.
+     * is specific to our implementation and the model (MobileFaceNet) we use.
      */
     faceEmbedding: (input: Float32Array) => Promise<Float32Array>;
 

From 9ff4989d81df88609890ff43eb7d88c45d51c025 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 14:39:59 +0530
Subject: [PATCH 08/17] Cleanup

---
 desktop/src/main/services/ml-clip.ts          |  4 +-
 desktop/src/main/services/ml-face.ts          | 45 +------------------
 .../services/machineLearning/faceService.ts   |  5 +--
 3 files changed, 6 insertions(+), 48 deletions(-)

diff --git a/desktop/src/main/services/ml-clip.ts b/desktop/src/main/services/ml-clip.ts
index 3fe6da2eb2..63fa751482 100644
--- a/desktop/src/main/services/ml-clip.ts
+++ b/desktop/src/main/services/ml-clip.ts
@@ -134,7 +134,7 @@ const clipImageEmbedding_ = async (jpegFilePath: string) => {
     const results = await imageSession.run(feeds);
     log.debug(
         () =>
-            `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
+            `onnx/clip image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
     );
     const imageEmbedding = results["output"].data; // Float32Array
     return normalizeEmbedding(imageEmbedding);
@@ -241,7 +241,7 @@ export const clipTextEmbedding = async (text: string) => {
     const results = await imageSession.run(feeds);
     log.debug(
         () =>
-            `CLIP text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
+            `onnx/clip text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
     );
     const textEmbedding = results["output"].data;
     return normalizeEmbedding(textEmbedding);
diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
index 78eb82bd15..c79ae591f8 100644
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@@ -2,7 +2,7 @@
  * @file Various face recognition related tasks.
  *
  * - Face detection with the YOLO model.
- * - Face embedding with the mobilefacenet model.
+ * - Face embedding with the MobileFaceNet model.
  *
  * The runtime used is ONNX.
  */
@@ -99,51 +99,10 @@ export const detectFaces = async (input: Float32Array) => {
         input: new ort.Tensor("float32", input, [1, 3, 640, 640]),
     };
     const results = await session.run(feeds);
-    log.debug(() => `onnx/yolo inference took ${Date.now() - t} ms`);
+    log.debug(() => `onnx/yolo face detection took ${Date.now() - t} ms`);
     return results["output"].data;
 };
 
 export const faceEmbedding = async (input: Float32Array) => {
     throw new Error("test");
 };
-
-/*
-
-private async initOnnx() {
-}
-
-private async getOnnxInferenceSession() {
-    if (!this.onnxInferenceSession) {
-        await this.initOnnx();
-    }
-    return this.onnxInferenceSession;
-}
-*/
-
-// export const clipImageEmbedding = async (jpegImageData: Uint8Array) => {
-//     const tempFilePath = await generateTempFilePath("");
-//     const imageStream = new Response(jpegImageData.buffer).body;
-//     await writeStream(tempFilePath, imageStream);
-//     try {
-//         return await clipImageEmbedding_(tempFilePath);
-//     } finally {
-//         await deleteTempFile(tempFilePath);
-//     }
-// };
-
-// const clipImageEmbedding_ = async (jpegFilePath: string) => {
-//     const imageSession = await onnxImageSession();
-//     const t1 = Date.now();
-//     const rgbData = await getRGBData(jpegFilePath);
-//     const feeds = {
-//         input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]),
-//     };
-//     const t2 = Date.now();
-//     const results = await imageSession.run(feeds);
-//     log.debug(
-//         () =>
-//             `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`,
-//     );
-//     const imageEmbedding = results["output"].data; // Float32Array
-//     return normalizeEmbedding(imageEmbedding);
-// };
diff --git a/web/apps/photos/src/services/machineLearning/faceService.ts b/web/apps/photos/src/services/machineLearning/faceService.ts
index 449ae0b964..0f26950f8a 100644
--- a/web/apps/photos/src/services/machineLearning/faceService.ts
+++ b/web/apps/photos/src/services/machineLearning/faceService.ts
@@ -55,7 +55,7 @@ class FaceService {
             await syncContext.faceDetectionService.detectFaces(imageBitmap);
         console.timeEnd(timerId);
         console.log("faceDetections: ", faceDetections?.length);
-        // log.info('3 TF Memory stats: ',JSON.stringify(tf.memory()));
+
         // TODO: reenable faces filtering based on width
         const detectedFaces = faceDetections?.map((detection) => {
             return {
@@ -150,7 +150,7 @@ class FaceService {
 
         imageBitmap.close();
         log.info("[MLService] alignedFaces: ", newMlFile.faces?.length);
-        // log.info('4 TF Memory stats: ',JSON.stringify(tf.memory()));
+
         return faceImages;
     }
 
@@ -187,7 +187,6 @@ class FaceService {
         newMlFile.faces.forEach((f, i) => (f.embedding = embeddings[i]));
 
         log.info("[MLService] facesWithEmbeddings: ", newMlFile.faces.length);
-        // log.info('5 TF Memory stats: ',JSON.stringify(tf.memory()));
     }
 
     async syncFileFaceMakeRelativeDetections(

From 52727f2255624e88dae26c5f7b0675a4aa7911ae Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 15:02:14 +0530
Subject: [PATCH 09/17] Also move the embedding

---
 desktop/src/main/services/ml-face.ts          | 37 ++++++++-
 web/apps/photos/src/constants/mlConfig.ts     | 11 ---
 .../laplacianBlurDetectionService.ts          |  4 +-
 .../mobileFaceNetEmbeddingService.ts          | 76 ++-----------------
 .../yoloFaceDetectionService.ts               |  4 +-
 .../photos/src/types/machineLearning/index.ts |  3 +-
 6 files changed, 49 insertions(+), 86 deletions(-)

diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
index c79ae591f8..63b7a9d02f 100644
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@@ -104,5 +104,40 @@ export const detectFaces = async (input: Float32Array) => {
 };
 
 export const faceEmbedding = async (input: Float32Array) => {
-    throw new Error("test");
+    // console.log("start ort mobilefacenet");
+    // this.onnxInferenceSession = await ort.InferenceSession.create(
+    //     "/models/mobilefacenet/mobilefacenet_opset15.onnx",
+    // );
+    // const faceBatchSize = 1;
+    // const data = new Float32Array(
+    //     faceBatchSize * 3 * this.faceSize * this.faceSize,
+    // );
+    // const inputTensor = new ort.Tensor("float32", data, [
+    //     faceBatchSize,
+    //     this.faceSize,
+    //     this.faceSize,
+    //     3,
+    // ]);
+    // // TODO(MR): onnx-yolo
+    // // const feeds: Record<string, ort.Tensor> = {};
+    // const feeds: Record<string, any> = {};
+    // const name = this.onnxInferenceSession.inputNames[0];
+    // feeds[name] = inputTensor;
+    // await this.onnxInferenceSession.run(feeds);
+    // console.log("start end mobilefacenet");
+
+    // Dimension of each face (alias)
+    const mobileFaceNetFaceSize = 112;
+    // Smaller alias
+    const z = mobileFaceNetFaceSize;
+    // Size of each face's data in the batch
+    const n = Math.round(input.length / (z * z * 3));
+    const inputTensor = new ort.Tensor("float32", input, [n, z, z, 3]);
+
+    const session = await faceEmbeddingSession();
+    const t = Date.now();
+    const feeds = { img_inputs: inputTensor };
+    const results = await session.run(feeds);
+    log.debug(() => `onnx/yolo face embedding took ${Date.now() - t} ms`);
+    return results.embeddings["cpuData"]; // as Float32Array;
 };
diff --git a/web/apps/photos/src/constants/mlConfig.ts b/web/apps/photos/src/constants/mlConfig.ts
index ff3eed264a..929594e1c1 100644
--- a/web/apps/photos/src/constants/mlConfig.ts
+++ b/web/apps/photos/src/constants/mlConfig.ts
@@ -53,15 +53,4 @@ export const DEFAULT_ML_SEARCH_CONFIG: MLSearchConfig = {
     enabled: false,
 };
 
-export const ML_SYNC_DOWNLOAD_TIMEOUT_MS = 300000;
-
-export const MAX_FACE_DISTANCE_PERCENT = Math.sqrt(2) / 100;
-
 export const MAX_ML_SYNC_ERROR_COUNT = 1;
-
-export const TEXT_DETECTION_TIMEOUT_MS = [10000, 30000, 60000, 120000, 240000];
-
-export const MOBILEFACENET_FACE_SIZE = 112;
-export const MOBILEFACENET_EMBEDDING_SIZE = 192;
-
-export const BATCHES_BEFORE_SYNCING_INDEX = 5;
diff --git a/web/apps/photos/src/services/machineLearning/laplacianBlurDetectionService.ts b/web/apps/photos/src/services/machineLearning/laplacianBlurDetectionService.ts
index b5842f70c2..14178a5351 100644
--- a/web/apps/photos/src/services/machineLearning/laplacianBlurDetectionService.ts
+++ b/web/apps/photos/src/services/machineLearning/laplacianBlurDetectionService.ts
@@ -1,10 +1,10 @@
-import { MOBILEFACENET_FACE_SIZE } from "constants/mlConfig";
 import {
     BlurDetectionMethod,
     BlurDetectionService,
     Versioned,
 } from "types/machineLearning";
 import { createGrayscaleIntMatrixFromNormalized2List } from "utils/image";
+import { mobileFaceNetFaceSize } from "./mobileFaceNetEmbeddingService";
 
 class LaplacianBlurDetectionService implements BlurDetectionService {
     public method: Versioned<BlurDetectionMethod>;
@@ -19,7 +19,7 @@ class LaplacianBlurDetectionService implements BlurDetectionService {
     public detectBlur(alignedFaces: Float32Array): number[] {
         const numFaces = Math.round(
             alignedFaces.length /
-                (MOBILEFACENET_FACE_SIZE * MOBILEFACENET_FACE_SIZE * 3),
+                (mobileFaceNetFaceSize * mobileFaceNetFaceSize * 3),
         );
         const blurValues: number[] = [];
         for (let i = 0; i < numFaces; i++) {
diff --git a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
index 6b2450a24b..7daa7d8444 100644
--- a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
+++ b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
@@ -1,7 +1,4 @@
-import {
-    MOBILEFACENET_EMBEDDING_SIZE,
-    MOBILEFACENET_FACE_SIZE,
-} from "constants/mlConfig";
+import { ensureElectron } from "@/next/electron";
 import {
     FaceEmbedding,
     FaceEmbeddingMethod,
@@ -9,17 +6,9 @@ import {
     Versioned,
 } from "types/machineLearning";
 
-// TODO(MR): onnx-yolo
-// import * as ort from "onnxruntime-web";
-// import { env } from "onnxruntime-web";
-const ort: any = {};
+export const mobileFaceNetFaceSize = 112;
 
-// TODO(MR): onnx-yolo
-// env.wasm.wasmPaths = "/js/onnx/";
 class MobileFaceNetEmbeddingService implements FaceEmbeddingService {
-    // TODO(MR): onnx-yolo
-    // private onnxInferenceSession?: ort.InferenceSession;
-    private onnxInferenceSession?: any;
     public method: Versioned<FaceEmbeddingMethod>;
     public faceSize: number;
 
@@ -28,70 +17,21 @@ class MobileFaceNetEmbeddingService implements FaceEmbeddingService {
             value: "MobileFaceNet",
             version: 2,
         };
-        this.faceSize = MOBILEFACENET_FACE_SIZE;
-        // TODO: set timeout
-    }
-
-    private async initOnnx() {
-        console.log("start ort mobilefacenet");
-        this.onnxInferenceSession = await ort.InferenceSession.create(
-            "/models/mobilefacenet/mobilefacenet_opset15.onnx",
-        );
-        const faceBatchSize = 1;
-        const data = new Float32Array(
-            faceBatchSize * 3 * this.faceSize * this.faceSize,
-        );
-        const inputTensor = new ort.Tensor("float32", data, [
-            faceBatchSize,
-            this.faceSize,
-            this.faceSize,
-            3,
-        ]);
-        // TODO(MR): onnx-yolo
-        // const feeds: Record<string, ort.Tensor> = {};
-        const feeds: Record<string, any> = {};
-        const name = this.onnxInferenceSession.inputNames[0];
-        feeds[name] = inputTensor;
-        await this.onnxInferenceSession.run(feeds);
-        console.log("start end mobilefacenet");
-    }
-
-    private async getOnnxInferenceSession() {
-        if (!this.onnxInferenceSession) {
-            await this.initOnnx();
-        }
-        return this.onnxInferenceSession;
+        this.faceSize = mobileFaceNetFaceSize;
     }
 
     public async getFaceEmbeddings(
         faceData: Float32Array,
     ): Promise<Array<FaceEmbedding>> {
-        const inputTensor = new ort.Tensor("float32", faceData, [
-            Math.round(faceData.length / (this.faceSize * this.faceSize * 3)),
-            this.faceSize,
-            this.faceSize,
-            3,
-        ]);
-        // TODO(MR): onnx-yolo
-        // const feeds: Record<string, ort.Tensor> = {};
-        const feeds: Record<string, any> = {};
-        feeds["img_inputs"] = inputTensor;
-        const inferenceSession = await this.getOnnxInferenceSession();
-        // TODO(MR): onnx-yolo
-        // const runout: ort.InferenceSession.OnnxValueMapType =
-        const runout: any = await inferenceSession.run(feeds);
-        // const test = runout.embeddings;
-        // const test2 = test.cpuData;
-        const outputData = runout.embeddings["cpuData"] as Float32Array;
+        const outputData = await ensureElectron().faceEmbedding(faceData);
+
+        const embeddingSize = 192;
         const embeddings = new Array<FaceEmbedding>(
-            outputData.length / MOBILEFACENET_EMBEDDING_SIZE,
+            outputData.length / embeddingSize,
         );
         for (let i = 0; i < embeddings.length; i++) {
             embeddings[i] = new Float32Array(
-                outputData.slice(
-                    i * MOBILEFACENET_EMBEDDING_SIZE,
-                    (i + 1) * MOBILEFACENET_EMBEDDING_SIZE,
-                ),
+                outputData.slice(i * embeddingSize, (i + 1) * embeddingSize),
             );
         }
         return embeddings;
diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
index 9efd31cbb7..fdbb3f102b 100644
--- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
+++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
@@ -1,5 +1,4 @@
 import { ensureElectron } from "@/next/electron";
-import { MAX_FACE_DISTANCE_PERCENT } from "constants/mlConfig";
 import { euclidean } from "hdbscan";
 import {
     Matrix,
@@ -36,7 +35,8 @@ class YoloFaceDetectionService implements FaceDetectionService {
     public async detectFaces(
         imageBitmap: ImageBitmap,
     ): Promise<Array<FaceDetection>> {
-        const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT;
+        const maxFaceDistancePercent = Math.sqrt(2) / 100;
+        const maxFaceDistance = imageBitmap.width * maxFaceDistancePercent;
         const preprocessResult =
             this.preprocessImageBitmapToFloat32ChannelsFirst(
                 imageBitmap,
diff --git a/web/apps/photos/src/types/machineLearning/index.ts b/web/apps/photos/src/types/machineLearning/index.ts
index 399990696c..436585bbae 100644
--- a/web/apps/photos/src/types/machineLearning/index.ts
+++ b/web/apps/photos/src/types/machineLearning/index.ts
@@ -287,9 +287,8 @@ export interface FaceAlignmentService {
 export interface FaceEmbeddingService {
     method: Versioned<FaceEmbeddingMethod>;
     faceSize: number;
-    // init(): Promise<void>;
+
     getFaceEmbeddings(faceImages: Float32Array): Promise<Array<FaceEmbedding>>;
-    dispose(): Promise<void>;
 }
 
 export interface BlurDetectionService {

From f5bf776848653f23d293042ef64e02c3c0e69c0d Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 15:06:03 +0530
Subject: [PATCH 10/17] lint

---
 desktop/docs/dependencies.md         | 10 +++++-----
 desktop/src/main/services/ml-face.ts |  3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/desktop/docs/dependencies.md b/desktop/docs/dependencies.md
index 9cced1f818..62f70e8e46 100644
--- a/desktop/docs/dependencies.md
+++ b/desktop/docs/dependencies.md
@@ -1,8 +1,8 @@
 # Dependencies
 
-* [Electron](#electron)
-* [Dev dependencies](#dev)
-* [Functionality](#functionality)
+-   [Electron](#electron)
+-   [Dev dependencies](#dev)
+-   [Functionality](#functionality)
 
 ## Electron
 
@@ -114,8 +114,8 @@ available on the host machine, and is not bundled with our app.
 AI/ML runtime. It powers both natural language searches (using CLIP) and face
 detection (using YOLO).
 
-[jpeg-js](https://github.com/jpeg-js/jpeg-js#readme) is used for decoding
-JPEG data into raw RGB bytes before passing it to ONNX.
+[jpeg-js](https://github.com/jpeg-js/jpeg-js#readme) is used for decoding JPEG
+data into raw RGB bytes before passing it to ONNX.
 
 html-entities is used by the bundled clip-bpe-ts tokenizer for CLIP.
 
diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
index 63b7a9d02f..62865ff236 100644
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@@ -139,5 +139,6 @@ export const faceEmbedding = async (input: Float32Array) => {
     const feeds = { img_inputs: inputTensor };
     const results = await session.run(feeds);
     log.debug(() => `onnx/yolo face embedding took ${Date.now() - t} ms`);
-    return results.embeddings["cpuData"]; // as Float32Array;
+    // TODO: What's with this type?
+    return (results.embeddings as unknown as any)["cpuData"]; // as Float32Array;
 };

From 33e3265db6351f2cf92da227c8c8e2f1b2deba06 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 15:17:50 +0530
Subject: [PATCH 11/17] Migration for existing configs

---
 web/apps/photos/src/types/machineLearning/index.ts | 9 +++------
 web/apps/photos/src/utils/storage/mlIDbStorage.ts  | 3 +++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/web/apps/photos/src/types/machineLearning/index.ts b/web/apps/photos/src/types/machineLearning/index.ts
index 436585bbae..d0c902333c 100644
--- a/web/apps/photos/src/types/machineLearning/index.ts
+++ b/web/apps/photos/src/types/machineLearning/index.ts
@@ -45,16 +45,13 @@ export declare type Landmark = Point;
 
 export declare type ImageType = "Original" | "Preview";
 
-export declare type FaceDetectionMethod = "FaceApiSSD" | "YoloFace";
+export declare type FaceDetectionMethod = "YoloFace";
 
 export declare type FaceCropMethod = "ArcFace";
 
-export declare type FaceAlignmentMethod =
-    | "ArcFace"
-    | "FaceApiDlib"
-    | "RotatedFaceApiDlib";
+export declare type FaceAlignmentMethod = "ArcFace";
 
-export declare type FaceEmbeddingMethod = "MobileFaceNet" | "FaceApiDlib";
+export declare type FaceEmbeddingMethod = "MobileFaceNet";
 
 export declare type BlurDetectionMethod = "Laplacian";
 
diff --git a/web/apps/photos/src/utils/storage/mlIDbStorage.ts b/web/apps/photos/src/utils/storage/mlIDbStorage.ts
index d7e24cbe80..bba71c4ff5 100644
--- a/web/apps/photos/src/utils/storage/mlIDbStorage.ts
+++ b/web/apps/photos/src/utils/storage/mlIDbStorage.ts
@@ -124,6 +124,9 @@ class MLIDbStorage {
                         .add(DEFAULT_ML_SEARCH_CONFIG, ML_SEARCH_CONFIG_NAME);
                 }
                 if (oldVersion < 4) {
+                    db.deleteObjectStore("configs");
+                    db.createObjectStore("configs");
+
                     db.deleteObjectStore("things");
                 }
 

From ff66a2f44caf6fec50eba3b64163bc6b6ee99bba Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 15:27:02 +0530
Subject: [PATCH 12/17] The ML code runs in workers

---
 .../services/machineLearning/mobileFaceNetEmbeddingService.ts | 4 ++--
 .../src/services/machineLearning/yoloFaceDetectionService.ts  | 4 ++--
 web/apps/photos/src/utils/storage/mlIDbStorage.ts             | 1 +
 web/packages/next/worker/comlink-worker.ts                    | 3 +++
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
index 7daa7d8444..818b8a5d12 100644
--- a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
+++ b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts
@@ -1,4 +1,4 @@
-import { ensureElectron } from "@/next/electron";
+import { workerBridge } from "@/next/worker/worker-bridge";
 import {
     FaceEmbedding,
     FaceEmbeddingMethod,
@@ -23,7 +23,7 @@ class MobileFaceNetEmbeddingService implements FaceEmbeddingService {
     public async getFaceEmbeddings(
         faceData: Float32Array,
     ): Promise<Array<FaceEmbedding>> {
-        const outputData = await ensureElectron().faceEmbedding(faceData);
+        const outputData = await workerBridge.faceEmbedding(faceData);
 
         const embeddingSize = 192;
         const embeddings = new Array<FaceEmbedding>(
diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
index fdbb3f102b..3e7d282fb1 100644
--- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
+++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
@@ -1,4 +1,3 @@
-import { ensureElectron } from "@/next/electron";
 import { euclidean } from "hdbscan";
 import {
     Matrix,
@@ -21,6 +20,7 @@ import {
 } from "utils/image";
 import { newBox } from "utils/machineLearning";
 import { Box, Point } from "../../../thirdparty/face-api/classes";
+import { workerBridge } from "@/next/worker/worker-bridge";
 
 class YoloFaceDetectionService implements FaceDetectionService {
     public method: Versioned<FaceDetectionMethod>;
@@ -45,7 +45,7 @@ class YoloFaceDetectionService implements FaceDetectionService {
             );
         const data = preprocessResult.data;
         const resized = preprocessResult.newSize;
-        const outputData = await ensureElectron().detectFaces(data);
+        const outputData = await workerBridge.detectFaces(data);
         const faces = this.getFacesFromYoloOutput(
             outputData as Float32Array,
             0.7,
diff --git a/web/apps/photos/src/utils/storage/mlIDbStorage.ts b/web/apps/photos/src/utils/storage/mlIDbStorage.ts
index bba71c4ff5..8be60afacf 100644
--- a/web/apps/photos/src/utils/storage/mlIDbStorage.ts
+++ b/web/apps/photos/src/utils/storage/mlIDbStorage.ts
@@ -124,6 +124,7 @@ class MLIDbStorage {
                         .add(DEFAULT_ML_SEARCH_CONFIG, ML_SEARCH_CONFIG_NAME);
                 }
                 if (oldVersion < 4) {
+                    // TODO(MR): This loses the user's settings.
                     db.deleteObjectStore("configs");
                     db.createObjectStore("configs");
 
diff --git a/web/packages/next/worker/comlink-worker.ts b/web/packages/next/worker/comlink-worker.ts
index 033c79fa8c..ad340c2094 100644
--- a/web/packages/next/worker/comlink-worker.ts
+++ b/web/packages/next/worker/comlink-worker.ts
@@ -46,6 +46,9 @@ const workerBridge = {
     logToDisk,
     convertToJPEG: (inputFileData: Uint8Array, filename: string) =>
         ensureElectron().convertToJPEG(inputFileData, filename),
+    detectFaces: (input: Float32Array) => ensureElectron().detectFaces(input),
+    faceEmbedding: (input: Float32Array) =>
+        ensureElectron().faceEmbedding(input),
 };
 
 export type WorkerBridge = typeof workerBridge;

From ef4462553c0595c61a0f54e0f133b366a914c590 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 15:35:25 +0530
Subject: [PATCH 13/17] Fix incorrect typecheck that fails on undefined

---
 web/apps/photos/src/services/machineLearning/mlWorkManager.ts | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/web/apps/photos/src/services/machineLearning/mlWorkManager.ts b/web/apps/photos/src/services/machineLearning/mlWorkManager.ts
index d62d6f829e..c5df14b224 100644
--- a/web/apps/photos/src/services/machineLearning/mlWorkManager.ts
+++ b/web/apps/photos/src/services/machineLearning/mlWorkManager.ts
@@ -186,8 +186,7 @@ class MLWorkManager {
             return mlWorker.syncLocalFile(token, userID, enteFile, localFile);
         });
 
-        // @ts-expect-error "TODO: Fix ML related type errors"
-        if ("message" in result) {
+        if (result instanceof Error) {
             // TODO: redirect/refresh to gallery in case of session_expired
             // may not be required as uploader should anyways take care of this
             console.error("Error while syncing local file: ", result);

From e58e96091f26db7c9a4c56d4d41f6fb3d30f1042 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 16:15:08 +0530
Subject: [PATCH 14/17] Ignore (expected) errors when trying to cache face
 crops

---
 .../services/machineLearning/faceService.ts   | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/web/apps/photos/src/services/machineLearning/faceService.ts b/web/apps/photos/src/services/machineLearning/faceService.ts
index 0f26950f8a..3116ac23c2 100644
--- a/web/apps/photos/src/services/machineLearning/faceService.ts
+++ b/web/apps/photos/src/services/machineLearning/faceService.ts
@@ -225,11 +225,21 @@ class FaceService {
             face.detection,
             syncContext.config.faceCrop,
         );
-        face.crop = await storeFaceCrop(
-            face.id,
-            faceCrop,
-            syncContext.config.faceCrop.blobOptions,
-        );
+        try {
+            face.crop = await storeFaceCrop(
+                face.id,
+                faceCrop,
+                syncContext.config.faceCrop.blobOptions,
+            );
+        } catch (e) {
+            // TODO(MR): Temporarily ignoring errors about failing cache puts
+            // when using a custom scheme in Electron. Needs an alternative
+            // approach, perhaps OPFS.
+            console.error(
+                "Ignoring error when caching face crop, the face crop will not be available",
+                e,
+            );
+        }
         const blob = await imageBitmapToBlob(faceCrop.image);
         faceCrop.image.close();
         return blob;

From 320db9f8b741973492b56aa4dd9d019e90b0c716 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 16:22:38 +0530
Subject: [PATCH 15/17] Fix the putEmbeddings API calls for now

---
 web/apps/photos/src/services/embeddingService.ts |  6 +++++-
 web/packages/next/worker/comlink-worker.ts       | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/web/apps/photos/src/services/embeddingService.ts b/web/apps/photos/src/services/embeddingService.ts
index b93b01532b..a4309e314c 100644
--- a/web/apps/photos/src/services/embeddingService.ts
+++ b/web/apps/photos/src/services/embeddingService.ts
@@ -1,4 +1,6 @@
+import { inWorker } from "@/next/env";
 import log from "@/next/log";
+import { workerBridge } from "@/next/worker/worker-bridge";
 import ComlinkCryptoWorker from "@ente/shared/crypto";
 import { CustomError } from "@ente/shared/error";
 import HTTPService from "@ente/shared/network/HTTPService";
@@ -262,7 +264,9 @@ export const putEmbedding = async (
     putEmbeddingReq: PutEmbeddingRequest,
 ): Promise<EncryptedEmbedding> => {
     try {
-        const token = getToken();
+        const token = inWorker()
+            ? await workerBridge.getAuthToken()
+            : getToken();
         if (!token) {
             log.info("putEmbedding failed: token not found");
             throw Error(CustomError.TOKEN_MISSING);
diff --git a/web/packages/next/worker/comlink-worker.ts b/web/packages/next/worker/comlink-worker.ts
index ad340c2094..f082ac1145 100644
--- a/web/packages/next/worker/comlink-worker.ts
+++ b/web/packages/next/worker/comlink-worker.ts
@@ -35,6 +35,19 @@ export class ComlinkWorker<T extends new () => InstanceType<T>> {
     }
 }
 
+// TODO(MR): Temporary method to forward auth tokens to workers
+const getAuthToken = () => {
+    // LS_KEYS.USER
+    const userJSONString = localStorage.getItem("user");
+    if (!userJSONString) return undefined;
+    const json: unknown = JSON.parse(userJSONString);
+    if (!json || typeof json != "object" || !("token" in json))
+        return undefined;
+    const token = json.token;
+    if (typeof token != "string") return undefined;
+    return token;
+};
+
 /**
  * A minimal set of utility functions that we expose to all workers that we
  * create.
@@ -44,6 +57,7 @@ export class ComlinkWorker<T extends new () => InstanceType<T>> {
  */
 const workerBridge = {
     logToDisk,
+    getAuthToken,
     convertToJPEG: (inputFileData: Uint8Array, filename: string) =>
         ensureElectron().convertToJPEG(inputFileData, filename),
     detectFaces: (input: Float32Array) => ensureElectron().detectFaces(input),

From 33c84f7a08e99135f5e9ea54fb3b9e7ac120d4d9 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 16:27:27 +0530
Subject: [PATCH 16/17] Prevent undefined errors

---
 web/apps/photos/src/utils/common/job.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/apps/photos/src/utils/common/job.ts b/web/apps/photos/src/utils/common/job.ts
index f549966b66..365f879e95 100644
--- a/web/apps/photos/src/utils/common/job.ts
+++ b/web/apps/photos/src/utils/common/job.ts
@@ -50,7 +50,7 @@ export class SimpleJob<R extends JobResult> {
 
         try {
             const jobResult = await this.runCallback();
-            if (jobResult.shouldBackoff) {
+            if (jobResult && jobResult.shouldBackoff) {
                 this.intervalSec = Math.min(
                     this.config.maxItervalSec,
                     this.intervalSec * this.config.backoffMultiplier,

From 61143c9c62b30e7fa06365447aa1b7cac5a680f4 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 11 Apr 2024 16:36:56 +0530
Subject: [PATCH 17/17] Cleanup

---
 desktop/src/main/services/ml-face.ts          | 38 +------------------
 .../yoloFaceDetectionService.ts               |  2 +-
 2 files changed, 2 insertions(+), 38 deletions(-)

diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts
index 62865ff236..1f007c5fd8 100644
--- a/desktop/src/main/services/ml-face.ts
+++ b/desktop/src/main/services/ml-face.ts
@@ -79,20 +79,6 @@ const faceEmbeddingSession = async () => {
 };
 
 export const detectFaces = async (input: Float32Array) => {
-    // console.log("start ort");
-    // this.onnxInferenceSession = await ort.InferenceSession.create(
-    //     "/models/yoloface/yolov5s_face_640_640_dynamic.onnx",
-    // );
-    // const data = new Float32Array(1 * 3 * 640 * 640);
-    // const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]);
-    // // TODO(MR): onnx-yolo
-    // // const feeds: Record<string, ort.Tensor> = {};
-    // const feeds: Record<string, any> = {};
-    // const name = this.onnxInferenceSession.inputNames[0];
-    // feeds[name] = inputTensor;
-    // await this.onnxInferenceSession.run(feeds);
-    // console.log("start end");
-
     const session = await faceDetectionSession();
     const t = Date.now();
     const feeds = {
@@ -104,28 +90,6 @@ export const detectFaces = async (input: Float32Array) => {
 };
 
 export const faceEmbedding = async (input: Float32Array) => {
-    // console.log("start ort mobilefacenet");
-    // this.onnxInferenceSession = await ort.InferenceSession.create(
-    //     "/models/mobilefacenet/mobilefacenet_opset15.onnx",
-    // );
-    // const faceBatchSize = 1;
-    // const data = new Float32Array(
-    //     faceBatchSize * 3 * this.faceSize * this.faceSize,
-    // );
-    // const inputTensor = new ort.Tensor("float32", data, [
-    //     faceBatchSize,
-    //     this.faceSize,
-    //     this.faceSize,
-    //     3,
-    // ]);
-    // // TODO(MR): onnx-yolo
-    // // const feeds: Record<string, ort.Tensor> = {};
-    // const feeds: Record<string, any> = {};
-    // const name = this.onnxInferenceSession.inputNames[0];
-    // feeds[name] = inputTensor;
-    // await this.onnxInferenceSession.run(feeds);
-    // console.log("start end mobilefacenet");
-
     // Dimension of each face (alias)
     const mobileFaceNetFaceSize = 112;
     // Smaller alias
@@ -139,6 +103,6 @@ export const faceEmbedding = async (input: Float32Array) => {
     const feeds = { img_inputs: inputTensor };
     const results = await session.run(feeds);
     log.debug(() => `onnx/yolo face embedding took ${Date.now() - t} ms`);
-    // TODO: What's with this type?
+    // TODO: What's with this type? It works in practice, but double check.
     return (results.embeddings as unknown as any)["cpuData"]; // as Float32Array;
 };
diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
index 3e7d282fb1..4fa840749d 100644
--- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
+++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts
@@ -1,3 +1,4 @@
+import { workerBridge } from "@/next/worker/worker-bridge";
 import { euclidean } from "hdbscan";
 import {
     Matrix,
@@ -20,7 +21,6 @@ import {
 } from "utils/image";
 import { newBox } from "utils/machineLearning";
 import { Box, Point } from "../../../thirdparty/face-api/classes";
-import { workerBridge } from "@/next/worker/worker-bridge";
 
 class YoloFaceDetectionService implements FaceDetectionService {
     public method: Versioned<FaceDetectionMethod>;