From d3eb85be8d653022eeffe7af8b36c1f3cdd8f89e Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 13:17:56 +0530 Subject: [PATCH 01/17] Split --- desktop/src/main/ipc.ts | 2 +- desktop/src/main/services/clip.ts | 288 ------------------------------ 2 files changed, 1 insertion(+), 289 deletions(-) delete mode 100644 desktop/src/main/services/clip.ts diff --git a/desktop/src/main/ipc.ts b/desktop/src/main/ipc.ts index 2b328bb986..1d863b8e9a 100644 --- a/desktop/src/main/ipc.ts +++ b/desktop/src/main/ipc.ts @@ -36,7 +36,7 @@ import { updateAndRestart, updateOnNextRestart, } from "./services/app-update"; -import { clipImageEmbedding, clipTextEmbedding } from "./services/clip"; +import { clipImageEmbedding, clipTextEmbedding } from "./services/ml-clip"; import { runFFmpegCmd } from "./services/ffmpeg"; import { getDirFiles } from "./services/fs"; import { diff --git a/desktop/src/main/services/clip.ts b/desktop/src/main/services/clip.ts deleted file mode 100644 index 525e613424..0000000000 --- a/desktop/src/main/services/clip.ts +++ /dev/null @@ -1,288 +0,0 @@ -/** - * @file Compute CLIP embeddings - * - * @see `web/apps/photos/src/services/clip-service.ts` for more details. This - * file implements the Node.js implementation of the actual embedding - * computation. By doing it in the Node.js layer, we can use the binary ONNX - * runtimes which are 10-20x faster than the WASM based web ones. - * - * The embeddings are computed using ONNX runtime. The model itself is not - * shipped with the app but is downloaded on demand. - */ -import { app, net } from "electron/main"; -import { existsSync } from "fs"; -import jpeg from "jpeg-js"; -import fs from "node:fs/promises"; -import path from "node:path"; -import * as ort from "onnxruntime-node"; -import Tokenizer from "../../thirdparty/clip-bpe-ts/mod"; -import { CustomErrors } from "../../types/ipc"; -import { writeStream } from "../fs"; -import log from "../log"; -import { generateTempFilePath } from "../temp"; -import { deleteTempFile } from "./ffmpeg"; - -const textModelName = "clip-text-vit-32-uint8.onnx"; -const textModelByteSize = 64173509; // 61.2 MB - -const imageModelName = "clip-image-vit-32-float32.onnx"; -const imageModelByteSize = 351468764; // 335.2 MB - -/** Return the path where the given {@link modelName} is meant to be saved */ -const modelSavePath = (modelName: string) => - path.join(app.getPath("userData"), "models", modelName); - -const downloadModel = async (saveLocation: string, name: string) => { - // `mkdir -p` the directory where we want to save the model. - const saveDir = path.dirname(saveLocation); - await fs.mkdir(saveDir, { recursive: true }); - // Download - log.info(`Downloading CLIP model from ${name}`); - const url = `https://models.ente.io/${name}`; - const res = await net.fetch(url); - if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`); - // Save - await writeStream(saveLocation, res.body); - log.info(`Downloaded CLIP model ${name}`); -}; - -let activeImageModelDownload: Promise | undefined; - -const imageModelPathDownloadingIfNeeded = async () => { - try { - const modelPath = modelSavePath(imageModelName); - if (activeImageModelDownload) { - log.info("Waiting for CLIP image model download to finish"); - await activeImageModelDownload; - } else { - if (!existsSync(modelPath)) { - log.info("CLIP image model not found, downloading"); - activeImageModelDownload = downloadModel( - modelPath, - imageModelName, - ); - await activeImageModelDownload; - } else { - const localFileSize = (await fs.stat(modelPath)).size; - if (localFileSize !== imageModelByteSize) { - log.error( - `CLIP image model size ${localFileSize} does not match the expected size, downloading again`, - ); - activeImageModelDownload = downloadModel( - modelPath, - imageModelName, - ); - await activeImageModelDownload; - } - } - } - return modelPath; - } finally { - activeImageModelDownload = undefined; - } -}; - -let textModelDownloadInProgress = false; - -const textModelPathDownloadingIfNeeded = async () => { - if (textModelDownloadInProgress) - throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING); - - const modelPath = modelSavePath(textModelName); - if (!existsSync(modelPath)) { - log.info("CLIP text model not found, downloading"); - textModelDownloadInProgress = true; - downloadModel(modelPath, textModelName) - .catch((e) => { - // log but otherwise ignore - log.error("CLIP text model download failed", e); - }) - .finally(() => { - textModelDownloadInProgress = false; - }); - throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING); - } else { - const localFileSize = (await fs.stat(modelPath)).size; - if (localFileSize !== textModelByteSize) { - log.error( - `CLIP text model size ${localFileSize} does not match the expected size, downloading again`, - ); - textModelDownloadInProgress = true; - downloadModel(modelPath, textModelName) - .catch((e) => { - // log but otherwise ignore - log.error("CLIP text model download failed", e); - }) - .finally(() => { - textModelDownloadInProgress = false; - }); - throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING); - } - } - - return modelPath; -}; - -const createInferenceSession = async (modelPath: string) => { - return await ort.InferenceSession.create(modelPath, { - intraOpNumThreads: 1, - enableCpuMemArena: false, - }); -}; - -let imageSessionPromise: Promise | undefined; - -const onnxImageSession = async () => { - if (!imageSessionPromise) { - imageSessionPromise = (async () => { - const modelPath = await imageModelPathDownloadingIfNeeded(); - return createInferenceSession(modelPath); - })(); - } - return imageSessionPromise; -}; - -let _textSession: any = null; - -const onnxTextSession = async () => { - if (!_textSession) { - const modelPath = await textModelPathDownloadingIfNeeded(); - _textSession = await createInferenceSession(modelPath); - } - return _textSession; -}; - -export const clipImageEmbedding = async (jpegImageData: Uint8Array) => { - const tempFilePath = await generateTempFilePath(""); - const imageStream = new Response(jpegImageData.buffer).body; - await writeStream(tempFilePath, imageStream); - try { - return await clipImageEmbedding_(tempFilePath); - } finally { - await deleteTempFile(tempFilePath); - } -}; - -const clipImageEmbedding_ = async (jpegFilePath: string) => { - const imageSession = await onnxImageSession(); - const t1 = Date.now(); - const rgbData = await getRGBData(jpegFilePath); - const feeds = { - input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]), - }; - const t2 = Date.now(); - const results = await imageSession.run(feeds); - log.debug( - () => - `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, - ); - const imageEmbedding = results["output"].data; // Float32Array - return normalizeEmbedding(imageEmbedding); -}; - -const getRGBData = async (jpegFilePath: string) => { - const jpegData = await fs.readFile(jpegFilePath); - const rawImageData = jpeg.decode(jpegData, { - useTArray: true, - formatAsRGBA: false, - }); - - const nx: number = rawImageData.width; - const ny: number = rawImageData.height; - const inputImage: Uint8Array = rawImageData.data; - - const nx2: number = 224; - const ny2: number = 224; - const totalSize: number = 3 * nx2 * ny2; - - const result: number[] = Array(totalSize).fill(0); - const scale: number = Math.max(nx, ny) / 224; - - const nx3: number = Math.round(nx / scale); - const ny3: number = Math.round(ny / scale); - - const mean: number[] = [0.48145466, 0.4578275, 0.40821073]; - const std: number[] = [0.26862954, 0.26130258, 0.27577711]; - - for (let y = 0; y < ny3; y++) { - for (let x = 0; x < nx3; x++) { - for (let c = 0; c < 3; c++) { - // Linear interpolation - const sx: number = (x + 0.5) * scale - 0.5; - const sy: number = (y + 0.5) * scale - 0.5; - - const x0: number = Math.max(0, Math.floor(sx)); - const y0: number = Math.max(0, Math.floor(sy)); - - const x1: number = Math.min(x0 + 1, nx - 1); - const y1: number = Math.min(y0 + 1, ny - 1); - - const dx: number = sx - x0; - const dy: number = sy - y0; - - const j00: number = 3 * (y0 * nx + x0) + c; - const j01: number = 3 * (y0 * nx + x1) + c; - const j10: number = 3 * (y1 * nx + x0) + c; - const j11: number = 3 * (y1 * nx + x1) + c; - - const v00: number = inputImage[j00]; - const v01: number = inputImage[j01]; - const v10: number = inputImage[j10]; - const v11: number = inputImage[j11]; - - const v0: number = v00 * (1 - dx) + v01 * dx; - const v1: number = v10 * (1 - dx) + v11 * dx; - - const v: number = v0 * (1 - dy) + v1 * dy; - - const v2: number = Math.min(Math.max(Math.round(v), 0), 255); - - // createTensorWithDataList is dumb compared to reshape and - // hence has to be given with one channel after another - const i: number = y * nx3 + x + (c % 3) * 224 * 224; - - result[i] = (v2 / 255 - mean[c]) / std[c]; - } - } - } - - return result; -}; - -const normalizeEmbedding = (embedding: Float32Array) => { - let normalization = 0; - for (let index = 0; index < embedding.length; index++) { - normalization += embedding[index] * embedding[index]; - } - const sqrtNormalization = Math.sqrt(normalization); - for (let index = 0; index < embedding.length; index++) { - embedding[index] = embedding[index] / sqrtNormalization; - } - return embedding; -}; - -let _tokenizer: Tokenizer = null; -const getTokenizer = () => { - if (!_tokenizer) { - _tokenizer = new Tokenizer(); - } - return _tokenizer; -}; - -export const clipTextEmbedding = async (text: string) => { - const imageSession = await onnxTextSession(); - const t1 = Date.now(); - const tokenizer = getTokenizer(); - const tokenizedText = Int32Array.from(tokenizer.encodeForCLIP(text)); - const feeds = { - input: new ort.Tensor("int32", tokenizedText, [1, 77]), - }; - const t2 = Date.now(); - const results = await imageSession.run(feeds); - log.debug( - () => - `CLIP text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, - ); - const textEmbedding = results["output"].data; - return normalizeEmbedding(textEmbedding); -}; From 2b6047a979bcd46170e1e8e6d23706c7f7f55d45 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 13:40:35 +0530 Subject: [PATCH 02/17] Split --- desktop/src/main/services/ml-clip.ts | 248 +++++++++++++++++++++++++++ desktop/src/main/services/ml-face.ts | 77 +++++++++ desktop/src/main/services/ml.ts | 79 +++++++++ 3 files changed, 404 insertions(+) create mode 100644 desktop/src/main/services/ml-clip.ts create mode 100644 desktop/src/main/services/ml-face.ts create mode 100644 desktop/src/main/services/ml.ts diff --git a/desktop/src/main/services/ml-clip.ts b/desktop/src/main/services/ml-clip.ts new file mode 100644 index 0000000000..3fe6da2eb2 --- /dev/null +++ b/desktop/src/main/services/ml-clip.ts @@ -0,0 +1,248 @@ +/** + * @file Compute CLIP embeddings for images and text. + * + * The embeddings are computed using ONNX runtime, with CLIP as the model. + * + * @see `web/apps/photos/src/services/clip-service.ts` for more details. + */ +import { existsSync } from "fs"; +import jpeg from "jpeg-js"; +import fs from "node:fs/promises"; +import * as ort from "onnxruntime-node"; +import Tokenizer from "../../thirdparty/clip-bpe-ts/mod"; +import { CustomErrors } from "../../types/ipc"; +import { writeStream } from "../fs"; +import log from "../log"; +import { generateTempFilePath } from "../temp"; +import { deleteTempFile } from "./ffmpeg"; +import { + createInferenceSession, + downloadModel, + modelPathDownloadingIfNeeded, + modelSavePath, +} from "./ml"; + +const textModelName = "clip-text-vit-32-uint8.onnx"; +const textModelByteSize = 64173509; // 61.2 MB + +const imageModelName = "clip-image-vit-32-float32.onnx"; +const imageModelByteSize = 351468764; // 335.2 MB + +let activeImageModelDownload: Promise | undefined; + +const imageModelPathDownloadingIfNeeded = async () => { + try { + if (activeImageModelDownload) { + log.info("Waiting for CLIP image model download to finish"); + await activeImageModelDownload; + } else { + activeImageModelDownload = modelPathDownloadingIfNeeded( + imageModelName, + imageModelByteSize, + ); + return await activeImageModelDownload; + } + } finally { + activeImageModelDownload = undefined; + } +}; + +let textModelDownloadInProgress = false; + +/* TODO(MR): use the generic method. Then we can remove the exports for the + internal details functions that we use here */ +const textModelPathDownloadingIfNeeded = async () => { + if (textModelDownloadInProgress) + throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING); + + const modelPath = modelSavePath(textModelName); + if (!existsSync(modelPath)) { + log.info("CLIP text model not found, downloading"); + textModelDownloadInProgress = true; + downloadModel(modelPath, textModelName) + .catch((e) => { + // log but otherwise ignore + log.error("CLIP text model download failed", e); + }) + .finally(() => { + textModelDownloadInProgress = false; + }); + throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING); + } else { + const localFileSize = (await fs.stat(modelPath)).size; + if (localFileSize !== textModelByteSize) { + log.error( + `CLIP text model size ${localFileSize} does not match the expected size, downloading again`, + ); + textModelDownloadInProgress = true; + downloadModel(modelPath, textModelName) + .catch((e) => { + // log but otherwise ignore + log.error("CLIP text model download failed", e); + }) + .finally(() => { + textModelDownloadInProgress = false; + }); + throw Error(CustomErrors.MODEL_DOWNLOAD_PENDING); + } + } + + return modelPath; +}; + +let imageSessionPromise: Promise | undefined; + +const onnxImageSession = async () => { + if (!imageSessionPromise) { + imageSessionPromise = (async () => { + const modelPath = await imageModelPathDownloadingIfNeeded(); + return createInferenceSession(modelPath); + })(); + } + return imageSessionPromise; +}; + +let _textSession: any = null; + +const onnxTextSession = async () => { + if (!_textSession) { + const modelPath = await textModelPathDownloadingIfNeeded(); + _textSession = await createInferenceSession(modelPath); + } + return _textSession; +}; + +export const clipImageEmbedding = async (jpegImageData: Uint8Array) => { + const tempFilePath = await generateTempFilePath(""); + const imageStream = new Response(jpegImageData.buffer).body; + await writeStream(tempFilePath, imageStream); + try { + return await clipImageEmbedding_(tempFilePath); + } finally { + await deleteTempFile(tempFilePath); + } +}; + +const clipImageEmbedding_ = async (jpegFilePath: string) => { + const imageSession = await onnxImageSession(); + const t1 = Date.now(); + const rgbData = await getRGBData(jpegFilePath); + const feeds = { + input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]), + }; + const t2 = Date.now(); + const results = await imageSession.run(feeds); + log.debug( + () => + `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, + ); + const imageEmbedding = results["output"].data; // Float32Array + return normalizeEmbedding(imageEmbedding); +}; + +const getRGBData = async (jpegFilePath: string) => { + const jpegData = await fs.readFile(jpegFilePath); + const rawImageData = jpeg.decode(jpegData, { + useTArray: true, + formatAsRGBA: false, + }); + + const nx: number = rawImageData.width; + const ny: number = rawImageData.height; + const inputImage: Uint8Array = rawImageData.data; + + const nx2: number = 224; + const ny2: number = 224; + const totalSize: number = 3 * nx2 * ny2; + + const result: number[] = Array(totalSize).fill(0); + const scale: number = Math.max(nx, ny) / 224; + + const nx3: number = Math.round(nx / scale); + const ny3: number = Math.round(ny / scale); + + const mean: number[] = [0.48145466, 0.4578275, 0.40821073]; + const std: number[] = [0.26862954, 0.26130258, 0.27577711]; + + for (let y = 0; y < ny3; y++) { + for (let x = 0; x < nx3; x++) { + for (let c = 0; c < 3; c++) { + // Linear interpolation + const sx: number = (x + 0.5) * scale - 0.5; + const sy: number = (y + 0.5) * scale - 0.5; + + const x0: number = Math.max(0, Math.floor(sx)); + const y0: number = Math.max(0, Math.floor(sy)); + + const x1: number = Math.min(x0 + 1, nx - 1); + const y1: number = Math.min(y0 + 1, ny - 1); + + const dx: number = sx - x0; + const dy: number = sy - y0; + + const j00: number = 3 * (y0 * nx + x0) + c; + const j01: number = 3 * (y0 * nx + x1) + c; + const j10: number = 3 * (y1 * nx + x0) + c; + const j11: number = 3 * (y1 * nx + x1) + c; + + const v00: number = inputImage[j00]; + const v01: number = inputImage[j01]; + const v10: number = inputImage[j10]; + const v11: number = inputImage[j11]; + + const v0: number = v00 * (1 - dx) + v01 * dx; + const v1: number = v10 * (1 - dx) + v11 * dx; + + const v: number = v0 * (1 - dy) + v1 * dy; + + const v2: number = Math.min(Math.max(Math.round(v), 0), 255); + + // createTensorWithDataList is dumb compared to reshape and + // hence has to be given with one channel after another + const i: number = y * nx3 + x + (c % 3) * 224 * 224; + + result[i] = (v2 / 255 - mean[c]) / std[c]; + } + } + } + + return result; +}; + +const normalizeEmbedding = (embedding: Float32Array) => { + let normalization = 0; + for (let index = 0; index < embedding.length; index++) { + normalization += embedding[index] * embedding[index]; + } + const sqrtNormalization = Math.sqrt(normalization); + for (let index = 0; index < embedding.length; index++) { + embedding[index] = embedding[index] / sqrtNormalization; + } + return embedding; +}; + +let _tokenizer: Tokenizer = null; +const getTokenizer = () => { + if (!_tokenizer) { + _tokenizer = new Tokenizer(); + } + return _tokenizer; +}; + +export const clipTextEmbedding = async (text: string) => { + const imageSession = await onnxTextSession(); + const t1 = Date.now(); + const tokenizer = getTokenizer(); + const tokenizedText = Int32Array.from(tokenizer.encodeForCLIP(text)); + const feeds = { + input: new ort.Tensor("int32", tokenizedText, [1, 77]), + }; + const t2 = Date.now(); + const results = await imageSession.run(feeds); + log.debug( + () => + `CLIP text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, + ); + const textEmbedding = results["output"].data; + return normalizeEmbedding(textEmbedding); +}; diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts new file mode 100644 index 0000000000..c547885bb0 --- /dev/null +++ b/desktop/src/main/services/ml-face.ts @@ -0,0 +1,77 @@ +/** + * @file Various face recognition related tasks. + * + * - Face detection with the YOLO model. + * - Face embedding with the mobilefacenet model. + * + * The runtime used is ONNX. + */ +import * as ort from "onnxruntime-node"; +import log from "../log"; +import { createInferenceSession, modelPathDownloadingIfNeeded } from "./ml"; + +const faceDetectionModelName = "yolov5s_face_640_640_dynamic.onnx"; +const faceDetectionModelByteSize = 30762872; // 29.3 MB + +const faceEmbeddingModelName = "mobilefacenet_opset15.onnx"; +const faceEmbeddingModelByteSize = 5286998; // 5 MB + +let activeFaceDetectionModelDownload: Promise | undefined; + +const faceDetectionModelPathDownloadingIfNeeded = async () => { + try { + if (activeFaceDetectionModelDownload) { + log.info("Waiting for face detection model download to finish"); + await activeFaceDetectionModelDownload; + } else { + activeFaceDetectionModelDownload = modelPathDownloadingIfNeeded( + faceDetectionModelName, + faceDetectionModelByteSize, + ); + return await activeFaceDetectionModelDownload; + } + } finally { + activeFaceDetectionModelDownload = undefined; + } +}; + +let _faceDetectionSession: Promise | undefined; + +const faceDetectionSession = async () => { + if (!_faceDetectionSession) { + _faceDetectionSession = + faceDetectionModelPathDownloadingIfNeeded().then((modelPath) => + createInferenceSession(modelPath), + ); + } + return _faceDetectionSession; +}; + + +// export const clipImageEmbedding = async (jpegImageData: Uint8Array) => { +// const tempFilePath = await generateTempFilePath(""); +// const imageStream = new Response(jpegImageData.buffer).body; +// await writeStream(tempFilePath, imageStream); +// try { +// return await clipImageEmbedding_(tempFilePath); +// } finally { +// await deleteTempFile(tempFilePath); +// } +// }; + +// const clipImageEmbedding_ = async (jpegFilePath: string) => { +// const imageSession = await onnxImageSession(); +// const t1 = Date.now(); +// const rgbData = await getRGBData(jpegFilePath); +// const feeds = { +// input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]), +// }; +// const t2 = Date.now(); +// const results = await imageSession.run(feeds); +// log.debug( +// () => +// `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, +// ); +// const imageEmbedding = results["output"].data; // Float32Array +// return normalizeEmbedding(imageEmbedding); +// }; diff --git a/desktop/src/main/services/ml.ts b/desktop/src/main/services/ml.ts new file mode 100644 index 0000000000..10402db217 --- /dev/null +++ b/desktop/src/main/services/ml.ts @@ -0,0 +1,79 @@ +/** + * @file AI/ML related functionality. + * + * @see also `ml-clip.ts`, `ml-face.ts`. + * + * The ML runtime we use for inference is [ONNX](https://onnxruntime.ai). Models + * for various tasks are not shipped with the app but are downloaded on demand. + * + * The primary reason for doing these tasks in the Node.js layer is so that we + * can use the binary ONNX runtime which is 10-20x faster than the WASM based + * web one. + */ +import { app, net } from "electron/main"; +import { existsSync } from "fs"; +import fs from "node:fs/promises"; +import path from "node:path"; +import * as ort from "onnxruntime-node"; +import { writeStream } from "../fs"; +import log from "../log"; + +/** + * Download the model named {@link modelName} if we don't already have it. + * + * Also verify that the size of the model we get matches {@expectedByteSize} (if + * not, redownload it). + * + * @returns the path to the model on the local machine. + */ +export const modelPathDownloadingIfNeeded = async ( + modelName: string, + expectedByteSize: number, +) => { + const modelPath = modelSavePath(modelName); + + if (!existsSync(modelPath)) { + log.info("CLIP image model not found, downloading"); + await downloadModel(modelPath, modelName); + } else { + const size = (await fs.stat(modelPath)).size; + if (size !== expectedByteSize) { + log.error( + `The size ${size} of model ${modelName} does not match the expected size, downloading again`, + ); + await downloadModel(modelPath, modelName); + } + } + + return modelPath; +}; + +/** Return the path where the given {@link modelName} is meant to be saved */ +export const modelSavePath = (modelName: string) => + path.join(app.getPath("userData"), "models", modelName); + +export const downloadModel = async (saveLocation: string, name: string) => { + // `mkdir -p` the directory where we want to save the model. + const saveDir = path.dirname(saveLocation); + await fs.mkdir(saveDir, { recursive: true }); + // Download + log.info(`Downloading ML model from ${name}`); + const url = `https://models.ente.io/${name}`; + const res = await net.fetch(url); + if (!res.ok) throw new Error(`Failed to fetch ${url}: HTTP ${res.status}`); + // Save + await writeStream(saveLocation, res.body); + log.info(`Downloaded CLIP model ${name}`); +}; + +/** + * Crete an ONNX {@link InferenceSession} with some defaults. + */ +export const createInferenceSession = async (modelPath: string) => { + return await ort.InferenceSession.create(modelPath, { + // Restrict the number of threads to 1 + intraOpNumThreads: 1, + // Be more conservative with RAM usage + enableCpuMemArena: false, + }); +}; From 7bf8912dbc3278496f723ac740ca68c8af4a70f5 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 13:45:02 +0530 Subject: [PATCH 03/17] Duplicate for now --- desktop/src/main/services/ml-face.ts | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts index c547885bb0..f88f432ee8 100644 --- a/desktop/src/main/services/ml-face.ts +++ b/desktop/src/main/services/ml-face.ts @@ -47,6 +47,36 @@ const faceDetectionSession = async () => { return _faceDetectionSession; }; +let activeFaceEmbeddingModelDownload: Promise | undefined; + +const faceEmbeddingModelPathDownloadingIfNeeded = async () => { + try { + if (activeFaceEmbeddingModelDownload) { + log.info("Waiting for face embedding model download to finish"); + await activeFaceEmbeddingModelDownload; + } else { + activeFaceEmbeddingModelDownload = modelPathDownloadingIfNeeded( + faceEmbeddingModelName, + faceEmbeddingModelByteSize, + ); + return await activeFaceEmbeddingModelDownload; + } + } finally { + activeFaceEmbeddingModelDownload = undefined; + } +}; + +let _faceEmbeddingSession: Promise | undefined; + +const faceEmbeddingSession = async () => { + if (!_faceEmbeddingSession) { + _faceEmbeddingSession = + faceEmbeddingModelPathDownloadingIfNeeded().then((modelPath) => + createInferenceSession(modelPath), + ); + } + return _faceEmbeddingSession; +}; // export const clipImageEmbedding = async (jpegImageData: Uint8Array) => { // const tempFilePath = await generateTempFilePath(""); From 2bb9e77e34e1334712647fc37868ecc4d7cddfdb Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 13:46:34 +0530 Subject: [PATCH 04/17] Remove unused code --- .../yoloFaceDetectionService.ts | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts index 9fc0f7ad24..71b51f674e 100644 --- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts +++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts @@ -156,43 +156,6 @@ class YoloFaceDetectionService implements FaceDetectionService { }; } - /** - * @deprecated The method should not be used - */ - private imageBitmapToTensorData(imageBitmap) { - // Create an OffscreenCanvas and set its size - const offscreenCanvas = new OffscreenCanvas( - imageBitmap.width, - imageBitmap.height, - ); - const ctx = offscreenCanvas.getContext("2d"); - ctx.drawImage(imageBitmap, 0, 0, imageBitmap.width, imageBitmap.height); - const imageData = ctx.getImageData( - 0, - 0, - imageBitmap.width, - imageBitmap.height, - ); - const pixelData = imageData.data; - const data = new Float32Array( - 1 * 3 * imageBitmap.width * imageBitmap.height, - ); - // Populate the Float32Array with normalized pixel values - for (let i = 0; i < pixelData.length; i += 4) { - // Normalize pixel values to the range [0, 1] - data[i / 4] = pixelData[i] / 255.0; // Red channel - data[i / 4 + imageBitmap.width * imageBitmap.height] = - pixelData[i + 1] / 255.0; // Green channel - data[i / 4 + 2 * imageBitmap.width * imageBitmap.height] = - pixelData[i + 2] / 255.0; // Blue channel - } - - return { - data: data, - shape: [1, 3, imageBitmap.width, imageBitmap.height], - }; - } - // The rowOutput is a Float32Array of shape [25200, 16], where each row represents a bounding box. private getFacesFromYoloOutput( rowOutput: Float32Array, From a88f551b6a6b8bc6f3ca76f1b4af1d188ffdaa0a Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 13:58:52 +0530 Subject: [PATCH 05/17] WIP IPC API --- desktop/src/main/services/ml-face.ts | 24 ++++ .../machineLearning/machineLearningFactory.ts | 3 - .../mobileFaceNetEmbeddingService.ts | 6 - .../yoloFaceDetectionService.ts | 116 ++++++------------ .../photos/src/types/machineLearning/index.ts | 3 +- web/packages/next/types/ipc.ts | 21 +++- 6 files changed, 79 insertions(+), 94 deletions(-) diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts index f88f432ee8..bf8eea1625 100644 --- a/desktop/src/main/services/ml-face.ts +++ b/desktop/src/main/services/ml-face.ts @@ -78,6 +78,30 @@ const faceEmbeddingSession = async () => { return _faceEmbeddingSession; }; +private async initOnnx() { + console.log("start ort"); + this.onnxInferenceSession = await ort.InferenceSession.create( + "/models/yoloface/yolov5s_face_640_640_dynamic.onnx", + ); + const data = new Float32Array(1 * 3 * 640 * 640); + const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); + // TODO(MR): onnx-yolo + // const feeds: Record = {}; + const feeds: Record = {}; + const name = this.onnxInferenceSession.inputNames[0]; + feeds[name] = inputTensor; + await this.onnxInferenceSession.run(feeds); + console.log("start end"); +} + +private async getOnnxInferenceSession() { + if (!this.onnxInferenceSession) { + await this.initOnnx(); + } + return this.onnxInferenceSession; +} + + // export const clipImageEmbedding = async (jpegImageData: Uint8Array) => { // const tempFilePath = await generateTempFilePath(""); // const imageStream = new Response(jpegImageData.buffer).body; diff --git a/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts b/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts index 36e37d9b83..991ae68087 100644 --- a/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts +++ b/web/apps/photos/src/services/machineLearning/machineLearningFactory.ts @@ -203,9 +203,6 @@ export class LocalMLSyncContext implements MLSyncContext { } public async dispose() { - // await this.faceDetectionService.dispose(); - // await this.faceEmbeddingService.dispose(); - this.localFilesMap = undefined; await this.syncQueue.onIdle(); this.syncQueue.removeAllListeners(); diff --git a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts index 39953689e6..6b2450a24b 100644 --- a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts +++ b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts @@ -96,12 +96,6 @@ class MobileFaceNetEmbeddingService implements FaceEmbeddingService { } return embeddings; } - - public async dispose() { - const inferenceSession = await this.getOnnxInferenceSession(); - inferenceSession?.release(); - this.onnxInferenceSession = undefined; - } } export default new MobileFaceNetEmbeddingService(); diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts index 71b51f674e..02e5bb02b2 100644 --- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts +++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts @@ -1,4 +1,5 @@ import { MAX_FACE_DISTANCE_PERCENT } from "constants/mlConfig"; +import { euclidean } from "hdbscan"; import { Matrix, applyToPoint, @@ -21,17 +22,7 @@ import { import { newBox } from "utils/machineLearning"; import { Box, Point } from "../../../thirdparty/face-api/classes"; -// TODO(MR): onnx-yolo -// import * as ort from "onnxruntime-web"; -// import { env } from "onnxruntime-web"; -const ort: any = {}; - -// TODO(MR): onnx-yolo -// env.wasm.wasmPaths = "/js/onnx/"; class YoloFaceDetectionService implements FaceDetectionService { - // TODO(MR): onnx-yolo - // private onnxInferenceSession?: ort.InferenceSession; - private onnxInferenceSession?: any; public method: Versioned; public constructor() { @@ -41,27 +32,44 @@ class YoloFaceDetectionService implements FaceDetectionService { }; } - private async initOnnx() { - console.log("start ort"); - this.onnxInferenceSession = await ort.InferenceSession.create( - "/models/yoloface/yolov5s_face_640_640_dynamic.onnx", - ); - const data = new Float32Array(1 * 3 * 640 * 640); + public async detectFaces( + imageBitmap: ImageBitmap, + ): Promise> { + const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT; + const preprocessResult = + this.preprocessImageBitmapToFloat32ChannelsFirst( + imageBitmap, + 640, + 640, + ); + const data = preprocessResult.data; + const resized = preprocessResult.newSize; const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); // TODO(MR): onnx-yolo // const feeds: Record = {}; const feeds: Record = {}; - const name = this.onnxInferenceSession.inputNames[0]; - feeds[name] = inputTensor; - await this.onnxInferenceSession.run(feeds); - console.log("start end"); - } - - private async getOnnxInferenceSession() { - if (!this.onnxInferenceSession) { - await this.initOnnx(); - } - return this.onnxInferenceSession; + feeds["input"] = inputTensor; + const inferenceSession = await this.getOnnxInferenceSession(); + const runout = await inferenceSession.run(feeds); + const outputData = runout.output.data; + const faces = this.getFacesFromYoloOutput( + outputData as Float32Array, + 0.7, + ); + const inBox = newBox(0, 0, resized.width, resized.height); + const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height); + const transform = computeTransformToBox(inBox, toBox); + const faceDetections: Array = faces?.map((f) => { + const box = transformBox(f.box, transform); + const normLandmarks = f.landmarks; + const landmarks = transformPoints(normLandmarks, transform); + return { + box, + landmarks, + probability: f.probability as number, + } as FaceDetection; + }); + return removeDuplicateDetections(faceDetections, maxFaceDistance); } private preprocessImageBitmapToFloat32ChannelsFirst( @@ -233,64 +241,10 @@ class YoloFaceDetectionService implements FaceDetectionService { probability: faceDetection.probability, }; } - - private async estimateOnnx(imageBitmap: ImageBitmap) { - const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT; - const preprocessResult = - this.preprocessImageBitmapToFloat32ChannelsFirst( - imageBitmap, - 640, - 640, - ); - const data = preprocessResult.data; - const resized = preprocessResult.newSize; - const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); - // TODO(MR): onnx-yolo - // const feeds: Record = {}; - const feeds: Record = {}; - feeds["input"] = inputTensor; - const inferenceSession = await this.getOnnxInferenceSession(); - const runout = await inferenceSession.run(feeds); - const outputData = runout.output.data; - const faces = this.getFacesFromYoloOutput( - outputData as Float32Array, - 0.7, - ); - const inBox = newBox(0, 0, resized.width, resized.height); - const toBox = newBox(0, 0, imageBitmap.width, imageBitmap.height); - const transform = computeTransformToBox(inBox, toBox); - const faceDetections: Array = faces?.map((f) => { - const box = transformBox(f.box, transform); - const normLandmarks = f.landmarks; - const landmarks = transformPoints(normLandmarks, transform); - return { - box, - landmarks, - probability: f.probability as number, - } as FaceDetection; - }); - return removeDuplicateDetections(faceDetections, maxFaceDistance); - } - - public async detectFaces( - imageBitmap: ImageBitmap, - ): Promise> { - // measure time taken - const facesFromOnnx = await this.estimateOnnx(imageBitmap); - return facesFromOnnx; - } - - public async dispose() { - const inferenceSession = await this.getOnnxInferenceSession(); - inferenceSession?.release(); - this.onnxInferenceSession = undefined; - } } export default new YoloFaceDetectionService(); -import { euclidean } from "hdbscan"; - /** * Removes duplicate face detections from an array of detections. * diff --git a/web/apps/photos/src/types/machineLearning/index.ts b/web/apps/photos/src/types/machineLearning/index.ts index 3def20a088..399990696c 100644 --- a/web/apps/photos/src/types/machineLearning/index.ts +++ b/web/apps/photos/src/types/machineLearning/index.ts @@ -261,13 +261,12 @@ export declare type MLIndex = "files" | "people"; export interface FaceDetectionService { method: Versioned; - // init(): Promise; + detectFaces(image: ImageBitmap): Promise>; getRelativeDetection( faceDetection: FaceDetection, imageDimensions: Dimensions, ): FaceDetection; - dispose(): Promise; } export interface FaceCropService { diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts index a0bc07d9a8..83d9ee6bdd 100644 --- a/web/packages/next/types/ipc.ts +++ b/web/packages/next/types/ipc.ts @@ -196,7 +196,7 @@ export interface Electron { // - ML /** - * Compute and return a CLIP embedding of the given image. + * Return a CLIP embedding of the given image. * * See: [Note: CLIP based magic search] * @@ -207,7 +207,7 @@ export interface Electron { clipImageEmbedding: (jpegImageData: Uint8Array) => Promise; /** - * Compute and return a CLIP embedding of the given image. + * Return a CLIP embedding of the given image. * * See: [Note: CLIP based magic search] * @@ -217,6 +217,23 @@ export interface Electron { */ clipTextEmbedding: (text: string) => Promise; + /** + * Detect faces in the given image using YOLO. + * + * Both the input and output are opaque binary data whose internal structure + * is model (YOLO) and our implementation specific. That said, specifically + * the {@link inputImage} a particular bitmap encoding of an image. + */ + detectFaces: (inputImage: Uint8Array) => Promise; + + /** + * Return a mobilefacenet embedding for the given face data. + * + * Both the input and output are opaque binary data whose internal structure + * is model (mobilefacenet) and our implementation specific. + */ + faceEmbedding: (input: Float32Array) => Promise; + // - File selection // TODO: Deprecated - use dialogs on the renderer process itself From 41f7b30ca078f2262a1c19ebf453360a55f2fa45 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 14:22:52 +0530 Subject: [PATCH 06/17] Wire together --- desktop/src/main/ipc.ts | 11 ++++++++++- desktop/src/main/services/ml-face.ts | 12 +++++++++++- desktop/src/preload.ts | 8 ++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/desktop/src/main/ipc.ts b/desktop/src/main/ipc.ts index 1d863b8e9a..b6e8848183 100644 --- a/desktop/src/main/ipc.ts +++ b/desktop/src/main/ipc.ts @@ -36,13 +36,14 @@ import { updateAndRestart, updateOnNextRestart, } from "./services/app-update"; -import { clipImageEmbedding, clipTextEmbedding } from "./services/ml-clip"; import { runFFmpegCmd } from "./services/ffmpeg"; import { getDirFiles } from "./services/fs"; import { convertToJPEG, generateImageThumbnail, } from "./services/imageProcessor"; +import { clipImageEmbedding, clipTextEmbedding } from "./services/ml-clip"; +import { detectFaces, faceEmbedding } from "./services/ml-face"; import { clearStores, encryptionKey, @@ -146,6 +147,14 @@ export const attachIPCHandlers = () => { clipTextEmbedding(text), ); + ipcMain.handle("detectFaces", (_, imageData: Uint8Array) => + detectFaces(imageData), + ); + + ipcMain.handle("faceEmbedding", (_, input: Float32Array) => + faceEmbedding(input), + ); + // - File selection ipcMain.handle("selectDirectory", () => selectDirectory()); diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts index bf8eea1625..066f5406bf 100644 --- a/desktop/src/main/services/ml-face.ts +++ b/desktop/src/main/services/ml-face.ts @@ -78,6 +78,16 @@ const faceEmbeddingSession = async () => { return _faceEmbeddingSession; }; +export const detectFaces = async (inputImage: Uint8Array) => { + throw new Error("test"); +}; + +export const faceEmbedding = async (input: Float32Array) => { + throw new Error("test"); +}; + +/* + private async initOnnx() { console.log("start ort"); this.onnxInferenceSession = await ort.InferenceSession.create( @@ -100,7 +110,7 @@ private async getOnnxInferenceSession() { } return this.onnxInferenceSession; } - +*/ // export const clipImageEmbedding = async (jpegImageData: Uint8Array) => { // const tempFilePath = await generateTempFilePath(""); diff --git a/desktop/src/preload.ts b/desktop/src/preload.ts index 07736502bd..bea5c9e18f 100644 --- a/desktop/src/preload.ts +++ b/desktop/src/preload.ts @@ -143,6 +143,12 @@ const clipImageEmbedding = (jpegImageData: Uint8Array): Promise => const clipTextEmbedding = (text: string): Promise => ipcRenderer.invoke("clipTextEmbedding", text); +const detectFaces = (imageData: Uint8Array): Promise => + ipcRenderer.invoke("detectFaces", imageData); + +const faceEmbedding = (input: Float32Array): Promise => + ipcRenderer.invoke("faceEmbedding", input); + // - File selection // TODO: Deprecated - use dialogs on the renderer process itself @@ -322,6 +328,8 @@ contextBridge.exposeInMainWorld("electron", { // - ML clipImageEmbedding, clipTextEmbedding, + detectFaces, + faceEmbedding, // - File selection selectDirectory, From a1d6ef43b4565733010e76b413090cdf61681729 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 14:37:44 +0530 Subject: [PATCH 07/17] Roundtrip --- desktop/src/main/ipc.ts | 4 +- desktop/src/main/services/ml-face.ts | 38 +++++++++++-------- desktop/src/preload.ts | 4 +- .../yoloFaceDetectionService.ts | 10 +---- web/packages/next/types/ipc.ts | 9 ++--- 5 files changed, 33 insertions(+), 32 deletions(-) diff --git a/desktop/src/main/ipc.ts b/desktop/src/main/ipc.ts index b6e8848183..180e68cdcf 100644 --- a/desktop/src/main/ipc.ts +++ b/desktop/src/main/ipc.ts @@ -147,8 +147,8 @@ export const attachIPCHandlers = () => { clipTextEmbedding(text), ); - ipcMain.handle("detectFaces", (_, imageData: Uint8Array) => - detectFaces(imageData), + ipcMain.handle("detectFaces", (_, input: Float32Array) => + detectFaces(input), ); ipcMain.handle("faceEmbedding", (_, input: Float32Array) => diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts index 066f5406bf..78eb82bd15 100644 --- a/desktop/src/main/services/ml-face.ts +++ b/desktop/src/main/services/ml-face.ts @@ -78,8 +78,29 @@ const faceEmbeddingSession = async () => { return _faceEmbeddingSession; }; -export const detectFaces = async (inputImage: Uint8Array) => { - throw new Error("test"); +export const detectFaces = async (input: Float32Array) => { + // console.log("start ort"); + // this.onnxInferenceSession = await ort.InferenceSession.create( + // "/models/yoloface/yolov5s_face_640_640_dynamic.onnx", + // ); + // const data = new Float32Array(1 * 3 * 640 * 640); + // const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); + // // TODO(MR): onnx-yolo + // // const feeds: Record = {}; + // const feeds: Record = {}; + // const name = this.onnxInferenceSession.inputNames[0]; + // feeds[name] = inputTensor; + // await this.onnxInferenceSession.run(feeds); + // console.log("start end"); + + const session = await faceDetectionSession(); + const t = Date.now(); + const feeds = { + input: new ort.Tensor("float32", input, [1, 3, 640, 640]), + }; + const results = await session.run(feeds); + log.debug(() => `onnx/yolo inference took ${Date.now() - t} ms`); + return results["output"].data; }; export const faceEmbedding = async (input: Float32Array) => { @@ -89,19 +110,6 @@ export const faceEmbedding = async (input: Float32Array) => { /* private async initOnnx() { - console.log("start ort"); - this.onnxInferenceSession = await ort.InferenceSession.create( - "/models/yoloface/yolov5s_face_640_640_dynamic.onnx", - ); - const data = new Float32Array(1 * 3 * 640 * 640); - const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); - // TODO(MR): onnx-yolo - // const feeds: Record = {}; - const feeds: Record = {}; - const name = this.onnxInferenceSession.inputNames[0]; - feeds[name] = inputTensor; - await this.onnxInferenceSession.run(feeds); - console.log("start end"); } private async getOnnxInferenceSession() { diff --git a/desktop/src/preload.ts b/desktop/src/preload.ts index bea5c9e18f..2db39e2290 100644 --- a/desktop/src/preload.ts +++ b/desktop/src/preload.ts @@ -143,8 +143,8 @@ const clipImageEmbedding = (jpegImageData: Uint8Array): Promise => const clipTextEmbedding = (text: string): Promise => ipcRenderer.invoke("clipTextEmbedding", text); -const detectFaces = (imageData: Uint8Array): Promise => - ipcRenderer.invoke("detectFaces", imageData); +const detectFaces = (input: Float32Array): Promise => + ipcRenderer.invoke("detectFaces", input); const faceEmbedding = (input: Float32Array): Promise => ipcRenderer.invoke("faceEmbedding", input); diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts index 02e5bb02b2..9efd31cbb7 100644 --- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts +++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts @@ -1,3 +1,4 @@ +import { ensureElectron } from "@/next/electron"; import { MAX_FACE_DISTANCE_PERCENT } from "constants/mlConfig"; import { euclidean } from "hdbscan"; import { @@ -44,14 +45,7 @@ class YoloFaceDetectionService implements FaceDetectionService { ); const data = preprocessResult.data; const resized = preprocessResult.newSize; - const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); - // TODO(MR): onnx-yolo - // const feeds: Record = {}; - const feeds: Record = {}; - feeds["input"] = inputTensor; - const inferenceSession = await this.getOnnxInferenceSession(); - const runout = await inferenceSession.run(feeds); - const outputData = runout.output.data; + const outputData = await ensureElectron().detectFaces(data); const faces = this.getFacesFromYoloOutput( outputData as Float32Array, 0.7, diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts index 83d9ee6bdd..5b0979eaa2 100644 --- a/web/packages/next/types/ipc.ts +++ b/web/packages/next/types/ipc.ts @@ -221,16 +221,15 @@ export interface Electron { * Detect faces in the given image using YOLO. * * Both the input and output are opaque binary data whose internal structure - * is model (YOLO) and our implementation specific. That said, specifically - * the {@link inputImage} a particular bitmap encoding of an image. + * is specific to our implementation and the model (YOLO) we use. */ - detectFaces: (inputImage: Uint8Array) => Promise; + detectFaces: (input: Float32Array) => Promise; /** - * Return a mobilefacenet embedding for the given face data. + * Return a MobileFaceNet embedding for the given face data. * * Both the input and output are opaque binary data whose internal structure - * is model (mobilefacenet) and our implementation specific. + * is specific to our implementation and the model (MobileFaceNet) we use. */ faceEmbedding: (input: Float32Array) => Promise; From 9ff4989d81df88609890ff43eb7d88c45d51c025 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 14:39:59 +0530 Subject: [PATCH 08/17] Cleanup --- desktop/src/main/services/ml-clip.ts | 4 +- desktop/src/main/services/ml-face.ts | 45 +------------------ .../services/machineLearning/faceService.ts | 5 +-- 3 files changed, 6 insertions(+), 48 deletions(-) diff --git a/desktop/src/main/services/ml-clip.ts b/desktop/src/main/services/ml-clip.ts index 3fe6da2eb2..63fa751482 100644 --- a/desktop/src/main/services/ml-clip.ts +++ b/desktop/src/main/services/ml-clip.ts @@ -134,7 +134,7 @@ const clipImageEmbedding_ = async (jpegFilePath: string) => { const results = await imageSession.run(feeds); log.debug( () => - `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, + `onnx/clip image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, ); const imageEmbedding = results["output"].data; // Float32Array return normalizeEmbedding(imageEmbedding); @@ -241,7 +241,7 @@ export const clipTextEmbedding = async (text: string) => { const results = await imageSession.run(feeds); log.debug( () => - `CLIP text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, + `onnx/clip text embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, ); const textEmbedding = results["output"].data; return normalizeEmbedding(textEmbedding); diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts index 78eb82bd15..c79ae591f8 100644 --- a/desktop/src/main/services/ml-face.ts +++ b/desktop/src/main/services/ml-face.ts @@ -2,7 +2,7 @@ * @file Various face recognition related tasks. * * - Face detection with the YOLO model. - * - Face embedding with the mobilefacenet model. + * - Face embedding with the MobileFaceNet model. * * The runtime used is ONNX. */ @@ -99,51 +99,10 @@ export const detectFaces = async (input: Float32Array) => { input: new ort.Tensor("float32", input, [1, 3, 640, 640]), }; const results = await session.run(feeds); - log.debug(() => `onnx/yolo inference took ${Date.now() - t} ms`); + log.debug(() => `onnx/yolo face detection took ${Date.now() - t} ms`); return results["output"].data; }; export const faceEmbedding = async (input: Float32Array) => { throw new Error("test"); }; - -/* - -private async initOnnx() { -} - -private async getOnnxInferenceSession() { - if (!this.onnxInferenceSession) { - await this.initOnnx(); - } - return this.onnxInferenceSession; -} -*/ - -// export const clipImageEmbedding = async (jpegImageData: Uint8Array) => { -// const tempFilePath = await generateTempFilePath(""); -// const imageStream = new Response(jpegImageData.buffer).body; -// await writeStream(tempFilePath, imageStream); -// try { -// return await clipImageEmbedding_(tempFilePath); -// } finally { -// await deleteTempFile(tempFilePath); -// } -// }; - -// const clipImageEmbedding_ = async (jpegFilePath: string) => { -// const imageSession = await onnxImageSession(); -// const t1 = Date.now(); -// const rgbData = await getRGBData(jpegFilePath); -// const feeds = { -// input: new ort.Tensor("float32", rgbData, [1, 3, 224, 224]), -// }; -// const t2 = Date.now(); -// const results = await imageSession.run(feeds); -// log.debug( -// () => -// `CLIP image embedding took ${Date.now() - t1} ms (prep: ${t2 - t1} ms, inference: ${Date.now() - t2} ms)`, -// ); -// const imageEmbedding = results["output"].data; // Float32Array -// return normalizeEmbedding(imageEmbedding); -// }; diff --git a/web/apps/photos/src/services/machineLearning/faceService.ts b/web/apps/photos/src/services/machineLearning/faceService.ts index 449ae0b964..0f26950f8a 100644 --- a/web/apps/photos/src/services/machineLearning/faceService.ts +++ b/web/apps/photos/src/services/machineLearning/faceService.ts @@ -55,7 +55,7 @@ class FaceService { await syncContext.faceDetectionService.detectFaces(imageBitmap); console.timeEnd(timerId); console.log("faceDetections: ", faceDetections?.length); - // log.info('3 TF Memory stats: ',JSON.stringify(tf.memory())); + // TODO: reenable faces filtering based on width const detectedFaces = faceDetections?.map((detection) => { return { @@ -150,7 +150,7 @@ class FaceService { imageBitmap.close(); log.info("[MLService] alignedFaces: ", newMlFile.faces?.length); - // log.info('4 TF Memory stats: ',JSON.stringify(tf.memory())); + return faceImages; } @@ -187,7 +187,6 @@ class FaceService { newMlFile.faces.forEach((f, i) => (f.embedding = embeddings[i])); log.info("[MLService] facesWithEmbeddings: ", newMlFile.faces.length); - // log.info('5 TF Memory stats: ',JSON.stringify(tf.memory())); } async syncFileFaceMakeRelativeDetections( From 52727f2255624e88dae26c5f7b0675a4aa7911ae Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 15:02:14 +0530 Subject: [PATCH 09/17] Also move the embedding --- desktop/src/main/services/ml-face.ts | 37 ++++++++- web/apps/photos/src/constants/mlConfig.ts | 11 --- .../laplacianBlurDetectionService.ts | 4 +- .../mobileFaceNetEmbeddingService.ts | 76 ++----------------- .../yoloFaceDetectionService.ts | 4 +- .../photos/src/types/machineLearning/index.ts | 3 +- 6 files changed, 49 insertions(+), 86 deletions(-) diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts index c79ae591f8..63b7a9d02f 100644 --- a/desktop/src/main/services/ml-face.ts +++ b/desktop/src/main/services/ml-face.ts @@ -104,5 +104,40 @@ export const detectFaces = async (input: Float32Array) => { }; export const faceEmbedding = async (input: Float32Array) => { - throw new Error("test"); + // console.log("start ort mobilefacenet"); + // this.onnxInferenceSession = await ort.InferenceSession.create( + // "/models/mobilefacenet/mobilefacenet_opset15.onnx", + // ); + // const faceBatchSize = 1; + // const data = new Float32Array( + // faceBatchSize * 3 * this.faceSize * this.faceSize, + // ); + // const inputTensor = new ort.Tensor("float32", data, [ + // faceBatchSize, + // this.faceSize, + // this.faceSize, + // 3, + // ]); + // // TODO(MR): onnx-yolo + // // const feeds: Record = {}; + // const feeds: Record = {}; + // const name = this.onnxInferenceSession.inputNames[0]; + // feeds[name] = inputTensor; + // await this.onnxInferenceSession.run(feeds); + // console.log("start end mobilefacenet"); + + // Dimension of each face (alias) + const mobileFaceNetFaceSize = 112; + // Smaller alias + const z = mobileFaceNetFaceSize; + // Size of each face's data in the batch + const n = Math.round(input.length / (z * z * 3)); + const inputTensor = new ort.Tensor("float32", input, [n, z, z, 3]); + + const session = await faceEmbeddingSession(); + const t = Date.now(); + const feeds = { img_inputs: inputTensor }; + const results = await session.run(feeds); + log.debug(() => `onnx/yolo face embedding took ${Date.now() - t} ms`); + return results.embeddings["cpuData"]; // as Float32Array; }; diff --git a/web/apps/photos/src/constants/mlConfig.ts b/web/apps/photos/src/constants/mlConfig.ts index ff3eed264a..929594e1c1 100644 --- a/web/apps/photos/src/constants/mlConfig.ts +++ b/web/apps/photos/src/constants/mlConfig.ts @@ -53,15 +53,4 @@ export const DEFAULT_ML_SEARCH_CONFIG: MLSearchConfig = { enabled: false, }; -export const ML_SYNC_DOWNLOAD_TIMEOUT_MS = 300000; - -export const MAX_FACE_DISTANCE_PERCENT = Math.sqrt(2) / 100; - export const MAX_ML_SYNC_ERROR_COUNT = 1; - -export const TEXT_DETECTION_TIMEOUT_MS = [10000, 30000, 60000, 120000, 240000]; - -export const MOBILEFACENET_FACE_SIZE = 112; -export const MOBILEFACENET_EMBEDDING_SIZE = 192; - -export const BATCHES_BEFORE_SYNCING_INDEX = 5; diff --git a/web/apps/photos/src/services/machineLearning/laplacianBlurDetectionService.ts b/web/apps/photos/src/services/machineLearning/laplacianBlurDetectionService.ts index b5842f70c2..14178a5351 100644 --- a/web/apps/photos/src/services/machineLearning/laplacianBlurDetectionService.ts +++ b/web/apps/photos/src/services/machineLearning/laplacianBlurDetectionService.ts @@ -1,10 +1,10 @@ -import { MOBILEFACENET_FACE_SIZE } from "constants/mlConfig"; import { BlurDetectionMethod, BlurDetectionService, Versioned, } from "types/machineLearning"; import { createGrayscaleIntMatrixFromNormalized2List } from "utils/image"; +import { mobileFaceNetFaceSize } from "./mobileFaceNetEmbeddingService"; class LaplacianBlurDetectionService implements BlurDetectionService { public method: Versioned; @@ -19,7 +19,7 @@ class LaplacianBlurDetectionService implements BlurDetectionService { public detectBlur(alignedFaces: Float32Array): number[] { const numFaces = Math.round( alignedFaces.length / - (MOBILEFACENET_FACE_SIZE * MOBILEFACENET_FACE_SIZE * 3), + (mobileFaceNetFaceSize * mobileFaceNetFaceSize * 3), ); const blurValues: number[] = []; for (let i = 0; i < numFaces; i++) { diff --git a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts index 6b2450a24b..7daa7d8444 100644 --- a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts +++ b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts @@ -1,7 +1,4 @@ -import { - MOBILEFACENET_EMBEDDING_SIZE, - MOBILEFACENET_FACE_SIZE, -} from "constants/mlConfig"; +import { ensureElectron } from "@/next/electron"; import { FaceEmbedding, FaceEmbeddingMethod, @@ -9,17 +6,9 @@ import { Versioned, } from "types/machineLearning"; -// TODO(MR): onnx-yolo -// import * as ort from "onnxruntime-web"; -// import { env } from "onnxruntime-web"; -const ort: any = {}; +export const mobileFaceNetFaceSize = 112; -// TODO(MR): onnx-yolo -// env.wasm.wasmPaths = "/js/onnx/"; class MobileFaceNetEmbeddingService implements FaceEmbeddingService { - // TODO(MR): onnx-yolo - // private onnxInferenceSession?: ort.InferenceSession; - private onnxInferenceSession?: any; public method: Versioned; public faceSize: number; @@ -28,70 +17,21 @@ class MobileFaceNetEmbeddingService implements FaceEmbeddingService { value: "MobileFaceNet", version: 2, }; - this.faceSize = MOBILEFACENET_FACE_SIZE; - // TODO: set timeout - } - - private async initOnnx() { - console.log("start ort mobilefacenet"); - this.onnxInferenceSession = await ort.InferenceSession.create( - "/models/mobilefacenet/mobilefacenet_opset15.onnx", - ); - const faceBatchSize = 1; - const data = new Float32Array( - faceBatchSize * 3 * this.faceSize * this.faceSize, - ); - const inputTensor = new ort.Tensor("float32", data, [ - faceBatchSize, - this.faceSize, - this.faceSize, - 3, - ]); - // TODO(MR): onnx-yolo - // const feeds: Record = {}; - const feeds: Record = {}; - const name = this.onnxInferenceSession.inputNames[0]; - feeds[name] = inputTensor; - await this.onnxInferenceSession.run(feeds); - console.log("start end mobilefacenet"); - } - - private async getOnnxInferenceSession() { - if (!this.onnxInferenceSession) { - await this.initOnnx(); - } - return this.onnxInferenceSession; + this.faceSize = mobileFaceNetFaceSize; } public async getFaceEmbeddings( faceData: Float32Array, ): Promise> { - const inputTensor = new ort.Tensor("float32", faceData, [ - Math.round(faceData.length / (this.faceSize * this.faceSize * 3)), - this.faceSize, - this.faceSize, - 3, - ]); - // TODO(MR): onnx-yolo - // const feeds: Record = {}; - const feeds: Record = {}; - feeds["img_inputs"] = inputTensor; - const inferenceSession = await this.getOnnxInferenceSession(); - // TODO(MR): onnx-yolo - // const runout: ort.InferenceSession.OnnxValueMapType = - const runout: any = await inferenceSession.run(feeds); - // const test = runout.embeddings; - // const test2 = test.cpuData; - const outputData = runout.embeddings["cpuData"] as Float32Array; + const outputData = await ensureElectron().faceEmbedding(faceData); + + const embeddingSize = 192; const embeddings = new Array( - outputData.length / MOBILEFACENET_EMBEDDING_SIZE, + outputData.length / embeddingSize, ); for (let i = 0; i < embeddings.length; i++) { embeddings[i] = new Float32Array( - outputData.slice( - i * MOBILEFACENET_EMBEDDING_SIZE, - (i + 1) * MOBILEFACENET_EMBEDDING_SIZE, - ), + outputData.slice(i * embeddingSize, (i + 1) * embeddingSize), ); } return embeddings; diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts index 9efd31cbb7..fdbb3f102b 100644 --- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts +++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts @@ -1,5 +1,4 @@ import { ensureElectron } from "@/next/electron"; -import { MAX_FACE_DISTANCE_PERCENT } from "constants/mlConfig"; import { euclidean } from "hdbscan"; import { Matrix, @@ -36,7 +35,8 @@ class YoloFaceDetectionService implements FaceDetectionService { public async detectFaces( imageBitmap: ImageBitmap, ): Promise> { - const maxFaceDistance = imageBitmap.width * MAX_FACE_DISTANCE_PERCENT; + const maxFaceDistancePercent = Math.sqrt(2) / 100; + const maxFaceDistance = imageBitmap.width * maxFaceDistancePercent; const preprocessResult = this.preprocessImageBitmapToFloat32ChannelsFirst( imageBitmap, diff --git a/web/apps/photos/src/types/machineLearning/index.ts b/web/apps/photos/src/types/machineLearning/index.ts index 399990696c..436585bbae 100644 --- a/web/apps/photos/src/types/machineLearning/index.ts +++ b/web/apps/photos/src/types/machineLearning/index.ts @@ -287,9 +287,8 @@ export interface FaceAlignmentService { export interface FaceEmbeddingService { method: Versioned; faceSize: number; - // init(): Promise; + getFaceEmbeddings(faceImages: Float32Array): Promise>; - dispose(): Promise; } export interface BlurDetectionService { From f5bf776848653f23d293042ef64e02c3c0e69c0d Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 15:06:03 +0530 Subject: [PATCH 10/17] lint --- desktop/docs/dependencies.md | 10 +++++----- desktop/src/main/services/ml-face.ts | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/desktop/docs/dependencies.md b/desktop/docs/dependencies.md index 9cced1f818..62f70e8e46 100644 --- a/desktop/docs/dependencies.md +++ b/desktop/docs/dependencies.md @@ -1,8 +1,8 @@ # Dependencies -* [Electron](#electron) -* [Dev dependencies](#dev) -* [Functionality](#functionality) +- [Electron](#electron) +- [Dev dependencies](#dev) +- [Functionality](#functionality) ## Electron @@ -114,8 +114,8 @@ available on the host machine, and is not bundled with our app. AI/ML runtime. It powers both natural language searches (using CLIP) and face detection (using YOLO). -[jpeg-js](https://github.com/jpeg-js/jpeg-js#readme) is used for decoding -JPEG data into raw RGB bytes before passing it to ONNX. +[jpeg-js](https://github.com/jpeg-js/jpeg-js#readme) is used for decoding JPEG +data into raw RGB bytes before passing it to ONNX. html-entities is used by the bundled clip-bpe-ts tokenizer for CLIP. diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts index 63b7a9d02f..62865ff236 100644 --- a/desktop/src/main/services/ml-face.ts +++ b/desktop/src/main/services/ml-face.ts @@ -139,5 +139,6 @@ export const faceEmbedding = async (input: Float32Array) => { const feeds = { img_inputs: inputTensor }; const results = await session.run(feeds); log.debug(() => `onnx/yolo face embedding took ${Date.now() - t} ms`); - return results.embeddings["cpuData"]; // as Float32Array; + // TODO: What's with this type? + return (results.embeddings as unknown as any)["cpuData"]; // as Float32Array; }; From 33e3265db6351f2cf92da227c8c8e2f1b2deba06 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 15:17:50 +0530 Subject: [PATCH 11/17] Migration for existing configs --- web/apps/photos/src/types/machineLearning/index.ts | 9 +++------ web/apps/photos/src/utils/storage/mlIDbStorage.ts | 3 +++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/web/apps/photos/src/types/machineLearning/index.ts b/web/apps/photos/src/types/machineLearning/index.ts index 436585bbae..d0c902333c 100644 --- a/web/apps/photos/src/types/machineLearning/index.ts +++ b/web/apps/photos/src/types/machineLearning/index.ts @@ -45,16 +45,13 @@ export declare type Landmark = Point; export declare type ImageType = "Original" | "Preview"; -export declare type FaceDetectionMethod = "FaceApiSSD" | "YoloFace"; +export declare type FaceDetectionMethod = "YoloFace"; export declare type FaceCropMethod = "ArcFace"; -export declare type FaceAlignmentMethod = - | "ArcFace" - | "FaceApiDlib" - | "RotatedFaceApiDlib"; +export declare type FaceAlignmentMethod = "ArcFace"; -export declare type FaceEmbeddingMethod = "MobileFaceNet" | "FaceApiDlib"; +export declare type FaceEmbeddingMethod = "MobileFaceNet"; export declare type BlurDetectionMethod = "Laplacian"; diff --git a/web/apps/photos/src/utils/storage/mlIDbStorage.ts b/web/apps/photos/src/utils/storage/mlIDbStorage.ts index d7e24cbe80..bba71c4ff5 100644 --- a/web/apps/photos/src/utils/storage/mlIDbStorage.ts +++ b/web/apps/photos/src/utils/storage/mlIDbStorage.ts @@ -124,6 +124,9 @@ class MLIDbStorage { .add(DEFAULT_ML_SEARCH_CONFIG, ML_SEARCH_CONFIG_NAME); } if (oldVersion < 4) { + db.deleteObjectStore("configs"); + db.createObjectStore("configs"); + db.deleteObjectStore("things"); } From ff66a2f44caf6fec50eba3b64163bc6b6ee99bba Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 15:27:02 +0530 Subject: [PATCH 12/17] The ML code runs in workers --- .../services/machineLearning/mobileFaceNetEmbeddingService.ts | 4 ++-- .../src/services/machineLearning/yoloFaceDetectionService.ts | 4 ++-- web/apps/photos/src/utils/storage/mlIDbStorage.ts | 1 + web/packages/next/worker/comlink-worker.ts | 3 +++ 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts index 7daa7d8444..818b8a5d12 100644 --- a/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts +++ b/web/apps/photos/src/services/machineLearning/mobileFaceNetEmbeddingService.ts @@ -1,4 +1,4 @@ -import { ensureElectron } from "@/next/electron"; +import { workerBridge } from "@/next/worker/worker-bridge"; import { FaceEmbedding, FaceEmbeddingMethod, @@ -23,7 +23,7 @@ class MobileFaceNetEmbeddingService implements FaceEmbeddingService { public async getFaceEmbeddings( faceData: Float32Array, ): Promise> { - const outputData = await ensureElectron().faceEmbedding(faceData); + const outputData = await workerBridge.faceEmbedding(faceData); const embeddingSize = 192; const embeddings = new Array( diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts index fdbb3f102b..3e7d282fb1 100644 --- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts +++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts @@ -1,4 +1,3 @@ -import { ensureElectron } from "@/next/electron"; import { euclidean } from "hdbscan"; import { Matrix, @@ -21,6 +20,7 @@ import { } from "utils/image"; import { newBox } from "utils/machineLearning"; import { Box, Point } from "../../../thirdparty/face-api/classes"; +import { workerBridge } from "@/next/worker/worker-bridge"; class YoloFaceDetectionService implements FaceDetectionService { public method: Versioned; @@ -45,7 +45,7 @@ class YoloFaceDetectionService implements FaceDetectionService { ); const data = preprocessResult.data; const resized = preprocessResult.newSize; - const outputData = await ensureElectron().detectFaces(data); + const outputData = await workerBridge.detectFaces(data); const faces = this.getFacesFromYoloOutput( outputData as Float32Array, 0.7, diff --git a/web/apps/photos/src/utils/storage/mlIDbStorage.ts b/web/apps/photos/src/utils/storage/mlIDbStorage.ts index bba71c4ff5..8be60afacf 100644 --- a/web/apps/photos/src/utils/storage/mlIDbStorage.ts +++ b/web/apps/photos/src/utils/storage/mlIDbStorage.ts @@ -124,6 +124,7 @@ class MLIDbStorage { .add(DEFAULT_ML_SEARCH_CONFIG, ML_SEARCH_CONFIG_NAME); } if (oldVersion < 4) { + // TODO(MR): This loses the user's settings. db.deleteObjectStore("configs"); db.createObjectStore("configs"); diff --git a/web/packages/next/worker/comlink-worker.ts b/web/packages/next/worker/comlink-worker.ts index 033c79fa8c..ad340c2094 100644 --- a/web/packages/next/worker/comlink-worker.ts +++ b/web/packages/next/worker/comlink-worker.ts @@ -46,6 +46,9 @@ const workerBridge = { logToDisk, convertToJPEG: (inputFileData: Uint8Array, filename: string) => ensureElectron().convertToJPEG(inputFileData, filename), + detectFaces: (input: Float32Array) => ensureElectron().detectFaces(input), + faceEmbedding: (input: Float32Array) => + ensureElectron().faceEmbedding(input), }; export type WorkerBridge = typeof workerBridge; From ef4462553c0595c61a0f54e0f133b366a914c590 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 15:35:25 +0530 Subject: [PATCH 13/17] Fix incorrect typecheck that fails on undefined --- web/apps/photos/src/services/machineLearning/mlWorkManager.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/web/apps/photos/src/services/machineLearning/mlWorkManager.ts b/web/apps/photos/src/services/machineLearning/mlWorkManager.ts index d62d6f829e..c5df14b224 100644 --- a/web/apps/photos/src/services/machineLearning/mlWorkManager.ts +++ b/web/apps/photos/src/services/machineLearning/mlWorkManager.ts @@ -186,8 +186,7 @@ class MLWorkManager { return mlWorker.syncLocalFile(token, userID, enteFile, localFile); }); - // @ts-expect-error "TODO: Fix ML related type errors" - if ("message" in result) { + if (result instanceof Error) { // TODO: redirect/refresh to gallery in case of session_expired // may not be required as uploader should anyways take care of this console.error("Error while syncing local file: ", result); From e58e96091f26db7c9a4c56d4d41f6fb3d30f1042 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 16:15:08 +0530 Subject: [PATCH 14/17] Ignore (expected) errors when trying to cache face crops --- .../services/machineLearning/faceService.ts | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/web/apps/photos/src/services/machineLearning/faceService.ts b/web/apps/photos/src/services/machineLearning/faceService.ts index 0f26950f8a..3116ac23c2 100644 --- a/web/apps/photos/src/services/machineLearning/faceService.ts +++ b/web/apps/photos/src/services/machineLearning/faceService.ts @@ -225,11 +225,21 @@ class FaceService { face.detection, syncContext.config.faceCrop, ); - face.crop = await storeFaceCrop( - face.id, - faceCrop, - syncContext.config.faceCrop.blobOptions, - ); + try { + face.crop = await storeFaceCrop( + face.id, + faceCrop, + syncContext.config.faceCrop.blobOptions, + ); + } catch (e) { + // TODO(MR): Temporarily ignoring errors about failing cache puts + // when using a custom scheme in Electron. Needs an alternative + // approach, perhaps OPFS. + console.error( + "Ignoring error when caching face crop, the face crop will not be available", + e, + ); + } const blob = await imageBitmapToBlob(faceCrop.image); faceCrop.image.close(); return blob; From 320db9f8b741973492b56aa4dd9d019e90b0c716 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 16:22:38 +0530 Subject: [PATCH 15/17] Fix the putEmbeddings API calls for now --- web/apps/photos/src/services/embeddingService.ts | 6 +++++- web/packages/next/worker/comlink-worker.ts | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/web/apps/photos/src/services/embeddingService.ts b/web/apps/photos/src/services/embeddingService.ts index b93b01532b..a4309e314c 100644 --- a/web/apps/photos/src/services/embeddingService.ts +++ b/web/apps/photos/src/services/embeddingService.ts @@ -1,4 +1,6 @@ +import { inWorker } from "@/next/env"; import log from "@/next/log"; +import { workerBridge } from "@/next/worker/worker-bridge"; import ComlinkCryptoWorker from "@ente/shared/crypto"; import { CustomError } from "@ente/shared/error"; import HTTPService from "@ente/shared/network/HTTPService"; @@ -262,7 +264,9 @@ export const putEmbedding = async ( putEmbeddingReq: PutEmbeddingRequest, ): Promise => { try { - const token = getToken(); + const token = inWorker() + ? await workerBridge.getAuthToken() + : getToken(); if (!token) { log.info("putEmbedding failed: token not found"); throw Error(CustomError.TOKEN_MISSING); diff --git a/web/packages/next/worker/comlink-worker.ts b/web/packages/next/worker/comlink-worker.ts index ad340c2094..f082ac1145 100644 --- a/web/packages/next/worker/comlink-worker.ts +++ b/web/packages/next/worker/comlink-worker.ts @@ -35,6 +35,19 @@ export class ComlinkWorker InstanceType> { } } +// TODO(MR): Temporary method to forward auth tokens to workers +const getAuthToken = () => { + // LS_KEYS.USER + const userJSONString = localStorage.getItem("user"); + if (!userJSONString) return undefined; + const json: unknown = JSON.parse(userJSONString); + if (!json || typeof json != "object" || !("token" in json)) + return undefined; + const token = json.token; + if (typeof token != "string") return undefined; + return token; +}; + /** * A minimal set of utility functions that we expose to all workers that we * create. @@ -44,6 +57,7 @@ export class ComlinkWorker InstanceType> { */ const workerBridge = { logToDisk, + getAuthToken, convertToJPEG: (inputFileData: Uint8Array, filename: string) => ensureElectron().convertToJPEG(inputFileData, filename), detectFaces: (input: Float32Array) => ensureElectron().detectFaces(input), From 33c84f7a08e99135f5e9ea54fb3b9e7ac120d4d9 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 16:27:27 +0530 Subject: [PATCH 16/17] Prevent undefined errors --- web/apps/photos/src/utils/common/job.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/apps/photos/src/utils/common/job.ts b/web/apps/photos/src/utils/common/job.ts index f549966b66..365f879e95 100644 --- a/web/apps/photos/src/utils/common/job.ts +++ b/web/apps/photos/src/utils/common/job.ts @@ -50,7 +50,7 @@ export class SimpleJob { try { const jobResult = await this.runCallback(); - if (jobResult.shouldBackoff) { + if (jobResult && jobResult.shouldBackoff) { this.intervalSec = Math.min( this.config.maxItervalSec, this.intervalSec * this.config.backoffMultiplier, From 61143c9c62b30e7fa06365447aa1b7cac5a680f4 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 11 Apr 2024 16:36:56 +0530 Subject: [PATCH 17/17] Cleanup --- desktop/src/main/services/ml-face.ts | 38 +------------------ .../yoloFaceDetectionService.ts | 2 +- 2 files changed, 2 insertions(+), 38 deletions(-) diff --git a/desktop/src/main/services/ml-face.ts b/desktop/src/main/services/ml-face.ts index 62865ff236..1f007c5fd8 100644 --- a/desktop/src/main/services/ml-face.ts +++ b/desktop/src/main/services/ml-face.ts @@ -79,20 +79,6 @@ const faceEmbeddingSession = async () => { }; export const detectFaces = async (input: Float32Array) => { - // console.log("start ort"); - // this.onnxInferenceSession = await ort.InferenceSession.create( - // "/models/yoloface/yolov5s_face_640_640_dynamic.onnx", - // ); - // const data = new Float32Array(1 * 3 * 640 * 640); - // const inputTensor = new ort.Tensor("float32", data, [1, 3, 640, 640]); - // // TODO(MR): onnx-yolo - // // const feeds: Record = {}; - // const feeds: Record = {}; - // const name = this.onnxInferenceSession.inputNames[0]; - // feeds[name] = inputTensor; - // await this.onnxInferenceSession.run(feeds); - // console.log("start end"); - const session = await faceDetectionSession(); const t = Date.now(); const feeds = { @@ -104,28 +90,6 @@ export const detectFaces = async (input: Float32Array) => { }; export const faceEmbedding = async (input: Float32Array) => { - // console.log("start ort mobilefacenet"); - // this.onnxInferenceSession = await ort.InferenceSession.create( - // "/models/mobilefacenet/mobilefacenet_opset15.onnx", - // ); - // const faceBatchSize = 1; - // const data = new Float32Array( - // faceBatchSize * 3 * this.faceSize * this.faceSize, - // ); - // const inputTensor = new ort.Tensor("float32", data, [ - // faceBatchSize, - // this.faceSize, - // this.faceSize, - // 3, - // ]); - // // TODO(MR): onnx-yolo - // // const feeds: Record = {}; - // const feeds: Record = {}; - // const name = this.onnxInferenceSession.inputNames[0]; - // feeds[name] = inputTensor; - // await this.onnxInferenceSession.run(feeds); - // console.log("start end mobilefacenet"); - // Dimension of each face (alias) const mobileFaceNetFaceSize = 112; // Smaller alias @@ -139,6 +103,6 @@ export const faceEmbedding = async (input: Float32Array) => { const feeds = { img_inputs: inputTensor }; const results = await session.run(feeds); log.debug(() => `onnx/yolo face embedding took ${Date.now() - t} ms`); - // TODO: What's with this type? + // TODO: What's with this type? It works in practice, but double check. return (results.embeddings as unknown as any)["cpuData"]; // as Float32Array; }; diff --git a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts index 3e7d282fb1..4fa840749d 100644 --- a/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts +++ b/web/apps/photos/src/services/machineLearning/yoloFaceDetectionService.ts @@ -1,3 +1,4 @@ +import { workerBridge } from "@/next/worker/worker-bridge"; import { euclidean } from "hdbscan"; import { Matrix, @@ -20,7 +21,6 @@ import { } from "utils/image"; import { newBox } from "utils/machineLearning"; import { Box, Point } from "../../../thirdparty/face-api/classes"; -import { workerBridge } from "@/next/worker/worker-bridge"; class YoloFaceDetectionService implements FaceDetectionService { public method: Versioned;