Split

2024-08-12 20:07:22 +05:30
parent fa91430276
commit 520647d77f
2 changed files with 114 additions and 110 deletions
--- a/web/packages/new/photos/services/file-data.ts
+++ b/web/packages/new/photos/services/file-data.ts
@@ -0,0 +1,112 @@
+import { encryptFileEmbedding } from "@/base/crypto/ente";
+import { authenticatedRequestHeaders, ensureOk } from "@/base/http";
+import { apiURL } from "@/base/origins";
+import type { EnteFile } from "@/new/photos/types/file";
+import { z } from "zod";
+
+/**
+ * [Note: Derived embeddings model]
+ *
+ * The API endpoints related to embeddings and are parameterized by a "model"
+ * enum. This is a bit of misnomer, since the contents of the payload are not
+ * just the raw embeddings themselves, but also additional data generated by the
+ * ML model.
+ *
+ * For example, the face indexing process generates multiple face embeddings per
+ * file, each with an associated detection box. So instead of storing just a
+ * singular embedding, the data is an entire face index structure containing
+ * multiple embeddings and associated data.
+ *
+ * Further down, it was realized that the fan out caused on remote when trying
+ * to fetch both CLIP and face embeddings was problematic, and also that their
+ * raw JSON was unnecessarily big. To deal with these better, we now have a
+ * single "derived" model type, whose data is a gzipped map of the form:
+ *
+ *     {
+ *       "face": ... the face indexing result ...
+ *       "clip": ... the CLIP indexing result ...
+ *       ... more in the future ...
+ *     }
+ */
+type FileDataType = "mldata";
+
+const RemoteFileData = z.object({
+    /** The ID of the file whose embedding this is. */
+    fileID: z.number(),
+    /**
+     * Base64 representation of the encrypted (model specific) embedding JSON.
+     */
+    encryptedData: z.string(),
+    /**
+     * Base64 representation of the header that should be passed when decrypting
+     * {@link encryptedData}. See the {@link decryptMetadata} function in the
+     * crypto layer.
+     */
+    decryptionHeader: z.string(),
+});
+
+type RemoteFileData = z.infer<typeof RemoteFileData>;
+
+/**
+ * Fetch {@link model} embeddings for the given list of files.
+ *
+ * @param model The {@link FileDataType} which we want.
+ *
+ * @param fileIDs The ids of the files for which we want the embeddings.
+ *
+ * @returns a list of {@link RemoteFileData} for the files which had embeddings
+ * (and that remote was able to successfully retrieve). The order of this list
+ * is arbitrary, and the caller should use the {@link fileID} present within the
+ * {@link RemoteFileData} to associate an item in the result back to a file
+ * instead of relying on the order or count of items in the result.
+ */
+export const fetchFileData = async (
+    model: FileDataType,
+    fileIDs: number[],
+): Promise<RemoteFileData[]> => {
+    const res = await fetch(await apiURL("/files/data/fetch"), {
+        method: "POST",
+        headers: await authenticatedRequestHeaders(),
+        body: JSON.stringify({
+            type: "mldata",
+            fileIDs,
+        }),
+    });
+    ensureOk(res);
+    return z.object({ data: z.array(RemoteFileData) }).parse(await res.json())
+        .data;
+};
+
+/**
+ * Upload an embedding to remote.
+ *
+ * This function will save or update the given embedding as the latest embedding
+ * associated with the given {@link enteFile} for {@link type}.
+ *
+ * @param enteFile {@link EnteFile} to which this embedding relates to.
+ *
+ * @param type The {@link FileDataType} which we are uploading.
+ *
+ * @param embedding The binary data the embedding. The exact contents of the
+ * embedding are {@link type} specific.
+ */
+export const putFileData = async (
+    enteFile: EnteFile,
+    type: FileDataType,
+    embedding: Uint8Array,
+) => {
+    const { encryptedDataB64, decryptionHeaderB64 } =
+        await encryptFileEmbedding({ data: embedding, keyB64: enteFile.key });
+
+    const res = await fetch(await apiURL("/files/data"), {
+        method: "PUT",
+        headers: await authenticatedRequestHeaders(),
+        body: JSON.stringify({
+            fileID: enteFile.id,
+            encryptedData: encryptedDataB64,
+            decryptionHeader: decryptionHeaderB64,
+            type: "mldata",
+        }),
+    });
+    ensureOk(res);
+};
--- a/web/packages/new/photos/services/ml/embedding.ts
+++ b/web/packages/new/photos/services/ml/embedding.ts
@@ -1,57 +1,13 @@
-import { decryptFileEmbedding, encryptFileEmbedding } from "@/base/crypto/ente";
-import { authenticatedRequestHeaders, ensureOk } from "@/base/http";
+import { decryptFileEmbedding } from "@/base/crypto/ente";
 import log from "@/base/log";
-import { apiURL } from "@/base/origins";
 import type { EnteFile } from "@/new/photos/types/file";
 import { nullToUndefined } from "@/utils/transform";
 import { z } from "zod";
+import { fetchFileData, putFileData } from "../file-data";
 import { gunzip, gzip } from "../gzip";
 import { type RemoteCLIPIndex } from "./clip";
 import { type RemoteFaceIndex } from "./face";

-/**
- * [Note: Derived embeddings model]
- *
- * The API endpoints related to embeddings and are parameterized by a "model"
- * enum. This is a bit of misnomer, since the contents of the payload are not
- * just the raw embeddings themselves, but also additional data generated by the
- * ML model.
- *
- * For example, the face indexing process generates multiple face embeddings per
- * file, each with an associated detection box. So instead of storing just a
- * singular embedding, the data is an entire face index structure containing
- * multiple embeddings and associated data.
- *
- * Further down, it was realized that the fan out caused on remote when trying
- * to fetch both CLIP and face embeddings was problematic, and also that their
- * raw JSON was unnecessarily big. To deal with these better, we now have a
- * single "derived" model type, whose data is a gzipped map of the form:
- *
- *     {
- *       "face": ... the face indexing result ...
- *       "clip": ... the CLIP indexing result ...
- *       ... more in the future ...
- *     }
- */
-type FileDataType = "mldata";
-
-const RemoteFileData = z.object({
-    /** The ID of the file whose embedding this is. */
-    fileID: z.number(),
-    /**
-     * Base64 representation of the encrypted (model specific) embedding JSON.
-     */
-    encryptedData: z.string(),
-    /**
-     * Base64 representation of the header that should be passed when decrypting
-     * {@link encryptedData}. See the {@link decryptMetadata} function in the
-     * crypto layer.
-     */
-    decryptionHeader: z.string(),
-});
-
-type RemoteFileData = z.infer<typeof RemoteFileData>;
-
 export type RawRemoteMLData = Record<string, unknown>;

 export type ParsedRemoteMLData = Partial<{
@@ -224,36 +180,6 @@ const remoteMLDataFromJSONString = (jsonString: string) => {
    return { raw, parsed };
 };

-/**
- * Fetch {@link model} embeddings for the given list of files.
- *
- * @param model The {@link FileDataType} which we want.
- *
- * @param fileIDs The ids of the files for which we want the embeddings.
- *
- * @returns a list of {@link RemoteFileData} for the files which had embeddings
- * (and that remote was able to successfully retrieve). The order of this list
- * is arbitrary, and the caller should use the {@link fileID} present within the
- * {@link RemoteFileData} to associate an item in the result back to a file
- * instead of relying on the order or count of items in the result.
- */
-const fetchFileData = async (
-    model: FileDataType,
-    fileIDs: number[],
-): Promise<RemoteFileData[]> => {
-    const res = await fetch(await apiURL("/files/data/fetch"), {
-        method: "POST",
-        headers: await authenticatedRequestHeaders(),
-        body: JSON.stringify({
-            type: "mldata",
-            fileIDs,
-        }),
-    });
-    ensureOk(res);
-    return z.object({ data: z.array(RemoteFileData) }).parse(await res.json())
-        .data;
-};
-
 /**
 * Update the derived data stored for given {@link enteFile} on remote.
 *
@@ -270,37 +196,3 @@ export const putMLData = async (
    enteFile: EnteFile,
    derivedData: RawRemoteMLData,
 ) => putFileData(enteFile, "mldata", await gzip(JSON.stringify(derivedData)));
-
-/**
- * Upload an embedding to remote.
- *
- * This function will save or update the given embedding as the latest embedding
- * associated with the given {@link enteFile} for {@link type}.
- *
- * @param enteFile {@link EnteFile} to which this embedding relates to.
- *
- * @param type The {@link FileDataType} which we are uploading.
- *
- * @param embedding The binary data the embedding. The exact contents of the
- * embedding are {@link type} specific.
- */
-const putFileData = async (
-    enteFile: EnteFile,
-    type: FileDataType,
-    embedding: Uint8Array,
-) => {
-    const { encryptedDataB64, decryptionHeaderB64 } =
-        await encryptFileEmbedding({ data: embedding, keyB64: enteFile.key });
-
-    const res = await fetch(await apiURL("/files/data"), {
-        method: "PUT",
-        headers: await authenticatedRequestHeaders(),
-        body: JSON.stringify({
-            fileID: enteFile.id,
-            encryptedData: encryptedDataB64,
-            decryptionHeader: decryptionHeaderB64,
-            type: "mldata",
-        }),
-    });
-    ensureOk(res);
-};