Linear

2024-08-30 17:05:16 +05:30
parent 598d5aab10
commit 48e00a0ecc
7 changed files with 539 additions and 635 deletions
--- a/web/packages/new/photos/services/ml/cluster-hdb.ts
+++ b/web/packages/new/photos/services/ml/cluster-hdb.ts
@@ -0,0 +1,35 @@
+import { Hdbscan, type DebugInfo } from "hdbscan";
+
+/**
+ * Each "cluster" is a list of indexes of the embeddings belonging to that
+ * particular cluster.
+ */
+export type EmbeddingCluster = number[];
+
+export interface ClusterHdbscanResult {
+    clusters: EmbeddingCluster[];
+    noise: number[];
+    debugInfo?: DebugInfo;
+}
+
+/**
+ * Cluster the given {@link embeddings} using hdbscan.
+ */
+export const clusterHdbscan = (
+    embeddings: number[][],
+): ClusterHdbscanResult => {
+    const hdbscan = new Hdbscan({
+        input: embeddings,
+        minClusterSize: 3,
+        minSamples: 5,
+        clusterSelectionEpsilon: 0.6,
+        clusterSelectionMethod: "leaf",
+        debug: false,
+    });
+
+    return {
+        clusters: hdbscan.getClusters(),
+        noise: hdbscan.getNoise(),
+        debugInfo: hdbscan.getDebugInfo(),
+    };
+};
--- a/web/packages/new/photos/services/ml/cluster-new.ts
+++ b/web/packages/new/photos/services/ml/cluster-new.ts
@@ -1,603 +0,0 @@
-import { newNonSecureID } from "@/base/id-worker";
-import log from "@/base/log";
-import { ensure } from "@/utils/ensure";
-import { clusterFacesHdbscan } from "./cluster";
-import { clusterGroups, faceClusters } from "./db";
-import type { Face, FaceIndex } from "./face";
-import { dotProduct } from "./math";
-
-/**
- * A face cluster is an set of faces.
- *
- * Each cluster has an id so that a {@link CGroup} can refer to it.
- *
- * The cluster is not directly synced to remote. Only clusters that the user
- * interacts with get synced to remote, as part of a {@link CGroup}.
- */
-export interface FaceCluster {
-    /**
-     * A nanoid for this cluster.
-     */
-    id: string;
-    /**
-     * An unordered set of ids of the faces that belong to this cluster.
-     *
-     * For ergonomics of transportation and persistence this is an array, but it
-     * should conceptually be thought of as a set.
-     */
-    faceIDs: string[];
-}
-
-/**
- * A cgroup ("cluster group") is a group of clusters (possibly containing a
- * single cluster) that the user has interacted with.
- *
- * Interactions include hiding, merging and giving a name and/or a cover photo.
- *
- * The most frequent interaction is naming a {@link FaceCluster}, which promotes
- * it to a become a {@link CGroup}. The promotion comes with the ability to be
- * synced with remote (as a "cgroup" user entity).
- *
- * There after, the user may attach more clusters to the same {@link CGroup}.
- *
- * > A named cluster group can be thought of as a "person", though this is not
- * > necessarily an accurate characterization. e.g. there can be a named cluster
- * > group that contains face clusters of pets.
- *
- * The other form of interaction is hiding. The user may hide a single (unnamed)
- * cluster, or they may hide an named {@link CGroup}. In both cases, we promote
- * the cluster to a CGroup if needed so that their request to hide gets synced.
- *
- * While in our local representation we separately maintain clusters and link to
- * them from within CGroups by their clusterID, in the remote representation
- * clusters themselves don't get synced. Instead, the "cgroup" entities synced
- * with remote contain the clusters within themselves. So a group that gets
- * synced with remote looks something like:
- *
- *     { id, name, clusters: [{ clusterID, faceIDs }] }
- *
- */
-export interface CGroup {
-    /**
-     * A nanoid for this cluster group.
-     *
-     * This is the ID of the "cgroup" user entity (the envelope), and it is not
-     * contained as part of the group entity payload itself.
-     */
-    id: string;
-    /**
-     * A name assigned by the user to this cluster group.
-     *
-     * The client should handle both empty strings and undefined as indicating a
-     * cgroup without a name. When the client needs to set this to an "empty"
-     * value, which happens when hiding an unnamed cluster, it should it to an
-     * empty string. That is, expect `"" | undefined`, but set `""`.
-     */
-    name: string | undefined;
-    /**
-     * An unordered set of ids of the clusters that belong to this group.
-     *
-     * For ergonomics of transportation and persistence this is an array, but it
-     * should conceptually be thought of as a set.
-     */
-    clusterIDs: string[];
-    /**
-     * True if this cluster group should be hidden.
-     *
-     * The user can hide both named cluster groups and single unnamed clusters.
-     * If the user hides a single cluster that was offered as a suggestion to
-     * them on a client, the client will create a new unnamed cgroup containing
-     * it, and set its hidden flag to sync it with remote (so that other clients
-     * can also stop showing this cluster).
-     */
-    isHidden: boolean;
-    /**
-     * The ID of the face that should be used as the cover photo for this
-     * cluster group (if the user has set one).
-     *
-     * This is similar to the [@link displayFaceID}, the difference being:
-     *
-     * -   {@link avatarFaceID} is the face selected by the user.
-     *
-     * -   {@link displayFaceID} is the automatic placeholder, and only comes
-     *     into effect if the user has not explicitly selected a face.
-     */
-    avatarFaceID: string | undefined;
-    /**
-     * Locally determined ID of the "best" face that should be used as the
-     * display face, to represent this cluster group in the UI.
-     *
-     * This property is not synced with remote. For more details, see
-     * {@link avatarFaceID}.
-     */
-    displayFaceID: string | undefined;
-}
-
-export interface ClusteringOpts {
-    method: "linear" | "hdbscan";
-    batchSize: number;
-    joinThreshold: number;
-}
-
-export interface ClusterPreview {
-    clusterSize: number;
-    faces: ClusterPreviewFace[];
-}
-
-export interface ClusterPreviewFace {
-    face: Face;
-    cosineSimilarity: number;
-    wasMerged: boolean;
-}
-
-/**
- * Cluster faces into groups.
- *
- * [Note: Face clustering algorithm]
- *
- * A cgroup (cluster group) consists of clusters, each of which itself is a set
- * of faces.
- *
- *     cgroup << cluster << face
- *
- * The clusters are generated locally by clients using the following algorithm:
- *
- * 1.  clusters = [] initially, or fetched from remote.
- *
- * 2.  For each face, find its nearest neighbour in the embedding space.
- *
- * 3.  If no such neighbour is found within our threshold, create a new cluster.
- *
- * 4.  Otherwise assign this face to the same cluster as its nearest neighbour.
- *
- * This user can then tweak the output of the algorithm by performing the
- * following actions to the list of clusters that they can see:
- *
- * -   They can provide a name for a cluster ("name a person"). This upgrades a
- *     cluster into a "cgroup", which is an entity that gets synced via remote
- *     to the user's other clients.
- *
- * -   They can attach more clusters to a cgroup ("merge clusters")
- *
- * -   They can remove a cluster from a cgroup ("break clusters").
- *
- * After clustering, we also do some routine cleanup. Faces belonging to files
- * that have been deleted (including those in Trash) should be pruned off.
- *
- * We should not make strict assumptions about the clusters we get from remote.
- * In particular, the same face ID can be in different clusters. In such cases
- * we should assign it arbitrarily assign it to the last cluster we find it in.
- * Such leeway is intentionally provided to allow clients some slack in how they
- * implement the sync without needing to make an blocking API request for every
- * user interaction.
- */
-export const clusterFaces = async (faceIndexes: FaceIndex[]) => {
-    const t = Date.now();
-
-    // A flattened array of faces.
-    // TODO-Cluster note the 2k slice
-    const faces = [...enumerateFaces(faceIndexes)].slice(0, 2000);
-
-    // Start with the clusters we already have (either from a previous indexing,
-    // or fetched from remote).
-    const clusters = await faceClusters();
-
-    // For fast reverse lookup - map from cluster ids to their index in the
-    // clusters array.
-    const clusterIndexForClusterID = new Map(clusters.map((c, i) => [c.id, i]));
-
-    // For fast reverse lookup - map from face ids to the id of the cluster to
-    // which they belong.
-    const clusterIDForFaceID = new Map(
-        clusters.flatMap((c) => c.faceIDs.map((id) => [id, c.id] as const)),
-    );
-
-    // A function to generate new cluster IDs.
-    const newClusterID = () => newNonSecureID("cluster_");
-
-    const faceAndNeigbours: FaceNeighbours[] = [];
-
-    // For each face,
-    for (const [i, fi] of faces.entries()) {
-        // If the face is already part of a cluster, then skip it.
-        if (clusterIDForFaceID.get(fi.faceID)) continue;
-
-        // Find the nearest neighbour from among all the other faces.
-        let nn: Face | undefined;
-        let nnCosineSimilarity = 0;
-        let neighbours: FaceNeighbour[] = [];
-        for (let j = 0; j < faces.length; j++) {
-            // ! This is an O(n^2) loop, be careful when adding more code here.
-
-            // TODO-Cluster Commenting this here and moving it downward
-            // // Skip ourselves.
-            // if (i == j) continue;
-
-            // Can't find a way of avoiding the null assertion here.
-            // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
-            const fj = faces[j]!;
-
-            // The vectors are already normalized, so we can directly use their
-            // dot product as their cosine similarity.
-            const csim = dotProduct(fi.embedding, fj.embedding);
-
-            // TODO-Cluster Delete me and uncomment the check above
-            // Skip ourselves.
-            if (i == j) {
-                neighbours.push({ face: fj, cosineSimilarity: csim });
-                continue;
-            }
-
-            const threshold = fi.blur < 100 || fj.blur < 100 ? 0.7 : 0.6;
-            if (csim > threshold && csim > nnCosineSimilarity) {
-                nn = fj;
-                nnCosineSimilarity = csim;
-            }
-
-            neighbours.push({ face: fj, cosineSimilarity: csim });
-        }
-
-        neighbours = neighbours.sort(
-            (a, b) => b.cosineSimilarity - a.cosineSimilarity,
-        );
-        faceAndNeigbours.push({ face: fi, neighbours });
-
-        const { faceID } = fi;
-
-        if (nn) {
-            // Found a neighbour near enough.
-            const nnFaceID = nn.faceID;
-
-            // Find the cluster the nearest neighbour belongs to, if any.
-            const nnClusterID = clusterIDForFaceID.get(nn.faceID);
-
-            if (nnClusterID) {
-                // If the neighbour is already part of a cluster, also add
-                // ourselves to that cluster.
-
-                const nnClusterIndex = ensure(
-                    clusterIndexForClusterID.get(nnClusterID),
-                );
-                clusters[nnClusterIndex]?.faceIDs.push(faceID);
-                clusterIDForFaceID.set(faceID, nnClusterID);
-            } else {
-                // Otherwise create a new cluster with us and our nearest
-                // neighbour.
-
-                const cluster = {
-                    id: newClusterID(),
-                    faceIDs: [faceID, nnFaceID],
-                };
-                clusterIndexForClusterID.set(cluster.id, clusters.length);
-                clusterIDForFaceID.set(faceID, cluster.id);
-                clusterIDForFaceID.set(nnFaceID, cluster.id);
-                clusters.push(cluster);
-            }
-        } else {
-            // We didn't find a neighbour within the threshold. Create a new
-            // cluster with only this face.
-
-            const cluster = { id: newClusterID(), faceIDs: [faceID] };
-            clusterIndexForClusterID.set(cluster.id, clusters.length);
-            clusterIDForFaceID.set(faceID, cluster.id);
-            clusters.push(cluster);
-        }
-    }
-
-    // Prune too small clusters.
-    const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
-
-    let cgroups = await clusterGroups();
-
-    // TODO-Cluster - Currently we're not syncing with remote or saving anything
-    // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
-    // cgroup, one per cluster.
-    cgroups = cgroups.concat(
-        validClusters.map((c) => ({
-            id: c.id,
-            name: undefined,
-            clusterIDs: [c.id],
-            isHidden: false,
-            avatarFaceID: undefined,
-            displayFaceID: undefined,
-        })),
-    );
-
-    // For each cluster group, use the highest scoring face in any of its
-    // clusters as its display face.
-    const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
-    for (const cgroup of cgroups) {
-        cgroup.displayFaceID = cgroup.clusterIDs
-            .map((clusterID) => clusterIndexForClusterID.get(clusterID))
-            .filter((i) => i !== undefined) /* 0 is a valid index */
-            .flatMap((i) => clusters[i]?.faceIDs ?? [])
-            .map((faceID) => faceForFaceID.get(faceID))
-            .filter((face) => !!face)
-            .reduce((max, face) =>
-                max.score > face.score ? max : face,
-            ).faceID;
-    }
-
-    log.info("ml/cluster", {
-        faces,
-        validClusters,
-        clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
-        clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
-        cgroups,
-    });
-    log.info(
-        `Clustered ${faces.length} faces into ${validClusters.length} clusters (${Date.now() - t} ms)`,
-    );
-
-    return { faces, clusters: validClusters, cgroups, faceAndNeigbours };
-};
-
-/**
- * A generator function that returns a stream of {faceID, embedding} values,
- * flattening all the the faces present in the given {@link faceIndices}.
- */
-function* enumerateFaces(faceIndices: FaceIndex[]) {
-    for (const fi of faceIndices) {
-        for (const f of fi.faces) {
-            yield f;
-        }
-    }
-}
-
-export const clusterFacesHdb = (
-    faceIndexes: FaceIndex[],
-    opts: ClusteringOpts,
-) => {
-    const { batch } = opts;
-    const t = Date.now();
-
-    // A flattened array of faces.
-    // TODO-Cluster ad-hoc filtering and slicing
-    const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
-    // .slice(0, 6000);
-    // TODO-Cluster testing code, can be removed once done
-    const faces = Array(1)
-        .fill(0)
-        .flatMap(() => faces0);
-
-    // For fast reverse lookup - map from face ids to the face.
-    const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
-
-    const faceEmbeddings = faces.map(({ embedding }) => embedding);
-
-    // For fast reverse lookup - map from cluster ids to their index in the
-    // clusters array.
-    const clusterIndexForClusterID = new Map<string, number>();
-
-    // For fast reverse lookup - map from the id of a face to the id of the
-    // cluster to which it belongs.
-    const clusterIDForFaceID = new Map<string, string>();
-
-    // A function to chain two reverse lookup.
-    const firstFaceOfCluster = (cluster: FaceCluster) =>
-        ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
-
-    // A function to generate new cluster IDs.
-    const newClusterID = () => newNonSecureID("cluster_");
-
-    // The resultant clusters.
-    // TODO-Cluster Later on, instead of starting from a blank slate, this will
-    // be list of existing clusters we fetch from remote.
-    const clusters: FaceCluster[] = [];
-
-    // Process the faces in batches. The faces are already sorted by file ID,
-    // which is a monotonically increasing integer, so we will also have some
-    // temporal locality.
-    //
-    // The number 2500 was derived by ad-hoc observations and takes a few
-    // seconds. On a particular test dataset and a particular machine,
-    // clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
-    // Memory usage was constant in all these cases.
-    //
-    // At around 100k faces, the clustering starts taking hours, and we start
-    // running into stack overflows. The stack overflows can perhaps be avoided
-    // by restructuring the code, but hours of uninterruptible work is anyways
-    // not feasible.
-
-    const batchSize = 2500;
-    for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
-        const it = Date.now();
-        const embeddings = faceEmbeddings.slice(i, i + batchSize);
-        const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings);
-
-        log.info(
-            `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
-        );
-
-        // Merge the new clusters we got from hdbscan into the existing clusters
-        // if they are "near" them (using some heuristic).
-        //
-        // We need to ensure we don't change any of the existing cluster IDs,
-        // since these might be existing clusters we got from remote.
-
-        for (const hdbCluster of hdbClusters) {
-            // Find the existing cluster whose (arbitrarily chosen) first face
-            // is the nearest neighbour of the (arbitrarily chosen) first face
-            // of the cluster produced by hdbscan.
-
-            const newFace = ensure(faces[i + ensure(hdbCluster[0])]);
-
-            let nnCluster: FaceCluster | undefined;
-            let nnCosineSimilarity = 0;
-            for (const existingCluster of clusters) {
-                const existingFace = firstFaceOfCluster(existingCluster);
-
-                // The vectors are already normalized, so we can directly use their
-                // dot product as their cosine similarity.
-                const csim = dotProduct(
-                    existingFace.embedding,
-                    newFace.embedding,
-                );
-
-                // Use a higher cosine similarity threshold if either of the two
-                // faces are blurry.
-                const threshold =
-                    existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7;
-                if (csim > threshold && csim > nnCosineSimilarity) {
-                    nnCluster = existingCluster;
-                    nnCosineSimilarity = csim;
-                }
-            }
-
-            if (nnCluster) {
-                // If we found an existing cluster that is near enough,
-                // sublimate the cluster produced by hdbscan into that cluster.
-                for (const j of hdbCluster) {
-                    const { faceID } = ensure(faces[i + j]);
-                    nnCluster.faceIDs.push(faceID);
-                    clusterIDForFaceID.set(faceID, nnCluster.id);
-                }
-            } else {
-                // Otherwise make a new cluster from the cluster produced by
-                // hdbscan.
-                const clusterID = newClusterID();
-                const faceIDs: string[] = [];
-                for (const j of hdbCluster) {
-                    const { faceID } = ensure(faces[i + j]);
-                    faceIDs.push(faceID);
-                    clusterIDForFaceID.set(faceID, clusterID);
-                }
-                clusterIndexForClusterID.set(clusterID, clusters.length);
-                clusters.push({ id: clusterID, faceIDs });
-            }
-        }
-    }
-
-    // Convert into the data structure we're using to debug/visualize.
-    // const faceAndNeigbours: FaceNeighbours[] = [];
-    // const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
-    // for (const fi of topFaces) {
-    //     let neighbours: FaceNeighbour[] = [];
-    //     for (const fj of faces) {
-    //         // The vectors are already normalized, so we can directly use their
-    //         // dot product as their cosine similarity.
-    //         const csim = dotProduct(fi.embedding, fj.embedding);
-    //         neighbours.push({ face: fj, cosineSimilarity: csim });
-    //     }
-
-    //     neighbours = neighbours
-    //         .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
-    //         .slice(0, 30);
-
-    //     faceAndNeigbours.push({ face: fi, neighbours });
-    // }
-
-    // Convert into the data structure we're using to debug/visualize.
-    //
-    // > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
-    // > each, sorted by cosine distance to highest scoring face in the
-    // > cluster).
-
-    const sortedClusters = clusters.sort(
-        (a, b) => b.faceIDs.length - a.faceIDs.length,
-    );
-    const debugClusters =
-        sortedClusters.length < 60
-            ? sortedClusters
-            : sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
-    const clusterPreviews: ClusterPreview[] = [];
-    for (const cluster of debugClusters) {
-        const faces = cluster.faceIDs.map((id) =>
-            ensure(faceForFaceID.get(id)),
-        );
-        const topFace = faces.reduce((max, face) =>
-            max.score > face.score ? max : face,
-        );
-        const previewFaces: ClusterPreviewFace[] = [];
-        for (const face of faces) {
-            const csim = dotProduct(topFace.embedding, face.embedding);
-            previewFaces.push({ face, cosineSimilarity: csim });
-        }
-        clusterPreviews.push({
-            clusterSize: cluster.faceIDs.length,
-            faces: previewFaces
-                .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
-                .slice(0, 50),
-        });
-    }
-
-    // Prune too small clusters.
-    // TODO-Cluster this is likely not needed since hdbscan already has a min?
-    const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
-
-    // let cgroups = await clusterGroups();
-
-    // // TODO-Cluster - Currently we're not syncing with remote or saving anything
-    // // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
-    // // cgroup, one per cluster.
-    // cgroups = cgroups.concat(
-    //     validClusters.map((c) => ({
-    //         id: c.id,
-    //         name: undefined,
-    //         clusterIDs: [c.id],
-    //         isHidden: false,
-    //         avatarFaceID: undefined,
-    //         displayFaceID: undefined,
-    //     })),
-    // );
-
-    // // For each cluster group, use the highest scoring face in any of its
-    // // clusters as its display face.
-    // for (const cgroup of cgroups) {
-    //     cgroup.displayFaceID = cgroup.clusterIDs
-    //         .map((clusterID) => clusterIndexForClusterID.get(clusterID))
-    //         .filter((i) => i !== undefined) /* 0 is a valid index */
-    //         .flatMap((i) => clusters[i]?.faceIDs ?? [])
-    //         .map((faceID) => faceForFaceID.get(faceID))
-    //         .filter((face) => !!face)
-    //         .reduce((max, face) =>
-    //             max.score > face.score ? max : face,
-    //         ).faceID;
-    // }
-
-    // TODO-Cluster - Currently we're not syncing with remote or saving anything
-    // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
-    // cgroup, one per cluster.
-
-    const cgroups: CGroup[] = [];
-    for (const cluster of sortedClusters) {
-        const faces = cluster.faceIDs.map((id) =>
-            ensure(faceForFaceID.get(id)),
-        );
-        const topFace = faces.reduce((max, face) =>
-            max.score > face.score ? max : face,
-        );
-        cgroups.push({
-            id: cluster.id,
-            name: undefined,
-            clusterIDs: [cluster.id],
-            isHidden: false,
-            avatarFaceID: undefined,
-            displayFaceID: topFace.faceID,
-        });
-    }
-
-    // log.info("ml/cluster", {
-    //     faces,
-    //     validClusters,
-    //     clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
-    //     clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
-    //     cgroups,
-    // });
-    log.info(
-        `Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
-    );
-
-    const clusteredCount = clusterIDForFaceID.size;
-    const unclusteredCount = faces.length - clusteredCount;
-
-    return {
-        // faces,
-        clusteredCount,
-        unclusteredCount,
-        clusters: validClusters,
-        cgroups,
-        clusterPreviews,
-        clusterIDForFaceID,
-    };
-};
--- a/web/packages/new/photos/services/ml/cluster.ts
+++ b/web/packages/new/photos/services/ml/cluster.ts
@@ -1,35 +1,507 @@
-import { Hdbscan, type DebugInfo } from "hdbscan";
+import { newNonSecureID } from "@/base/id-worker";
+import log from "@/base/log";
+import { ensure } from "@/utils/ensure";
+import { type EmbeddingCluster, clusterHdbscan } from "./cluster-hdb";
+import type { Face, FaceIndex } from "./face";
+import { dotProduct } from "./math";

-export type Cluster = number[];
-
-export interface ClusterFacesResult {
-    clusters: Cluster[];
-    noise: Cluster;
-    debugInfo?: DebugInfo;
+/**
+ * A face cluster is an set of faces.
+ *
+ * Each cluster has an id so that a {@link CGroup} can refer to it.
+ *
+ * The cluster is not directly synced to remote. Only clusters that the user
+ * interacts with get synced to remote, as part of a {@link CGroup}.
+ */
+export interface FaceCluster {
+    /**
+     * A nanoid for this cluster.
+     */
+    id: string;
+    /**
+     * An unordered set of ids of the faces that belong to this cluster.
+     *
+     * For ergonomics of transportation and persistence this is an array, but it
+     * should conceptually be thought of as a set.
+     */
+    faceIDs: string[];
 }

 /**
- * Cluster the given {@link faceEmbeddings}.
+ * A cgroup ("cluster group") is a group of clusters (possibly containing a
+ * single cluster) that the user has interacted with.
+ *
+ * Interactions include hiding, merging and giving a name and/or a cover photo.
+ *
+ * The most frequent interaction is naming a {@link FaceCluster}, which promotes
+ * it to a become a {@link CGroup}. The promotion comes with the ability to be
+ * synced with remote (as a "cgroup" user entity).
+ *
+ * There after, the user may attach more clusters to the same {@link CGroup}.
+ *
+ * > A named cluster group can be thought of as a "person", though this is not
+ * > necessarily an accurate characterization. e.g. there can be a named cluster
+ * > group that contains face clusters of pets.
+ *
+ * The other form of interaction is hiding. The user may hide a single (unnamed)
+ * cluster, or they may hide an named {@link CGroup}. In both cases, we promote
+ * the cluster to a CGroup if needed so that their request to hide gets synced.
+ *
+ * While in our local representation we separately maintain clusters and link to
+ * them from within CGroups by their clusterID, in the remote representation
+ * clusters themselves don't get synced. Instead, the "cgroup" entities synced
+ * with remote contain the clusters within themselves. So a group that gets
+ * synced with remote looks something like:
+ *
+ *     { id, name, clusters: [{ clusterID, faceIDs }] }
 *
- * @param faceEmbeddings An array of embeddings produced by our face indexing
- * pipeline. Each embedding is for a face detected in an image (a single image
- * may have multiple faces detected within it).
 */
-export const clusterFacesHdbscan = (
-    faceEmbeddings: number[][],
-): ClusterFacesResult => {
-    const hdbscan = new Hdbscan({
-        input: faceEmbeddings,
-        minClusterSize: 3,
-        minSamples: 5,
-        clusterSelectionEpsilon: 0.6,
-        clusterSelectionMethod: "leaf",
-        debug: false,
-    });
+export interface CGroup {
+    /**
+     * A nanoid for this cluster group.
+     *
+     * This is the ID of the "cgroup" user entity (the envelope), and it is not
+     * contained as part of the group entity payload itself.
+     */
+    id: string;
+    /**
+     * A name assigned by the user to this cluster group.
+     *
+     * The client should handle both empty strings and undefined as indicating a
+     * cgroup without a name. When the client needs to set this to an "empty"
+     * value, which happens when hiding an unnamed cluster, it should it to an
+     * empty string. That is, expect `"" | undefined`, but set `""`.
+     */
+    name: string | undefined;
+    /**
+     * An unordered set of ids of the clusters that belong to this group.
+     *
+     * For ergonomics of transportation and persistence this is an array, but it
+     * should conceptually be thought of as a set.
+     */
+    clusterIDs: string[];
+    /**
+     * True if this cluster group should be hidden.
+     *
+     * The user can hide both named cluster groups and single unnamed clusters.
+     * If the user hides a single cluster that was offered as a suggestion to
+     * them on a client, the client will create a new unnamed cgroup containing
+     * it, and set its hidden flag to sync it with remote (so that other clients
+     * can also stop showing this cluster).
+     */
+    isHidden: boolean;
+    /**
+     * The ID of the face that should be used as the cover photo for this
+     * cluster group (if the user has set one).
+     *
+     * This is similar to the [@link displayFaceID}, the difference being:
+     *
+     * -   {@link avatarFaceID} is the face selected by the user.
+     *
+     * -   {@link displayFaceID} is the automatic placeholder, and only comes
+     *     into effect if the user has not explicitly selected a face.
+     */
+    avatarFaceID: string | undefined;
+    /**
+     * Locally determined ID of the "best" face that should be used as the
+     * display face, to represent this cluster group in the UI.
+     *
+     * This property is not synced with remote. For more details, see
+     * {@link avatarFaceID}.
+     */
+    displayFaceID: string | undefined;
+}
+
+export interface ClusteringOpts {
+    method: "linear" | "hdbscan";
+    batchSize: number;
+    joinThreshold: number;
+}
+
+export interface ClusterPreview {
+    clusterSize: number;
+    faces: ClusterPreviewFace[];
+}
+
+export interface ClusterPreviewFace {
+    face: Face;
+    cosineSimilarity: number;
+    wasMerged: boolean;
+}
+
+/**
+ * Cluster faces into groups.
+ *
+ * [Note: Face clustering algorithm]
+ *
+ * A cgroup (cluster group) consists of clusters, each of which itself is a set
+ * of faces.
+ *
+ *     cgroup << cluster << face
+ *
+ * The clusters are generated locally by clients using the following algorithm:
+ *
+ * 1.  clusters = [] initially, or fetched from remote.
+ *
+ * 2.  For each face, find its nearest neighbour in the embedding space.
+ *
+ * 3.  If no such neighbour is found within our threshold, create a new cluster.
+ *
+ * 4.  Otherwise assign this face to the same cluster as its nearest neighbour.
+ *
+ * This user can then tweak the output of the algorithm by performing the
+ * following actions to the list of clusters that they can see:
+ *
+ * -   They can provide a name for a cluster ("name a person"). This upgrades a
+ *     cluster into a "cgroup", which is an entity that gets synced via remote
+ *     to the user's other clients.
+ *
+ * -   They can attach more clusters to a cgroup ("merge clusters")
+ *
+ * -   They can remove a cluster from a cgroup ("break clusters").
+ *
+ * After clustering, we also do some routine cleanup. Faces belonging to files
+ * that have been deleted (including those in Trash) should be pruned off.
+ *
+ * We should not make strict assumptions about the clusters we get from remote.
+ * In particular, the same face ID can be in different clusters. In such cases
+ * we should assign it arbitrarily assign it to the last cluster we find it in.
+ * Such leeway is intentionally provided to allow clients some slack in how they
+ * implement the sync without needing to make an blocking API request for every
+ * user interaction.
+ */
+export const clusterFaces = (
+    faceIndexes: FaceIndex[],
+    opts: ClusteringOpts,
+) => {
+    const { batchSize, joinThreshold } = opts;
+    const t = Date.now();
+
+    // A flattened array of faces.
+    // TODO-Cluster ad-hoc filtering and slicing
+    const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
+    // .slice(0, 6000);
+    // TODO-Cluster testing code, can be removed once done
+    const faces = Array(1)
+        .fill(0)
+        .flatMap(() => faces0);
+
+    // For fast reverse lookup - map from face ids to the face.
+    const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
+
+    const faceEmbeddings = faces.map(({ embedding }) => embedding);
+
+    // For fast reverse lookup - map from cluster ids to their index in the
+    // clusters array.
+    const clusterIndexForClusterID = new Map<string, number>();
+
+    // For fast reverse lookup - map from the id of a face to the id of the
+    // cluster to which it belongs.
+    const clusterIDForFaceID = new Map<string, string>();
+
+    // A function to chain two reverse lookup.
+    const firstFaceOfCluster = (cluster: FaceCluster) =>
+        ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
+
+    // A function to generate new cluster IDs.
+    const newClusterID = () => newNonSecureID("cluster_");
+
+    // The resultant clusters.
+    // TODO-Cluster Later on, instead of starting from a blank slate, this will
+    // be list of existing clusters we fetch from remote.
+    const clusters: FaceCluster[] = [];
+
+    // Process the faces in batches. The faces are already sorted by file ID,
+    // which is a monotonically increasing integer, so we will also have some
+    // temporal locality.
+    //
+    // The number 2500 was derived by ad-hoc observations and takes a few
+    // seconds. On a particular test dataset and a particular machine,
+    // clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
+    // Memory usage was constant in all these cases.
+    //
+    // At around 100k faces, the clustering starts taking hours, and we start
+    // running into stack overflows. The stack overflows can perhaps be avoided
+    // by restructuring the code, but hours of uninterruptible work is anyways
+    // not feasible.
+
+    const batchSize = 2500;
+    for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
+        const it = Date.now();
+        const embeddings = faceEmbeddings.slice(i, i + batchSize);
+        const { clusters: hdbClusters } = clusterHdbscan(embeddings);
+
+        log.info(
+            `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
+        );
+
+        // Merge the new clusters we got from hdbscan into the existing clusters
+        // if they are "near" them (using some heuristic).
+        //
+        // We need to ensure we don't change any of the existing cluster IDs,
+        // since these might be existing clusters we got from remote.
+
+        for (const hdbCluster of hdbClusters) {
+            // Find the existing cluster whose (arbitrarily chosen) first face
+            // is the nearest neighbour of the (arbitrarily chosen) first face
+            // of the cluster produced by hdbscan.
+
+            const newFace = ensure(faces[i + ensure(hdbCluster[0])]);
+
+            let nnCluster: FaceCluster | undefined;
+            let nnCosineSimilarity = 0;
+            for (const existingCluster of clusters) {
+                const existingFace = firstFaceOfCluster(existingCluster);
+
+                // The vectors are already normalized, so we can directly use their
+                // dot product as their cosine similarity.
+                const csim = dotProduct(
+                    existingFace.embedding,
+                    newFace.embedding,
+                );
+
+                // Use a higher cosine similarity threshold if either of the two
+                // faces are blurry.
+                const threshold =
+                    existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7;
+                if (csim > threshold && csim > nnCosineSimilarity) {
+                    nnCluster = existingCluster;
+                    nnCosineSimilarity = csim;
+                }
+            }
+
+            if (nnCluster) {
+                // If we found an existing cluster that is near enough,
+                // sublimate the cluster produced by hdbscan into that cluster.
+                for (const j of hdbCluster) {
+                    const { faceID } = ensure(faces[i + j]);
+                    nnCluster.faceIDs.push(faceID);
+                    clusterIDForFaceID.set(faceID, nnCluster.id);
+                }
+            } else {
+                // Otherwise make a new cluster from the cluster produced by
+                // hdbscan.
+                const clusterID = newClusterID();
+                const faceIDs: string[] = [];
+                for (const j of hdbCluster) {
+                    const { faceID } = ensure(faces[i + j]);
+                    faceIDs.push(faceID);
+                    clusterIDForFaceID.set(faceID, clusterID);
+                }
+                clusterIndexForClusterID.set(clusterID, clusters.length);
+                clusters.push({ id: clusterID, faceIDs });
+            }
+        }
+    }
+
+    // Convert into the data structure we're using to debug/visualize.
+    // const faceAndNeigbours: FaceNeighbours[] = [];
+    // const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
+    // for (const fi of topFaces) {
+    //     let neighbours: FaceNeighbour[] = [];
+    //     for (const fj of faces) {
+    //         // The vectors are already normalized, so we can directly use their
+    //         // dot product as their cosine similarity.
+    //         const csim = dotProduct(fi.embedding, fj.embedding);
+    //         neighbours.push({ face: fj, cosineSimilarity: csim });
+    //     }
+
+    //     neighbours = neighbours
+    //         .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
+    //         .slice(0, 30);
+
+    //     faceAndNeigbours.push({ face: fi, neighbours });
+    // }
+
+    // Convert into the data structure we're using to debug/visualize.
+    //
+    // > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
+    // > each, sorted by cosine distance to highest scoring face in the
+    // > cluster).
+
+    const sortedClusters = clusters.sort(
+        (a, b) => b.faceIDs.length - a.faceIDs.length,
+    );
+    const debugClusters =
+        sortedClusters.length < 60
+            ? sortedClusters
+            : sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
+    const clusterPreviews: ClusterPreview[] = [];
+    for (const cluster of debugClusters) {
+        const faces = cluster.faceIDs.map((id) =>
+            ensure(faceForFaceID.get(id)),
+        );
+        const topFace = faces.reduce((max, face) =>
+            max.score > face.score ? max : face,
+        );
+        const previewFaces: ClusterPreviewFace[] = [];
+        for (const face of faces) {
+            const csim = dotProduct(topFace.embedding, face.embedding);
+            previewFaces.push({ face, cosineSimilarity: csim });
+        }
+        clusterPreviews.push({
+            clusterSize: cluster.faceIDs.length,
+            faces: previewFaces
+                .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
+                .slice(0, 50),
+        });
+    }
+
+    // Prune too small clusters.
+    // TODO-Cluster this is likely not needed since hdbscan already has a min?
+    const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
+
+    // let cgroups = await clusterGroups();
+
+    // // TODO-Cluster - Currently we're not syncing with remote or saving anything
+    // // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
+    // // cgroup, one per cluster.
+    // cgroups = cgroups.concat(
+    //     validClusters.map((c) => ({
+    //         id: c.id,
+    //         name: undefined,
+    //         clusterIDs: [c.id],
+    //         isHidden: false,
+    //         avatarFaceID: undefined,
+    //         displayFaceID: undefined,
+    //     })),
+    // );
+
+    // // For each cluster group, use the highest scoring face in any of its
+    // // clusters as its display face.
+    // for (const cgroup of cgroups) {
+    //     cgroup.displayFaceID = cgroup.clusterIDs
+    //         .map((clusterID) => clusterIndexForClusterID.get(clusterID))
+    //         .filter((i) => i !== undefined) /* 0 is a valid index */
+    //         .flatMap((i) => clusters[i]?.faceIDs ?? [])
+    //         .map((faceID) => faceForFaceID.get(faceID))
+    //         .filter((face) => !!face)
+    //         .reduce((max, face) =>
+    //             max.score > face.score ? max : face,
+    //         ).faceID;
+    // }
+
+    // TODO-Cluster - Currently we're not syncing with remote or saving anything
+    // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
+    // cgroup, one per cluster.
+
+    const cgroups: CGroup[] = [];
+    for (const cluster of sortedClusters) {
+        const faces = cluster.faceIDs.map((id) =>
+            ensure(faceForFaceID.get(id)),
+        );
+        const topFace = faces.reduce((max, face) =>
+            max.score > face.score ? max : face,
+        );
+        cgroups.push({
+            id: cluster.id,
+            name: undefined,
+            clusterIDs: [cluster.id],
+            isHidden: false,
+            avatarFaceID: undefined,
+            displayFaceID: topFace.faceID,
+        });
+    }
+
+    // log.info("ml/cluster", {
+    //     faces,
+    //     validClusters,
+    //     clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
+    //     clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
+    //     cgroups,
+    // });
+    log.info(
+        `Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
+    );
+
+    const clusteredCount = clusterIDForFaceID.size;
+    const unclusteredCount = faces.length - clusteredCount;

    return {
-        clusters: hdbscan.getClusters(),
-        noise: hdbscan.getNoise(),
-        debugInfo: hdbscan.getDebugInfo(),
+        // faces,
+        clusteredCount,
+        unclusteredCount,
+        clusters: validClusters,
+        cgroups,
+        clusterPreviews,
+        clusterIDForFaceID,
    };
 };
+
+/**
+ * A generator function that returns a stream of {faceID, embedding} values,
+ * flattening all the the faces present in the given {@link faceIndices}.
+ */
+function* enumerateFaces(faceIndices: FaceIndex[]) {
+    for (const fi of faceIndices) {
+        for (const f of fi.faces) {
+            yield f;
+        }
+    }
+}
+
+interface ClusterLinearResult {
+    clusters: EmbeddingCluster[];
+}
+
+const clusterLinear = (
+    embeddings: number[][],
+    threshold: number,
+): ClusterLinearResult => {
+    const clusters: EmbeddingCluster[] = [];
+    const clusterIndexForEmbeddingIndex = new Map<number, number>();
+    // For each embedding
+    for (const [i, ei] of embeddings.entries()) {
+        // If the embedding is already part of a cluster, then skip it.
+        if (clusterIndexForEmbeddingIndex.get(i)) continue;
+
+        // Find the nearest neighbour from among all the other embeddings.
+        let nnIndex: number | undefined;
+        let nnCosineSimilarity = 0;
+        for (const [j, ej] of embeddings.entries()) {
+            // ! This is an O(n^2) loop, be careful when adding more code here.
+
+            // Skip ourselves.
+            if (i == j) continue;
+
+            // The vectors are already normalized, so we can directly use their
+            // dot product as their cosine similarity.
+            const csim = dotProduct(ei, ej);
+            if (csim > threshold && csim > nnCosineSimilarity) {
+                nnIndex = j;
+                nnCosineSimilarity = csim;
+            }
+        }
+
+        if (nnIndex) {
+            // Find the cluster the nearest neighbour belongs to, if any.
+            const nnClusterIndex = clusterIndexForEmbeddingIndex.get(nnIndex);
+
+            if (nnClusterIndex) {
+                // If the neighbour is already part of a cluster, also add
+                // ourselves to that cluster.
+
+                ensure(clusters[nnClusterIndex]).push(i);
+                clusterIndexForEmbeddingIndex.set(i, nnClusterIndex);
+            } else {
+                // Otherwise create a new cluster with us and our nearest
+                // neighbour.
+
+                clusterIndexForEmbeddingIndex.set(i, clusters.length);
+                clusterIndexForEmbeddingIndex.set(nnIndex, clusters.length);
+                clusters.push([i, nnIndex]);
+            }
+        } else {
+            // We didn't find a neighbour within the threshold. Create a new
+            // cluster with only this embedding.
+
+            clusterIndexForEmbeddingIndex.set(i, clusters.length);
+            clusters.push([i]);
+        }
+    }
+
+    // Prune singletone clusters.
+    const validClusters = clusters.filter((cs) => cs.length > 1);
+
+    return { clusters: validClusters };
+};
--- a/web/packages/new/photos/services/ml/db.ts
+++ b/web/packages/new/photos/services/ml/db.ts
@@ -3,7 +3,7 @@ import log from "@/base/log";
 import localForage from "@ente/shared/storage/localForage";
 import { deleteDB, openDB, type DBSchema } from "idb";
 import type { LocalCLIPIndex } from "./clip";
-import type { CGroup, FaceCluster } from "./cluster-new";
+import type { CGroup, FaceCluster } from "./cluster";
 import type { LocalFaceIndex } from "./face";

 /**
--- a/web/packages/new/photos/services/ml/index.ts
+++ b/web/packages/new/photos/services/ml/index.ts
@@ -24,7 +24,7 @@ import {
    type ClusteringOpts,
    type ClusterPreviewFace,
    type FaceCluster,
-} from "./cluster-new";
+} from "./cluster";
 import { regenerateFaceCrops } from "./crop";
 import { clearMLDB, faceIndex, indexableAndIndexedCounts } from "./db";
 import type { Face } from "./face";
@@ -386,7 +386,7 @@ export const wipClusterDebugPageContents = async (
        clusters,
        cgroups,
        unclusteredFaces,
-    } = await worker().then((w) => w.clusterFacesHdb(opts));
+    } = await worker().then((w) => w.clusterFaces(opts));

    const localFiles = await getAllLocalFiles();
    const localFileByID = new Map(localFiles.map((f) => [f.id, f]));
--- a/web/packages/new/photos/services/ml/worker.ts
+++ b/web/packages/new/photos/services/ml/worker.ts
@@ -24,7 +24,7 @@ import {
    indexCLIP,
    type CLIPIndex,
 } from "./clip";
-import { clusterFacesHdb, type ClusteringOpts } from "./cluster-new";
+import { type ClusteringOpts } from "./cluster";
 import { saveFaceCrops } from "./crop";
 import {
    faceIndexes,
@@ -276,8 +276,8 @@ export class MLWorker {
    }

    // TODO-Cluster
-    async clusterFacesHdb(opts: ClusteringOpts) {
-        return clusterFacesHdb(await faceIndexes(), opts);
+    async clusterFaces(opts: ClusteringOpts) {
+        return clusterFace(await faceIndexes(), opts);
    }
 }

--- a/web/packages/new/photos/services/user-entity.ts
+++ b/web/packages/new/photos/services/user-entity.ts
@@ -12,7 +12,7 @@ import { ensure } from "@/utils/ensure";
 import { nullToUndefined } from "@/utils/transform";
 import { z } from "zod";
 import { gunzip } from "./gzip";
-import type { CGroup } from "./ml/cluster-new";
+import type { CGroup } from "./ml/cluster";
 import { applyCGroupDiff } from "./ml/db";

 /**