From 4f4eb773fc6c9b6969285cfa7fa0629fea523e5d Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Fri, 30 Aug 2024 17:24:49 +0530
Subject: [PATCH] Clean

---
 web/apps/photos/src/pages/cluster-debug.tsx   |   4 +-
 .../new/photos/services/ml/cluster.ts         | 139 ++++--------------
 web/packages/new/photos/services/ml/index.ts  |  10 +-
 web/packages/new/photos/services/ml/worker.ts |   4 +-
 4 files changed, 41 insertions(+), 116 deletions(-)
diff --git a/web/apps/photos/src/pages/cluster-debug.tsx b/web/apps/photos/src/pages/cluster-debug.tsx
index 7f84b0c5f2..60efbb0118 100644
--- a/web/apps/photos/src/pages/cluster-debug.tsx
+++ b/web/apps/photos/src/pages/cluster-debug.tsx
@@ -6,7 +6,7 @@ import {
     type ClusterDebugPageContents,
     type ClusterPreviewFaceWithFile,
 } from "@/new/photos/services/ml";
-import { type ClusteringOpts } from "@/new/photos/services/ml/cluster-new";
+import { type ClusteringOpts } from "@/new/photos/services/ml/cluster";
 import { faceDirection } from "@/new/photos/services/ml/face";
 import {
     FlexWrapper,
@@ -297,7 +297,7 @@ const Header: React.FC<HeaderProps> = ({ clusterRes, onCluster }) => {
     const clusterInfo = clusterRes && (
         <Stack m={1}>
             <Typography variant="small" mb={1}>
-                {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredFaceCount} faces. ${clusterRes.unclusteredFaceCount} unclustered faces.`}
+                {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredFaceCount} faces in ${(clusterRes.timeTakenMs / 1000).toFixed(0)} seconds. ${clusterRes.unclusteredFaceCount} unclustered faces.`}
             </Typography>
             <Typography variant="small" color="text.muted">
                 Showing only top 30 and bottom 30 clusters.
diff --git a/web/packages/new/photos/services/ml/cluster.ts b/web/packages/new/photos/services/ml/cluster.ts
index f13b889aa1..7eec7af886 100644
--- a/web/packages/new/photos/services/ml/cluster.ts
+++ b/web/packages/new/photos/services/ml/cluster.ts
@@ -174,17 +174,11 @@ export const clusterFaces = (
     faceIndexes: FaceIndex[],
     opts: ClusteringOpts,
 ) => {
-    const { batchSize, joinThreshold } = opts;
+    const { method, batchSize, joinThreshold } = opts;
     const t = Date.now();
 
     // A flattened array of faces.
-    // TODO-Cluster ad-hoc filtering and slicing
-    const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
-    // .slice(0, 6000);
-    // TODO-Cluster testing code, can be removed once done
-    const faces = Array(1)
-        .fill(0)
-        .flatMap(() => faces0);
+    const faces = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
 
     // For fast reverse lookup - map from face ids to the face.
     const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
@@ -199,6 +193,10 @@ export const clusterFaces = (
     // cluster to which it belongs.
     const clusterIDForFaceID = new Map<string, string>();
 
+    // Keeps track of which faces were found by the OG clustering algorithm, and
+    // which were sublimated in from a later match.
+    const wasMergedFaceIDs = new Set<string>();
+
     // A function to chain two reverse lookup.
     const firstFaceOfCluster = (cluster: FaceCluster) =>
         ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
@@ -214,18 +212,7 @@ export const clusterFaces = (
     // Process the faces in batches. The faces are already sorted by file ID,
     // which is a monotonically increasing integer, so we will also have some
     // temporal locality.
-    //
-    // The number 2500 was derived by ad-hoc observations and takes a few
-    // seconds. On a particular test dataset and a particular machine,
-    // clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
-    // Memory usage was constant in all these cases.
-    //
-    // At around 100k faces, the clustering starts taking hours, and we start
-    // running into stack overflows. The stack overflows can perhaps be avoided
-    // by restructuring the code, but hours of uninterruptible work is anyways
-    // not feasible.
 
-    const batchSize = 2500;
     for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
         const it = Date.now();
         const embeddings = faceEmbeddings.slice(i, i + batchSize);
@@ -294,92 +281,34 @@ export const clusterFaces = (
         }
     }
 
-    // Convert into the data structure we're using to debug/visualize.
-    // const faceAndNeigbours: FaceNeighbours[] = [];
-    // const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
-    // for (const fi of topFaces) {
-    //     let neighbours: FaceNeighbour[] = [];
-    //     for (const fj of faces) {
-    //         // The vectors are already normalized, so we can directly use their
-    //         // dot product as their cosine similarity.
-    //         const csim = dotProduct(fi.embedding, fj.embedding);
-    //         neighbours.push({ face: fj, cosineSimilarity: csim });
-    //     }
-
-    //     neighbours = neighbours
-    //         .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
-    //         .slice(0, 30);
-
-    //     faceAndNeigbours.push({ face: fi, neighbours });
-    // }
-
-    // Convert into the data structure we're using to debug/visualize.
-    //
-    // > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
-    // > each, sorted by cosine distance to highest scoring face in the
-    // > cluster).
-
     const sortedClusters = clusters.sort(
         (a, b) => b.faceIDs.length - a.faceIDs.length,
     );
-    const debugClusters =
+
+    // Convert into the data structure we're using to debug/visualize.
+    const clusterPreviewClusters =
         sortedClusters.length < 60
             ? sortedClusters
             : sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
-    const clusterPreviews: ClusterPreview[] = [];
-    for (const cluster of debugClusters) {
+    const clusterPreviews = clusterPreviewClusters.map((cluster) => {
         const faces = cluster.faceIDs.map((id) =>
             ensure(faceForFaceID.get(id)),
         );
-        const topFace = faces.reduce((max, face) =>
-            max.score > face.score ? max : face,
+        const topFace = faces.reduce((top, face) =>
+            top.score > face.score ? top : face,
         );
-        const previewFaces: ClusterPreviewFace[] = [];
-        for (const face of faces) {
+        const previewFaces: ClusterPreviewFace[] = faces.map((face) => {
             const csim = dotProduct(topFace.embedding, face.embedding);
-            previewFaces.push({ face, cosineSimilarity: csim });
-        }
-        clusterPreviews.push({
+            const wasMerged = wasMergedFaceIDs.has(face.faceID);
+            return { face, cosineSimilarity: csim, wasMerged };
+        });
+        return {
             clusterSize: cluster.faceIDs.length,
             faces: previewFaces
                 .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
                 .slice(0, 50),
-        });
-    }
-
-    // Prune too small clusters.
-    // TODO-Cluster this is likely not needed since hdbscan already has a min?
-    const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
-
-    // let cgroups = await clusterGroups();
-
-    // // TODO-Cluster - Currently we're not syncing with remote or saving anything
-    // // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
-    // // cgroup, one per cluster.
-    // cgroups = cgroups.concat(
-    //     validClusters.map((c) => ({
-    //         id: c.id,
-    //         name: undefined,
-    //         clusterIDs: [c.id],
-    //         isHidden: false,
-    //         avatarFaceID: undefined,
-    //         displayFaceID: undefined,
-    //     })),
-    // );
-
-    // // For each cluster group, use the highest scoring face in any of its
-    // // clusters as its display face.
-    // for (const cgroup of cgroups) {
-    //     cgroup.displayFaceID = cgroup.clusterIDs
-    //         .map((clusterID) => clusterIndexForClusterID.get(clusterID))
-    //         .filter((i) => i !== undefined) /* 0 is a valid index */
-    //         .flatMap((i) => clusters[i]?.faceIDs ?? [])
-    //         .map((faceID) => faceForFaceID.get(faceID))
-    //         .filter((face) => !!face)
-    //         .reduce((max, face) =>
-    //             max.score > face.score ? max : face,
-    //         ).faceID;
-    // }
+        };
+    });
 
     // TODO-Cluster - Currently we're not syncing with remote or saving anything
     // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
@@ -390,8 +319,8 @@ export const clusterFaces = (
         const faces = cluster.faceIDs.map((id) =>
             ensure(faceForFaceID.get(id)),
         );
-        const topFace = faces.reduce((max, face) =>
-            max.score > face.score ? max : face,
+        const topFace = faces.reduce((top, face) =>
+            top.score > face.score ? top : face,
         );
         cgroups.push({
             id: cluster.id,
@@ -403,28 +332,22 @@ export const clusterFaces = (
         });
     }
 
-    // log.info("ml/cluster", {
-    //     faces,
-    //     validClusters,
-    //     clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
-    //     clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
-    //     cgroups,
-    // });
+    const timeTakenMs = Date.now() - t;
     log.info(
-        `Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
+        `Clustered ${faces.length} faces into ${clusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${timeTakenMs} ms)`,
     );
 
-    const clusteredCount = clusterIDForFaceID.size;
-    const unclusteredCount = faces.length - clusteredCount;
+    const clusteredFaceCount = clusterIDForFaceID.size;
+    const unclusteredFaceCount = faces.length - clusteredFaceCount;
 
     return {
-        // faces,
-        clusteredCount,
-        unclusteredCount,
-        clusters: validClusters,
-        cgroups,
+        clusteredFaceCount,
+        unclusteredFaceCount,
         clusterPreviews,
-        clusterIDForFaceID,
+        clusters: sortedClusters,
+        cgroups,
+        unclusteredFaces: [],
+        timeTakenMs,
     };
 };
 
diff --git a/web/packages/new/photos/services/ml/index.ts b/web/packages/new/photos/services/ml/index.ts
index d4f3c862e3..836eba693e 100644
--- a/web/packages/new/photos/services/ml/index.ts
+++ b/web/packages/new/photos/services/ml/index.ts
@@ -365,7 +365,8 @@ export interface ClusterDebugPageContents {
     unclusteredFacesWithFile: {
         face: Face;
         enteFile: EnteFile;
-    };
+    }[];
+    timeTakenMs: number;
 }
 
 export const wipClusterDebugPageContents = async (
@@ -378,7 +379,6 @@ export const wipClusterDebugPageContents = async (
     _wip_searchPersons = undefined;
     triggerStatusUpdate();
 
-    // const { faceAndNeigbours, clusters, cgroups } = await clusterFaces(
     const {
         clusteredFaceCount,
         unclusteredFaceCount,
@@ -386,6 +386,7 @@ export const wipClusterDebugPageContents = async (
         clusters,
         cgroups,
         unclusteredFaces,
+        timeTakenMs,
     } = await worker().then((w) => w.clusterFaces(opts));
 
     const localFiles = await getAllLocalFiles();
@@ -396,10 +397,10 @@ export const wipClusterDebugPageContents = async (
     const clusterPreviewsWithFile = clusterPreviews.map(
         ({ clusterSize, faces }) => ({
             clusterSize,
-            faces: faces.map(({ face, cosineSimilarity }) => ({
+            faces: faces.map(({ face, ...rest }) => ({
                 face,
                 enteFile: fileForFace(face),
-                cosineSimilarity,
+                ...rest,
             })),
         }),
     );
@@ -445,6 +446,7 @@ export const wipClusterDebugPageContents = async (
         clusters,
         clusterPreviewsWithFile,
         unclusteredFacesWithFile,
+        timeTakenMs,
     };
 };
 
diff --git a/web/packages/new/photos/services/ml/worker.ts b/web/packages/new/photos/services/ml/worker.ts
index 518bfb2804..c663abc2c9 100644
--- a/web/packages/new/photos/services/ml/worker.ts
+++ b/web/packages/new/photos/services/ml/worker.ts
@@ -24,7 +24,7 @@ import {
     indexCLIP,
     type CLIPIndex,
 } from "./clip";
-import { type ClusteringOpts } from "./cluster";
+import { clusterFaces, type ClusteringOpts } from "./cluster";
 import { saveFaceCrops } from "./crop";
 import {
     faceIndexes,
@@ -277,7 +277,7 @@ export class MLWorker {
 
     // TODO-Cluster
     async clusterFaces(opts: ClusteringOpts) {
-        return clusterFace(await faceIndexes(), opts);
+        return clusterFaces(await faceIndexes(), opts);
     }
 }