From d5a1187e13a20c3c58d73b01314eb3447ce14843 Mon Sep 17 00:00:00 2001
From: Manav Rathi <manav@mrmr.io>
Date: Thu, 29 Aug 2024 20:33:18 +0530
Subject: [PATCH] Prep

---
 web/apps/photos/src/pages/cluster-debug.tsx   | 24 +++++++++-----
 .../new/photos/components/MLSettings.tsx      | 15 +++++----
 .../new/photos/services/ml/cluster-new.ts     | 32 +++++++++----------
 3 files changed, 40 insertions(+), 31 deletions(-)
diff --git a/web/apps/photos/src/pages/cluster-debug.tsx b/web/apps/photos/src/pages/cluster-debug.tsx
index db187751de..fcbeadfcab 100644
--- a/web/apps/photos/src/pages/cluster-debug.tsx
+++ b/web/apps/photos/src/pages/cluster-debug.tsx
@@ -49,14 +49,22 @@ export default function ClusterDebug() {
     }
     return (
         <>
-            <Typography variant="small">
-                {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`}
-            </Typography>
-            <Typography variant="small" color="text.muted">
-                Showing only top 20 and bottom 10 clusters (and only up to 50
-                faces in each, sorted by cosine distance to highest scoring face
-                in the cluster).
-            </Typography>
+            <Stack m={1}>
+                <Typography variant="small" mb={1}>
+                    {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`}
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    Showing only top 30 and bottom 30 clusters.
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    For each cluster showing only up to 50 faces, sorted by
+                    cosine similarity to highest scoring face in the cluster.
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    Below each face is its{" "}
+                    <b>blur - score - cosineSimilarity - direction</b>
+                </Typography>
+            </Stack>
             <hr />
             <Container>
                 <AutoSizer>
diff --git a/web/packages/new/photos/components/MLSettings.tsx b/web/packages/new/photos/components/MLSettings.tsx
index eeff4d1be8..c8785110b6 100644
--- a/web/packages/new/photos/components/MLSettings.tsx
+++ b/web/packages/new/photos/components/MLSettings.tsx
@@ -8,7 +8,6 @@ import {
     enableML,
     mlStatusSnapshot,
     mlStatusSubscribe,
-    wipCluster,
     wipClusterEnable,
     type MLStatus,
 } from "@/new/photos/services/ml";
@@ -341,7 +340,7 @@ const ManageML: React.FC<ManageMLProps> = ({
 
     // TODO-Cluster
     const router = useRouter();
-    const wipClusterNow = () => wipCluster();
+    // const wipClusterNow = () => wipCluster();
     const wipClusterShowNow = () => router.push("/cluster-debug");
 
     return (
@@ -391,18 +390,20 @@ const ManageML: React.FC<ManageMLProps> = ({
                 <Box>
                     <MenuItemGroup>
                         <EnteMenuItem
-                            label={ut("Create clusters • internal only option")}
-                            onClick={wipClusterNow}
+                            label={ut(
+                                "Create clusters   • internal only option",
+                            )}
+                            onClick={wipClusterShowNow}
                         />
                     </MenuItemGroup>
                     <MenuSectionTitle
                         title={ut(
-                            "Create in-memory clusters from arbitrary 2k photos. Nothing will be saved or synced to remote. You can view the results in search dropdown.",
+                            "Create and show in-memory clusters. Takes ~ 1 min. Nothing will be saved or synced to remote. You can also view all clusters in the search dropdown later.",
                         )}
                     />
                 </Box>
             )}
-            {showClusterOpt && (
+            {/* {showClusterOpt && (
                 <Box>
                     <MenuItemGroup>
                         <EnteMenuItem
@@ -416,7 +417,7 @@ const ManageML: React.FC<ManageMLProps> = ({
                         )}
                     />
                 </Box>
-            )}
+            )} */}
         </Stack>
     );
 };
diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts
index df97ca4bd9..d6e1dc505a 100644
--- a/web/packages/new/photos/services/ml/cluster-new.ts
+++ b/web/packages/new/photos/services/ml/cluster-new.ts
@@ -348,14 +348,13 @@ function* enumerateFaces(faceIndices: FaceIndex[]) {
     }
 }
 
-export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
+export const clusterFacesHdb = (faceIndexes: FaceIndex[]) => {
     const t = Date.now();
 
     // A flattened array of faces.
     // TODO-Cluster ad-hoc filtering and slicing
-    const faces0 = [...enumerateFaces(faceIndexes)]
-        .filter((f) => f.blur > 99)
-        .slice(0, 6000);
+    const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
+    // .slice(0, 6000);
     // TODO-Cluster testing code, can be removed once done
     const faces = Array(1)
         .fill(0)
@@ -386,27 +385,28 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
     // be list of existing clusters we fetch from remote.
     const clusters: FaceCluster[] = [];
 
-    // Process the faces in batches of 10k. The faces are already sorted by file
-    // ID, which is a monotonically increasing integer, so we will also have
-    // some temporal locality.
+    // Process the faces in batches. The faces are already sorted by file ID,
+    // which is a monotonically increasing integer, so we will also have some
+    // temporal locality.
     //
-    // The number 10k was derived by ad-hoc observations. On a particular test
-    // dataset, clustering 10k took ~2 mins, while 20k took ~8 mins. Memory
-    // usage was constant in both cases.
+    // The number 2500 was derived by ad-hoc observations and takes a few
+    // seconds. On a particular test dataset and a particular machine,
+    // clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
+    // Memory usage was constant in all these cases.
     //
     // At around 100k faces, the clustering starts taking hours, and we start
     // running into stack overflows. The stack overflows can perhaps be avoided
     // by restructuring the code, but hours of uninterruptible work is anyways
     // not feasible.
 
-    // const batchSize = 10_000; // TODO-Cluster
-    const batchSize = 1_000;
+    const batchSize = 2500;
     for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
+        const it = Date.now();
         const embeddings = faceEmbeddings.slice(i, i + batchSize);
         const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings);
 
         log.info(
-            `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - t} ms)`,
+            `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
         );
 
         // Merge the new clusters we got from hdbscan into the existing clusters
@@ -489,7 +489,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
 
     // Convert into the data structure we're using to debug/visualize.
     //
-    // > Showing only top 20 and bottom 10 clusters (and only up to 50 faces in
+    // > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
     // > each, sorted by cosine distance to highest scoring face in the
     // > cluster).
 
@@ -497,9 +497,9 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
         (a, b) => b.faceIDs.length - a.faceIDs.length,
     );
     const debugClusters =
-        sortedClusters.length < 30
+        sortedClusters.length < 60
             ? sortedClusters
-            : sortedClusters.slice(0, 20).concat(sortedClusters.slice(-10));
+            : sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
     const clusterPreviews: ClusterPreview[] = [];
     for (const cluster of debugClusters) {
         const faces = cluster.faceIDs.map((id) =>