Prep

2024-08-29 20:33:18 +05:30
parent d6c7ab0735
commit d5a1187e13
3 changed files with 40 additions and 31 deletions
--- a/web/apps/photos/src/pages/cluster-debug.tsx
+++ b/web/apps/photos/src/pages/cluster-debug.tsx
@@ -49,14 +49,22 @@ export default function ClusterDebug() {
    }
    return (
        <>
-            <Typography variant="small">
-                {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`}
-            </Typography>
-            <Typography variant="small" color="text.muted">
-                Showing only top 20 and bottom 10 clusters (and only up to 50
-                faces in each, sorted by cosine distance to highest scoring face
-                in the cluster).
-            </Typography>
+            <Stack m={1}>
+                <Typography variant="small" mb={1}>
+                    {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`}
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    Showing only top 30 and bottom 30 clusters.
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    For each cluster showing only up to 50 faces, sorted by
+                    cosine similarity to highest scoring face in the cluster.
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    Below each face is its{" "}
+                    <b>blur - score - cosineSimilarity - direction</b>
+                </Typography>
+            </Stack>
            <hr />
            <Container>
                <AutoSizer>
--- a/web/packages/new/photos/components/MLSettings.tsx
+++ b/web/packages/new/photos/components/MLSettings.tsx
@@ -8,7 +8,6 @@ import {
    enableML,
    mlStatusSnapshot,
    mlStatusSubscribe,
-    wipCluster,
    wipClusterEnable,
    type MLStatus,
 } from "@/new/photos/services/ml";
@@ -341,7 +340,7 @@ const ManageML: React.FC<ManageMLProps> = ({

    // TODO-Cluster
    const router = useRouter();
-    const wipClusterNow = () => wipCluster();
+    // const wipClusterNow = () => wipCluster();
    const wipClusterShowNow = () => router.push("/cluster-debug");

    return (
@@ -391,18 +390,20 @@ const ManageML: React.FC<ManageMLProps> = ({
                <Box>
                    <MenuItemGroup>
                        <EnteMenuItem
-                            label={ut("Create clusters • internal only option")}
-                            onClick={wipClusterNow}
+                            label={ut(
+                                "Create clusters   • internal only option",
+                            )}
+                            onClick={wipClusterShowNow}
                        />
                    </MenuItemGroup>
                    <MenuSectionTitle
                        title={ut(
-                            "Create in-memory clusters from arbitrary 2k photos. Nothing will be saved or synced to remote. You can view the results in search dropdown.",
+                            "Create and show in-memory clusters. Takes ~ 1 min. Nothing will be saved or synced to remote. You can also view all clusters in the search dropdown later.",
                        )}
                    />
                </Box>
            )}
-            {showClusterOpt && (
+            {/* {showClusterOpt && (
                <Box>
                    <MenuItemGroup>
                        <EnteMenuItem
@@ -416,7 +417,7 @@ const ManageML: React.FC<ManageMLProps> = ({
                        )}
                    />
                </Box>
-            )}
+            )} */}
        </Stack>
    );
 };
--- a/web/packages/new/photos/services/ml/cluster-new.ts
+++ b/web/packages/new/photos/services/ml/cluster-new.ts
@@ -348,14 +348,13 @@ function* enumerateFaces(faceIndices: FaceIndex[]) {
    }
 }

-export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
+export const clusterFacesHdb = (faceIndexes: FaceIndex[]) => {
    const t = Date.now();

    // A flattened array of faces.
    // TODO-Cluster ad-hoc filtering and slicing
-    const faces0 = [...enumerateFaces(faceIndexes)]
-        .filter((f) => f.blur > 99)
-        .slice(0, 6000);
+    const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
+    // .slice(0, 6000);
    // TODO-Cluster testing code, can be removed once done
    const faces = Array(1)
        .fill(0)
@@ -386,27 +385,28 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
    // be list of existing clusters we fetch from remote.
    const clusters: FaceCluster[] = [];

-    // Process the faces in batches of 10k. The faces are already sorted by file
-    // ID, which is a monotonically increasing integer, so we will also have
-    // some temporal locality.
+    // Process the faces in batches. The faces are already sorted by file ID,
+    // which is a monotonically increasing integer, so we will also have some
+    // temporal locality.
    //
-    // The number 10k was derived by ad-hoc observations. On a particular test
-    // dataset, clustering 10k took ~2 mins, while 20k took ~8 mins. Memory
-    // usage was constant in both cases.
+    // The number 2500 was derived by ad-hoc observations and takes a few
+    // seconds. On a particular test dataset and a particular machine,
+    // clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
+    // Memory usage was constant in all these cases.
    //
    // At around 100k faces, the clustering starts taking hours, and we start
    // running into stack overflows. The stack overflows can perhaps be avoided
    // by restructuring the code, but hours of uninterruptible work is anyways
    // not feasible.

-    // const batchSize = 10_000; // TODO-Cluster
-    const batchSize = 1_000;
+    const batchSize = 2500;
    for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
+        const it = Date.now();
        const embeddings = faceEmbeddings.slice(i, i + batchSize);
        const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings);

        log.info(
-            `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - t} ms)`,
+            `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
        );

        // Merge the new clusters we got from hdbscan into the existing clusters
@@ -489,7 +489,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {

    // Convert into the data structure we're using to debug/visualize.
    //
-    // > Showing only top 20 and bottom 10 clusters (and only up to 50 faces in
+    // > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
    // > each, sorted by cosine distance to highest scoring face in the
    // > cluster).

@@ -497,9 +497,9 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
        (a, b) => b.faceIDs.length - a.faceIDs.length,
    );
    const debugClusters =
-        sortedClusters.length < 30
+        sortedClusters.length < 60
            ? sortedClusters
-            : sortedClusters.slice(0, 20).concat(sortedClusters.slice(-10));
+            : sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
    const clusterPreviews: ClusterPreview[] = [];
    for (const cluster of debugClusters) {
        const faces = cluster.faceIDs.map((id) =>