[desktop] Clustering WIP - Part x/x (#3040)

2024-08-29 21:03:09 +05:30
parent d413ed2de0 d5a1187e13
commit 883bdcd845
7 changed files with 374 additions and 146 deletions
--- a/web/apps/photos/src/pages/cluster-debug.tsx
+++ b/web/apps/photos/src/pages/cluster-debug.tsx
@@ -4,10 +4,10 @@ import {
    faceCrop,
    wipClusterDebugPageContents,
    type ClusterDebugPageContents,
-    type FaceFileNeighbour,
-    type FaceFileNeighbours,
+    type ClusterPreviewFaceWF,
+    type ClusterPreviewWF,
 } from "@/new/photos/services/ml";
-import type { Face } from "@/new/photos/services/ml/face";
+import { faceDirection } from "@/new/photos/services/ml/face";
 import {
    FlexWrapper,
    FluidContainer,
@@ -15,7 +15,7 @@ import {
 } from "@ente/shared/components/Container";
 import EnteSpinner from "@ente/shared/components/EnteSpinner";
 import BackButton from "@mui/icons-material/ArrowBackOutlined";
-import { Box, IconButton, styled, Typography } from "@mui/material";
+import { Box, IconButton, Stack, styled, Typography } from "@mui/material";
 import { useRouter } from "next/router";
 import { AppContext } from "pages/_app";
 import React, { useContext, useEffect, useMemo, useRef, useState } from "react";
@@ -49,13 +49,22 @@ export default function ClusterDebug() {
    }
    return (
        <>
-            <Typography variant="small">
-                {`${clusterRes.clusters.length} clusters`}
-            </Typography>
-            <Typography variant="small" color="text.muted">
-                Showing only upto first 30 faces (and only upto 30 nearest
-                neighbours of each).
-            </Typography>
+            <Stack m={1}>
+                <Typography variant="small" mb={1}>
+                    {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`}
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    Showing only top 30 and bottom 30 clusters.
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    For each cluster showing only up to 50 faces, sorted by
+                    cosine similarity to highest scoring face in the cluster.
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    Below each face is its{" "}
+                    <b>blur - score - cosineSimilarity - direction</b>
+                </Typography>
+            </Stack>
            <hr />
            <Container>
                <AutoSizer>
@@ -112,7 +121,7 @@ const ClusterPhotoList: React.FC<ClusterPhotoListProps> = ({
    width,
    clusterRes,
 }) => {
-    const { faceFNs, clusterIDForFaceID } = clusterRes;
+    const { clusterPreviewWFs, clusterIDForFaceID } = clusterRes;
    const [itemList, setItemList] = useState<ItemListItem[]>([]);
    const listRef = useRef(null);

@@ -125,8 +134,8 @@ const ClusterPhotoList: React.FC<ClusterPhotoListProps> = ({
    const listItemHeight = 120 * shrinkRatio + 24 + 4;

    useEffect(() => {
-        setItemList(itemListFromFaceFNs(faceFNs, columns));
-    }, [columns, faceFNs]);
+        setItemList(itemListFromClusterPreviewWFs(clusterPreviewWFs, columns));
+    }, [columns, clusterPreviewWFs]);

    useEffect(() => {
        listRef.current?.resetAfterIndex(0);
@@ -138,7 +147,7 @@ const ClusterPhotoList: React.FC<ClusterPhotoListProps> = ({
    const generateKey = (i: number) =>
        Array.isArray(itemList[i])
            ? `${itemList[i][0].enteFile.id}/${itemList[i][0].face.faceID}-${itemList[i].slice(-1)[0].enteFile.id}/${itemList[i].slice(-1)[0].face.faceID}-${i}`
-            : `${itemList[i].faceID}-${i}`;
+            : `${itemList[i]}-${i}`;

    return (
        <VariableSizeList
@@ -163,13 +172,13 @@ const ClusterPhotoList: React.FC<ClusterPhotoListProps> = ({
                        >
                            {!Array.isArray(item) ? (
                                <LabelContainer span={columns}>
-                                    {`score ${item.score.toFixed(2)} blur ${item.blur.toFixed(0)}`}
+                                    {`cluster size ${item.toFixed(2)}`}
                                </LabelContainer>
                            ) : (
-                                item.map((faceFN, i) => (
+                                item.map((faceWF, i) => (
                                    <FaceItem
                                        key={i.toString()}
-                                        {...{ faceFN, clusterIDForFaceID }}
+                                        {...{ faceWF, clusterIDForFaceID }}
                                    />
                                ))
                            )}
@@ -181,19 +190,20 @@ const ClusterPhotoList: React.FC<ClusterPhotoListProps> = ({
    );
 };

-type ItemListItem = Face | FaceFileNeighbour[];
+// type ItemListItem = Face | FaceFileNeighbour[];
+type ItemListItem = number | ClusterPreviewFaceWF[];

-const itemListFromFaceFNs = (
-    faceFNs: FaceFileNeighbours[],
+const itemListFromClusterPreviewWFs = (
+    clusterPreviewWFs: ClusterPreviewWF[],
    columns: number,
 ) => {
    const result: ItemListItem[] = [];
-    for (let index = 0; index < faceFNs.length; index++) {
-        const { face, neighbours } = faceFNs[index];
-        result.push(face);
+    for (let index = 0; index < clusterPreviewWFs.length; index++) {
+        const { clusterSize, faces } = clusterPreviewWFs[index];
+        result.push(clusterSize);
        let lastIndex = 0;
-        while (lastIndex < neighbours.length) {
-            result.push(neighbours.slice(lastIndex, lastIndex + columns));
+        while (lastIndex < faces.length) {
+            result.push(faces.slice(lastIndex, lastIndex + columns));
            lastIndex += columns;
        }
    }
@@ -210,12 +220,12 @@ const getShrinkRatio = (width: number, columns: number) =>
    (columns * 120);

 interface FaceItemProps {
-    faceFN: FaceFileNeighbour;
+    faceWF: ClusterPreviewFaceWF;
    clusterIDForFaceID: Map<string, string>;
 }

-const FaceItem: React.FC<FaceItemProps> = ({ faceFN, clusterIDForFaceID }) => {
-    const { face, enteFile, cosineSimilarity } = faceFN;
+const FaceItem: React.FC<FaceItemProps> = ({ faceWF, clusterIDForFaceID }) => {
+    const { face, enteFile, cosineSimilarity } = faceWF;
    const { faceID } = face;

    const [objectURL, setObjectURL] = useState<string | undefined>();
@@ -235,6 +245,8 @@ const FaceItem: React.FC<FaceItemProps> = ({ faceFN, clusterIDForFaceID }) => {
        };
    }, [faceID, enteFile]);

+    const fd = faceDirection(face.detection);
+    const d = fd == "straight" ? "•" : fd == "left" ? "←" : "→";
    return (
        <FaceChip
            style={{
@@ -252,9 +264,20 @@ const FaceItem: React.FC<FaceItemProps> = ({ faceFN, clusterIDForFaceID }) => {
                    src={objectURL}
                />
            )}
-            <Typography variant="small" color="text.muted" textAlign="right">
-                {cosineSimilarity.toFixed(2)}
-            </Typography>
+            <Stack direction="row" justifyContent="space-between">
+                <Typography variant="small" color="text.muted">
+                    {`b${face.blur.toFixed(0)} `}
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    {`s${face.score.toFixed(1)}`}
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    {`c${cosineSimilarity.toFixed(1)}`}
+                </Typography>
+                <Typography variant="small" color="text.muted">
+                    {`d${d}`}
+                </Typography>
+            </Stack>
        </FaceChip>
    );
 };
--- a/web/packages/new/photos/components/MLSettings.tsx
+++ b/web/packages/new/photos/components/MLSettings.tsx
@@ -8,7 +8,6 @@ import {
    enableML,
    mlStatusSnapshot,
    mlStatusSubscribe,
-    wipCluster,
    wipClusterEnable,
    type MLStatus,
 } from "@/new/photos/services/ml";
@@ -341,7 +340,7 @@ const ManageML: React.FC<ManageMLProps> = ({

    // TODO-Cluster
    const router = useRouter();
-    const wipClusterNow = () => wipCluster();
+    // const wipClusterNow = () => wipCluster();
    const wipClusterShowNow = () => router.push("/cluster-debug");

    return (
@@ -391,18 +390,20 @@ const ManageML: React.FC<ManageMLProps> = ({
                <Box>
                    <MenuItemGroup>
                        <EnteMenuItem
-                            label={ut("Create clusters • internal only option")}
-                            onClick={wipClusterNow}
+                            label={ut(
+                                "Create clusters   • internal only option",
+                            )}
+                            onClick={wipClusterShowNow}
                        />
                    </MenuItemGroup>
                    <MenuSectionTitle
                        title={ut(
-                            "Create in-memory clusters from arbitrary 2k photos. Nothing will be saved or synced to remote. You can view the results in search dropdown.",
+                            "Create and show in-memory clusters. Takes ~ 1 min. Nothing will be saved or synced to remote. You can also view all clusters in the search dropdown later.",
                        )}
                    />
                </Box>
            )}
-            {showClusterOpt && (
+            {/* {showClusterOpt && (
                <Box>
                    <MenuItemGroup>
                        <EnteMenuItem
@@ -416,7 +417,7 @@ const ManageML: React.FC<ManageMLProps> = ({
                        )}
                    />
                </Box>
-            )}
+            )} */}
        </Stack>
    );
 };
--- a/web/packages/new/photos/services/ml/cluster-new.ts
+++ b/web/packages/new/photos/services/ml/cluster-new.ts
@@ -124,6 +124,16 @@ interface FaceNeighbour {
    cosineSimilarity: number;
 }

+export interface ClusterPreview {
+    clusterSize: number;
+    faces: ClusterPreviewFace[];
+}
+
+interface ClusterPreviewFace {
+    face: Face;
+    cosineSimilarity: number;
+}
+
 /**
 * Cluster faces into groups.
 *
@@ -338,115 +348,256 @@ function* enumerateFaces(faceIndices: FaceIndex[]) {
    }
 }

-export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
+export const clusterFacesHdb = (faceIndexes: FaceIndex[]) => {
    const t = Date.now();

    // A flattened array of faces.
-    // TODO-Cluster note the 2k slice
-    const faces = [...enumerateFaces(faceIndexes)].slice(0, 2000);
+    // TODO-Cluster ad-hoc filtering and slicing
+    const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
+    // .slice(0, 6000);
+    // TODO-Cluster testing code, can be removed once done
+    const faces = Array(1)
+        .fill(0)
+        .flatMap(() => faces0);
+
+    // For fast reverse lookup - map from face ids to the face.
+    const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));

    const faceEmbeddings = faces.map(({ embedding }) => embedding);

-    const {
-        clusters: clusterIndices,
-        noise,
-        debugInfo,
-    } = clusterFacesHdbscan(faceEmbeddings);
-
-    log.info({ method: "hdbscan", clusterIndices, noise, debugInfo });
-    log.info(
-        `Clustered ${faces.length} faces into ${clusterIndices.length} clusters (${Date.now() - t} ms)`,
-    );
-
    // For fast reverse lookup - map from cluster ids to their index in the
    // clusters array.
    const clusterIndexForClusterID = new Map<string, number>();

-    // For fast reverse lookup - map from face ids to the id of the cluster to
-    // which they belong.
+    // For fast reverse lookup - map from the id of a face to the id of the
+    // cluster to which it belongs.
    const clusterIDForFaceID = new Map<string, string>();

+    // A function to chain two reverse lookup.
+    const firstFaceOfCluster = (cluster: FaceCluster) =>
+        ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
+
    // A function to generate new cluster IDs.
    const newClusterID = () => newNonSecureID("cluster_");

-    // Convert the numerical face indices into the result.
+    // The resultant clusters.
+    // TODO-Cluster Later on, instead of starting from a blank slate, this will
+    // be list of existing clusters we fetch from remote.
    const clusters: FaceCluster[] = [];
-    for (const [ci, faceIndices] of clusterIndices.entries()) {
-        const clusterID = newClusterID();
-        const faceIDs: string[] = [];
-        clusterIndexForClusterID.set(clusterID, ci);
-        for (const fi of faceIndices) {
-            // Can't find a way of avoiding the null assertion here.
-            // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
-            const face = faces[fi]!;
-            clusterIDForFaceID.set(face.faceID, clusterID);
-            faceIDs.push(face.faceID);
+
+    // Process the faces in batches. The faces are already sorted by file ID,
+    // which is a monotonically increasing integer, so we will also have some
+    // temporal locality.
+    //
+    // The number 2500 was derived by ad-hoc observations and takes a few
+    // seconds. On a particular test dataset and a particular machine,
+    // clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
+    // Memory usage was constant in all these cases.
+    //
+    // At around 100k faces, the clustering starts taking hours, and we start
+    // running into stack overflows. The stack overflows can perhaps be avoided
+    // by restructuring the code, but hours of uninterruptible work is anyways
+    // not feasible.
+
+    const batchSize = 2500;
+    for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
+        const it = Date.now();
+        const embeddings = faceEmbeddings.slice(i, i + batchSize);
+        const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings);
+
+        log.info(
+            `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
+        );
+
+        // Merge the new clusters we got from hdbscan into the existing clusters
+        // if they are "near" them (using some heuristic).
+        //
+        // We need to ensure we don't change any of the existing cluster IDs,
+        // since these might be existing clusters we got from remote.
+
+        for (const hdbCluster of hdbClusters) {
+            // Find the existing cluster whose (arbitrarily chosen) first face
+            // is the nearest neighbour of the (arbitrarily chosen) first face
+            // of the cluster produced by hdbscan.
+
+            const newFace = ensure(faces[i + ensure(hdbCluster[0])]);
+
+            let nnCluster: FaceCluster | undefined;
+            let nnCosineSimilarity = 0;
+            for (const existingCluster of clusters) {
+                const existingFace = firstFaceOfCluster(existingCluster);
+
+                // The vectors are already normalized, so we can directly use their
+                // dot product as their cosine similarity.
+                const csim = dotProduct(
+                    existingFace.embedding,
+                    newFace.embedding,
+                );
+
+                // Use a higher cosine similarity threshold if either of the two
+                // faces are blurry.
+                const threshold =
+                    existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7;
+                if (csim > threshold && csim > nnCosineSimilarity) {
+                    nnCluster = existingCluster;
+                    nnCosineSimilarity = csim;
+                }
+            }
+
+            if (nnCluster) {
+                // If we found an existing cluster that is near enough,
+                // sublimate the cluster produced by hdbscan into that cluster.
+                for (const j of hdbCluster) {
+                    const { faceID } = ensure(faces[i + j]);
+                    nnCluster.faceIDs.push(faceID);
+                    clusterIDForFaceID.set(faceID, nnCluster.id);
+                }
+            } else {
+                // Otherwise make a new cluster from the cluster produced by
+                // hdbscan.
+                const clusterID = newClusterID();
+                const faceIDs: string[] = [];
+                for (const j of hdbCluster) {
+                    const { faceID } = ensure(faces[i + j]);
+                    faceIDs.push(faceID);
+                    clusterIDForFaceID.set(faceID, clusterID);
+                }
+                clusterIndexForClusterID.set(clusterID, clusters.length);
+                clusters.push({ id: clusterID, faceIDs });
+            }
        }
-        clusters.push({ id: clusterID, faceIDs });
    }

    // Convert into the data structure we're using to debug/visualize.
-    const faceAndNeigbours: FaceNeighbours[] = [];
-    for (const fi of faces) {
-        let neighbours: FaceNeighbour[] = [];
-        for (const fj of faces) {
-            // The vectors are already normalized, so we can directly use their
-            // dot product as their cosine similarity.
-            const csim = dotProduct(fi.embedding, fj.embedding);
-            neighbours.push({ face: fj, cosineSimilarity: csim });
-        }
+    // const faceAndNeigbours: FaceNeighbours[] = [];
+    // const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
+    // for (const fi of topFaces) {
+    //     let neighbours: FaceNeighbour[] = [];
+    //     for (const fj of faces) {
+    //         // The vectors are already normalized, so we can directly use their
+    //         // dot product as their cosine similarity.
+    //         const csim = dotProduct(fi.embedding, fj.embedding);
+    //         neighbours.push({ face: fj, cosineSimilarity: csim });
+    //     }

-        neighbours = neighbours.sort(
-            (a, b) => b.cosineSimilarity - a.cosineSimilarity,
+    //     neighbours = neighbours
+    //         .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
+    //         .slice(0, 30);
+
+    //     faceAndNeigbours.push({ face: fi, neighbours });
+    // }
+
+    // Convert into the data structure we're using to debug/visualize.
+    //
+    // > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
+    // > each, sorted by cosine distance to highest scoring face in the
+    // > cluster).
+
+    const sortedClusters = clusters.sort(
+        (a, b) => b.faceIDs.length - a.faceIDs.length,
+    );
+    const debugClusters =
+        sortedClusters.length < 60
+            ? sortedClusters
+            : sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
+    const clusterPreviews: ClusterPreview[] = [];
+    for (const cluster of debugClusters) {
+        const faces = cluster.faceIDs.map((id) =>
+            ensure(faceForFaceID.get(id)),
        );
-
-        faceAndNeigbours.push({ face: fi, neighbours });
+        const topFace = faces.reduce((max, face) =>
+            max.score > face.score ? max : face,
+        );
+        const previewFaces: ClusterPreviewFace[] = [];
+        for (const face of faces) {
+            const csim = dotProduct(topFace.embedding, face.embedding);
+            previewFaces.push({ face, cosineSimilarity: csim });
+        }
+        clusterPreviews.push({
+            clusterSize: cluster.faceIDs.length,
+            faces: previewFaces
+                .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
+                .slice(0, 50),
+        });
    }

    // Prune too small clusters.
+    // TODO-Cluster this is likely not needed since hdbscan already has a min?
    const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);

-    let cgroups = await clusterGroups();
+    // let cgroups = await clusterGroups();
+
+    // // TODO-Cluster - Currently we're not syncing with remote or saving anything
+    // // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
+    // // cgroup, one per cluster.
+    // cgroups = cgroups.concat(
+    //     validClusters.map((c) => ({
+    //         id: c.id,
+    //         name: undefined,
+    //         clusterIDs: [c.id],
+    //         isHidden: false,
+    //         avatarFaceID: undefined,
+    //         displayFaceID: undefined,
+    //     })),
+    // );
+
+    // // For each cluster group, use the highest scoring face in any of its
+    // // clusters as its display face.
+    // for (const cgroup of cgroups) {
+    //     cgroup.displayFaceID = cgroup.clusterIDs
+    //         .map((clusterID) => clusterIndexForClusterID.get(clusterID))
+    //         .filter((i) => i !== undefined) /* 0 is a valid index */
+    //         .flatMap((i) => clusters[i]?.faceIDs ?? [])
+    //         .map((faceID) => faceForFaceID.get(faceID))
+    //         .filter((face) => !!face)
+    //         .reduce((max, face) =>
+    //             max.score > face.score ? max : face,
+    //         ).faceID;
+    // }

    // TODO-Cluster - Currently we're not syncing with remote or saving anything
    // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
    // cgroup, one per cluster.
-    cgroups = cgroups.concat(
-        validClusters.map((c) => ({
-            id: c.id,
+
+    const cgroups: CGroup[] = [];
+    for (const cluster of sortedClusters) {
+        const faces = cluster.faceIDs.map((id) =>
+            ensure(faceForFaceID.get(id)),
+        );
+        const topFace = faces.reduce((max, face) =>
+            max.score > face.score ? max : face,
+        );
+        cgroups.push({
+            id: cluster.id,
            name: undefined,
-            clusterIDs: [c.id],
+            clusterIDs: [cluster.id],
            isHidden: false,
            avatarFaceID: undefined,
-            displayFaceID: undefined,
-        })),
-    );
-
-    // For each cluster group, use the highest scoring face in any of its
-    // clusters as its display face.
-    const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
-    for (const cgroup of cgroups) {
-        cgroup.displayFaceID = cgroup.clusterIDs
-            .map((clusterID) => clusterIndexForClusterID.get(clusterID))
-            .filter((i) => i !== undefined) /* 0 is a valid index */
-            .flatMap((i) => clusters[i]?.faceIDs ?? [])
-            .map((faceID) => faceForFaceID.get(faceID))
-            .filter((face) => !!face)
-            .reduce((max, face) =>
-                max.score > face.score ? max : face,
-            ).faceID;
+            displayFaceID: topFace.faceID,
+        });
    }

-    log.info("ml/cluster", {
-        faces,
-        validClusters,
-        clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
-        clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
-        cgroups,
-    });
+    // log.info("ml/cluster", {
+    //     faces,
+    //     validClusters,
+    //     clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
+    //     clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
+    //     cgroups,
+    // });
    log.info(
-        `Clustered ${faces.length} faces into ${validClusters.length} clusters (${Date.now() - t} ms)`,
+        `Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
    );

-    return { faces, clusters: validClusters, cgroups, faceAndNeigbours };
+    const clusteredCount = clusterIDForFaceID.size;
+    const unclusteredCount = faces.length - clusteredCount;
+
+    return {
+        // faces,
+        clusteredCount,
+        unclusteredCount,
+        clusters: validClusters,
+        cgroups,
+        clusterPreviews,
+        clusterIDForFaceID,
+    };
 };
--- a/web/packages/new/photos/services/ml/cluster.ts
+++ b/web/packages/new/photos/services/ml/cluster.ts
@@ -24,7 +24,7 @@ export const clusterFacesHdbscan = (
        minSamples: 5,
        clusterSelectionEpsilon: 0.6,
        clusterSelectionMethod: "leaf",
-        debug: true,
+        debug: false,
    });

    return {
--- a/web/packages/new/photos/services/ml/face.ts
+++ b/web/packages/new/photos/services/ml/face.ts
@@ -714,7 +714,7 @@ const detectBlur = (

 type FaceDirection = "left" | "right" | "straight";

-const faceDirection = ({ landmarks }: FaceDetection): FaceDirection => {
+export const faceDirection = ({ landmarks }: FaceDetection): FaceDirection => {
    const leftEye = landmarks[0]!;
    const rightEye = landmarks[1]!;
    const nose = landmarks[2]!;
--- a/web/packages/new/photos/services/ml/index.ts
+++ b/web/packages/new/photos/services/ml/index.ts
@@ -20,14 +20,9 @@ import { getAllLocalFiles } from "../files";
 import { getRemoteFlag, updateRemoteFlag } from "../remote-store";
 import type { SearchPerson } from "../search/types";
 import type { UploadItem } from "../upload/types";
-import { clusterFacesHdb, type CGroup, type FaceCluster } from "./cluster-new";
+import { type CGroup, type FaceCluster } from "./cluster-new";
 import { regenerateFaceCrops } from "./crop";
-import {
-    clearMLDB,
-    faceIndex,
-    faceIndexes,
-    indexableAndIndexedCounts,
-} from "./db";
+import { clearMLDB, faceIndex, indexableAndIndexedCounts } from "./db";
 import type { Face } from "./face";
 import { MLWorker } from "./worker";
 import type { CLIPMatches } from "./worker-types";
@@ -360,8 +355,23 @@ export interface FaceFileNeighbour {
    cosineSimilarity: number;
 }

+// "with file"
+export interface ClusterPreviewWF {
+    clusterSize: number;
+    faces: ClusterPreviewFaceWF[];
+}
+
+export interface ClusterPreviewFaceWF {
+    face: Face;
+    enteFile: EnteFile;
+    cosineSimilarity: number;
+}
+
 export interface ClusterDebugPageContents {
-    faceFNs: FaceFileNeighbours[];
+    clusteredCount: number;
+    unclusteredCount: number;
+    // faceFNs: FaceFileNeighbours[];
+    clusterPreviewWFs: ClusterPreviewWF[];
    clusters: FaceCluster[];
    clusterIDForFaceID: Map<string, string>;
 }
@@ -377,48 +387,84 @@ export const wipClusterDebugPageContents = async (): Promise<
    triggerStatusUpdate();

    // const { faceAndNeigbours, clusters, cgroups } = await clusterFaces(
-    const { faceAndNeigbours, clusters, cgroups } = await clusterFacesHdb(
-        await faceIndexes(),
-    );
-    const searchPersons = await convertToSearchPersons(clusters, cgroups);
+    const {
+        clusteredCount,
+        unclusteredCount,
+        clusterPreviews,
+        clusters,
+        cgroups,
+        clusterIDForFaceID,
+    } = await worker().then((w) => w.clusterFacesHdb());
+
+    // const searchPersons = await convertToSearchPersons(clusters, cgroups);

    const localFiles = await getAllLocalFiles();
    const localFileByID = new Map(localFiles.map((f) => [f.id, f]));
    const fileForFace = ({ faceID }: Face) =>
        ensure(localFileByID.get(ensure(fileIDFromFaceID(faceID))));

-    const faceFNs = faceAndNeigbours
-        .map(({ face, neighbours }) => ({
+    // const faceFNs = faceAndNeigbours.map(
+    //     ({ topFace: face, faces: neighbours }) => ({
+    //         face,
+    //         neighbours: neighbours.map(({ face, cosineSimilarity }) => ({
+    //             face,
+    //             enteFile: fileForFace(face),
+    //             cosineSimilarity,
+    //         })),
+    //     }),
+    // );
+    const clusterPreviewWFs = clusterPreviews.map(({ clusterSize, faces }) => ({
+        clusterSize,
+        faces: faces.map(({ face, cosineSimilarity }) => ({
            face,
-            neighbours: neighbours.map(({ face, cosineSimilarity }) => ({
-                face,
-                enteFile: fileForFace(face),
-                cosineSimilarity,
-            })),
-        }))
-        .sort((a, b) => b.face.score - a.face.score);
+            enteFile: fileForFace(face),
+            cosineSimilarity,
+        })),
+    }));

-    const clusterIDForFaceID = new Map(
-        clusters.flatMap((cluster) =>
-            cluster.faceIDs.map((id) => [id, cluster.id]),
-        ),
-    );
+    const clusterByID = new Map(clusters.map((c) => [c.id, c]));
+
+    const searchPersons = cgroups
+        .map((cgroup) => {
+            const faceID = ensure(cgroup.displayFaceID);
+            const fileID = ensure(fileIDFromFaceID(faceID));
+            const file = ensure(localFileByID.get(fileID));
+
+            const faceIDs = cgroup.clusterIDs
+                .map((id) => ensure(clusterByID.get(id)))
+                .flatMap((cluster) => cluster.faceIDs);
+            const fileIDs = faceIDs
+                .map((faceID) => fileIDFromFaceID(faceID))
+                .filter((fileID) => fileID !== undefined);
+
+            return {
+                id: cgroup.id,
+                name: cgroup.name,
+                faceIDs,
+                files: [...new Set(fileIDs)],
+                displayFaceID: faceID,
+                displayFaceFile: file,
+            };
+        })
+        .sort((a, b) => b.faceIDs.length - a.faceIDs.length);

    _wip_isClustering = false;
    _wip_searchPersons = searchPersons;
    triggerStatusUpdate();

-    const prunedFaceFNs = faceFNs.slice(0, 30).map(({ face, neighbours }) => ({
-        face,
-        neighbours: neighbours.slice(0, 30),
-    }));
-
-    return { faceFNs: prunedFaceFNs, clusters, clusterIDForFaceID };
+    return {
+        clusteredCount,
+        unclusteredCount,
+        clusterPreviewWFs,
+        clusters,
+        clusterIDForFaceID,
+    };
 };

 export const wipCluster = () => void wipClusterDebugPageContents();

-const convertToSearchPersons = async (
+// TODO-Cluster remove me
+export const convertToSearchPersons = async (
    clusters: FaceCluster[],
    cgroups: CGroup[],
 ) => {
--- a/web/packages/new/photos/services/ml/worker.ts
+++ b/web/packages/new/photos/services/ml/worker.ts
@@ -24,8 +24,10 @@ import {
    indexCLIP,
    type CLIPIndex,
 } from "./clip";
+import { clusterFacesHdb } from "./cluster-new";
 import { saveFaceCrops } from "./crop";
 import {
+    faceIndexes,
    indexableFileIDs,
    markIndexingFailed,
    saveIndexes,
@@ -272,6 +274,11 @@ export class MLWorker {
            remoteMLData: mlDataByID.get(id),
        }));
    }
+
+    // TODO-Cluster
+    async clusterFacesHdb() {
+        return clusterFacesHdb(await faceIndexes());
+    }
 }

 expose(MLWorker);