[desktop] Clustering WIP - Part x/x (#3093)

- Prefer existing clusters when adding - Add min threshold
2024-09-02 14:08:27 +05:30
parent 7500fdd380 c0ad778c90
commit 5b0c32084a
2 changed files with 108 additions and 4 deletions
--- a/web/apps/photos/src/pages/cluster-debug.tsx
+++ b/web/apps/photos/src/pages/cluster-debug.tsx
@@ -112,6 +112,7 @@ const OptionsForm: React.FC<OptionsFormProps> = ({ onCluster }) => {
                method: "linear",
                minBlur: 10,
                minScore: 0.8,
+                minClusterSize: 2,
                joinThreshold: 0.7,
                batchSize: 12500,
            },
@@ -120,6 +121,7 @@ const OptionsForm: React.FC<OptionsFormProps> = ({ onCluster }) => {
                    method: values.method,
                    minBlur: toFloat(values.minBlur),
                    minScore: toFloat(values.minScore),
+                    minClusterSize: toFloat(values.minClusterSize),
                    joinThreshold: toFloat(values.joinThreshold),
                    batchSize: toFloat(values.batchSize),
                }),
@@ -162,6 +164,13 @@ const OptionsForm: React.FC<OptionsFormProps> = ({ onCluster }) => {
                        size="small"
                        onChange={handleChange}
                    />
+                    <TextField
+                        name="minClusterSize"
+                        label="minClusterSize"
+                        value={values.minClusterSize}
+                        size="small"
+                        onChange={handleChange}
+                    />
                    <TextField
                        name="joinThreshold"
                        label="joinThreshold"
--- a/web/packages/new/photos/services/ml/cluster.ts
+++ b/web/packages/new/photos/services/ml/cluster.ts
@@ -116,6 +116,7 @@ export interface ClusteringOpts {
    method: "linear" | "hdbscan";
    minBlur: number;
    minScore: number;
+    minClusterSize: number;
    batchSize: number;
    joinThreshold: number;
 }
@@ -176,7 +177,14 @@ export const clusterFaces = (
    faceIndexes: FaceIndex[],
    opts: ClusteringOpts,
 ) => {
-    const { method, batchSize, minBlur, minScore, joinThreshold } = opts;
+    const {
+        method,
+        batchSize,
+        minBlur,
+        minScore,
+        minClusterSize,
+        joinThreshold,
+    } = opts;
    const t = Date.now();

    // A flattened array of faces.
@@ -299,7 +307,12 @@ export const clusterFaces = (
        }
    }

-    const sortedClusters = clusters.sort(
+    // Prune clusters that are smaller than the threshold.
+    const validClusters = clusters.filter(
+        (cs) => cs.faceIDs.length > minClusterSize,
+    );
+
+    const sortedClusters = validClusters.sort(
        (a, b) => b.faceIDs.length - a.faceIDs.length,
    );

@@ -361,7 +374,7 @@ export const clusterFaces = (

    const timeTakenMs = Date.now() - t;
    log.info(
-        `Clustered ${faces.length} faces into ${clusters.length} clusters, ${faces.length - clusterIDForFaceID.size} faces remain unclustered (${timeTakenMs} ms)`,
+        `Clustered ${faces.length} faces into ${sortedClusters.length} clusters, ${faces.length - clusterIDForFaceID.size} faces remain unclustered (${timeTakenMs} ms)`,
    );

    return {
@@ -393,7 +406,8 @@ interface ClusterLinearResult {
    clusters: EmbeddingCluster[];
 }

-const clusterLinear = (
+// TODO-Cluster remove me
+export const clusterLinear_Direct = (
    embeddings: number[][],
    threshold: number,
 ): ClusterLinearResult => {
@@ -454,3 +468,84 @@ const clusterLinear = (

    return { clusters: validClusters };
 };
+
+const clusterLinear = (
+    embeddings: number[][],
+    threshold: number,
+): ClusterLinearResult => {
+    const clusters: EmbeddingCluster[] = [];
+    const clusterIndexForEmbeddingIndex = new Map<number, number>();
+    // For each embedding
+    for (const [i, ei] of embeddings.entries()) {
+        // If the embedding is already part of a cluster, then skip it.
+        if (clusterIndexForEmbeddingIndex.get(i)) continue;
+
+        // Find the nearest neighbour from among all the other embeddings.
+        let nnIndex: number | undefined;
+        let nnCosineSimilarity = 0;
+        // Find the nearest cluster from among all the existing clusters.
+        let nClusterIndex: number | undefined;
+        let nClusterCosineSimilarity = 0;
+        for (const [j, ej] of embeddings.entries()) {
+            // ! This is an O(n^2) loop, be careful when adding more code here.
+
+            // Skip ourselves.
+            if (i == j) continue;
+
+            // The vectors are already normalized, so we can directly use their
+            // dot product as their cosine similarity.
+            const csim = dotProduct(ei, ej);
+            if (csim > threshold) {
+                if (csim > nnCosineSimilarity) {
+                    nnIndex = j;
+                    nnCosineSimilarity = csim;
+                }
+                if (csim > nClusterCosineSimilarity) {
+                    const jClusterIndex = clusterIndexForEmbeddingIndex.get(j);
+                    if (jClusterIndex) {
+                        nClusterIndex = jClusterIndex;
+                        nClusterCosineSimilarity = csim;
+                    }
+                }
+            }
+        }
+
+        if (nClusterIndex) {
+            // Found a neighbouring cluster close enough, add ourselves to that.
+
+            ensure(clusters[nClusterIndex]).push(i);
+            clusterIndexForEmbeddingIndex.set(i, nClusterIndex);
+        } else if (nnIndex) {
+            // Find the cluster the nearest neighbour belongs to, if any.
+            const nnClusterIndex = clusterIndexForEmbeddingIndex.get(nnIndex);
+
+            if (nnClusterIndex) {
+                // TODO-Cluster remove this case.
+                // If the neighbour is already part of a cluster, also add
+                // ourselves to that cluster.
+
+                // ensure(clusters[nnClusterIndex]).push(i);
+                // clusterIndexForEmbeddingIndex.set(i, nnClusterIndex);
+                throw new Error("We shouldn't have reached here");
+            } else {
+                // Otherwise create a new cluster with us and our nearest
+                // neighbour.
+
+                clusterIndexForEmbeddingIndex.set(i, clusters.length);
+                clusterIndexForEmbeddingIndex.set(nnIndex, clusters.length);
+                clusters.push([i, nnIndex]);
+            }
+        } else {
+            // We didn't find a neighbour within the threshold. Create a new
+            // cluster with only this embedding.
+
+            clusterIndexForEmbeddingIndex.set(i, clusters.length);
+            clusters.push([i]);
+        }
+    }
+
+    // Prune singleton clusters.
+    const validClusters = clusters.filter((cs) => cs.length > 1);
+
+    return { clusters: validClusters };
+};