diff --git a/web/packages/new/photos/services/ml/cluster-hdb.ts b/web/packages/new/photos/services/ml/cluster-hdb.ts new file mode 100644 index 0000000000..3ecda4b5bc --- /dev/null +++ b/web/packages/new/photos/services/ml/cluster-hdb.ts @@ -0,0 +1,35 @@ +import { Hdbscan, type DebugInfo } from "hdbscan"; + +/** + * Each "cluster" is a list of indexes of the embeddings belonging to that + * particular cluster. + */ +export type EmbeddingCluster = number[]; + +export interface ClusterHdbscanResult { + clusters: EmbeddingCluster[]; + noise: number[]; + debugInfo?: DebugInfo; +} + +/** + * Cluster the given {@link embeddings} using hdbscan. + */ +export const clusterHdbscan = ( + embeddings: number[][], +): ClusterHdbscanResult => { + const hdbscan = new Hdbscan({ + input: embeddings, + minClusterSize: 3, + minSamples: 5, + clusterSelectionEpsilon: 0.6, + clusterSelectionMethod: "leaf", + debug: false, + }); + + return { + clusters: hdbscan.getClusters(), + noise: hdbscan.getNoise(), + debugInfo: hdbscan.getDebugInfo(), + }; +}; diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts deleted file mode 100644 index 8bfb00b164..0000000000 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ /dev/null @@ -1,603 +0,0 @@ -import { newNonSecureID } from "@/base/id-worker"; -import log from "@/base/log"; -import { ensure } from "@/utils/ensure"; -import { clusterFacesHdbscan } from "./cluster"; -import { clusterGroups, faceClusters } from "./db"; -import type { Face, FaceIndex } from "./face"; -import { dotProduct } from "./math"; - -/** - * A face cluster is an set of faces. - * - * Each cluster has an id so that a {@link CGroup} can refer to it. - * - * The cluster is not directly synced to remote. Only clusters that the user - * interacts with get synced to remote, as part of a {@link CGroup}. - */ -export interface FaceCluster { - /** - * A nanoid for this cluster. - */ - id: string; - /** - * An unordered set of ids of the faces that belong to this cluster. - * - * For ergonomics of transportation and persistence this is an array, but it - * should conceptually be thought of as a set. - */ - faceIDs: string[]; -} - -/** - * A cgroup ("cluster group") is a group of clusters (possibly containing a - * single cluster) that the user has interacted with. - * - * Interactions include hiding, merging and giving a name and/or a cover photo. - * - * The most frequent interaction is naming a {@link FaceCluster}, which promotes - * it to a become a {@link CGroup}. The promotion comes with the ability to be - * synced with remote (as a "cgroup" user entity). - * - * There after, the user may attach more clusters to the same {@link CGroup}. - * - * > A named cluster group can be thought of as a "person", though this is not - * > necessarily an accurate characterization. e.g. there can be a named cluster - * > group that contains face clusters of pets. - * - * The other form of interaction is hiding. The user may hide a single (unnamed) - * cluster, or they may hide an named {@link CGroup}. In both cases, we promote - * the cluster to a CGroup if needed so that their request to hide gets synced. - * - * While in our local representation we separately maintain clusters and link to - * them from within CGroups by their clusterID, in the remote representation - * clusters themselves don't get synced. Instead, the "cgroup" entities synced - * with remote contain the clusters within themselves. So a group that gets - * synced with remote looks something like: - * - * { id, name, clusters: [{ clusterID, faceIDs }] } - * - */ -export interface CGroup { - /** - * A nanoid for this cluster group. - * - * This is the ID of the "cgroup" user entity (the envelope), and it is not - * contained as part of the group entity payload itself. - */ - id: string; - /** - * A name assigned by the user to this cluster group. - * - * The client should handle both empty strings and undefined as indicating a - * cgroup without a name. When the client needs to set this to an "empty" - * value, which happens when hiding an unnamed cluster, it should it to an - * empty string. That is, expect `"" | undefined`, but set `""`. - */ - name: string | undefined; - /** - * An unordered set of ids of the clusters that belong to this group. - * - * For ergonomics of transportation and persistence this is an array, but it - * should conceptually be thought of as a set. - */ - clusterIDs: string[]; - /** - * True if this cluster group should be hidden. - * - * The user can hide both named cluster groups and single unnamed clusters. - * If the user hides a single cluster that was offered as a suggestion to - * them on a client, the client will create a new unnamed cgroup containing - * it, and set its hidden flag to sync it with remote (so that other clients - * can also stop showing this cluster). - */ - isHidden: boolean; - /** - * The ID of the face that should be used as the cover photo for this - * cluster group (if the user has set one). - * - * This is similar to the [@link displayFaceID}, the difference being: - * - * - {@link avatarFaceID} is the face selected by the user. - * - * - {@link displayFaceID} is the automatic placeholder, and only comes - * into effect if the user has not explicitly selected a face. - */ - avatarFaceID: string | undefined; - /** - * Locally determined ID of the "best" face that should be used as the - * display face, to represent this cluster group in the UI. - * - * This property is not synced with remote. For more details, see - * {@link avatarFaceID}. - */ - displayFaceID: string | undefined; -} - -export interface ClusteringOpts { - method: "linear" | "hdbscan"; - batchSize: number; - joinThreshold: number; -} - -export interface ClusterPreview { - clusterSize: number; - faces: ClusterPreviewFace[]; -} - -export interface ClusterPreviewFace { - face: Face; - cosineSimilarity: number; - wasMerged: boolean; -} - -/** - * Cluster faces into groups. - * - * [Note: Face clustering algorithm] - * - * A cgroup (cluster group) consists of clusters, each of which itself is a set - * of faces. - * - * cgroup << cluster << face - * - * The clusters are generated locally by clients using the following algorithm: - * - * 1. clusters = [] initially, or fetched from remote. - * - * 2. For each face, find its nearest neighbour in the embedding space. - * - * 3. If no such neighbour is found within our threshold, create a new cluster. - * - * 4. Otherwise assign this face to the same cluster as its nearest neighbour. - * - * This user can then tweak the output of the algorithm by performing the - * following actions to the list of clusters that they can see: - * - * - They can provide a name for a cluster ("name a person"). This upgrades a - * cluster into a "cgroup", which is an entity that gets synced via remote - * to the user's other clients. - * - * - They can attach more clusters to a cgroup ("merge clusters") - * - * - They can remove a cluster from a cgroup ("break clusters"). - * - * After clustering, we also do some routine cleanup. Faces belonging to files - * that have been deleted (including those in Trash) should be pruned off. - * - * We should not make strict assumptions about the clusters we get from remote. - * In particular, the same face ID can be in different clusters. In such cases - * we should assign it arbitrarily assign it to the last cluster we find it in. - * Such leeway is intentionally provided to allow clients some slack in how they - * implement the sync without needing to make an blocking API request for every - * user interaction. - */ -export const clusterFaces = async (faceIndexes: FaceIndex[]) => { - const t = Date.now(); - - // A flattened array of faces. - // TODO-Cluster note the 2k slice - const faces = [...enumerateFaces(faceIndexes)].slice(0, 2000); - - // Start with the clusters we already have (either from a previous indexing, - // or fetched from remote). - const clusters = await faceClusters(); - - // For fast reverse lookup - map from cluster ids to their index in the - // clusters array. - const clusterIndexForClusterID = new Map(clusters.map((c, i) => [c.id, i])); - - // For fast reverse lookup - map from face ids to the id of the cluster to - // which they belong. - const clusterIDForFaceID = new Map( - clusters.flatMap((c) => c.faceIDs.map((id) => [id, c.id] as const)), - ); - - // A function to generate new cluster IDs. - const newClusterID = () => newNonSecureID("cluster_"); - - const faceAndNeigbours: FaceNeighbours[] = []; - - // For each face, - for (const [i, fi] of faces.entries()) { - // If the face is already part of a cluster, then skip it. - if (clusterIDForFaceID.get(fi.faceID)) continue; - - // Find the nearest neighbour from among all the other faces. - let nn: Face | undefined; - let nnCosineSimilarity = 0; - let neighbours: FaceNeighbour[] = []; - for (let j = 0; j < faces.length; j++) { - // ! This is an O(n^2) loop, be careful when adding more code here. - - // TODO-Cluster Commenting this here and moving it downward - // // Skip ourselves. - // if (i == j) continue; - - // Can't find a way of avoiding the null assertion here. - // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - const fj = faces[j]!; - - // The vectors are already normalized, so we can directly use their - // dot product as their cosine similarity. - const csim = dotProduct(fi.embedding, fj.embedding); - - // TODO-Cluster Delete me and uncomment the check above - // Skip ourselves. - if (i == j) { - neighbours.push({ face: fj, cosineSimilarity: csim }); - continue; - } - - const threshold = fi.blur < 100 || fj.blur < 100 ? 0.7 : 0.6; - if (csim > threshold && csim > nnCosineSimilarity) { - nn = fj; - nnCosineSimilarity = csim; - } - - neighbours.push({ face: fj, cosineSimilarity: csim }); - } - - neighbours = neighbours.sort( - (a, b) => b.cosineSimilarity - a.cosineSimilarity, - ); - faceAndNeigbours.push({ face: fi, neighbours }); - - const { faceID } = fi; - - if (nn) { - // Found a neighbour near enough. - const nnFaceID = nn.faceID; - - // Find the cluster the nearest neighbour belongs to, if any. - const nnClusterID = clusterIDForFaceID.get(nn.faceID); - - if (nnClusterID) { - // If the neighbour is already part of a cluster, also add - // ourselves to that cluster. - - const nnClusterIndex = ensure( - clusterIndexForClusterID.get(nnClusterID), - ); - clusters[nnClusterIndex]?.faceIDs.push(faceID); - clusterIDForFaceID.set(faceID, nnClusterID); - } else { - // Otherwise create a new cluster with us and our nearest - // neighbour. - - const cluster = { - id: newClusterID(), - faceIDs: [faceID, nnFaceID], - }; - clusterIndexForClusterID.set(cluster.id, clusters.length); - clusterIDForFaceID.set(faceID, cluster.id); - clusterIDForFaceID.set(nnFaceID, cluster.id); - clusters.push(cluster); - } - } else { - // We didn't find a neighbour within the threshold. Create a new - // cluster with only this face. - - const cluster = { id: newClusterID(), faceIDs: [faceID] }; - clusterIndexForClusterID.set(cluster.id, clusters.length); - clusterIDForFaceID.set(faceID, cluster.id); - clusters.push(cluster); - } - } - - // Prune too small clusters. - const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1); - - let cgroups = await clusterGroups(); - - // TODO-Cluster - Currently we're not syncing with remote or saving anything - // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced) - // cgroup, one per cluster. - cgroups = cgroups.concat( - validClusters.map((c) => ({ - id: c.id, - name: undefined, - clusterIDs: [c.id], - isHidden: false, - avatarFaceID: undefined, - displayFaceID: undefined, - })), - ); - - // For each cluster group, use the highest scoring face in any of its - // clusters as its display face. - const faceForFaceID = new Map(faces.map((f) => [f.faceID, f])); - for (const cgroup of cgroups) { - cgroup.displayFaceID = cgroup.clusterIDs - .map((clusterID) => clusterIndexForClusterID.get(clusterID)) - .filter((i) => i !== undefined) /* 0 is a valid index */ - .flatMap((i) => clusters[i]?.faceIDs ?? []) - .map((faceID) => faceForFaceID.get(faceID)) - .filter((face) => !!face) - .reduce((max, face) => - max.score > face.score ? max : face, - ).faceID; - } - - log.info("ml/cluster", { - faces, - validClusters, - clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID), - clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID), - cgroups, - }); - log.info( - `Clustered ${faces.length} faces into ${validClusters.length} clusters (${Date.now() - t} ms)`, - ); - - return { faces, clusters: validClusters, cgroups, faceAndNeigbours }; -}; - -/** - * A generator function that returns a stream of {faceID, embedding} values, - * flattening all the the faces present in the given {@link faceIndices}. - */ -function* enumerateFaces(faceIndices: FaceIndex[]) { - for (const fi of faceIndices) { - for (const f of fi.faces) { - yield f; - } - } -} - -export const clusterFacesHdb = ( - faceIndexes: FaceIndex[], - opts: ClusteringOpts, -) => { - const { batch } = opts; - const t = Date.now(); - - // A flattened array of faces. - // TODO-Cluster ad-hoc filtering and slicing - const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99); - // .slice(0, 6000); - // TODO-Cluster testing code, can be removed once done - const faces = Array(1) - .fill(0) - .flatMap(() => faces0); - - // For fast reverse lookup - map from face ids to the face. - const faceForFaceID = new Map(faces.map((f) => [f.faceID, f])); - - const faceEmbeddings = faces.map(({ embedding }) => embedding); - - // For fast reverse lookup - map from cluster ids to their index in the - // clusters array. - const clusterIndexForClusterID = new Map(); - - // For fast reverse lookup - map from the id of a face to the id of the - // cluster to which it belongs. - const clusterIDForFaceID = new Map(); - - // A function to chain two reverse lookup. - const firstFaceOfCluster = (cluster: FaceCluster) => - ensure(faceForFaceID.get(ensure(cluster.faceIDs[0]))); - - // A function to generate new cluster IDs. - const newClusterID = () => newNonSecureID("cluster_"); - - // The resultant clusters. - // TODO-Cluster Later on, instead of starting from a blank slate, this will - // be list of existing clusters we fetch from remote. - const clusters: FaceCluster[] = []; - - // Process the faces in batches. The faces are already sorted by file ID, - // which is a monotonically increasing integer, so we will also have some - // temporal locality. - // - // The number 2500 was derived by ad-hoc observations and takes a few - // seconds. On a particular test dataset and a particular machine, - // clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins. - // Memory usage was constant in all these cases. - // - // At around 100k faces, the clustering starts taking hours, and we start - // running into stack overflows. The stack overflows can perhaps be avoided - // by restructuring the code, but hours of uninterruptible work is anyways - // not feasible. - - const batchSize = 2500; - for (let i = 0; i < faceEmbeddings.length; i += batchSize) { - const it = Date.now(); - const embeddings = faceEmbeddings.slice(i, i + batchSize); - const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings); - - log.info( - `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`, - ); - - // Merge the new clusters we got from hdbscan into the existing clusters - // if they are "near" them (using some heuristic). - // - // We need to ensure we don't change any of the existing cluster IDs, - // since these might be existing clusters we got from remote. - - for (const hdbCluster of hdbClusters) { - // Find the existing cluster whose (arbitrarily chosen) first face - // is the nearest neighbour of the (arbitrarily chosen) first face - // of the cluster produced by hdbscan. - - const newFace = ensure(faces[i + ensure(hdbCluster[0])]); - - let nnCluster: FaceCluster | undefined; - let nnCosineSimilarity = 0; - for (const existingCluster of clusters) { - const existingFace = firstFaceOfCluster(existingCluster); - - // The vectors are already normalized, so we can directly use their - // dot product as their cosine similarity. - const csim = dotProduct( - existingFace.embedding, - newFace.embedding, - ); - - // Use a higher cosine similarity threshold if either of the two - // faces are blurry. - const threshold = - existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7; - if (csim > threshold && csim > nnCosineSimilarity) { - nnCluster = existingCluster; - nnCosineSimilarity = csim; - } - } - - if (nnCluster) { - // If we found an existing cluster that is near enough, - // sublimate the cluster produced by hdbscan into that cluster. - for (const j of hdbCluster) { - const { faceID } = ensure(faces[i + j]); - nnCluster.faceIDs.push(faceID); - clusterIDForFaceID.set(faceID, nnCluster.id); - } - } else { - // Otherwise make a new cluster from the cluster produced by - // hdbscan. - const clusterID = newClusterID(); - const faceIDs: string[] = []; - for (const j of hdbCluster) { - const { faceID } = ensure(faces[i + j]); - faceIDs.push(faceID); - clusterIDForFaceID.set(faceID, clusterID); - } - clusterIndexForClusterID.set(clusterID, clusters.length); - clusters.push({ id: clusterID, faceIDs }); - } - } - } - - // Convert into the data structure we're using to debug/visualize. - // const faceAndNeigbours: FaceNeighbours[] = []; - // const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30); - // for (const fi of topFaces) { - // let neighbours: FaceNeighbour[] = []; - // for (const fj of faces) { - // // The vectors are already normalized, so we can directly use their - // // dot product as their cosine similarity. - // const csim = dotProduct(fi.embedding, fj.embedding); - // neighbours.push({ face: fj, cosineSimilarity: csim }); - // } - - // neighbours = neighbours - // .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity) - // .slice(0, 30); - - // faceAndNeigbours.push({ face: fi, neighbours }); - // } - - // Convert into the data structure we're using to debug/visualize. - // - // > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in - // > each, sorted by cosine distance to highest scoring face in the - // > cluster). - - const sortedClusters = clusters.sort( - (a, b) => b.faceIDs.length - a.faceIDs.length, - ); - const debugClusters = - sortedClusters.length < 60 - ? sortedClusters - : sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30)); - const clusterPreviews: ClusterPreview[] = []; - for (const cluster of debugClusters) { - const faces = cluster.faceIDs.map((id) => - ensure(faceForFaceID.get(id)), - ); - const topFace = faces.reduce((max, face) => - max.score > face.score ? max : face, - ); - const previewFaces: ClusterPreviewFace[] = []; - for (const face of faces) { - const csim = dotProduct(topFace.embedding, face.embedding); - previewFaces.push({ face, cosineSimilarity: csim }); - } - clusterPreviews.push({ - clusterSize: cluster.faceIDs.length, - faces: previewFaces - .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity) - .slice(0, 50), - }); - } - - // Prune too small clusters. - // TODO-Cluster this is likely not needed since hdbscan already has a min? - const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1); - - // let cgroups = await clusterGroups(); - - // // TODO-Cluster - Currently we're not syncing with remote or saving anything - // // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced) - // // cgroup, one per cluster. - // cgroups = cgroups.concat( - // validClusters.map((c) => ({ - // id: c.id, - // name: undefined, - // clusterIDs: [c.id], - // isHidden: false, - // avatarFaceID: undefined, - // displayFaceID: undefined, - // })), - // ); - - // // For each cluster group, use the highest scoring face in any of its - // // clusters as its display face. - // for (const cgroup of cgroups) { - // cgroup.displayFaceID = cgroup.clusterIDs - // .map((clusterID) => clusterIndexForClusterID.get(clusterID)) - // .filter((i) => i !== undefined) /* 0 is a valid index */ - // .flatMap((i) => clusters[i]?.faceIDs ?? []) - // .map((faceID) => faceForFaceID.get(faceID)) - // .filter((face) => !!face) - // .reduce((max, face) => - // max.score > face.score ? max : face, - // ).faceID; - // } - - // TODO-Cluster - Currently we're not syncing with remote or saving anything - // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced) - // cgroup, one per cluster. - - const cgroups: CGroup[] = []; - for (const cluster of sortedClusters) { - const faces = cluster.faceIDs.map((id) => - ensure(faceForFaceID.get(id)), - ); - const topFace = faces.reduce((max, face) => - max.score > face.score ? max : face, - ); - cgroups.push({ - id: cluster.id, - name: undefined, - clusterIDs: [cluster.id], - isHidden: false, - avatarFaceID: undefined, - displayFaceID: topFace.faceID, - }); - } - - // log.info("ml/cluster", { - // faces, - // validClusters, - // clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID), - // clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID), - // cgroups, - // }); - log.info( - `Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`, - ); - - const clusteredCount = clusterIDForFaceID.size; - const unclusteredCount = faces.length - clusteredCount; - - return { - // faces, - clusteredCount, - unclusteredCount, - clusters: validClusters, - cgroups, - clusterPreviews, - clusterIDForFaceID, - }; -}; diff --git a/web/packages/new/photos/services/ml/cluster.ts b/web/packages/new/photos/services/ml/cluster.ts index 53e4930d94..f13b889aa1 100644 --- a/web/packages/new/photos/services/ml/cluster.ts +++ b/web/packages/new/photos/services/ml/cluster.ts @@ -1,35 +1,507 @@ -import { Hdbscan, type DebugInfo } from "hdbscan"; +import { newNonSecureID } from "@/base/id-worker"; +import log from "@/base/log"; +import { ensure } from "@/utils/ensure"; +import { type EmbeddingCluster, clusterHdbscan } from "./cluster-hdb"; +import type { Face, FaceIndex } from "./face"; +import { dotProduct } from "./math"; -export type Cluster = number[]; - -export interface ClusterFacesResult { - clusters: Cluster[]; - noise: Cluster; - debugInfo?: DebugInfo; +/** + * A face cluster is an set of faces. + * + * Each cluster has an id so that a {@link CGroup} can refer to it. + * + * The cluster is not directly synced to remote. Only clusters that the user + * interacts with get synced to remote, as part of a {@link CGroup}. + */ +export interface FaceCluster { + /** + * A nanoid for this cluster. + */ + id: string; + /** + * An unordered set of ids of the faces that belong to this cluster. + * + * For ergonomics of transportation and persistence this is an array, but it + * should conceptually be thought of as a set. + */ + faceIDs: string[]; } /** - * Cluster the given {@link faceEmbeddings}. + * A cgroup ("cluster group") is a group of clusters (possibly containing a + * single cluster) that the user has interacted with. + * + * Interactions include hiding, merging and giving a name and/or a cover photo. + * + * The most frequent interaction is naming a {@link FaceCluster}, which promotes + * it to a become a {@link CGroup}. The promotion comes with the ability to be + * synced with remote (as a "cgroup" user entity). + * + * There after, the user may attach more clusters to the same {@link CGroup}. + * + * > A named cluster group can be thought of as a "person", though this is not + * > necessarily an accurate characterization. e.g. there can be a named cluster + * > group that contains face clusters of pets. + * + * The other form of interaction is hiding. The user may hide a single (unnamed) + * cluster, or they may hide an named {@link CGroup}. In both cases, we promote + * the cluster to a CGroup if needed so that their request to hide gets synced. + * + * While in our local representation we separately maintain clusters and link to + * them from within CGroups by their clusterID, in the remote representation + * clusters themselves don't get synced. Instead, the "cgroup" entities synced + * with remote contain the clusters within themselves. So a group that gets + * synced with remote looks something like: + * + * { id, name, clusters: [{ clusterID, faceIDs }] } * - * @param faceEmbeddings An array of embeddings produced by our face indexing - * pipeline. Each embedding is for a face detected in an image (a single image - * may have multiple faces detected within it). */ -export const clusterFacesHdbscan = ( - faceEmbeddings: number[][], -): ClusterFacesResult => { - const hdbscan = new Hdbscan({ - input: faceEmbeddings, - minClusterSize: 3, - minSamples: 5, - clusterSelectionEpsilon: 0.6, - clusterSelectionMethod: "leaf", - debug: false, - }); +export interface CGroup { + /** + * A nanoid for this cluster group. + * + * This is the ID of the "cgroup" user entity (the envelope), and it is not + * contained as part of the group entity payload itself. + */ + id: string; + /** + * A name assigned by the user to this cluster group. + * + * The client should handle both empty strings and undefined as indicating a + * cgroup without a name. When the client needs to set this to an "empty" + * value, which happens when hiding an unnamed cluster, it should it to an + * empty string. That is, expect `"" | undefined`, but set `""`. + */ + name: string | undefined; + /** + * An unordered set of ids of the clusters that belong to this group. + * + * For ergonomics of transportation and persistence this is an array, but it + * should conceptually be thought of as a set. + */ + clusterIDs: string[]; + /** + * True if this cluster group should be hidden. + * + * The user can hide both named cluster groups and single unnamed clusters. + * If the user hides a single cluster that was offered as a suggestion to + * them on a client, the client will create a new unnamed cgroup containing + * it, and set its hidden flag to sync it with remote (so that other clients + * can also stop showing this cluster). + */ + isHidden: boolean; + /** + * The ID of the face that should be used as the cover photo for this + * cluster group (if the user has set one). + * + * This is similar to the [@link displayFaceID}, the difference being: + * + * - {@link avatarFaceID} is the face selected by the user. + * + * - {@link displayFaceID} is the automatic placeholder, and only comes + * into effect if the user has not explicitly selected a face. + */ + avatarFaceID: string | undefined; + /** + * Locally determined ID of the "best" face that should be used as the + * display face, to represent this cluster group in the UI. + * + * This property is not synced with remote. For more details, see + * {@link avatarFaceID}. + */ + displayFaceID: string | undefined; +} + +export interface ClusteringOpts { + method: "linear" | "hdbscan"; + batchSize: number; + joinThreshold: number; +} + +export interface ClusterPreview { + clusterSize: number; + faces: ClusterPreviewFace[]; +} + +export interface ClusterPreviewFace { + face: Face; + cosineSimilarity: number; + wasMerged: boolean; +} + +/** + * Cluster faces into groups. + * + * [Note: Face clustering algorithm] + * + * A cgroup (cluster group) consists of clusters, each of which itself is a set + * of faces. + * + * cgroup << cluster << face + * + * The clusters are generated locally by clients using the following algorithm: + * + * 1. clusters = [] initially, or fetched from remote. + * + * 2. For each face, find its nearest neighbour in the embedding space. + * + * 3. If no such neighbour is found within our threshold, create a new cluster. + * + * 4. Otherwise assign this face to the same cluster as its nearest neighbour. + * + * This user can then tweak the output of the algorithm by performing the + * following actions to the list of clusters that they can see: + * + * - They can provide a name for a cluster ("name a person"). This upgrades a + * cluster into a "cgroup", which is an entity that gets synced via remote + * to the user's other clients. + * + * - They can attach more clusters to a cgroup ("merge clusters") + * + * - They can remove a cluster from a cgroup ("break clusters"). + * + * After clustering, we also do some routine cleanup. Faces belonging to files + * that have been deleted (including those in Trash) should be pruned off. + * + * We should not make strict assumptions about the clusters we get from remote. + * In particular, the same face ID can be in different clusters. In such cases + * we should assign it arbitrarily assign it to the last cluster we find it in. + * Such leeway is intentionally provided to allow clients some slack in how they + * implement the sync without needing to make an blocking API request for every + * user interaction. + */ +export const clusterFaces = ( + faceIndexes: FaceIndex[], + opts: ClusteringOpts, +) => { + const { batchSize, joinThreshold } = opts; + const t = Date.now(); + + // A flattened array of faces. + // TODO-Cluster ad-hoc filtering and slicing + const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99); + // .slice(0, 6000); + // TODO-Cluster testing code, can be removed once done + const faces = Array(1) + .fill(0) + .flatMap(() => faces0); + + // For fast reverse lookup - map from face ids to the face. + const faceForFaceID = new Map(faces.map((f) => [f.faceID, f])); + + const faceEmbeddings = faces.map(({ embedding }) => embedding); + + // For fast reverse lookup - map from cluster ids to their index in the + // clusters array. + const clusterIndexForClusterID = new Map(); + + // For fast reverse lookup - map from the id of a face to the id of the + // cluster to which it belongs. + const clusterIDForFaceID = new Map(); + + // A function to chain two reverse lookup. + const firstFaceOfCluster = (cluster: FaceCluster) => + ensure(faceForFaceID.get(ensure(cluster.faceIDs[0]))); + + // A function to generate new cluster IDs. + const newClusterID = () => newNonSecureID("cluster_"); + + // The resultant clusters. + // TODO-Cluster Later on, instead of starting from a blank slate, this will + // be list of existing clusters we fetch from remote. + const clusters: FaceCluster[] = []; + + // Process the faces in batches. The faces are already sorted by file ID, + // which is a monotonically increasing integer, so we will also have some + // temporal locality. + // + // The number 2500 was derived by ad-hoc observations and takes a few + // seconds. On a particular test dataset and a particular machine, + // clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins. + // Memory usage was constant in all these cases. + // + // At around 100k faces, the clustering starts taking hours, and we start + // running into stack overflows. The stack overflows can perhaps be avoided + // by restructuring the code, but hours of uninterruptible work is anyways + // not feasible. + + const batchSize = 2500; + for (let i = 0; i < faceEmbeddings.length; i += batchSize) { + const it = Date.now(); + const embeddings = faceEmbeddings.slice(i, i + batchSize); + const { clusters: hdbClusters } = clusterHdbscan(embeddings); + + log.info( + `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`, + ); + + // Merge the new clusters we got from hdbscan into the existing clusters + // if they are "near" them (using some heuristic). + // + // We need to ensure we don't change any of the existing cluster IDs, + // since these might be existing clusters we got from remote. + + for (const hdbCluster of hdbClusters) { + // Find the existing cluster whose (arbitrarily chosen) first face + // is the nearest neighbour of the (arbitrarily chosen) first face + // of the cluster produced by hdbscan. + + const newFace = ensure(faces[i + ensure(hdbCluster[0])]); + + let nnCluster: FaceCluster | undefined; + let nnCosineSimilarity = 0; + for (const existingCluster of clusters) { + const existingFace = firstFaceOfCluster(existingCluster); + + // The vectors are already normalized, so we can directly use their + // dot product as their cosine similarity. + const csim = dotProduct( + existingFace.embedding, + newFace.embedding, + ); + + // Use a higher cosine similarity threshold if either of the two + // faces are blurry. + const threshold = + existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7; + if (csim > threshold && csim > nnCosineSimilarity) { + nnCluster = existingCluster; + nnCosineSimilarity = csim; + } + } + + if (nnCluster) { + // If we found an existing cluster that is near enough, + // sublimate the cluster produced by hdbscan into that cluster. + for (const j of hdbCluster) { + const { faceID } = ensure(faces[i + j]); + nnCluster.faceIDs.push(faceID); + clusterIDForFaceID.set(faceID, nnCluster.id); + } + } else { + // Otherwise make a new cluster from the cluster produced by + // hdbscan. + const clusterID = newClusterID(); + const faceIDs: string[] = []; + for (const j of hdbCluster) { + const { faceID } = ensure(faces[i + j]); + faceIDs.push(faceID); + clusterIDForFaceID.set(faceID, clusterID); + } + clusterIndexForClusterID.set(clusterID, clusters.length); + clusters.push({ id: clusterID, faceIDs }); + } + } + } + + // Convert into the data structure we're using to debug/visualize. + // const faceAndNeigbours: FaceNeighbours[] = []; + // const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30); + // for (const fi of topFaces) { + // let neighbours: FaceNeighbour[] = []; + // for (const fj of faces) { + // // The vectors are already normalized, so we can directly use their + // // dot product as their cosine similarity. + // const csim = dotProduct(fi.embedding, fj.embedding); + // neighbours.push({ face: fj, cosineSimilarity: csim }); + // } + + // neighbours = neighbours + // .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity) + // .slice(0, 30); + + // faceAndNeigbours.push({ face: fi, neighbours }); + // } + + // Convert into the data structure we're using to debug/visualize. + // + // > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in + // > each, sorted by cosine distance to highest scoring face in the + // > cluster). + + const sortedClusters = clusters.sort( + (a, b) => b.faceIDs.length - a.faceIDs.length, + ); + const debugClusters = + sortedClusters.length < 60 + ? sortedClusters + : sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30)); + const clusterPreviews: ClusterPreview[] = []; + for (const cluster of debugClusters) { + const faces = cluster.faceIDs.map((id) => + ensure(faceForFaceID.get(id)), + ); + const topFace = faces.reduce((max, face) => + max.score > face.score ? max : face, + ); + const previewFaces: ClusterPreviewFace[] = []; + for (const face of faces) { + const csim = dotProduct(topFace.embedding, face.embedding); + previewFaces.push({ face, cosineSimilarity: csim }); + } + clusterPreviews.push({ + clusterSize: cluster.faceIDs.length, + faces: previewFaces + .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity) + .slice(0, 50), + }); + } + + // Prune too small clusters. + // TODO-Cluster this is likely not needed since hdbscan already has a min? + const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1); + + // let cgroups = await clusterGroups(); + + // // TODO-Cluster - Currently we're not syncing with remote or saving anything + // // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced) + // // cgroup, one per cluster. + // cgroups = cgroups.concat( + // validClusters.map((c) => ({ + // id: c.id, + // name: undefined, + // clusterIDs: [c.id], + // isHidden: false, + // avatarFaceID: undefined, + // displayFaceID: undefined, + // })), + // ); + + // // For each cluster group, use the highest scoring face in any of its + // // clusters as its display face. + // for (const cgroup of cgroups) { + // cgroup.displayFaceID = cgroup.clusterIDs + // .map((clusterID) => clusterIndexForClusterID.get(clusterID)) + // .filter((i) => i !== undefined) /* 0 is a valid index */ + // .flatMap((i) => clusters[i]?.faceIDs ?? []) + // .map((faceID) => faceForFaceID.get(faceID)) + // .filter((face) => !!face) + // .reduce((max, face) => + // max.score > face.score ? max : face, + // ).faceID; + // } + + // TODO-Cluster - Currently we're not syncing with remote or saving anything + // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced) + // cgroup, one per cluster. + + const cgroups: CGroup[] = []; + for (const cluster of sortedClusters) { + const faces = cluster.faceIDs.map((id) => + ensure(faceForFaceID.get(id)), + ); + const topFace = faces.reduce((max, face) => + max.score > face.score ? max : face, + ); + cgroups.push({ + id: cluster.id, + name: undefined, + clusterIDs: [cluster.id], + isHidden: false, + avatarFaceID: undefined, + displayFaceID: topFace.faceID, + }); + } + + // log.info("ml/cluster", { + // faces, + // validClusters, + // clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID), + // clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID), + // cgroups, + // }); + log.info( + `Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`, + ); + + const clusteredCount = clusterIDForFaceID.size; + const unclusteredCount = faces.length - clusteredCount; return { - clusters: hdbscan.getClusters(), - noise: hdbscan.getNoise(), - debugInfo: hdbscan.getDebugInfo(), + // faces, + clusteredCount, + unclusteredCount, + clusters: validClusters, + cgroups, + clusterPreviews, + clusterIDForFaceID, }; }; + +/** + * A generator function that returns a stream of {faceID, embedding} values, + * flattening all the the faces present in the given {@link faceIndices}. + */ +function* enumerateFaces(faceIndices: FaceIndex[]) { + for (const fi of faceIndices) { + for (const f of fi.faces) { + yield f; + } + } +} + +interface ClusterLinearResult { + clusters: EmbeddingCluster[]; +} + +const clusterLinear = ( + embeddings: number[][], + threshold: number, +): ClusterLinearResult => { + const clusters: EmbeddingCluster[] = []; + const clusterIndexForEmbeddingIndex = new Map(); + // For each embedding + for (const [i, ei] of embeddings.entries()) { + // If the embedding is already part of a cluster, then skip it. + if (clusterIndexForEmbeddingIndex.get(i)) continue; + + // Find the nearest neighbour from among all the other embeddings. + let nnIndex: number | undefined; + let nnCosineSimilarity = 0; + for (const [j, ej] of embeddings.entries()) { + // ! This is an O(n^2) loop, be careful when adding more code here. + + // Skip ourselves. + if (i == j) continue; + + // The vectors are already normalized, so we can directly use their + // dot product as their cosine similarity. + const csim = dotProduct(ei, ej); + if (csim > threshold && csim > nnCosineSimilarity) { + nnIndex = j; + nnCosineSimilarity = csim; + } + } + + if (nnIndex) { + // Find the cluster the nearest neighbour belongs to, if any. + const nnClusterIndex = clusterIndexForEmbeddingIndex.get(nnIndex); + + if (nnClusterIndex) { + // If the neighbour is already part of a cluster, also add + // ourselves to that cluster. + + ensure(clusters[nnClusterIndex]).push(i); + clusterIndexForEmbeddingIndex.set(i, nnClusterIndex); + } else { + // Otherwise create a new cluster with us and our nearest + // neighbour. + + clusterIndexForEmbeddingIndex.set(i, clusters.length); + clusterIndexForEmbeddingIndex.set(nnIndex, clusters.length); + clusters.push([i, nnIndex]); + } + } else { + // We didn't find a neighbour within the threshold. Create a new + // cluster with only this embedding. + + clusterIndexForEmbeddingIndex.set(i, clusters.length); + clusters.push([i]); + } + } + + // Prune singletone clusters. + const validClusters = clusters.filter((cs) => cs.length > 1); + + return { clusters: validClusters }; +}; diff --git a/web/packages/new/photos/services/ml/db.ts b/web/packages/new/photos/services/ml/db.ts index f6d2043752..5f57ea30e1 100644 --- a/web/packages/new/photos/services/ml/db.ts +++ b/web/packages/new/photos/services/ml/db.ts @@ -3,7 +3,7 @@ import log from "@/base/log"; import localForage from "@ente/shared/storage/localForage"; import { deleteDB, openDB, type DBSchema } from "idb"; import type { LocalCLIPIndex } from "./clip"; -import type { CGroup, FaceCluster } from "./cluster-new"; +import type { CGroup, FaceCluster } from "./cluster"; import type { LocalFaceIndex } from "./face"; /** diff --git a/web/packages/new/photos/services/ml/index.ts b/web/packages/new/photos/services/ml/index.ts index c5ff83c2ef..d4f3c862e3 100644 --- a/web/packages/new/photos/services/ml/index.ts +++ b/web/packages/new/photos/services/ml/index.ts @@ -24,7 +24,7 @@ import { type ClusteringOpts, type ClusterPreviewFace, type FaceCluster, -} from "./cluster-new"; +} from "./cluster"; import { regenerateFaceCrops } from "./crop"; import { clearMLDB, faceIndex, indexableAndIndexedCounts } from "./db"; import type { Face } from "./face"; @@ -386,7 +386,7 @@ export const wipClusterDebugPageContents = async ( clusters, cgroups, unclusteredFaces, - } = await worker().then((w) => w.clusterFacesHdb(opts)); + } = await worker().then((w) => w.clusterFaces(opts)); const localFiles = await getAllLocalFiles(); const localFileByID = new Map(localFiles.map((f) => [f.id, f])); diff --git a/web/packages/new/photos/services/ml/worker.ts b/web/packages/new/photos/services/ml/worker.ts index 6eff182347..518bfb2804 100644 --- a/web/packages/new/photos/services/ml/worker.ts +++ b/web/packages/new/photos/services/ml/worker.ts @@ -24,7 +24,7 @@ import { indexCLIP, type CLIPIndex, } from "./clip"; -import { clusterFacesHdb, type ClusteringOpts } from "./cluster-new"; +import { type ClusteringOpts } from "./cluster"; import { saveFaceCrops } from "./crop"; import { faceIndexes, @@ -276,8 +276,8 @@ export class MLWorker { } // TODO-Cluster - async clusterFacesHdb(opts: ClusteringOpts) { - return clusterFacesHdb(await faceIndexes(), opts); + async clusterFaces(opts: ClusteringOpts) { + return clusterFace(await faceIndexes(), opts); } } diff --git a/web/packages/new/photos/services/user-entity.ts b/web/packages/new/photos/services/user-entity.ts index 7e26726dd5..121171d214 100644 --- a/web/packages/new/photos/services/user-entity.ts +++ b/web/packages/new/photos/services/user-entity.ts @@ -12,7 +12,7 @@ import { ensure } from "@/utils/ensure"; import { nullToUndefined } from "@/utils/transform"; import { z } from "zod"; import { gunzip } from "./gzip"; -import type { CGroup } from "./ml/cluster-new"; +import type { CGroup } from "./ml/cluster"; import { applyCGroupDiff } from "./ml/db"; /**