This commit is contained in:
Manav Rathi
2024-08-30 17:05:16 +05:30
parent 598d5aab10
commit 48e00a0ecc
7 changed files with 539 additions and 635 deletions

View File

@@ -0,0 +1,35 @@
import { Hdbscan, type DebugInfo } from "hdbscan";
/**
* Each "cluster" is a list of indexes of the embeddings belonging to that
* particular cluster.
*/
export type EmbeddingCluster = number[];
export interface ClusterHdbscanResult {
clusters: EmbeddingCluster[];
noise: number[];
debugInfo?: DebugInfo;
}
/**
* Cluster the given {@link embeddings} using hdbscan.
*/
export const clusterHdbscan = (
embeddings: number[][],
): ClusterHdbscanResult => {
const hdbscan = new Hdbscan({
input: embeddings,
minClusterSize: 3,
minSamples: 5,
clusterSelectionEpsilon: 0.6,
clusterSelectionMethod: "leaf",
debug: false,
});
return {
clusters: hdbscan.getClusters(),
noise: hdbscan.getNoise(),
debugInfo: hdbscan.getDebugInfo(),
};
};

View File

@@ -1,603 +0,0 @@
import { newNonSecureID } from "@/base/id-worker";
import log from "@/base/log";
import { ensure } from "@/utils/ensure";
import { clusterFacesHdbscan } from "./cluster";
import { clusterGroups, faceClusters } from "./db";
import type { Face, FaceIndex } from "./face";
import { dotProduct } from "./math";
/**
* A face cluster is an set of faces.
*
* Each cluster has an id so that a {@link CGroup} can refer to it.
*
* The cluster is not directly synced to remote. Only clusters that the user
* interacts with get synced to remote, as part of a {@link CGroup}.
*/
export interface FaceCluster {
/**
* A nanoid for this cluster.
*/
id: string;
/**
* An unordered set of ids of the faces that belong to this cluster.
*
* For ergonomics of transportation and persistence this is an array, but it
* should conceptually be thought of as a set.
*/
faceIDs: string[];
}
/**
* A cgroup ("cluster group") is a group of clusters (possibly containing a
* single cluster) that the user has interacted with.
*
* Interactions include hiding, merging and giving a name and/or a cover photo.
*
* The most frequent interaction is naming a {@link FaceCluster}, which promotes
* it to a become a {@link CGroup}. The promotion comes with the ability to be
* synced with remote (as a "cgroup" user entity).
*
* There after, the user may attach more clusters to the same {@link CGroup}.
*
* > A named cluster group can be thought of as a "person", though this is not
* > necessarily an accurate characterization. e.g. there can be a named cluster
* > group that contains face clusters of pets.
*
* The other form of interaction is hiding. The user may hide a single (unnamed)
* cluster, or they may hide an named {@link CGroup}. In both cases, we promote
* the cluster to a CGroup if needed so that their request to hide gets synced.
*
* While in our local representation we separately maintain clusters and link to
* them from within CGroups by their clusterID, in the remote representation
* clusters themselves don't get synced. Instead, the "cgroup" entities synced
* with remote contain the clusters within themselves. So a group that gets
* synced with remote looks something like:
*
* { id, name, clusters: [{ clusterID, faceIDs }] }
*
*/
export interface CGroup {
/**
* A nanoid for this cluster group.
*
* This is the ID of the "cgroup" user entity (the envelope), and it is not
* contained as part of the group entity payload itself.
*/
id: string;
/**
* A name assigned by the user to this cluster group.
*
* The client should handle both empty strings and undefined as indicating a
* cgroup without a name. When the client needs to set this to an "empty"
* value, which happens when hiding an unnamed cluster, it should it to an
* empty string. That is, expect `"" | undefined`, but set `""`.
*/
name: string | undefined;
/**
* An unordered set of ids of the clusters that belong to this group.
*
* For ergonomics of transportation and persistence this is an array, but it
* should conceptually be thought of as a set.
*/
clusterIDs: string[];
/**
* True if this cluster group should be hidden.
*
* The user can hide both named cluster groups and single unnamed clusters.
* If the user hides a single cluster that was offered as a suggestion to
* them on a client, the client will create a new unnamed cgroup containing
* it, and set its hidden flag to sync it with remote (so that other clients
* can also stop showing this cluster).
*/
isHidden: boolean;
/**
* The ID of the face that should be used as the cover photo for this
* cluster group (if the user has set one).
*
* This is similar to the [@link displayFaceID}, the difference being:
*
* - {@link avatarFaceID} is the face selected by the user.
*
* - {@link displayFaceID} is the automatic placeholder, and only comes
* into effect if the user has not explicitly selected a face.
*/
avatarFaceID: string | undefined;
/**
* Locally determined ID of the "best" face that should be used as the
* display face, to represent this cluster group in the UI.
*
* This property is not synced with remote. For more details, see
* {@link avatarFaceID}.
*/
displayFaceID: string | undefined;
}
export interface ClusteringOpts {
method: "linear" | "hdbscan";
batchSize: number;
joinThreshold: number;
}
export interface ClusterPreview {
clusterSize: number;
faces: ClusterPreviewFace[];
}
export interface ClusterPreviewFace {
face: Face;
cosineSimilarity: number;
wasMerged: boolean;
}
/**
* Cluster faces into groups.
*
* [Note: Face clustering algorithm]
*
* A cgroup (cluster group) consists of clusters, each of which itself is a set
* of faces.
*
* cgroup << cluster << face
*
* The clusters are generated locally by clients using the following algorithm:
*
* 1. clusters = [] initially, or fetched from remote.
*
* 2. For each face, find its nearest neighbour in the embedding space.
*
* 3. If no such neighbour is found within our threshold, create a new cluster.
*
* 4. Otherwise assign this face to the same cluster as its nearest neighbour.
*
* This user can then tweak the output of the algorithm by performing the
* following actions to the list of clusters that they can see:
*
* - They can provide a name for a cluster ("name a person"). This upgrades a
* cluster into a "cgroup", which is an entity that gets synced via remote
* to the user's other clients.
*
* - They can attach more clusters to a cgroup ("merge clusters")
*
* - They can remove a cluster from a cgroup ("break clusters").
*
* After clustering, we also do some routine cleanup. Faces belonging to files
* that have been deleted (including those in Trash) should be pruned off.
*
* We should not make strict assumptions about the clusters we get from remote.
* In particular, the same face ID can be in different clusters. In such cases
* we should assign it arbitrarily assign it to the last cluster we find it in.
* Such leeway is intentionally provided to allow clients some slack in how they
* implement the sync without needing to make an blocking API request for every
* user interaction.
*/
export const clusterFaces = async (faceIndexes: FaceIndex[]) => {
const t = Date.now();
// A flattened array of faces.
// TODO-Cluster note the 2k slice
const faces = [...enumerateFaces(faceIndexes)].slice(0, 2000);
// Start with the clusters we already have (either from a previous indexing,
// or fetched from remote).
const clusters = await faceClusters();
// For fast reverse lookup - map from cluster ids to their index in the
// clusters array.
const clusterIndexForClusterID = new Map(clusters.map((c, i) => [c.id, i]));
// For fast reverse lookup - map from face ids to the id of the cluster to
// which they belong.
const clusterIDForFaceID = new Map(
clusters.flatMap((c) => c.faceIDs.map((id) => [id, c.id] as const)),
);
// A function to generate new cluster IDs.
const newClusterID = () => newNonSecureID("cluster_");
const faceAndNeigbours: FaceNeighbours[] = [];
// For each face,
for (const [i, fi] of faces.entries()) {
// If the face is already part of a cluster, then skip it.
if (clusterIDForFaceID.get(fi.faceID)) continue;
// Find the nearest neighbour from among all the other faces.
let nn: Face | undefined;
let nnCosineSimilarity = 0;
let neighbours: FaceNeighbour[] = [];
for (let j = 0; j < faces.length; j++) {
// ! This is an O(n^2) loop, be careful when adding more code here.
// TODO-Cluster Commenting this here and moving it downward
// // Skip ourselves.
// if (i == j) continue;
// Can't find a way of avoiding the null assertion here.
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const fj = faces[j]!;
// The vectors are already normalized, so we can directly use their
// dot product as their cosine similarity.
const csim = dotProduct(fi.embedding, fj.embedding);
// TODO-Cluster Delete me and uncomment the check above
// Skip ourselves.
if (i == j) {
neighbours.push({ face: fj, cosineSimilarity: csim });
continue;
}
const threshold = fi.blur < 100 || fj.blur < 100 ? 0.7 : 0.6;
if (csim > threshold && csim > nnCosineSimilarity) {
nn = fj;
nnCosineSimilarity = csim;
}
neighbours.push({ face: fj, cosineSimilarity: csim });
}
neighbours = neighbours.sort(
(a, b) => b.cosineSimilarity - a.cosineSimilarity,
);
faceAndNeigbours.push({ face: fi, neighbours });
const { faceID } = fi;
if (nn) {
// Found a neighbour near enough.
const nnFaceID = nn.faceID;
// Find the cluster the nearest neighbour belongs to, if any.
const nnClusterID = clusterIDForFaceID.get(nn.faceID);
if (nnClusterID) {
// If the neighbour is already part of a cluster, also add
// ourselves to that cluster.
const nnClusterIndex = ensure(
clusterIndexForClusterID.get(nnClusterID),
);
clusters[nnClusterIndex]?.faceIDs.push(faceID);
clusterIDForFaceID.set(faceID, nnClusterID);
} else {
// Otherwise create a new cluster with us and our nearest
// neighbour.
const cluster = {
id: newClusterID(),
faceIDs: [faceID, nnFaceID],
};
clusterIndexForClusterID.set(cluster.id, clusters.length);
clusterIDForFaceID.set(faceID, cluster.id);
clusterIDForFaceID.set(nnFaceID, cluster.id);
clusters.push(cluster);
}
} else {
// We didn't find a neighbour within the threshold. Create a new
// cluster with only this face.
const cluster = { id: newClusterID(), faceIDs: [faceID] };
clusterIndexForClusterID.set(cluster.id, clusters.length);
clusterIDForFaceID.set(faceID, cluster.id);
clusters.push(cluster);
}
}
// Prune too small clusters.
const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
let cgroups = await clusterGroups();
// TODO-Cluster - Currently we're not syncing with remote or saving anything
// locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
// cgroup, one per cluster.
cgroups = cgroups.concat(
validClusters.map((c) => ({
id: c.id,
name: undefined,
clusterIDs: [c.id],
isHidden: false,
avatarFaceID: undefined,
displayFaceID: undefined,
})),
);
// For each cluster group, use the highest scoring face in any of its
// clusters as its display face.
const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
for (const cgroup of cgroups) {
cgroup.displayFaceID = cgroup.clusterIDs
.map((clusterID) => clusterIndexForClusterID.get(clusterID))
.filter((i) => i !== undefined) /* 0 is a valid index */
.flatMap((i) => clusters[i]?.faceIDs ?? [])
.map((faceID) => faceForFaceID.get(faceID))
.filter((face) => !!face)
.reduce((max, face) =>
max.score > face.score ? max : face,
).faceID;
}
log.info("ml/cluster", {
faces,
validClusters,
clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
cgroups,
});
log.info(
`Clustered ${faces.length} faces into ${validClusters.length} clusters (${Date.now() - t} ms)`,
);
return { faces, clusters: validClusters, cgroups, faceAndNeigbours };
};
/**
* A generator function that returns a stream of {faceID, embedding} values,
* flattening all the the faces present in the given {@link faceIndices}.
*/
function* enumerateFaces(faceIndices: FaceIndex[]) {
for (const fi of faceIndices) {
for (const f of fi.faces) {
yield f;
}
}
}
export const clusterFacesHdb = (
faceIndexes: FaceIndex[],
opts: ClusteringOpts,
) => {
const { batch } = opts;
const t = Date.now();
// A flattened array of faces.
// TODO-Cluster ad-hoc filtering and slicing
const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
// .slice(0, 6000);
// TODO-Cluster testing code, can be removed once done
const faces = Array(1)
.fill(0)
.flatMap(() => faces0);
// For fast reverse lookup - map from face ids to the face.
const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
const faceEmbeddings = faces.map(({ embedding }) => embedding);
// For fast reverse lookup - map from cluster ids to their index in the
// clusters array.
const clusterIndexForClusterID = new Map<string, number>();
// For fast reverse lookup - map from the id of a face to the id of the
// cluster to which it belongs.
const clusterIDForFaceID = new Map<string, string>();
// A function to chain two reverse lookup.
const firstFaceOfCluster = (cluster: FaceCluster) =>
ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
// A function to generate new cluster IDs.
const newClusterID = () => newNonSecureID("cluster_");
// The resultant clusters.
// TODO-Cluster Later on, instead of starting from a blank slate, this will
// be list of existing clusters we fetch from remote.
const clusters: FaceCluster[] = [];
// Process the faces in batches. The faces are already sorted by file ID,
// which is a monotonically increasing integer, so we will also have some
// temporal locality.
//
// The number 2500 was derived by ad-hoc observations and takes a few
// seconds. On a particular test dataset and a particular machine,
// clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
// Memory usage was constant in all these cases.
//
// At around 100k faces, the clustering starts taking hours, and we start
// running into stack overflows. The stack overflows can perhaps be avoided
// by restructuring the code, but hours of uninterruptible work is anyways
// not feasible.
const batchSize = 2500;
for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
const it = Date.now();
const embeddings = faceEmbeddings.slice(i, i + batchSize);
const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings);
log.info(
`hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
);
// Merge the new clusters we got from hdbscan into the existing clusters
// if they are "near" them (using some heuristic).
//
// We need to ensure we don't change any of the existing cluster IDs,
// since these might be existing clusters we got from remote.
for (const hdbCluster of hdbClusters) {
// Find the existing cluster whose (arbitrarily chosen) first face
// is the nearest neighbour of the (arbitrarily chosen) first face
// of the cluster produced by hdbscan.
const newFace = ensure(faces[i + ensure(hdbCluster[0])]);
let nnCluster: FaceCluster | undefined;
let nnCosineSimilarity = 0;
for (const existingCluster of clusters) {
const existingFace = firstFaceOfCluster(existingCluster);
// The vectors are already normalized, so we can directly use their
// dot product as their cosine similarity.
const csim = dotProduct(
existingFace.embedding,
newFace.embedding,
);
// Use a higher cosine similarity threshold if either of the two
// faces are blurry.
const threshold =
existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7;
if (csim > threshold && csim > nnCosineSimilarity) {
nnCluster = existingCluster;
nnCosineSimilarity = csim;
}
}
if (nnCluster) {
// If we found an existing cluster that is near enough,
// sublimate the cluster produced by hdbscan into that cluster.
for (const j of hdbCluster) {
const { faceID } = ensure(faces[i + j]);
nnCluster.faceIDs.push(faceID);
clusterIDForFaceID.set(faceID, nnCluster.id);
}
} else {
// Otherwise make a new cluster from the cluster produced by
// hdbscan.
const clusterID = newClusterID();
const faceIDs: string[] = [];
for (const j of hdbCluster) {
const { faceID } = ensure(faces[i + j]);
faceIDs.push(faceID);
clusterIDForFaceID.set(faceID, clusterID);
}
clusterIndexForClusterID.set(clusterID, clusters.length);
clusters.push({ id: clusterID, faceIDs });
}
}
}
// Convert into the data structure we're using to debug/visualize.
// const faceAndNeigbours: FaceNeighbours[] = [];
// const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
// for (const fi of topFaces) {
// let neighbours: FaceNeighbour[] = [];
// for (const fj of faces) {
// // The vectors are already normalized, so we can directly use their
// // dot product as their cosine similarity.
// const csim = dotProduct(fi.embedding, fj.embedding);
// neighbours.push({ face: fj, cosineSimilarity: csim });
// }
// neighbours = neighbours
// .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
// .slice(0, 30);
// faceAndNeigbours.push({ face: fi, neighbours });
// }
// Convert into the data structure we're using to debug/visualize.
//
// > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
// > each, sorted by cosine distance to highest scoring face in the
// > cluster).
const sortedClusters = clusters.sort(
(a, b) => b.faceIDs.length - a.faceIDs.length,
);
const debugClusters =
sortedClusters.length < 60
? sortedClusters
: sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
const clusterPreviews: ClusterPreview[] = [];
for (const cluster of debugClusters) {
const faces = cluster.faceIDs.map((id) =>
ensure(faceForFaceID.get(id)),
);
const topFace = faces.reduce((max, face) =>
max.score > face.score ? max : face,
);
const previewFaces: ClusterPreviewFace[] = [];
for (const face of faces) {
const csim = dotProduct(topFace.embedding, face.embedding);
previewFaces.push({ face, cosineSimilarity: csim });
}
clusterPreviews.push({
clusterSize: cluster.faceIDs.length,
faces: previewFaces
.sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
.slice(0, 50),
});
}
// Prune too small clusters.
// TODO-Cluster this is likely not needed since hdbscan already has a min?
const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
// let cgroups = await clusterGroups();
// // TODO-Cluster - Currently we're not syncing with remote or saving anything
// // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
// // cgroup, one per cluster.
// cgroups = cgroups.concat(
// validClusters.map((c) => ({
// id: c.id,
// name: undefined,
// clusterIDs: [c.id],
// isHidden: false,
// avatarFaceID: undefined,
// displayFaceID: undefined,
// })),
// );
// // For each cluster group, use the highest scoring face in any of its
// // clusters as its display face.
// for (const cgroup of cgroups) {
// cgroup.displayFaceID = cgroup.clusterIDs
// .map((clusterID) => clusterIndexForClusterID.get(clusterID))
// .filter((i) => i !== undefined) /* 0 is a valid index */
// .flatMap((i) => clusters[i]?.faceIDs ?? [])
// .map((faceID) => faceForFaceID.get(faceID))
// .filter((face) => !!face)
// .reduce((max, face) =>
// max.score > face.score ? max : face,
// ).faceID;
// }
// TODO-Cluster - Currently we're not syncing with remote or saving anything
// locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
// cgroup, one per cluster.
const cgroups: CGroup[] = [];
for (const cluster of sortedClusters) {
const faces = cluster.faceIDs.map((id) =>
ensure(faceForFaceID.get(id)),
);
const topFace = faces.reduce((max, face) =>
max.score > face.score ? max : face,
);
cgroups.push({
id: cluster.id,
name: undefined,
clusterIDs: [cluster.id],
isHidden: false,
avatarFaceID: undefined,
displayFaceID: topFace.faceID,
});
}
// log.info("ml/cluster", {
// faces,
// validClusters,
// clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
// clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
// cgroups,
// });
log.info(
`Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
);
const clusteredCount = clusterIDForFaceID.size;
const unclusteredCount = faces.length - clusteredCount;
return {
// faces,
clusteredCount,
unclusteredCount,
clusters: validClusters,
cgroups,
clusterPreviews,
clusterIDForFaceID,
};
};

View File

@@ -1,35 +1,507 @@
import { Hdbscan, type DebugInfo } from "hdbscan";
import { newNonSecureID } from "@/base/id-worker";
import log from "@/base/log";
import { ensure } from "@/utils/ensure";
import { type EmbeddingCluster, clusterHdbscan } from "./cluster-hdb";
import type { Face, FaceIndex } from "./face";
import { dotProduct } from "./math";
export type Cluster = number[];
export interface ClusterFacesResult {
clusters: Cluster[];
noise: Cluster;
debugInfo?: DebugInfo;
/**
* A face cluster is an set of faces.
*
* Each cluster has an id so that a {@link CGroup} can refer to it.
*
* The cluster is not directly synced to remote. Only clusters that the user
* interacts with get synced to remote, as part of a {@link CGroup}.
*/
export interface FaceCluster {
/**
* A nanoid for this cluster.
*/
id: string;
/**
* An unordered set of ids of the faces that belong to this cluster.
*
* For ergonomics of transportation and persistence this is an array, but it
* should conceptually be thought of as a set.
*/
faceIDs: string[];
}
/**
* Cluster the given {@link faceEmbeddings}.
* A cgroup ("cluster group") is a group of clusters (possibly containing a
* single cluster) that the user has interacted with.
*
* Interactions include hiding, merging and giving a name and/or a cover photo.
*
* The most frequent interaction is naming a {@link FaceCluster}, which promotes
* it to a become a {@link CGroup}. The promotion comes with the ability to be
* synced with remote (as a "cgroup" user entity).
*
* There after, the user may attach more clusters to the same {@link CGroup}.
*
* > A named cluster group can be thought of as a "person", though this is not
* > necessarily an accurate characterization. e.g. there can be a named cluster
* > group that contains face clusters of pets.
*
* The other form of interaction is hiding. The user may hide a single (unnamed)
* cluster, or they may hide an named {@link CGroup}. In both cases, we promote
* the cluster to a CGroup if needed so that their request to hide gets synced.
*
* While in our local representation we separately maintain clusters and link to
* them from within CGroups by their clusterID, in the remote representation
* clusters themselves don't get synced. Instead, the "cgroup" entities synced
* with remote contain the clusters within themselves. So a group that gets
* synced with remote looks something like:
*
* { id, name, clusters: [{ clusterID, faceIDs }] }
*
* @param faceEmbeddings An array of embeddings produced by our face indexing
* pipeline. Each embedding is for a face detected in an image (a single image
* may have multiple faces detected within it).
*/
export const clusterFacesHdbscan = (
faceEmbeddings: number[][],
): ClusterFacesResult => {
const hdbscan = new Hdbscan({
input: faceEmbeddings,
minClusterSize: 3,
minSamples: 5,
clusterSelectionEpsilon: 0.6,
clusterSelectionMethod: "leaf",
debug: false,
});
export interface CGroup {
/**
* A nanoid for this cluster group.
*
* This is the ID of the "cgroup" user entity (the envelope), and it is not
* contained as part of the group entity payload itself.
*/
id: string;
/**
* A name assigned by the user to this cluster group.
*
* The client should handle both empty strings and undefined as indicating a
* cgroup without a name. When the client needs to set this to an "empty"
* value, which happens when hiding an unnamed cluster, it should it to an
* empty string. That is, expect `"" | undefined`, but set `""`.
*/
name: string | undefined;
/**
* An unordered set of ids of the clusters that belong to this group.
*
* For ergonomics of transportation and persistence this is an array, but it
* should conceptually be thought of as a set.
*/
clusterIDs: string[];
/**
* True if this cluster group should be hidden.
*
* The user can hide both named cluster groups and single unnamed clusters.
* If the user hides a single cluster that was offered as a suggestion to
* them on a client, the client will create a new unnamed cgroup containing
* it, and set its hidden flag to sync it with remote (so that other clients
* can also stop showing this cluster).
*/
isHidden: boolean;
/**
* The ID of the face that should be used as the cover photo for this
* cluster group (if the user has set one).
*
* This is similar to the [@link displayFaceID}, the difference being:
*
* - {@link avatarFaceID} is the face selected by the user.
*
* - {@link displayFaceID} is the automatic placeholder, and only comes
* into effect if the user has not explicitly selected a face.
*/
avatarFaceID: string | undefined;
/**
* Locally determined ID of the "best" face that should be used as the
* display face, to represent this cluster group in the UI.
*
* This property is not synced with remote. For more details, see
* {@link avatarFaceID}.
*/
displayFaceID: string | undefined;
}
export interface ClusteringOpts {
method: "linear" | "hdbscan";
batchSize: number;
joinThreshold: number;
}
export interface ClusterPreview {
clusterSize: number;
faces: ClusterPreviewFace[];
}
export interface ClusterPreviewFace {
face: Face;
cosineSimilarity: number;
wasMerged: boolean;
}
/**
* Cluster faces into groups.
*
* [Note: Face clustering algorithm]
*
* A cgroup (cluster group) consists of clusters, each of which itself is a set
* of faces.
*
* cgroup << cluster << face
*
* The clusters are generated locally by clients using the following algorithm:
*
* 1. clusters = [] initially, or fetched from remote.
*
* 2. For each face, find its nearest neighbour in the embedding space.
*
* 3. If no such neighbour is found within our threshold, create a new cluster.
*
* 4. Otherwise assign this face to the same cluster as its nearest neighbour.
*
* This user can then tweak the output of the algorithm by performing the
* following actions to the list of clusters that they can see:
*
* - They can provide a name for a cluster ("name a person"). This upgrades a
* cluster into a "cgroup", which is an entity that gets synced via remote
* to the user's other clients.
*
* - They can attach more clusters to a cgroup ("merge clusters")
*
* - They can remove a cluster from a cgroup ("break clusters").
*
* After clustering, we also do some routine cleanup. Faces belonging to files
* that have been deleted (including those in Trash) should be pruned off.
*
* We should not make strict assumptions about the clusters we get from remote.
* In particular, the same face ID can be in different clusters. In such cases
* we should assign it arbitrarily assign it to the last cluster we find it in.
* Such leeway is intentionally provided to allow clients some slack in how they
* implement the sync without needing to make an blocking API request for every
* user interaction.
*/
export const clusterFaces = (
faceIndexes: FaceIndex[],
opts: ClusteringOpts,
) => {
const { batchSize, joinThreshold } = opts;
const t = Date.now();
// A flattened array of faces.
// TODO-Cluster ad-hoc filtering and slicing
const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
// .slice(0, 6000);
// TODO-Cluster testing code, can be removed once done
const faces = Array(1)
.fill(0)
.flatMap(() => faces0);
// For fast reverse lookup - map from face ids to the face.
const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
const faceEmbeddings = faces.map(({ embedding }) => embedding);
// For fast reverse lookup - map from cluster ids to their index in the
// clusters array.
const clusterIndexForClusterID = new Map<string, number>();
// For fast reverse lookup - map from the id of a face to the id of the
// cluster to which it belongs.
const clusterIDForFaceID = new Map<string, string>();
// A function to chain two reverse lookup.
const firstFaceOfCluster = (cluster: FaceCluster) =>
ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
// A function to generate new cluster IDs.
const newClusterID = () => newNonSecureID("cluster_");
// The resultant clusters.
// TODO-Cluster Later on, instead of starting from a blank slate, this will
// be list of existing clusters we fetch from remote.
const clusters: FaceCluster[] = [];
// Process the faces in batches. The faces are already sorted by file ID,
// which is a monotonically increasing integer, so we will also have some
// temporal locality.
//
// The number 2500 was derived by ad-hoc observations and takes a few
// seconds. On a particular test dataset and a particular machine,
// clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
// Memory usage was constant in all these cases.
//
// At around 100k faces, the clustering starts taking hours, and we start
// running into stack overflows. The stack overflows can perhaps be avoided
// by restructuring the code, but hours of uninterruptible work is anyways
// not feasible.
const batchSize = 2500;
for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
const it = Date.now();
const embeddings = faceEmbeddings.slice(i, i + batchSize);
const { clusters: hdbClusters } = clusterHdbscan(embeddings);
log.info(
`hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
);
// Merge the new clusters we got from hdbscan into the existing clusters
// if they are "near" them (using some heuristic).
//
// We need to ensure we don't change any of the existing cluster IDs,
// since these might be existing clusters we got from remote.
for (const hdbCluster of hdbClusters) {
// Find the existing cluster whose (arbitrarily chosen) first face
// is the nearest neighbour of the (arbitrarily chosen) first face
// of the cluster produced by hdbscan.
const newFace = ensure(faces[i + ensure(hdbCluster[0])]);
let nnCluster: FaceCluster | undefined;
let nnCosineSimilarity = 0;
for (const existingCluster of clusters) {
const existingFace = firstFaceOfCluster(existingCluster);
// The vectors are already normalized, so we can directly use their
// dot product as their cosine similarity.
const csim = dotProduct(
existingFace.embedding,
newFace.embedding,
);
// Use a higher cosine similarity threshold if either of the two
// faces are blurry.
const threshold =
existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7;
if (csim > threshold && csim > nnCosineSimilarity) {
nnCluster = existingCluster;
nnCosineSimilarity = csim;
}
}
if (nnCluster) {
// If we found an existing cluster that is near enough,
// sublimate the cluster produced by hdbscan into that cluster.
for (const j of hdbCluster) {
const { faceID } = ensure(faces[i + j]);
nnCluster.faceIDs.push(faceID);
clusterIDForFaceID.set(faceID, nnCluster.id);
}
} else {
// Otherwise make a new cluster from the cluster produced by
// hdbscan.
const clusterID = newClusterID();
const faceIDs: string[] = [];
for (const j of hdbCluster) {
const { faceID } = ensure(faces[i + j]);
faceIDs.push(faceID);
clusterIDForFaceID.set(faceID, clusterID);
}
clusterIndexForClusterID.set(clusterID, clusters.length);
clusters.push({ id: clusterID, faceIDs });
}
}
}
// Convert into the data structure we're using to debug/visualize.
// const faceAndNeigbours: FaceNeighbours[] = [];
// const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
// for (const fi of topFaces) {
// let neighbours: FaceNeighbour[] = [];
// for (const fj of faces) {
// // The vectors are already normalized, so we can directly use their
// // dot product as their cosine similarity.
// const csim = dotProduct(fi.embedding, fj.embedding);
// neighbours.push({ face: fj, cosineSimilarity: csim });
// }
// neighbours = neighbours
// .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
// .slice(0, 30);
// faceAndNeigbours.push({ face: fi, neighbours });
// }
// Convert into the data structure we're using to debug/visualize.
//
// > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
// > each, sorted by cosine distance to highest scoring face in the
// > cluster).
const sortedClusters = clusters.sort(
(a, b) => b.faceIDs.length - a.faceIDs.length,
);
const debugClusters =
sortedClusters.length < 60
? sortedClusters
: sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
const clusterPreviews: ClusterPreview[] = [];
for (const cluster of debugClusters) {
const faces = cluster.faceIDs.map((id) =>
ensure(faceForFaceID.get(id)),
);
const topFace = faces.reduce((max, face) =>
max.score > face.score ? max : face,
);
const previewFaces: ClusterPreviewFace[] = [];
for (const face of faces) {
const csim = dotProduct(topFace.embedding, face.embedding);
previewFaces.push({ face, cosineSimilarity: csim });
}
clusterPreviews.push({
clusterSize: cluster.faceIDs.length,
faces: previewFaces
.sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
.slice(0, 50),
});
}
// Prune too small clusters.
// TODO-Cluster this is likely not needed since hdbscan already has a min?
const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
// let cgroups = await clusterGroups();
// // TODO-Cluster - Currently we're not syncing with remote or saving anything
// // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
// // cgroup, one per cluster.
// cgroups = cgroups.concat(
// validClusters.map((c) => ({
// id: c.id,
// name: undefined,
// clusterIDs: [c.id],
// isHidden: false,
// avatarFaceID: undefined,
// displayFaceID: undefined,
// })),
// );
// // For each cluster group, use the highest scoring face in any of its
// // clusters as its display face.
// for (const cgroup of cgroups) {
// cgroup.displayFaceID = cgroup.clusterIDs
// .map((clusterID) => clusterIndexForClusterID.get(clusterID))
// .filter((i) => i !== undefined) /* 0 is a valid index */
// .flatMap((i) => clusters[i]?.faceIDs ?? [])
// .map((faceID) => faceForFaceID.get(faceID))
// .filter((face) => !!face)
// .reduce((max, face) =>
// max.score > face.score ? max : face,
// ).faceID;
// }
// TODO-Cluster - Currently we're not syncing with remote or saving anything
// locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
// cgroup, one per cluster.
const cgroups: CGroup[] = [];
for (const cluster of sortedClusters) {
const faces = cluster.faceIDs.map((id) =>
ensure(faceForFaceID.get(id)),
);
const topFace = faces.reduce((max, face) =>
max.score > face.score ? max : face,
);
cgroups.push({
id: cluster.id,
name: undefined,
clusterIDs: [cluster.id],
isHidden: false,
avatarFaceID: undefined,
displayFaceID: topFace.faceID,
});
}
// log.info("ml/cluster", {
// faces,
// validClusters,
// clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
// clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
// cgroups,
// });
log.info(
`Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
);
const clusteredCount = clusterIDForFaceID.size;
const unclusteredCount = faces.length - clusteredCount;
return {
clusters: hdbscan.getClusters(),
noise: hdbscan.getNoise(),
debugInfo: hdbscan.getDebugInfo(),
// faces,
clusteredCount,
unclusteredCount,
clusters: validClusters,
cgroups,
clusterPreviews,
clusterIDForFaceID,
};
};
/**
* A generator function that returns a stream of {faceID, embedding} values,
* flattening all the the faces present in the given {@link faceIndices}.
*/
function* enumerateFaces(faceIndices: FaceIndex[]) {
for (const fi of faceIndices) {
for (const f of fi.faces) {
yield f;
}
}
}
interface ClusterLinearResult {
clusters: EmbeddingCluster[];
}
const clusterLinear = (
embeddings: number[][],
threshold: number,
): ClusterLinearResult => {
const clusters: EmbeddingCluster[] = [];
const clusterIndexForEmbeddingIndex = new Map<number, number>();
// For each embedding
for (const [i, ei] of embeddings.entries()) {
// If the embedding is already part of a cluster, then skip it.
if (clusterIndexForEmbeddingIndex.get(i)) continue;
// Find the nearest neighbour from among all the other embeddings.
let nnIndex: number | undefined;
let nnCosineSimilarity = 0;
for (const [j, ej] of embeddings.entries()) {
// ! This is an O(n^2) loop, be careful when adding more code here.
// Skip ourselves.
if (i == j) continue;
// The vectors are already normalized, so we can directly use their
// dot product as their cosine similarity.
const csim = dotProduct(ei, ej);
if (csim > threshold && csim > nnCosineSimilarity) {
nnIndex = j;
nnCosineSimilarity = csim;
}
}
if (nnIndex) {
// Find the cluster the nearest neighbour belongs to, if any.
const nnClusterIndex = clusterIndexForEmbeddingIndex.get(nnIndex);
if (nnClusterIndex) {
// If the neighbour is already part of a cluster, also add
// ourselves to that cluster.
ensure(clusters[nnClusterIndex]).push(i);
clusterIndexForEmbeddingIndex.set(i, nnClusterIndex);
} else {
// Otherwise create a new cluster with us and our nearest
// neighbour.
clusterIndexForEmbeddingIndex.set(i, clusters.length);
clusterIndexForEmbeddingIndex.set(nnIndex, clusters.length);
clusters.push([i, nnIndex]);
}
} else {
// We didn't find a neighbour within the threshold. Create a new
// cluster with only this embedding.
clusterIndexForEmbeddingIndex.set(i, clusters.length);
clusters.push([i]);
}
}
// Prune singletone clusters.
const validClusters = clusters.filter((cs) => cs.length > 1);
return { clusters: validClusters };
};

View File

@@ -3,7 +3,7 @@ import log from "@/base/log";
import localForage from "@ente/shared/storage/localForage";
import { deleteDB, openDB, type DBSchema } from "idb";
import type { LocalCLIPIndex } from "./clip";
import type { CGroup, FaceCluster } from "./cluster-new";
import type { CGroup, FaceCluster } from "./cluster";
import type { LocalFaceIndex } from "./face";
/**

View File

@@ -24,7 +24,7 @@ import {
type ClusteringOpts,
type ClusterPreviewFace,
type FaceCluster,
} from "./cluster-new";
} from "./cluster";
import { regenerateFaceCrops } from "./crop";
import { clearMLDB, faceIndex, indexableAndIndexedCounts } from "./db";
import type { Face } from "./face";
@@ -386,7 +386,7 @@ export const wipClusterDebugPageContents = async (
clusters,
cgroups,
unclusteredFaces,
} = await worker().then((w) => w.clusterFacesHdb(opts));
} = await worker().then((w) => w.clusterFaces(opts));
const localFiles = await getAllLocalFiles();
const localFileByID = new Map(localFiles.map((f) => [f.id, f]));

View File

@@ -24,7 +24,7 @@ import {
indexCLIP,
type CLIPIndex,
} from "./clip";
import { clusterFacesHdb, type ClusteringOpts } from "./cluster-new";
import { type ClusteringOpts } from "./cluster";
import { saveFaceCrops } from "./crop";
import {
faceIndexes,
@@ -276,8 +276,8 @@ export class MLWorker {
}
// TODO-Cluster
async clusterFacesHdb(opts: ClusteringOpts) {
return clusterFacesHdb(await faceIndexes(), opts);
async clusterFaces(opts: ClusteringOpts) {
return clusterFace(await faceIndexes(), opts);
}
}

View File

@@ -12,7 +12,7 @@ import { ensure } from "@/utils/ensure";
import { nullToUndefined } from "@/utils/transform";
import { z } from "zod";
import { gunzip } from "./gzip";
import type { CGroup } from "./ml/cluster-new";
import type { CGroup } from "./ml/cluster";
import { applyCGroupDiff } from "./ml/db";
/**