Linear
This commit is contained in:
35
web/packages/new/photos/services/ml/cluster-hdb.ts
Normal file
35
web/packages/new/photos/services/ml/cluster-hdb.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import { Hdbscan, type DebugInfo } from "hdbscan";
|
||||
|
||||
/**
|
||||
* Each "cluster" is a list of indexes of the embeddings belonging to that
|
||||
* particular cluster.
|
||||
*/
|
||||
export type EmbeddingCluster = number[];
|
||||
|
||||
export interface ClusterHdbscanResult {
|
||||
clusters: EmbeddingCluster[];
|
||||
noise: number[];
|
||||
debugInfo?: DebugInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cluster the given {@link embeddings} using hdbscan.
|
||||
*/
|
||||
export const clusterHdbscan = (
|
||||
embeddings: number[][],
|
||||
): ClusterHdbscanResult => {
|
||||
const hdbscan = new Hdbscan({
|
||||
input: embeddings,
|
||||
minClusterSize: 3,
|
||||
minSamples: 5,
|
||||
clusterSelectionEpsilon: 0.6,
|
||||
clusterSelectionMethod: "leaf",
|
||||
debug: false,
|
||||
});
|
||||
|
||||
return {
|
||||
clusters: hdbscan.getClusters(),
|
||||
noise: hdbscan.getNoise(),
|
||||
debugInfo: hdbscan.getDebugInfo(),
|
||||
};
|
||||
};
|
||||
@@ -1,603 +0,0 @@
|
||||
import { newNonSecureID } from "@/base/id-worker";
|
||||
import log from "@/base/log";
|
||||
import { ensure } from "@/utils/ensure";
|
||||
import { clusterFacesHdbscan } from "./cluster";
|
||||
import { clusterGroups, faceClusters } from "./db";
|
||||
import type { Face, FaceIndex } from "./face";
|
||||
import { dotProduct } from "./math";
|
||||
|
||||
/**
|
||||
* A face cluster is an set of faces.
|
||||
*
|
||||
* Each cluster has an id so that a {@link CGroup} can refer to it.
|
||||
*
|
||||
* The cluster is not directly synced to remote. Only clusters that the user
|
||||
* interacts with get synced to remote, as part of a {@link CGroup}.
|
||||
*/
|
||||
export interface FaceCluster {
|
||||
/**
|
||||
* A nanoid for this cluster.
|
||||
*/
|
||||
id: string;
|
||||
/**
|
||||
* An unordered set of ids of the faces that belong to this cluster.
|
||||
*
|
||||
* For ergonomics of transportation and persistence this is an array, but it
|
||||
* should conceptually be thought of as a set.
|
||||
*/
|
||||
faceIDs: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* A cgroup ("cluster group") is a group of clusters (possibly containing a
|
||||
* single cluster) that the user has interacted with.
|
||||
*
|
||||
* Interactions include hiding, merging and giving a name and/or a cover photo.
|
||||
*
|
||||
* The most frequent interaction is naming a {@link FaceCluster}, which promotes
|
||||
* it to a become a {@link CGroup}. The promotion comes with the ability to be
|
||||
* synced with remote (as a "cgroup" user entity).
|
||||
*
|
||||
* There after, the user may attach more clusters to the same {@link CGroup}.
|
||||
*
|
||||
* > A named cluster group can be thought of as a "person", though this is not
|
||||
* > necessarily an accurate characterization. e.g. there can be a named cluster
|
||||
* > group that contains face clusters of pets.
|
||||
*
|
||||
* The other form of interaction is hiding. The user may hide a single (unnamed)
|
||||
* cluster, or they may hide an named {@link CGroup}. In both cases, we promote
|
||||
* the cluster to a CGroup if needed so that their request to hide gets synced.
|
||||
*
|
||||
* While in our local representation we separately maintain clusters and link to
|
||||
* them from within CGroups by their clusterID, in the remote representation
|
||||
* clusters themselves don't get synced. Instead, the "cgroup" entities synced
|
||||
* with remote contain the clusters within themselves. So a group that gets
|
||||
* synced with remote looks something like:
|
||||
*
|
||||
* { id, name, clusters: [{ clusterID, faceIDs }] }
|
||||
*
|
||||
*/
|
||||
export interface CGroup {
|
||||
/**
|
||||
* A nanoid for this cluster group.
|
||||
*
|
||||
* This is the ID of the "cgroup" user entity (the envelope), and it is not
|
||||
* contained as part of the group entity payload itself.
|
||||
*/
|
||||
id: string;
|
||||
/**
|
||||
* A name assigned by the user to this cluster group.
|
||||
*
|
||||
* The client should handle both empty strings and undefined as indicating a
|
||||
* cgroup without a name. When the client needs to set this to an "empty"
|
||||
* value, which happens when hiding an unnamed cluster, it should it to an
|
||||
* empty string. That is, expect `"" | undefined`, but set `""`.
|
||||
*/
|
||||
name: string | undefined;
|
||||
/**
|
||||
* An unordered set of ids of the clusters that belong to this group.
|
||||
*
|
||||
* For ergonomics of transportation and persistence this is an array, but it
|
||||
* should conceptually be thought of as a set.
|
||||
*/
|
||||
clusterIDs: string[];
|
||||
/**
|
||||
* True if this cluster group should be hidden.
|
||||
*
|
||||
* The user can hide both named cluster groups and single unnamed clusters.
|
||||
* If the user hides a single cluster that was offered as a suggestion to
|
||||
* them on a client, the client will create a new unnamed cgroup containing
|
||||
* it, and set its hidden flag to sync it with remote (so that other clients
|
||||
* can also stop showing this cluster).
|
||||
*/
|
||||
isHidden: boolean;
|
||||
/**
|
||||
* The ID of the face that should be used as the cover photo for this
|
||||
* cluster group (if the user has set one).
|
||||
*
|
||||
* This is similar to the [@link displayFaceID}, the difference being:
|
||||
*
|
||||
* - {@link avatarFaceID} is the face selected by the user.
|
||||
*
|
||||
* - {@link displayFaceID} is the automatic placeholder, and only comes
|
||||
* into effect if the user has not explicitly selected a face.
|
||||
*/
|
||||
avatarFaceID: string | undefined;
|
||||
/**
|
||||
* Locally determined ID of the "best" face that should be used as the
|
||||
* display face, to represent this cluster group in the UI.
|
||||
*
|
||||
* This property is not synced with remote. For more details, see
|
||||
* {@link avatarFaceID}.
|
||||
*/
|
||||
displayFaceID: string | undefined;
|
||||
}
|
||||
|
||||
export interface ClusteringOpts {
|
||||
method: "linear" | "hdbscan";
|
||||
batchSize: number;
|
||||
joinThreshold: number;
|
||||
}
|
||||
|
||||
export interface ClusterPreview {
|
||||
clusterSize: number;
|
||||
faces: ClusterPreviewFace[];
|
||||
}
|
||||
|
||||
export interface ClusterPreviewFace {
|
||||
face: Face;
|
||||
cosineSimilarity: number;
|
||||
wasMerged: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cluster faces into groups.
|
||||
*
|
||||
* [Note: Face clustering algorithm]
|
||||
*
|
||||
* A cgroup (cluster group) consists of clusters, each of which itself is a set
|
||||
* of faces.
|
||||
*
|
||||
* cgroup << cluster << face
|
||||
*
|
||||
* The clusters are generated locally by clients using the following algorithm:
|
||||
*
|
||||
* 1. clusters = [] initially, or fetched from remote.
|
||||
*
|
||||
* 2. For each face, find its nearest neighbour in the embedding space.
|
||||
*
|
||||
* 3. If no such neighbour is found within our threshold, create a new cluster.
|
||||
*
|
||||
* 4. Otherwise assign this face to the same cluster as its nearest neighbour.
|
||||
*
|
||||
* This user can then tweak the output of the algorithm by performing the
|
||||
* following actions to the list of clusters that they can see:
|
||||
*
|
||||
* - They can provide a name for a cluster ("name a person"). This upgrades a
|
||||
* cluster into a "cgroup", which is an entity that gets synced via remote
|
||||
* to the user's other clients.
|
||||
*
|
||||
* - They can attach more clusters to a cgroup ("merge clusters")
|
||||
*
|
||||
* - They can remove a cluster from a cgroup ("break clusters").
|
||||
*
|
||||
* After clustering, we also do some routine cleanup. Faces belonging to files
|
||||
* that have been deleted (including those in Trash) should be pruned off.
|
||||
*
|
||||
* We should not make strict assumptions about the clusters we get from remote.
|
||||
* In particular, the same face ID can be in different clusters. In such cases
|
||||
* we should assign it arbitrarily assign it to the last cluster we find it in.
|
||||
* Such leeway is intentionally provided to allow clients some slack in how they
|
||||
* implement the sync without needing to make an blocking API request for every
|
||||
* user interaction.
|
||||
*/
|
||||
export const clusterFaces = async (faceIndexes: FaceIndex[]) => {
|
||||
const t = Date.now();
|
||||
|
||||
// A flattened array of faces.
|
||||
// TODO-Cluster note the 2k slice
|
||||
const faces = [...enumerateFaces(faceIndexes)].slice(0, 2000);
|
||||
|
||||
// Start with the clusters we already have (either from a previous indexing,
|
||||
// or fetched from remote).
|
||||
const clusters = await faceClusters();
|
||||
|
||||
// For fast reverse lookup - map from cluster ids to their index in the
|
||||
// clusters array.
|
||||
const clusterIndexForClusterID = new Map(clusters.map((c, i) => [c.id, i]));
|
||||
|
||||
// For fast reverse lookup - map from face ids to the id of the cluster to
|
||||
// which they belong.
|
||||
const clusterIDForFaceID = new Map(
|
||||
clusters.flatMap((c) => c.faceIDs.map((id) => [id, c.id] as const)),
|
||||
);
|
||||
|
||||
// A function to generate new cluster IDs.
|
||||
const newClusterID = () => newNonSecureID("cluster_");
|
||||
|
||||
const faceAndNeigbours: FaceNeighbours[] = [];
|
||||
|
||||
// For each face,
|
||||
for (const [i, fi] of faces.entries()) {
|
||||
// If the face is already part of a cluster, then skip it.
|
||||
if (clusterIDForFaceID.get(fi.faceID)) continue;
|
||||
|
||||
// Find the nearest neighbour from among all the other faces.
|
||||
let nn: Face | undefined;
|
||||
let nnCosineSimilarity = 0;
|
||||
let neighbours: FaceNeighbour[] = [];
|
||||
for (let j = 0; j < faces.length; j++) {
|
||||
// ! This is an O(n^2) loop, be careful when adding more code here.
|
||||
|
||||
// TODO-Cluster Commenting this here and moving it downward
|
||||
// // Skip ourselves.
|
||||
// if (i == j) continue;
|
||||
|
||||
// Can't find a way of avoiding the null assertion here.
|
||||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
||||
const fj = faces[j]!;
|
||||
|
||||
// The vectors are already normalized, so we can directly use their
|
||||
// dot product as their cosine similarity.
|
||||
const csim = dotProduct(fi.embedding, fj.embedding);
|
||||
|
||||
// TODO-Cluster Delete me and uncomment the check above
|
||||
// Skip ourselves.
|
||||
if (i == j) {
|
||||
neighbours.push({ face: fj, cosineSimilarity: csim });
|
||||
continue;
|
||||
}
|
||||
|
||||
const threshold = fi.blur < 100 || fj.blur < 100 ? 0.7 : 0.6;
|
||||
if (csim > threshold && csim > nnCosineSimilarity) {
|
||||
nn = fj;
|
||||
nnCosineSimilarity = csim;
|
||||
}
|
||||
|
||||
neighbours.push({ face: fj, cosineSimilarity: csim });
|
||||
}
|
||||
|
||||
neighbours = neighbours.sort(
|
||||
(a, b) => b.cosineSimilarity - a.cosineSimilarity,
|
||||
);
|
||||
faceAndNeigbours.push({ face: fi, neighbours });
|
||||
|
||||
const { faceID } = fi;
|
||||
|
||||
if (nn) {
|
||||
// Found a neighbour near enough.
|
||||
const nnFaceID = nn.faceID;
|
||||
|
||||
// Find the cluster the nearest neighbour belongs to, if any.
|
||||
const nnClusterID = clusterIDForFaceID.get(nn.faceID);
|
||||
|
||||
if (nnClusterID) {
|
||||
// If the neighbour is already part of a cluster, also add
|
||||
// ourselves to that cluster.
|
||||
|
||||
const nnClusterIndex = ensure(
|
||||
clusterIndexForClusterID.get(nnClusterID),
|
||||
);
|
||||
clusters[nnClusterIndex]?.faceIDs.push(faceID);
|
||||
clusterIDForFaceID.set(faceID, nnClusterID);
|
||||
} else {
|
||||
// Otherwise create a new cluster with us and our nearest
|
||||
// neighbour.
|
||||
|
||||
const cluster = {
|
||||
id: newClusterID(),
|
||||
faceIDs: [faceID, nnFaceID],
|
||||
};
|
||||
clusterIndexForClusterID.set(cluster.id, clusters.length);
|
||||
clusterIDForFaceID.set(faceID, cluster.id);
|
||||
clusterIDForFaceID.set(nnFaceID, cluster.id);
|
||||
clusters.push(cluster);
|
||||
}
|
||||
} else {
|
||||
// We didn't find a neighbour within the threshold. Create a new
|
||||
// cluster with only this face.
|
||||
|
||||
const cluster = { id: newClusterID(), faceIDs: [faceID] };
|
||||
clusterIndexForClusterID.set(cluster.id, clusters.length);
|
||||
clusterIDForFaceID.set(faceID, cluster.id);
|
||||
clusters.push(cluster);
|
||||
}
|
||||
}
|
||||
|
||||
// Prune too small clusters.
|
||||
const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
|
||||
|
||||
let cgroups = await clusterGroups();
|
||||
|
||||
// TODO-Cluster - Currently we're not syncing with remote or saving anything
|
||||
// locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
|
||||
// cgroup, one per cluster.
|
||||
cgroups = cgroups.concat(
|
||||
validClusters.map((c) => ({
|
||||
id: c.id,
|
||||
name: undefined,
|
||||
clusterIDs: [c.id],
|
||||
isHidden: false,
|
||||
avatarFaceID: undefined,
|
||||
displayFaceID: undefined,
|
||||
})),
|
||||
);
|
||||
|
||||
// For each cluster group, use the highest scoring face in any of its
|
||||
// clusters as its display face.
|
||||
const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
|
||||
for (const cgroup of cgroups) {
|
||||
cgroup.displayFaceID = cgroup.clusterIDs
|
||||
.map((clusterID) => clusterIndexForClusterID.get(clusterID))
|
||||
.filter((i) => i !== undefined) /* 0 is a valid index */
|
||||
.flatMap((i) => clusters[i]?.faceIDs ?? [])
|
||||
.map((faceID) => faceForFaceID.get(faceID))
|
||||
.filter((face) => !!face)
|
||||
.reduce((max, face) =>
|
||||
max.score > face.score ? max : face,
|
||||
).faceID;
|
||||
}
|
||||
|
||||
log.info("ml/cluster", {
|
||||
faces,
|
||||
validClusters,
|
||||
clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
|
||||
clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
|
||||
cgroups,
|
||||
});
|
||||
log.info(
|
||||
`Clustered ${faces.length} faces into ${validClusters.length} clusters (${Date.now() - t} ms)`,
|
||||
);
|
||||
|
||||
return { faces, clusters: validClusters, cgroups, faceAndNeigbours };
|
||||
};
|
||||
|
||||
/**
|
||||
* A generator function that returns a stream of {faceID, embedding} values,
|
||||
* flattening all the the faces present in the given {@link faceIndices}.
|
||||
*/
|
||||
function* enumerateFaces(faceIndices: FaceIndex[]) {
|
||||
for (const fi of faceIndices) {
|
||||
for (const f of fi.faces) {
|
||||
yield f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const clusterFacesHdb = (
|
||||
faceIndexes: FaceIndex[],
|
||||
opts: ClusteringOpts,
|
||||
) => {
|
||||
const { batch } = opts;
|
||||
const t = Date.now();
|
||||
|
||||
// A flattened array of faces.
|
||||
// TODO-Cluster ad-hoc filtering and slicing
|
||||
const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
|
||||
// .slice(0, 6000);
|
||||
// TODO-Cluster testing code, can be removed once done
|
||||
const faces = Array(1)
|
||||
.fill(0)
|
||||
.flatMap(() => faces0);
|
||||
|
||||
// For fast reverse lookup - map from face ids to the face.
|
||||
const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
|
||||
|
||||
const faceEmbeddings = faces.map(({ embedding }) => embedding);
|
||||
|
||||
// For fast reverse lookup - map from cluster ids to their index in the
|
||||
// clusters array.
|
||||
const clusterIndexForClusterID = new Map<string, number>();
|
||||
|
||||
// For fast reverse lookup - map from the id of a face to the id of the
|
||||
// cluster to which it belongs.
|
||||
const clusterIDForFaceID = new Map<string, string>();
|
||||
|
||||
// A function to chain two reverse lookup.
|
||||
const firstFaceOfCluster = (cluster: FaceCluster) =>
|
||||
ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
|
||||
|
||||
// A function to generate new cluster IDs.
|
||||
const newClusterID = () => newNonSecureID("cluster_");
|
||||
|
||||
// The resultant clusters.
|
||||
// TODO-Cluster Later on, instead of starting from a blank slate, this will
|
||||
// be list of existing clusters we fetch from remote.
|
||||
const clusters: FaceCluster[] = [];
|
||||
|
||||
// Process the faces in batches. The faces are already sorted by file ID,
|
||||
// which is a monotonically increasing integer, so we will also have some
|
||||
// temporal locality.
|
||||
//
|
||||
// The number 2500 was derived by ad-hoc observations and takes a few
|
||||
// seconds. On a particular test dataset and a particular machine,
|
||||
// clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
|
||||
// Memory usage was constant in all these cases.
|
||||
//
|
||||
// At around 100k faces, the clustering starts taking hours, and we start
|
||||
// running into stack overflows. The stack overflows can perhaps be avoided
|
||||
// by restructuring the code, but hours of uninterruptible work is anyways
|
||||
// not feasible.
|
||||
|
||||
const batchSize = 2500;
|
||||
for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
|
||||
const it = Date.now();
|
||||
const embeddings = faceEmbeddings.slice(i, i + batchSize);
|
||||
const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings);
|
||||
|
||||
log.info(
|
||||
`hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
|
||||
);
|
||||
|
||||
// Merge the new clusters we got from hdbscan into the existing clusters
|
||||
// if they are "near" them (using some heuristic).
|
||||
//
|
||||
// We need to ensure we don't change any of the existing cluster IDs,
|
||||
// since these might be existing clusters we got from remote.
|
||||
|
||||
for (const hdbCluster of hdbClusters) {
|
||||
// Find the existing cluster whose (arbitrarily chosen) first face
|
||||
// is the nearest neighbour of the (arbitrarily chosen) first face
|
||||
// of the cluster produced by hdbscan.
|
||||
|
||||
const newFace = ensure(faces[i + ensure(hdbCluster[0])]);
|
||||
|
||||
let nnCluster: FaceCluster | undefined;
|
||||
let nnCosineSimilarity = 0;
|
||||
for (const existingCluster of clusters) {
|
||||
const existingFace = firstFaceOfCluster(existingCluster);
|
||||
|
||||
// The vectors are already normalized, so we can directly use their
|
||||
// dot product as their cosine similarity.
|
||||
const csim = dotProduct(
|
||||
existingFace.embedding,
|
||||
newFace.embedding,
|
||||
);
|
||||
|
||||
// Use a higher cosine similarity threshold if either of the two
|
||||
// faces are blurry.
|
||||
const threshold =
|
||||
existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7;
|
||||
if (csim > threshold && csim > nnCosineSimilarity) {
|
||||
nnCluster = existingCluster;
|
||||
nnCosineSimilarity = csim;
|
||||
}
|
||||
}
|
||||
|
||||
if (nnCluster) {
|
||||
// If we found an existing cluster that is near enough,
|
||||
// sublimate the cluster produced by hdbscan into that cluster.
|
||||
for (const j of hdbCluster) {
|
||||
const { faceID } = ensure(faces[i + j]);
|
||||
nnCluster.faceIDs.push(faceID);
|
||||
clusterIDForFaceID.set(faceID, nnCluster.id);
|
||||
}
|
||||
} else {
|
||||
// Otherwise make a new cluster from the cluster produced by
|
||||
// hdbscan.
|
||||
const clusterID = newClusterID();
|
||||
const faceIDs: string[] = [];
|
||||
for (const j of hdbCluster) {
|
||||
const { faceID } = ensure(faces[i + j]);
|
||||
faceIDs.push(faceID);
|
||||
clusterIDForFaceID.set(faceID, clusterID);
|
||||
}
|
||||
clusterIndexForClusterID.set(clusterID, clusters.length);
|
||||
clusters.push({ id: clusterID, faceIDs });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert into the data structure we're using to debug/visualize.
|
||||
// const faceAndNeigbours: FaceNeighbours[] = [];
|
||||
// const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
|
||||
// for (const fi of topFaces) {
|
||||
// let neighbours: FaceNeighbour[] = [];
|
||||
// for (const fj of faces) {
|
||||
// // The vectors are already normalized, so we can directly use their
|
||||
// // dot product as their cosine similarity.
|
||||
// const csim = dotProduct(fi.embedding, fj.embedding);
|
||||
// neighbours.push({ face: fj, cosineSimilarity: csim });
|
||||
// }
|
||||
|
||||
// neighbours = neighbours
|
||||
// .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
|
||||
// .slice(0, 30);
|
||||
|
||||
// faceAndNeigbours.push({ face: fi, neighbours });
|
||||
// }
|
||||
|
||||
// Convert into the data structure we're using to debug/visualize.
|
||||
//
|
||||
// > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
|
||||
// > each, sorted by cosine distance to highest scoring face in the
|
||||
// > cluster).
|
||||
|
||||
const sortedClusters = clusters.sort(
|
||||
(a, b) => b.faceIDs.length - a.faceIDs.length,
|
||||
);
|
||||
const debugClusters =
|
||||
sortedClusters.length < 60
|
||||
? sortedClusters
|
||||
: sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
|
||||
const clusterPreviews: ClusterPreview[] = [];
|
||||
for (const cluster of debugClusters) {
|
||||
const faces = cluster.faceIDs.map((id) =>
|
||||
ensure(faceForFaceID.get(id)),
|
||||
);
|
||||
const topFace = faces.reduce((max, face) =>
|
||||
max.score > face.score ? max : face,
|
||||
);
|
||||
const previewFaces: ClusterPreviewFace[] = [];
|
||||
for (const face of faces) {
|
||||
const csim = dotProduct(topFace.embedding, face.embedding);
|
||||
previewFaces.push({ face, cosineSimilarity: csim });
|
||||
}
|
||||
clusterPreviews.push({
|
||||
clusterSize: cluster.faceIDs.length,
|
||||
faces: previewFaces
|
||||
.sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
|
||||
.slice(0, 50),
|
||||
});
|
||||
}
|
||||
|
||||
// Prune too small clusters.
|
||||
// TODO-Cluster this is likely not needed since hdbscan already has a min?
|
||||
const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
|
||||
|
||||
// let cgroups = await clusterGroups();
|
||||
|
||||
// // TODO-Cluster - Currently we're not syncing with remote or saving anything
|
||||
// // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
|
||||
// // cgroup, one per cluster.
|
||||
// cgroups = cgroups.concat(
|
||||
// validClusters.map((c) => ({
|
||||
// id: c.id,
|
||||
// name: undefined,
|
||||
// clusterIDs: [c.id],
|
||||
// isHidden: false,
|
||||
// avatarFaceID: undefined,
|
||||
// displayFaceID: undefined,
|
||||
// })),
|
||||
// );
|
||||
|
||||
// // For each cluster group, use the highest scoring face in any of its
|
||||
// // clusters as its display face.
|
||||
// for (const cgroup of cgroups) {
|
||||
// cgroup.displayFaceID = cgroup.clusterIDs
|
||||
// .map((clusterID) => clusterIndexForClusterID.get(clusterID))
|
||||
// .filter((i) => i !== undefined) /* 0 is a valid index */
|
||||
// .flatMap((i) => clusters[i]?.faceIDs ?? [])
|
||||
// .map((faceID) => faceForFaceID.get(faceID))
|
||||
// .filter((face) => !!face)
|
||||
// .reduce((max, face) =>
|
||||
// max.score > face.score ? max : face,
|
||||
// ).faceID;
|
||||
// }
|
||||
|
||||
// TODO-Cluster - Currently we're not syncing with remote or saving anything
|
||||
// locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
|
||||
// cgroup, one per cluster.
|
||||
|
||||
const cgroups: CGroup[] = [];
|
||||
for (const cluster of sortedClusters) {
|
||||
const faces = cluster.faceIDs.map((id) =>
|
||||
ensure(faceForFaceID.get(id)),
|
||||
);
|
||||
const topFace = faces.reduce((max, face) =>
|
||||
max.score > face.score ? max : face,
|
||||
);
|
||||
cgroups.push({
|
||||
id: cluster.id,
|
||||
name: undefined,
|
||||
clusterIDs: [cluster.id],
|
||||
isHidden: false,
|
||||
avatarFaceID: undefined,
|
||||
displayFaceID: topFace.faceID,
|
||||
});
|
||||
}
|
||||
|
||||
// log.info("ml/cluster", {
|
||||
// faces,
|
||||
// validClusters,
|
||||
// clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
|
||||
// clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
|
||||
// cgroups,
|
||||
// });
|
||||
log.info(
|
||||
`Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
|
||||
);
|
||||
|
||||
const clusteredCount = clusterIDForFaceID.size;
|
||||
const unclusteredCount = faces.length - clusteredCount;
|
||||
|
||||
return {
|
||||
// faces,
|
||||
clusteredCount,
|
||||
unclusteredCount,
|
||||
clusters: validClusters,
|
||||
cgroups,
|
||||
clusterPreviews,
|
||||
clusterIDForFaceID,
|
||||
};
|
||||
};
|
||||
@@ -1,35 +1,507 @@
|
||||
import { Hdbscan, type DebugInfo } from "hdbscan";
|
||||
import { newNonSecureID } from "@/base/id-worker";
|
||||
import log from "@/base/log";
|
||||
import { ensure } from "@/utils/ensure";
|
||||
import { type EmbeddingCluster, clusterHdbscan } from "./cluster-hdb";
|
||||
import type { Face, FaceIndex } from "./face";
|
||||
import { dotProduct } from "./math";
|
||||
|
||||
export type Cluster = number[];
|
||||
|
||||
export interface ClusterFacesResult {
|
||||
clusters: Cluster[];
|
||||
noise: Cluster;
|
||||
debugInfo?: DebugInfo;
|
||||
/**
|
||||
* A face cluster is an set of faces.
|
||||
*
|
||||
* Each cluster has an id so that a {@link CGroup} can refer to it.
|
||||
*
|
||||
* The cluster is not directly synced to remote. Only clusters that the user
|
||||
* interacts with get synced to remote, as part of a {@link CGroup}.
|
||||
*/
|
||||
export interface FaceCluster {
|
||||
/**
|
||||
* A nanoid for this cluster.
|
||||
*/
|
||||
id: string;
|
||||
/**
|
||||
* An unordered set of ids of the faces that belong to this cluster.
|
||||
*
|
||||
* For ergonomics of transportation and persistence this is an array, but it
|
||||
* should conceptually be thought of as a set.
|
||||
*/
|
||||
faceIDs: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Cluster the given {@link faceEmbeddings}.
|
||||
* A cgroup ("cluster group") is a group of clusters (possibly containing a
|
||||
* single cluster) that the user has interacted with.
|
||||
*
|
||||
* Interactions include hiding, merging and giving a name and/or a cover photo.
|
||||
*
|
||||
* The most frequent interaction is naming a {@link FaceCluster}, which promotes
|
||||
* it to a become a {@link CGroup}. The promotion comes with the ability to be
|
||||
* synced with remote (as a "cgroup" user entity).
|
||||
*
|
||||
* There after, the user may attach more clusters to the same {@link CGroup}.
|
||||
*
|
||||
* > A named cluster group can be thought of as a "person", though this is not
|
||||
* > necessarily an accurate characterization. e.g. there can be a named cluster
|
||||
* > group that contains face clusters of pets.
|
||||
*
|
||||
* The other form of interaction is hiding. The user may hide a single (unnamed)
|
||||
* cluster, or they may hide an named {@link CGroup}. In both cases, we promote
|
||||
* the cluster to a CGroup if needed so that their request to hide gets synced.
|
||||
*
|
||||
* While in our local representation we separately maintain clusters and link to
|
||||
* them from within CGroups by their clusterID, in the remote representation
|
||||
* clusters themselves don't get synced. Instead, the "cgroup" entities synced
|
||||
* with remote contain the clusters within themselves. So a group that gets
|
||||
* synced with remote looks something like:
|
||||
*
|
||||
* { id, name, clusters: [{ clusterID, faceIDs }] }
|
||||
*
|
||||
* @param faceEmbeddings An array of embeddings produced by our face indexing
|
||||
* pipeline. Each embedding is for a face detected in an image (a single image
|
||||
* may have multiple faces detected within it).
|
||||
*/
|
||||
export const clusterFacesHdbscan = (
|
||||
faceEmbeddings: number[][],
|
||||
): ClusterFacesResult => {
|
||||
const hdbscan = new Hdbscan({
|
||||
input: faceEmbeddings,
|
||||
minClusterSize: 3,
|
||||
minSamples: 5,
|
||||
clusterSelectionEpsilon: 0.6,
|
||||
clusterSelectionMethod: "leaf",
|
||||
debug: false,
|
||||
});
|
||||
export interface CGroup {
|
||||
/**
|
||||
* A nanoid for this cluster group.
|
||||
*
|
||||
* This is the ID of the "cgroup" user entity (the envelope), and it is not
|
||||
* contained as part of the group entity payload itself.
|
||||
*/
|
||||
id: string;
|
||||
/**
|
||||
* A name assigned by the user to this cluster group.
|
||||
*
|
||||
* The client should handle both empty strings and undefined as indicating a
|
||||
* cgroup without a name. When the client needs to set this to an "empty"
|
||||
* value, which happens when hiding an unnamed cluster, it should it to an
|
||||
* empty string. That is, expect `"" | undefined`, but set `""`.
|
||||
*/
|
||||
name: string | undefined;
|
||||
/**
|
||||
* An unordered set of ids of the clusters that belong to this group.
|
||||
*
|
||||
* For ergonomics of transportation and persistence this is an array, but it
|
||||
* should conceptually be thought of as a set.
|
||||
*/
|
||||
clusterIDs: string[];
|
||||
/**
|
||||
* True if this cluster group should be hidden.
|
||||
*
|
||||
* The user can hide both named cluster groups and single unnamed clusters.
|
||||
* If the user hides a single cluster that was offered as a suggestion to
|
||||
* them on a client, the client will create a new unnamed cgroup containing
|
||||
* it, and set its hidden flag to sync it with remote (so that other clients
|
||||
* can also stop showing this cluster).
|
||||
*/
|
||||
isHidden: boolean;
|
||||
/**
|
||||
* The ID of the face that should be used as the cover photo for this
|
||||
* cluster group (if the user has set one).
|
||||
*
|
||||
* This is similar to the [@link displayFaceID}, the difference being:
|
||||
*
|
||||
* - {@link avatarFaceID} is the face selected by the user.
|
||||
*
|
||||
* - {@link displayFaceID} is the automatic placeholder, and only comes
|
||||
* into effect if the user has not explicitly selected a face.
|
||||
*/
|
||||
avatarFaceID: string | undefined;
|
||||
/**
|
||||
* Locally determined ID of the "best" face that should be used as the
|
||||
* display face, to represent this cluster group in the UI.
|
||||
*
|
||||
* This property is not synced with remote. For more details, see
|
||||
* {@link avatarFaceID}.
|
||||
*/
|
||||
displayFaceID: string | undefined;
|
||||
}
|
||||
|
||||
export interface ClusteringOpts {
|
||||
method: "linear" | "hdbscan";
|
||||
batchSize: number;
|
||||
joinThreshold: number;
|
||||
}
|
||||
|
||||
export interface ClusterPreview {
|
||||
clusterSize: number;
|
||||
faces: ClusterPreviewFace[];
|
||||
}
|
||||
|
||||
export interface ClusterPreviewFace {
|
||||
face: Face;
|
||||
cosineSimilarity: number;
|
||||
wasMerged: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cluster faces into groups.
|
||||
*
|
||||
* [Note: Face clustering algorithm]
|
||||
*
|
||||
* A cgroup (cluster group) consists of clusters, each of which itself is a set
|
||||
* of faces.
|
||||
*
|
||||
* cgroup << cluster << face
|
||||
*
|
||||
* The clusters are generated locally by clients using the following algorithm:
|
||||
*
|
||||
* 1. clusters = [] initially, or fetched from remote.
|
||||
*
|
||||
* 2. For each face, find its nearest neighbour in the embedding space.
|
||||
*
|
||||
* 3. If no such neighbour is found within our threshold, create a new cluster.
|
||||
*
|
||||
* 4. Otherwise assign this face to the same cluster as its nearest neighbour.
|
||||
*
|
||||
* This user can then tweak the output of the algorithm by performing the
|
||||
* following actions to the list of clusters that they can see:
|
||||
*
|
||||
* - They can provide a name for a cluster ("name a person"). This upgrades a
|
||||
* cluster into a "cgroup", which is an entity that gets synced via remote
|
||||
* to the user's other clients.
|
||||
*
|
||||
* - They can attach more clusters to a cgroup ("merge clusters")
|
||||
*
|
||||
* - They can remove a cluster from a cgroup ("break clusters").
|
||||
*
|
||||
* After clustering, we also do some routine cleanup. Faces belonging to files
|
||||
* that have been deleted (including those in Trash) should be pruned off.
|
||||
*
|
||||
* We should not make strict assumptions about the clusters we get from remote.
|
||||
* In particular, the same face ID can be in different clusters. In such cases
|
||||
* we should assign it arbitrarily assign it to the last cluster we find it in.
|
||||
* Such leeway is intentionally provided to allow clients some slack in how they
|
||||
* implement the sync without needing to make an blocking API request for every
|
||||
* user interaction.
|
||||
*/
|
||||
export const clusterFaces = (
|
||||
faceIndexes: FaceIndex[],
|
||||
opts: ClusteringOpts,
|
||||
) => {
|
||||
const { batchSize, joinThreshold } = opts;
|
||||
const t = Date.now();
|
||||
|
||||
// A flattened array of faces.
|
||||
// TODO-Cluster ad-hoc filtering and slicing
|
||||
const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
|
||||
// .slice(0, 6000);
|
||||
// TODO-Cluster testing code, can be removed once done
|
||||
const faces = Array(1)
|
||||
.fill(0)
|
||||
.flatMap(() => faces0);
|
||||
|
||||
// For fast reverse lookup - map from face ids to the face.
|
||||
const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
|
||||
|
||||
const faceEmbeddings = faces.map(({ embedding }) => embedding);
|
||||
|
||||
// For fast reverse lookup - map from cluster ids to their index in the
|
||||
// clusters array.
|
||||
const clusterIndexForClusterID = new Map<string, number>();
|
||||
|
||||
// For fast reverse lookup - map from the id of a face to the id of the
|
||||
// cluster to which it belongs.
|
||||
const clusterIDForFaceID = new Map<string, string>();
|
||||
|
||||
// A function to chain two reverse lookup.
|
||||
const firstFaceOfCluster = (cluster: FaceCluster) =>
|
||||
ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
|
||||
|
||||
// A function to generate new cluster IDs.
|
||||
const newClusterID = () => newNonSecureID("cluster_");
|
||||
|
||||
// The resultant clusters.
|
||||
// TODO-Cluster Later on, instead of starting from a blank slate, this will
|
||||
// be list of existing clusters we fetch from remote.
|
||||
const clusters: FaceCluster[] = [];
|
||||
|
||||
// Process the faces in batches. The faces are already sorted by file ID,
|
||||
// which is a monotonically increasing integer, so we will also have some
|
||||
// temporal locality.
|
||||
//
|
||||
// The number 2500 was derived by ad-hoc observations and takes a few
|
||||
// seconds. On a particular test dataset and a particular machine,
|
||||
// clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
|
||||
// Memory usage was constant in all these cases.
|
||||
//
|
||||
// At around 100k faces, the clustering starts taking hours, and we start
|
||||
// running into stack overflows. The stack overflows can perhaps be avoided
|
||||
// by restructuring the code, but hours of uninterruptible work is anyways
|
||||
// not feasible.
|
||||
|
||||
const batchSize = 2500;
|
||||
for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
|
||||
const it = Date.now();
|
||||
const embeddings = faceEmbeddings.slice(i, i + batchSize);
|
||||
const { clusters: hdbClusters } = clusterHdbscan(embeddings);
|
||||
|
||||
log.info(
|
||||
`hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
|
||||
);
|
||||
|
||||
// Merge the new clusters we got from hdbscan into the existing clusters
|
||||
// if they are "near" them (using some heuristic).
|
||||
//
|
||||
// We need to ensure we don't change any of the existing cluster IDs,
|
||||
// since these might be existing clusters we got from remote.
|
||||
|
||||
for (const hdbCluster of hdbClusters) {
|
||||
// Find the existing cluster whose (arbitrarily chosen) first face
|
||||
// is the nearest neighbour of the (arbitrarily chosen) first face
|
||||
// of the cluster produced by hdbscan.
|
||||
|
||||
const newFace = ensure(faces[i + ensure(hdbCluster[0])]);
|
||||
|
||||
let nnCluster: FaceCluster | undefined;
|
||||
let nnCosineSimilarity = 0;
|
||||
for (const existingCluster of clusters) {
|
||||
const existingFace = firstFaceOfCluster(existingCluster);
|
||||
|
||||
// The vectors are already normalized, so we can directly use their
|
||||
// dot product as their cosine similarity.
|
||||
const csim = dotProduct(
|
||||
existingFace.embedding,
|
||||
newFace.embedding,
|
||||
);
|
||||
|
||||
// Use a higher cosine similarity threshold if either of the two
|
||||
// faces are blurry.
|
||||
const threshold =
|
||||
existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7;
|
||||
if (csim > threshold && csim > nnCosineSimilarity) {
|
||||
nnCluster = existingCluster;
|
||||
nnCosineSimilarity = csim;
|
||||
}
|
||||
}
|
||||
|
||||
if (nnCluster) {
|
||||
// If we found an existing cluster that is near enough,
|
||||
// sublimate the cluster produced by hdbscan into that cluster.
|
||||
for (const j of hdbCluster) {
|
||||
const { faceID } = ensure(faces[i + j]);
|
||||
nnCluster.faceIDs.push(faceID);
|
||||
clusterIDForFaceID.set(faceID, nnCluster.id);
|
||||
}
|
||||
} else {
|
||||
// Otherwise make a new cluster from the cluster produced by
|
||||
// hdbscan.
|
||||
const clusterID = newClusterID();
|
||||
const faceIDs: string[] = [];
|
||||
for (const j of hdbCluster) {
|
||||
const { faceID } = ensure(faces[i + j]);
|
||||
faceIDs.push(faceID);
|
||||
clusterIDForFaceID.set(faceID, clusterID);
|
||||
}
|
||||
clusterIndexForClusterID.set(clusterID, clusters.length);
|
||||
clusters.push({ id: clusterID, faceIDs });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert into the data structure we're using to debug/visualize.
|
||||
// const faceAndNeigbours: FaceNeighbours[] = [];
|
||||
// const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
|
||||
// for (const fi of topFaces) {
|
||||
// let neighbours: FaceNeighbour[] = [];
|
||||
// for (const fj of faces) {
|
||||
// // The vectors are already normalized, so we can directly use their
|
||||
// // dot product as their cosine similarity.
|
||||
// const csim = dotProduct(fi.embedding, fj.embedding);
|
||||
// neighbours.push({ face: fj, cosineSimilarity: csim });
|
||||
// }
|
||||
|
||||
// neighbours = neighbours
|
||||
// .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
|
||||
// .slice(0, 30);
|
||||
|
||||
// faceAndNeigbours.push({ face: fi, neighbours });
|
||||
// }
|
||||
|
||||
// Convert into the data structure we're using to debug/visualize.
|
||||
//
|
||||
// > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
|
||||
// > each, sorted by cosine distance to highest scoring face in the
|
||||
// > cluster).
|
||||
|
||||
const sortedClusters = clusters.sort(
|
||||
(a, b) => b.faceIDs.length - a.faceIDs.length,
|
||||
);
|
||||
const debugClusters =
|
||||
sortedClusters.length < 60
|
||||
? sortedClusters
|
||||
: sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
|
||||
const clusterPreviews: ClusterPreview[] = [];
|
||||
for (const cluster of debugClusters) {
|
||||
const faces = cluster.faceIDs.map((id) =>
|
||||
ensure(faceForFaceID.get(id)),
|
||||
);
|
||||
const topFace = faces.reduce((max, face) =>
|
||||
max.score > face.score ? max : face,
|
||||
);
|
||||
const previewFaces: ClusterPreviewFace[] = [];
|
||||
for (const face of faces) {
|
||||
const csim = dotProduct(topFace.embedding, face.embedding);
|
||||
previewFaces.push({ face, cosineSimilarity: csim });
|
||||
}
|
||||
clusterPreviews.push({
|
||||
clusterSize: cluster.faceIDs.length,
|
||||
faces: previewFaces
|
||||
.sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
|
||||
.slice(0, 50),
|
||||
});
|
||||
}
|
||||
|
||||
// Prune too small clusters.
|
||||
// TODO-Cluster this is likely not needed since hdbscan already has a min?
|
||||
const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
|
||||
|
||||
// let cgroups = await clusterGroups();
|
||||
|
||||
// // TODO-Cluster - Currently we're not syncing with remote or saving anything
|
||||
// // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
|
||||
// // cgroup, one per cluster.
|
||||
// cgroups = cgroups.concat(
|
||||
// validClusters.map((c) => ({
|
||||
// id: c.id,
|
||||
// name: undefined,
|
||||
// clusterIDs: [c.id],
|
||||
// isHidden: false,
|
||||
// avatarFaceID: undefined,
|
||||
// displayFaceID: undefined,
|
||||
// })),
|
||||
// );
|
||||
|
||||
// // For each cluster group, use the highest scoring face in any of its
|
||||
// // clusters as its display face.
|
||||
// for (const cgroup of cgroups) {
|
||||
// cgroup.displayFaceID = cgroup.clusterIDs
|
||||
// .map((clusterID) => clusterIndexForClusterID.get(clusterID))
|
||||
// .filter((i) => i !== undefined) /* 0 is a valid index */
|
||||
// .flatMap((i) => clusters[i]?.faceIDs ?? [])
|
||||
// .map((faceID) => faceForFaceID.get(faceID))
|
||||
// .filter((face) => !!face)
|
||||
// .reduce((max, face) =>
|
||||
// max.score > face.score ? max : face,
|
||||
// ).faceID;
|
||||
// }
|
||||
|
||||
// TODO-Cluster - Currently we're not syncing with remote or saving anything
|
||||
// locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
|
||||
// cgroup, one per cluster.
|
||||
|
||||
const cgroups: CGroup[] = [];
|
||||
for (const cluster of sortedClusters) {
|
||||
const faces = cluster.faceIDs.map((id) =>
|
||||
ensure(faceForFaceID.get(id)),
|
||||
);
|
||||
const topFace = faces.reduce((max, face) =>
|
||||
max.score > face.score ? max : face,
|
||||
);
|
||||
cgroups.push({
|
||||
id: cluster.id,
|
||||
name: undefined,
|
||||
clusterIDs: [cluster.id],
|
||||
isHidden: false,
|
||||
avatarFaceID: undefined,
|
||||
displayFaceID: topFace.faceID,
|
||||
});
|
||||
}
|
||||
|
||||
// log.info("ml/cluster", {
|
||||
// faces,
|
||||
// validClusters,
|
||||
// clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
|
||||
// clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
|
||||
// cgroups,
|
||||
// });
|
||||
log.info(
|
||||
`Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
|
||||
);
|
||||
|
||||
const clusteredCount = clusterIDForFaceID.size;
|
||||
const unclusteredCount = faces.length - clusteredCount;
|
||||
|
||||
return {
|
||||
clusters: hdbscan.getClusters(),
|
||||
noise: hdbscan.getNoise(),
|
||||
debugInfo: hdbscan.getDebugInfo(),
|
||||
// faces,
|
||||
clusteredCount,
|
||||
unclusteredCount,
|
||||
clusters: validClusters,
|
||||
cgroups,
|
||||
clusterPreviews,
|
||||
clusterIDForFaceID,
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* A generator function that returns a stream of {faceID, embedding} values,
|
||||
* flattening all the the faces present in the given {@link faceIndices}.
|
||||
*/
|
||||
function* enumerateFaces(faceIndices: FaceIndex[]) {
|
||||
for (const fi of faceIndices) {
|
||||
for (const f of fi.faces) {
|
||||
yield f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
interface ClusterLinearResult {
|
||||
clusters: EmbeddingCluster[];
|
||||
}
|
||||
|
||||
const clusterLinear = (
|
||||
embeddings: number[][],
|
||||
threshold: number,
|
||||
): ClusterLinearResult => {
|
||||
const clusters: EmbeddingCluster[] = [];
|
||||
const clusterIndexForEmbeddingIndex = new Map<number, number>();
|
||||
// For each embedding
|
||||
for (const [i, ei] of embeddings.entries()) {
|
||||
// If the embedding is already part of a cluster, then skip it.
|
||||
if (clusterIndexForEmbeddingIndex.get(i)) continue;
|
||||
|
||||
// Find the nearest neighbour from among all the other embeddings.
|
||||
let nnIndex: number | undefined;
|
||||
let nnCosineSimilarity = 0;
|
||||
for (const [j, ej] of embeddings.entries()) {
|
||||
// ! This is an O(n^2) loop, be careful when adding more code here.
|
||||
|
||||
// Skip ourselves.
|
||||
if (i == j) continue;
|
||||
|
||||
// The vectors are already normalized, so we can directly use their
|
||||
// dot product as their cosine similarity.
|
||||
const csim = dotProduct(ei, ej);
|
||||
if (csim > threshold && csim > nnCosineSimilarity) {
|
||||
nnIndex = j;
|
||||
nnCosineSimilarity = csim;
|
||||
}
|
||||
}
|
||||
|
||||
if (nnIndex) {
|
||||
// Find the cluster the nearest neighbour belongs to, if any.
|
||||
const nnClusterIndex = clusterIndexForEmbeddingIndex.get(nnIndex);
|
||||
|
||||
if (nnClusterIndex) {
|
||||
// If the neighbour is already part of a cluster, also add
|
||||
// ourselves to that cluster.
|
||||
|
||||
ensure(clusters[nnClusterIndex]).push(i);
|
||||
clusterIndexForEmbeddingIndex.set(i, nnClusterIndex);
|
||||
} else {
|
||||
// Otherwise create a new cluster with us and our nearest
|
||||
// neighbour.
|
||||
|
||||
clusterIndexForEmbeddingIndex.set(i, clusters.length);
|
||||
clusterIndexForEmbeddingIndex.set(nnIndex, clusters.length);
|
||||
clusters.push([i, nnIndex]);
|
||||
}
|
||||
} else {
|
||||
// We didn't find a neighbour within the threshold. Create a new
|
||||
// cluster with only this embedding.
|
||||
|
||||
clusterIndexForEmbeddingIndex.set(i, clusters.length);
|
||||
clusters.push([i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Prune singletone clusters.
|
||||
const validClusters = clusters.filter((cs) => cs.length > 1);
|
||||
|
||||
return { clusters: validClusters };
|
||||
};
|
||||
|
||||
@@ -3,7 +3,7 @@ import log from "@/base/log";
|
||||
import localForage from "@ente/shared/storage/localForage";
|
||||
import { deleteDB, openDB, type DBSchema } from "idb";
|
||||
import type { LocalCLIPIndex } from "./clip";
|
||||
import type { CGroup, FaceCluster } from "./cluster-new";
|
||||
import type { CGroup, FaceCluster } from "./cluster";
|
||||
import type { LocalFaceIndex } from "./face";
|
||||
|
||||
/**
|
||||
|
||||
@@ -24,7 +24,7 @@ import {
|
||||
type ClusteringOpts,
|
||||
type ClusterPreviewFace,
|
||||
type FaceCluster,
|
||||
} from "./cluster-new";
|
||||
} from "./cluster";
|
||||
import { regenerateFaceCrops } from "./crop";
|
||||
import { clearMLDB, faceIndex, indexableAndIndexedCounts } from "./db";
|
||||
import type { Face } from "./face";
|
||||
@@ -386,7 +386,7 @@ export const wipClusterDebugPageContents = async (
|
||||
clusters,
|
||||
cgroups,
|
||||
unclusteredFaces,
|
||||
} = await worker().then((w) => w.clusterFacesHdb(opts));
|
||||
} = await worker().then((w) => w.clusterFaces(opts));
|
||||
|
||||
const localFiles = await getAllLocalFiles();
|
||||
const localFileByID = new Map(localFiles.map((f) => [f.id, f]));
|
||||
|
||||
@@ -24,7 +24,7 @@ import {
|
||||
indexCLIP,
|
||||
type CLIPIndex,
|
||||
} from "./clip";
|
||||
import { clusterFacesHdb, type ClusteringOpts } from "./cluster-new";
|
||||
import { type ClusteringOpts } from "./cluster";
|
||||
import { saveFaceCrops } from "./crop";
|
||||
import {
|
||||
faceIndexes,
|
||||
@@ -276,8 +276,8 @@ export class MLWorker {
|
||||
}
|
||||
|
||||
// TODO-Cluster
|
||||
async clusterFacesHdb(opts: ClusteringOpts) {
|
||||
return clusterFacesHdb(await faceIndexes(), opts);
|
||||
async clusterFaces(opts: ClusteringOpts) {
|
||||
return clusterFace(await faceIndexes(), opts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ import { ensure } from "@/utils/ensure";
|
||||
import { nullToUndefined } from "@/utils/transform";
|
||||
import { z } from "zod";
|
||||
import { gunzip } from "./gzip";
|
||||
import type { CGroup } from "./ml/cluster-new";
|
||||
import type { CGroup } from "./ml/cluster";
|
||||
import { applyCGroupDiff } from "./ml/db";
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user