This commit is contained in:
Manav Rathi
2024-08-30 17:24:49 +05:30
parent 48e00a0ecc
commit 4f4eb773fc
4 changed files with 41 additions and 116 deletions

View File

@@ -6,7 +6,7 @@ import {
type ClusterDebugPageContents,
type ClusterPreviewFaceWithFile,
} from "@/new/photos/services/ml";
import { type ClusteringOpts } from "@/new/photos/services/ml/cluster-new";
import { type ClusteringOpts } from "@/new/photos/services/ml/cluster";
import { faceDirection } from "@/new/photos/services/ml/face";
import {
FlexWrapper,
@@ -297,7 +297,7 @@ const Header: React.FC<HeaderProps> = ({ clusterRes, onCluster }) => {
const clusterInfo = clusterRes && (
<Stack m={1}>
<Typography variant="small" mb={1}>
{`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredFaceCount} faces. ${clusterRes.unclusteredFaceCount} unclustered faces.`}
{`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredFaceCount} faces in ${(clusterRes.timeTakenMs / 1000).toFixed(0)} seconds. ${clusterRes.unclusteredFaceCount} unclustered faces.`}
</Typography>
<Typography variant="small" color="text.muted">
Showing only top 30 and bottom 30 clusters.

View File

@@ -174,17 +174,11 @@ export const clusterFaces = (
faceIndexes: FaceIndex[],
opts: ClusteringOpts,
) => {
const { batchSize, joinThreshold } = opts;
const { method, batchSize, joinThreshold } = opts;
const t = Date.now();
// A flattened array of faces.
// TODO-Cluster ad-hoc filtering and slicing
const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
// .slice(0, 6000);
// TODO-Cluster testing code, can be removed once done
const faces = Array(1)
.fill(0)
.flatMap(() => faces0);
const faces = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
// For fast reverse lookup - map from face ids to the face.
const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
@@ -199,6 +193,10 @@ export const clusterFaces = (
// cluster to which it belongs.
const clusterIDForFaceID = new Map<string, string>();
// Keeps track of which faces were found by the OG clustering algorithm, and
// which were sublimated in from a later match.
const wasMergedFaceIDs = new Set<string>();
// A function to chain two reverse lookup.
const firstFaceOfCluster = (cluster: FaceCluster) =>
ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
@@ -214,18 +212,7 @@ export const clusterFaces = (
// Process the faces in batches. The faces are already sorted by file ID,
// which is a monotonically increasing integer, so we will also have some
// temporal locality.
//
// The number 2500 was derived by ad-hoc observations and takes a few
// seconds. On a particular test dataset and a particular machine,
// clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
// Memory usage was constant in all these cases.
//
// At around 100k faces, the clustering starts taking hours, and we start
// running into stack overflows. The stack overflows can perhaps be avoided
// by restructuring the code, but hours of uninterruptible work is anyways
// not feasible.
const batchSize = 2500;
for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
const it = Date.now();
const embeddings = faceEmbeddings.slice(i, i + batchSize);
@@ -294,92 +281,34 @@ export const clusterFaces = (
}
}
// Convert into the data structure we're using to debug/visualize.
// const faceAndNeigbours: FaceNeighbours[] = [];
// const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
// for (const fi of topFaces) {
// let neighbours: FaceNeighbour[] = [];
// for (const fj of faces) {
// // The vectors are already normalized, so we can directly use their
// // dot product as their cosine similarity.
// const csim = dotProduct(fi.embedding, fj.embedding);
// neighbours.push({ face: fj, cosineSimilarity: csim });
// }
// neighbours = neighbours
// .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
// .slice(0, 30);
// faceAndNeigbours.push({ face: fi, neighbours });
// }
// Convert into the data structure we're using to debug/visualize.
//
// > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
// > each, sorted by cosine distance to highest scoring face in the
// > cluster).
const sortedClusters = clusters.sort(
(a, b) => b.faceIDs.length - a.faceIDs.length,
);
const debugClusters =
// Convert into the data structure we're using to debug/visualize.
const clusterPreviewClusters =
sortedClusters.length < 60
? sortedClusters
: sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
const clusterPreviews: ClusterPreview[] = [];
for (const cluster of debugClusters) {
const clusterPreviews = clusterPreviewClusters.map((cluster) => {
const faces = cluster.faceIDs.map((id) =>
ensure(faceForFaceID.get(id)),
);
const topFace = faces.reduce((max, face) =>
max.score > face.score ? max : face,
const topFace = faces.reduce((top, face) =>
top.score > face.score ? top : face,
);
const previewFaces: ClusterPreviewFace[] = [];
for (const face of faces) {
const previewFaces: ClusterPreviewFace[] = faces.map((face) => {
const csim = dotProduct(topFace.embedding, face.embedding);
previewFaces.push({ face, cosineSimilarity: csim });
}
clusterPreviews.push({
const wasMerged = wasMergedFaceIDs.has(face.faceID);
return { face, cosineSimilarity: csim, wasMerged };
});
return {
clusterSize: cluster.faceIDs.length,
faces: previewFaces
.sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
.slice(0, 50),
});
}
// Prune too small clusters.
// TODO-Cluster this is likely not needed since hdbscan already has a min?
const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
// let cgroups = await clusterGroups();
// // TODO-Cluster - Currently we're not syncing with remote or saving anything
// // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
// // cgroup, one per cluster.
// cgroups = cgroups.concat(
// validClusters.map((c) => ({
// id: c.id,
// name: undefined,
// clusterIDs: [c.id],
// isHidden: false,
// avatarFaceID: undefined,
// displayFaceID: undefined,
// })),
// );
// // For each cluster group, use the highest scoring face in any of its
// // clusters as its display face.
// for (const cgroup of cgroups) {
// cgroup.displayFaceID = cgroup.clusterIDs
// .map((clusterID) => clusterIndexForClusterID.get(clusterID))
// .filter((i) => i !== undefined) /* 0 is a valid index */
// .flatMap((i) => clusters[i]?.faceIDs ?? [])
// .map((faceID) => faceForFaceID.get(faceID))
// .filter((face) => !!face)
// .reduce((max, face) =>
// max.score > face.score ? max : face,
// ).faceID;
// }
};
});
// TODO-Cluster - Currently we're not syncing with remote or saving anything
// locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
@@ -390,8 +319,8 @@ export const clusterFaces = (
const faces = cluster.faceIDs.map((id) =>
ensure(faceForFaceID.get(id)),
);
const topFace = faces.reduce((max, face) =>
max.score > face.score ? max : face,
const topFace = faces.reduce((top, face) =>
top.score > face.score ? top : face,
);
cgroups.push({
id: cluster.id,
@@ -403,28 +332,22 @@ export const clusterFaces = (
});
}
// log.info("ml/cluster", {
// faces,
// validClusters,
// clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
// clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
// cgroups,
// });
const timeTakenMs = Date.now() - t;
log.info(
`Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
`Clustered ${faces.length} faces into ${clusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${timeTakenMs} ms)`,
);
const clusteredCount = clusterIDForFaceID.size;
const unclusteredCount = faces.length - clusteredCount;
const clusteredFaceCount = clusterIDForFaceID.size;
const unclusteredFaceCount = faces.length - clusteredFaceCount;
return {
// faces,
clusteredCount,
unclusteredCount,
clusters: validClusters,
cgroups,
clusteredFaceCount,
unclusteredFaceCount,
clusterPreviews,
clusterIDForFaceID,
clusters: sortedClusters,
cgroups,
unclusteredFaces: [],
timeTakenMs,
};
};

View File

@@ -365,7 +365,8 @@ export interface ClusterDebugPageContents {
unclusteredFacesWithFile: {
face: Face;
enteFile: EnteFile;
};
}[];
timeTakenMs: number;
}
export const wipClusterDebugPageContents = async (
@@ -378,7 +379,6 @@ export const wipClusterDebugPageContents = async (
_wip_searchPersons = undefined;
triggerStatusUpdate();
// const { faceAndNeigbours, clusters, cgroups } = await clusterFaces(
const {
clusteredFaceCount,
unclusteredFaceCount,
@@ -386,6 +386,7 @@ export const wipClusterDebugPageContents = async (
clusters,
cgroups,
unclusteredFaces,
timeTakenMs,
} = await worker().then((w) => w.clusterFaces(opts));
const localFiles = await getAllLocalFiles();
@@ -396,10 +397,10 @@ export const wipClusterDebugPageContents = async (
const clusterPreviewsWithFile = clusterPreviews.map(
({ clusterSize, faces }) => ({
clusterSize,
faces: faces.map(({ face, cosineSimilarity }) => ({
faces: faces.map(({ face, ...rest }) => ({
face,
enteFile: fileForFace(face),
cosineSimilarity,
...rest,
})),
}),
);
@@ -445,6 +446,7 @@ export const wipClusterDebugPageContents = async (
clusters,
clusterPreviewsWithFile,
unclusteredFacesWithFile,
timeTakenMs,
};
};

View File

@@ -24,7 +24,7 @@ import {
indexCLIP,
type CLIPIndex,
} from "./clip";
import { type ClusteringOpts } from "./cluster";
import { clusterFaces, type ClusteringOpts } from "./cluster";
import { saveFaceCrops } from "./crop";
import {
faceIndexes,
@@ -277,7 +277,7 @@ export class MLWorker {
// TODO-Cluster
async clusterFaces(opts: ClusteringOpts) {
return clusterFace(await faceIndexes(), opts);
return clusterFaces(await faceIndexes(), opts);
}
}