Clean
This commit is contained in:
@@ -6,7 +6,7 @@ import {
|
||||
type ClusterDebugPageContents,
|
||||
type ClusterPreviewFaceWithFile,
|
||||
} from "@/new/photos/services/ml";
|
||||
import { type ClusteringOpts } from "@/new/photos/services/ml/cluster-new";
|
||||
import { type ClusteringOpts } from "@/new/photos/services/ml/cluster";
|
||||
import { faceDirection } from "@/new/photos/services/ml/face";
|
||||
import {
|
||||
FlexWrapper,
|
||||
@@ -297,7 +297,7 @@ const Header: React.FC<HeaderProps> = ({ clusterRes, onCluster }) => {
|
||||
const clusterInfo = clusterRes && (
|
||||
<Stack m={1}>
|
||||
<Typography variant="small" mb={1}>
|
||||
{`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredFaceCount} faces. ${clusterRes.unclusteredFaceCount} unclustered faces.`}
|
||||
{`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredFaceCount} faces in ${(clusterRes.timeTakenMs / 1000).toFixed(0)} seconds. ${clusterRes.unclusteredFaceCount} unclustered faces.`}
|
||||
</Typography>
|
||||
<Typography variant="small" color="text.muted">
|
||||
Showing only top 30 and bottom 30 clusters.
|
||||
|
||||
@@ -174,17 +174,11 @@ export const clusterFaces = (
|
||||
faceIndexes: FaceIndex[],
|
||||
opts: ClusteringOpts,
|
||||
) => {
|
||||
const { batchSize, joinThreshold } = opts;
|
||||
const { method, batchSize, joinThreshold } = opts;
|
||||
const t = Date.now();
|
||||
|
||||
// A flattened array of faces.
|
||||
// TODO-Cluster ad-hoc filtering and slicing
|
||||
const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
|
||||
// .slice(0, 6000);
|
||||
// TODO-Cluster testing code, can be removed once done
|
||||
const faces = Array(1)
|
||||
.fill(0)
|
||||
.flatMap(() => faces0);
|
||||
const faces = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
|
||||
|
||||
// For fast reverse lookup - map from face ids to the face.
|
||||
const faceForFaceID = new Map(faces.map((f) => [f.faceID, f]));
|
||||
@@ -199,6 +193,10 @@ export const clusterFaces = (
|
||||
// cluster to which it belongs.
|
||||
const clusterIDForFaceID = new Map<string, string>();
|
||||
|
||||
// Keeps track of which faces were found by the OG clustering algorithm, and
|
||||
// which were sublimated in from a later match.
|
||||
const wasMergedFaceIDs = new Set<string>();
|
||||
|
||||
// A function to chain two reverse lookup.
|
||||
const firstFaceOfCluster = (cluster: FaceCluster) =>
|
||||
ensure(faceForFaceID.get(ensure(cluster.faceIDs[0])));
|
||||
@@ -214,18 +212,7 @@ export const clusterFaces = (
|
||||
// Process the faces in batches. The faces are already sorted by file ID,
|
||||
// which is a monotonically increasing integer, so we will also have some
|
||||
// temporal locality.
|
||||
//
|
||||
// The number 2500 was derived by ad-hoc observations and takes a few
|
||||
// seconds. On a particular test dataset and a particular machine,
|
||||
// clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
|
||||
// Memory usage was constant in all these cases.
|
||||
//
|
||||
// At around 100k faces, the clustering starts taking hours, and we start
|
||||
// running into stack overflows. The stack overflows can perhaps be avoided
|
||||
// by restructuring the code, but hours of uninterruptible work is anyways
|
||||
// not feasible.
|
||||
|
||||
const batchSize = 2500;
|
||||
for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
|
||||
const it = Date.now();
|
||||
const embeddings = faceEmbeddings.slice(i, i + batchSize);
|
||||
@@ -294,92 +281,34 @@ export const clusterFaces = (
|
||||
}
|
||||
}
|
||||
|
||||
// Convert into the data structure we're using to debug/visualize.
|
||||
// const faceAndNeigbours: FaceNeighbours[] = [];
|
||||
// const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30);
|
||||
// for (const fi of topFaces) {
|
||||
// let neighbours: FaceNeighbour[] = [];
|
||||
// for (const fj of faces) {
|
||||
// // The vectors are already normalized, so we can directly use their
|
||||
// // dot product as their cosine similarity.
|
||||
// const csim = dotProduct(fi.embedding, fj.embedding);
|
||||
// neighbours.push({ face: fj, cosineSimilarity: csim });
|
||||
// }
|
||||
|
||||
// neighbours = neighbours
|
||||
// .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
|
||||
// .slice(0, 30);
|
||||
|
||||
// faceAndNeigbours.push({ face: fi, neighbours });
|
||||
// }
|
||||
|
||||
// Convert into the data structure we're using to debug/visualize.
|
||||
//
|
||||
// > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
|
||||
// > each, sorted by cosine distance to highest scoring face in the
|
||||
// > cluster).
|
||||
|
||||
const sortedClusters = clusters.sort(
|
||||
(a, b) => b.faceIDs.length - a.faceIDs.length,
|
||||
);
|
||||
const debugClusters =
|
||||
|
||||
// Convert into the data structure we're using to debug/visualize.
|
||||
const clusterPreviewClusters =
|
||||
sortedClusters.length < 60
|
||||
? sortedClusters
|
||||
: sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
|
||||
const clusterPreviews: ClusterPreview[] = [];
|
||||
for (const cluster of debugClusters) {
|
||||
const clusterPreviews = clusterPreviewClusters.map((cluster) => {
|
||||
const faces = cluster.faceIDs.map((id) =>
|
||||
ensure(faceForFaceID.get(id)),
|
||||
);
|
||||
const topFace = faces.reduce((max, face) =>
|
||||
max.score > face.score ? max : face,
|
||||
const topFace = faces.reduce((top, face) =>
|
||||
top.score > face.score ? top : face,
|
||||
);
|
||||
const previewFaces: ClusterPreviewFace[] = [];
|
||||
for (const face of faces) {
|
||||
const previewFaces: ClusterPreviewFace[] = faces.map((face) => {
|
||||
const csim = dotProduct(topFace.embedding, face.embedding);
|
||||
previewFaces.push({ face, cosineSimilarity: csim });
|
||||
}
|
||||
clusterPreviews.push({
|
||||
const wasMerged = wasMergedFaceIDs.has(face.faceID);
|
||||
return { face, cosineSimilarity: csim, wasMerged };
|
||||
});
|
||||
return {
|
||||
clusterSize: cluster.faceIDs.length,
|
||||
faces: previewFaces
|
||||
.sort((a, b) => b.cosineSimilarity - a.cosineSimilarity)
|
||||
.slice(0, 50),
|
||||
});
|
||||
}
|
||||
|
||||
// Prune too small clusters.
|
||||
// TODO-Cluster this is likely not needed since hdbscan already has a min?
|
||||
const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1);
|
||||
|
||||
// let cgroups = await clusterGroups();
|
||||
|
||||
// // TODO-Cluster - Currently we're not syncing with remote or saving anything
|
||||
// // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
|
||||
// // cgroup, one per cluster.
|
||||
// cgroups = cgroups.concat(
|
||||
// validClusters.map((c) => ({
|
||||
// id: c.id,
|
||||
// name: undefined,
|
||||
// clusterIDs: [c.id],
|
||||
// isHidden: false,
|
||||
// avatarFaceID: undefined,
|
||||
// displayFaceID: undefined,
|
||||
// })),
|
||||
// );
|
||||
|
||||
// // For each cluster group, use the highest scoring face in any of its
|
||||
// // clusters as its display face.
|
||||
// for (const cgroup of cgroups) {
|
||||
// cgroup.displayFaceID = cgroup.clusterIDs
|
||||
// .map((clusterID) => clusterIndexForClusterID.get(clusterID))
|
||||
// .filter((i) => i !== undefined) /* 0 is a valid index */
|
||||
// .flatMap((i) => clusters[i]?.faceIDs ?? [])
|
||||
// .map((faceID) => faceForFaceID.get(faceID))
|
||||
// .filter((face) => !!face)
|
||||
// .reduce((max, face) =>
|
||||
// max.score > face.score ? max : face,
|
||||
// ).faceID;
|
||||
// }
|
||||
};
|
||||
});
|
||||
|
||||
// TODO-Cluster - Currently we're not syncing with remote or saving anything
|
||||
// locally, so cgroups will be empty. Create a temporary (unsaved, unsynced)
|
||||
@@ -390,8 +319,8 @@ export const clusterFaces = (
|
||||
const faces = cluster.faceIDs.map((id) =>
|
||||
ensure(faceForFaceID.get(id)),
|
||||
);
|
||||
const topFace = faces.reduce((max, face) =>
|
||||
max.score > face.score ? max : face,
|
||||
const topFace = faces.reduce((top, face) =>
|
||||
top.score > face.score ? top : face,
|
||||
);
|
||||
cgroups.push({
|
||||
id: cluster.id,
|
||||
@@ -403,28 +332,22 @@ export const clusterFaces = (
|
||||
});
|
||||
}
|
||||
|
||||
// log.info("ml/cluster", {
|
||||
// faces,
|
||||
// validClusters,
|
||||
// clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID),
|
||||
// clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID),
|
||||
// cgroups,
|
||||
// });
|
||||
const timeTakenMs = Date.now() - t;
|
||||
log.info(
|
||||
`Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`,
|
||||
`Clustered ${faces.length} faces into ${clusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${timeTakenMs} ms)`,
|
||||
);
|
||||
|
||||
const clusteredCount = clusterIDForFaceID.size;
|
||||
const unclusteredCount = faces.length - clusteredCount;
|
||||
const clusteredFaceCount = clusterIDForFaceID.size;
|
||||
const unclusteredFaceCount = faces.length - clusteredFaceCount;
|
||||
|
||||
return {
|
||||
// faces,
|
||||
clusteredCount,
|
||||
unclusteredCount,
|
||||
clusters: validClusters,
|
||||
cgroups,
|
||||
clusteredFaceCount,
|
||||
unclusteredFaceCount,
|
||||
clusterPreviews,
|
||||
clusterIDForFaceID,
|
||||
clusters: sortedClusters,
|
||||
cgroups,
|
||||
unclusteredFaces: [],
|
||||
timeTakenMs,
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@@ -365,7 +365,8 @@ export interface ClusterDebugPageContents {
|
||||
unclusteredFacesWithFile: {
|
||||
face: Face;
|
||||
enteFile: EnteFile;
|
||||
};
|
||||
}[];
|
||||
timeTakenMs: number;
|
||||
}
|
||||
|
||||
export const wipClusterDebugPageContents = async (
|
||||
@@ -378,7 +379,6 @@ export const wipClusterDebugPageContents = async (
|
||||
_wip_searchPersons = undefined;
|
||||
triggerStatusUpdate();
|
||||
|
||||
// const { faceAndNeigbours, clusters, cgroups } = await clusterFaces(
|
||||
const {
|
||||
clusteredFaceCount,
|
||||
unclusteredFaceCount,
|
||||
@@ -386,6 +386,7 @@ export const wipClusterDebugPageContents = async (
|
||||
clusters,
|
||||
cgroups,
|
||||
unclusteredFaces,
|
||||
timeTakenMs,
|
||||
} = await worker().then((w) => w.clusterFaces(opts));
|
||||
|
||||
const localFiles = await getAllLocalFiles();
|
||||
@@ -396,10 +397,10 @@ export const wipClusterDebugPageContents = async (
|
||||
const clusterPreviewsWithFile = clusterPreviews.map(
|
||||
({ clusterSize, faces }) => ({
|
||||
clusterSize,
|
||||
faces: faces.map(({ face, cosineSimilarity }) => ({
|
||||
faces: faces.map(({ face, ...rest }) => ({
|
||||
face,
|
||||
enteFile: fileForFace(face),
|
||||
cosineSimilarity,
|
||||
...rest,
|
||||
})),
|
||||
}),
|
||||
);
|
||||
@@ -445,6 +446,7 @@ export const wipClusterDebugPageContents = async (
|
||||
clusters,
|
||||
clusterPreviewsWithFile,
|
||||
unclusteredFacesWithFile,
|
||||
timeTakenMs,
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ import {
|
||||
indexCLIP,
|
||||
type CLIPIndex,
|
||||
} from "./clip";
|
||||
import { type ClusteringOpts } from "./cluster";
|
||||
import { clusterFaces, type ClusteringOpts } from "./cluster";
|
||||
import { saveFaceCrops } from "./crop";
|
||||
import {
|
||||
faceIndexes,
|
||||
@@ -277,7 +277,7 @@ export class MLWorker {
|
||||
|
||||
// TODO-Cluster
|
||||
async clusterFaces(opts: ClusteringOpts) {
|
||||
return clusterFace(await faceIndexes(), opts);
|
||||
return clusterFaces(await faceIndexes(), opts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user