This commit is contained in:
Manav Rathi
2024-08-29 20:33:18 +05:30
parent d6c7ab0735
commit d5a1187e13
3 changed files with 40 additions and 31 deletions

View File

@@ -49,14 +49,22 @@ export default function ClusterDebug() {
}
return (
<>
<Typography variant="small">
{`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`}
</Typography>
<Typography variant="small" color="text.muted">
Showing only top 20 and bottom 10 clusters (and only up to 50
faces in each, sorted by cosine distance to highest scoring face
in the cluster).
</Typography>
<Stack m={1}>
<Typography variant="small" mb={1}>
{`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`}
</Typography>
<Typography variant="small" color="text.muted">
Showing only top 30 and bottom 30 clusters.
</Typography>
<Typography variant="small" color="text.muted">
For each cluster showing only up to 50 faces, sorted by
cosine similarity to highest scoring face in the cluster.
</Typography>
<Typography variant="small" color="text.muted">
Below each face is its{" "}
<b>blur - score - cosineSimilarity - direction</b>
</Typography>
</Stack>
<hr />
<Container>
<AutoSizer>

View File

@@ -8,7 +8,6 @@ import {
enableML,
mlStatusSnapshot,
mlStatusSubscribe,
wipCluster,
wipClusterEnable,
type MLStatus,
} from "@/new/photos/services/ml";
@@ -341,7 +340,7 @@ const ManageML: React.FC<ManageMLProps> = ({
// TODO-Cluster
const router = useRouter();
const wipClusterNow = () => wipCluster();
// const wipClusterNow = () => wipCluster();
const wipClusterShowNow = () => router.push("/cluster-debug");
return (
@@ -391,18 +390,20 @@ const ManageML: React.FC<ManageMLProps> = ({
<Box>
<MenuItemGroup>
<EnteMenuItem
label={ut("Create clusters • internal only option")}
onClick={wipClusterNow}
label={ut(
"Create clusters • internal only option",
)}
onClick={wipClusterShowNow}
/>
</MenuItemGroup>
<MenuSectionTitle
title={ut(
"Create in-memory clusters from arbitrary 2k photos. Nothing will be saved or synced to remote. You can view the results in search dropdown.",
"Create and show in-memory clusters. Takes ~ 1 min. Nothing will be saved or synced to remote. You can also view all clusters in the search dropdown later.",
)}
/>
</Box>
)}
{showClusterOpt && (
{/* {showClusterOpt && (
<Box>
<MenuItemGroup>
<EnteMenuItem
@@ -416,7 +417,7 @@ const ManageML: React.FC<ManageMLProps> = ({
)}
/>
</Box>
)}
)} */}
</Stack>
);
};

View File

@@ -348,14 +348,13 @@ function* enumerateFaces(faceIndices: FaceIndex[]) {
}
}
export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
export const clusterFacesHdb = (faceIndexes: FaceIndex[]) => {
const t = Date.now();
// A flattened array of faces.
// TODO-Cluster ad-hoc filtering and slicing
const faces0 = [...enumerateFaces(faceIndexes)]
.filter((f) => f.blur > 99)
.slice(0, 6000);
const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
// .slice(0, 6000);
// TODO-Cluster testing code, can be removed once done
const faces = Array(1)
.fill(0)
@@ -386,27 +385,28 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
// be list of existing clusters we fetch from remote.
const clusters: FaceCluster[] = [];
// Process the faces in batches of 10k. The faces are already sorted by file
// ID, which is a monotonically increasing integer, so we will also have
// some temporal locality.
// Process the faces in batches. The faces are already sorted by file ID,
// which is a monotonically increasing integer, so we will also have some
// temporal locality.
//
// The number 10k was derived by ad-hoc observations. On a particular test
// dataset, clustering 10k took ~2 mins, while 20k took ~8 mins. Memory
// usage was constant in both cases.
// The number 2500 was derived by ad-hoc observations and takes a few
// seconds. On a particular test dataset and a particular machine,
// clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
// Memory usage was constant in all these cases.
//
// At around 100k faces, the clustering starts taking hours, and we start
// running into stack overflows. The stack overflows can perhaps be avoided
// by restructuring the code, but hours of uninterruptible work is anyways
// not feasible.
// const batchSize = 10_000; // TODO-Cluster
const batchSize = 1_000;
const batchSize = 2500;
for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
const it = Date.now();
const embeddings = faceEmbeddings.slice(i, i + batchSize);
const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings);
log.info(
`hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - t} ms)`,
`hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
);
// Merge the new clusters we got from hdbscan into the existing clusters
@@ -489,7 +489,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
// Convert into the data structure we're using to debug/visualize.
//
// > Showing only top 20 and bottom 10 clusters (and only up to 50 faces in
// > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
// > each, sorted by cosine distance to highest scoring face in the
// > cluster).
@@ -497,9 +497,9 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
(a, b) => b.faceIDs.length - a.faceIDs.length,
);
const debugClusters =
sortedClusters.length < 30
sortedClusters.length < 60
? sortedClusters
: sortedClusters.slice(0, 20).concat(sortedClusters.slice(-10));
: sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
const clusterPreviews: ClusterPreview[] = [];
for (const cluster of debugClusters) {
const faces = cluster.faceIDs.map((id) =>