Prep
This commit is contained in:
@@ -49,14 +49,22 @@ export default function ClusterDebug() {
|
||||
}
|
||||
return (
|
||||
<>
|
||||
<Typography variant="small">
|
||||
{`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`}
|
||||
</Typography>
|
||||
<Typography variant="small" color="text.muted">
|
||||
Showing only top 20 and bottom 10 clusters (and only up to 50
|
||||
faces in each, sorted by cosine distance to highest scoring face
|
||||
in the cluster).
|
||||
</Typography>
|
||||
<Stack m={1}>
|
||||
<Typography variant="small" mb={1}>
|
||||
{`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`}
|
||||
</Typography>
|
||||
<Typography variant="small" color="text.muted">
|
||||
Showing only top 30 and bottom 30 clusters.
|
||||
</Typography>
|
||||
<Typography variant="small" color="text.muted">
|
||||
For each cluster showing only up to 50 faces, sorted by
|
||||
cosine similarity to highest scoring face in the cluster.
|
||||
</Typography>
|
||||
<Typography variant="small" color="text.muted">
|
||||
Below each face is its{" "}
|
||||
<b>blur - score - cosineSimilarity - direction</b>
|
||||
</Typography>
|
||||
</Stack>
|
||||
<hr />
|
||||
<Container>
|
||||
<AutoSizer>
|
||||
|
||||
@@ -8,7 +8,6 @@ import {
|
||||
enableML,
|
||||
mlStatusSnapshot,
|
||||
mlStatusSubscribe,
|
||||
wipCluster,
|
||||
wipClusterEnable,
|
||||
type MLStatus,
|
||||
} from "@/new/photos/services/ml";
|
||||
@@ -341,7 +340,7 @@ const ManageML: React.FC<ManageMLProps> = ({
|
||||
|
||||
// TODO-Cluster
|
||||
const router = useRouter();
|
||||
const wipClusterNow = () => wipCluster();
|
||||
// const wipClusterNow = () => wipCluster();
|
||||
const wipClusterShowNow = () => router.push("/cluster-debug");
|
||||
|
||||
return (
|
||||
@@ -391,18 +390,20 @@ const ManageML: React.FC<ManageMLProps> = ({
|
||||
<Box>
|
||||
<MenuItemGroup>
|
||||
<EnteMenuItem
|
||||
label={ut("Create clusters • internal only option")}
|
||||
onClick={wipClusterNow}
|
||||
label={ut(
|
||||
"Create clusters • internal only option",
|
||||
)}
|
||||
onClick={wipClusterShowNow}
|
||||
/>
|
||||
</MenuItemGroup>
|
||||
<MenuSectionTitle
|
||||
title={ut(
|
||||
"Create in-memory clusters from arbitrary 2k photos. Nothing will be saved or synced to remote. You can view the results in search dropdown.",
|
||||
"Create and show in-memory clusters. Takes ~ 1 min. Nothing will be saved or synced to remote. You can also view all clusters in the search dropdown later.",
|
||||
)}
|
||||
/>
|
||||
</Box>
|
||||
)}
|
||||
{showClusterOpt && (
|
||||
{/* {showClusterOpt && (
|
||||
<Box>
|
||||
<MenuItemGroup>
|
||||
<EnteMenuItem
|
||||
@@ -416,7 +417,7 @@ const ManageML: React.FC<ManageMLProps> = ({
|
||||
)}
|
||||
/>
|
||||
</Box>
|
||||
)}
|
||||
)} */}
|
||||
</Stack>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -348,14 +348,13 @@ function* enumerateFaces(faceIndices: FaceIndex[]) {
|
||||
}
|
||||
}
|
||||
|
||||
export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
|
||||
export const clusterFacesHdb = (faceIndexes: FaceIndex[]) => {
|
||||
const t = Date.now();
|
||||
|
||||
// A flattened array of faces.
|
||||
// TODO-Cluster ad-hoc filtering and slicing
|
||||
const faces0 = [...enumerateFaces(faceIndexes)]
|
||||
.filter((f) => f.blur > 99)
|
||||
.slice(0, 6000);
|
||||
const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99);
|
||||
// .slice(0, 6000);
|
||||
// TODO-Cluster testing code, can be removed once done
|
||||
const faces = Array(1)
|
||||
.fill(0)
|
||||
@@ -386,27 +385,28 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
|
||||
// be list of existing clusters we fetch from remote.
|
||||
const clusters: FaceCluster[] = [];
|
||||
|
||||
// Process the faces in batches of 10k. The faces are already sorted by file
|
||||
// ID, which is a monotonically increasing integer, so we will also have
|
||||
// some temporal locality.
|
||||
// Process the faces in batches. The faces are already sorted by file ID,
|
||||
// which is a monotonically increasing integer, so we will also have some
|
||||
// temporal locality.
|
||||
//
|
||||
// The number 10k was derived by ad-hoc observations. On a particular test
|
||||
// dataset, clustering 10k took ~2 mins, while 20k took ~8 mins. Memory
|
||||
// usage was constant in both cases.
|
||||
// The number 2500 was derived by ad-hoc observations and takes a few
|
||||
// seconds. On a particular test dataset and a particular machine,
|
||||
// clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins.
|
||||
// Memory usage was constant in all these cases.
|
||||
//
|
||||
// At around 100k faces, the clustering starts taking hours, and we start
|
||||
// running into stack overflows. The stack overflows can perhaps be avoided
|
||||
// by restructuring the code, but hours of uninterruptible work is anyways
|
||||
// not feasible.
|
||||
|
||||
// const batchSize = 10_000; // TODO-Cluster
|
||||
const batchSize = 1_000;
|
||||
const batchSize = 2500;
|
||||
for (let i = 0; i < faceEmbeddings.length; i += batchSize) {
|
||||
const it = Date.now();
|
||||
const embeddings = faceEmbeddings.slice(i, i + batchSize);
|
||||
const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings);
|
||||
|
||||
log.info(
|
||||
`hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - t} ms)`,
|
||||
`hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`,
|
||||
);
|
||||
|
||||
// Merge the new clusters we got from hdbscan into the existing clusters
|
||||
@@ -489,7 +489,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
|
||||
|
||||
// Convert into the data structure we're using to debug/visualize.
|
||||
//
|
||||
// > Showing only top 20 and bottom 10 clusters (and only up to 50 faces in
|
||||
// > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in
|
||||
// > each, sorted by cosine distance to highest scoring face in the
|
||||
// > cluster).
|
||||
|
||||
@@ -497,9 +497,9 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => {
|
||||
(a, b) => b.faceIDs.length - a.faceIDs.length,
|
||||
);
|
||||
const debugClusters =
|
||||
sortedClusters.length < 30
|
||||
sortedClusters.length < 60
|
||||
? sortedClusters
|
||||
: sortedClusters.slice(0, 20).concat(sortedClusters.slice(-10));
|
||||
: sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30));
|
||||
const clusterPreviews: ClusterPreview[] = [];
|
||||
for (const cluster of debugClusters) {
|
||||
const faces = cluster.faceIDs.map((id) =>
|
||||
|
||||
Reference in New Issue
Block a user