From 67ea0cfe734f9128411cfc1f5e3dec303b09650a Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 12:22:22 +0530 Subject: [PATCH 01/11] Debugging code --- .../new/photos/services/ml/cluster-new.ts | 26 ++++++++++--------- .../new/photos/services/ml/cluster.ts | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts index 9e07b2812c..94e5efe11e 100644 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ b/web/packages/new/photos/services/ml/cluster-new.ts @@ -343,17 +343,18 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // A flattened array of faces. // TODO-Cluster note the 2k slice - const faces = [...enumerateFaces(faceIndexes)].slice(0, 2000); + const faces0 = [...enumerateFaces(faceIndexes)];//.slice(0, 2000); + const faces = Array(1).fill(0).flatMap(() => faces0); const faceEmbeddings = faces.map(({ embedding }) => embedding); const { clusters: clusterIndices, - noise, - debugInfo, + // noise, + // debugInfo, } = clusterFacesHdbscan(faceEmbeddings); - log.info({ method: "hdbscan", clusterIndices, noise, debugInfo }); + // log.info({ method: "hdbscan", clusterIndices, noise, debugInfo }); log.info( `Clustered ${faces.length} faces into ${clusterIndices.length} clusters (${Date.now() - t} ms)`, ); @@ -387,7 +388,8 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // Convert into the data structure we're using to debug/visualize. const faceAndNeigbours: FaceNeighbours[] = []; - for (const fi of faces) { + const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30); + for (const fi of topFaces) { let neighbours: FaceNeighbour[] = []; for (const fj of faces) { // The vectors are already normalized, so we can directly use their @@ -437,13 +439,13 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { ).faceID; } - log.info("ml/cluster", { - faces, - validClusters, - clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID), - clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID), - cgroups, - }); + // log.info("ml/cluster", { + // faces, + // validClusters, + // clusterIndexForClusterID: Object.fromEntries(clusterIndexForClusterID), + // clusterIDForFaceID: Object.fromEntries(clusterIDForFaceID), + // cgroups, + // }); log.info( `Clustered ${faces.length} faces into ${validClusters.length} clusters (${Date.now() - t} ms)`, ); diff --git a/web/packages/new/photos/services/ml/cluster.ts b/web/packages/new/photos/services/ml/cluster.ts index ff62f466a9..53e4930d94 100644 --- a/web/packages/new/photos/services/ml/cluster.ts +++ b/web/packages/new/photos/services/ml/cluster.ts @@ -24,7 +24,7 @@ export const clusterFacesHdbscan = ( minSamples: 5, clusterSelectionEpsilon: 0.6, clusterSelectionMethod: "leaf", - debug: true, + debug: false, }); return { From cd69e00451496ca4311f7a551a7e6ce2852bfe41 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 18:27:44 +0530 Subject: [PATCH 02/11] Batch --- .../new/photos/services/ml/cluster-new.ts | 132 +++++++++++++----- 1 file changed, 100 insertions(+), 32 deletions(-) diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts index 94e5efe11e..e934b54676 100644 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ b/web/packages/new/photos/services/ml/cluster-new.ts @@ -342,48 +342,116 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { const t = Date.now(); // A flattened array of faces. - // TODO-Cluster note the 2k slice - const faces0 = [...enumerateFaces(faceIndexes)];//.slice(0, 2000); - const faces = Array(1).fill(0).flatMap(() => faces0); + const faces0 = [...enumerateFaces(faceIndexes)]; + // TODO-Cluster testing code, can be removed once done + const faces = Array(1) + .fill(0) + .flatMap(() => faces0); + + // For fast reverse lookup - map from face ids to the face. + const faceForFaceID = new Map(faces.map((f) => [f.faceID, f])); const faceEmbeddings = faces.map(({ embedding }) => embedding); - const { - clusters: clusterIndices, - // noise, - // debugInfo, - } = clusterFacesHdbscan(faceEmbeddings); - - // log.info({ method: "hdbscan", clusterIndices, noise, debugInfo }); - log.info( - `Clustered ${faces.length} faces into ${clusterIndices.length} clusters (${Date.now() - t} ms)`, - ); - // For fast reverse lookup - map from cluster ids to their index in the // clusters array. const clusterIndexForClusterID = new Map(); - // For fast reverse lookup - map from face ids to the id of the cluster to - // which they belong. + // For fast reverse lookup - map from the id of a face to the id of the + // cluster to which it belongs. const clusterIDForFaceID = new Map(); + // A function to chain two reverse lookup. + const firstFaceOfCluster = (cluster: FaceCluster) => + ensure(faceForFaceID.get(ensure(cluster.faceIDs[0]))); + // A function to generate new cluster IDs. const newClusterID = () => newNonSecureID("cluster_"); - // Convert the numerical face indices into the result. + // The resultant clusters. + // TODO-Cluster Later on, instead of starting from a blank slate, this will + // be list of existing clusters we fetch from remote. const clusters: FaceCluster[] = []; - for (const [ci, faceIndices] of clusterIndices.entries()) { - const clusterID = newClusterID(); - const faceIDs: string[] = []; - clusterIndexForClusterID.set(clusterID, ci); - for (const fi of faceIndices) { - // Can't find a way of avoiding the null assertion here. - // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - const face = faces[fi]!; - clusterIDForFaceID.set(face.faceID, clusterID); - faceIDs.push(face.faceID); + + // Process the faces in batches of 10k. The faces are already sorted by file + // ID, which is a monotonically increasing integer, so we will also have + // some temporal locality. + // + // The number 10k was derived by ad-hoc observations. On a particular test + // dataset, clustering 10k took ~2 mins, while 20k took ~8 mins. Memory + // usage was constant in both cases. + // + // At around 100k faces, the clustering starts taking hours, and we start + // running into stack overflows. The stack overflows can perhaps be avoided + // by restructuring the code, but hours of uninterruptible work is anyways + // not feasible. + + const batchSize = 10_000; + for (let i = 0; i < faceEmbeddings.length; i += batchSize) { + const embeddings = faceEmbeddings.slice(i, i + batchSize); + const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings); + + log.info( + `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - t} ms)`, + ); + + // Merge the new clusters we got from hdbscan into the existing clusters + // if they are "near" them (using some heuristic). + // + // We need to ensure we don't change any of the existing cluster IDs, + // since these might be existing clusters we got from remote. + + for (const hdbCluster of hdbClusters) { + // Find the existing cluster whose (arbitrarily chosen) first face + // is the nearest neighbour of the (arbitrarily chosen) first face + // of the cluster produced by hdbscan. + + const newFace = ensure(faces[i + ensure(hdbCluster[0])]); + + let nnCluster: FaceCluster | undefined; + let nnCosineSimilarity = 0; + for (const existingCluster of clusters) { + const existingFace = firstFaceOfCluster(existingCluster); + + // The vectors are already normalized, so we can directly use their + // dot product as their cosine similarity. + const csim = dotProduct( + existingFace.embedding, + newFace.embedding, + ); + + // Use a higher cosine similarity threshold if either of the two + // faces are blurry. + const threshold = + existingFace.blur < 100 || newFace.blur < 100 ? 0.84 : 0.7; + if (csim > threshold && csim > nnCosineSimilarity) { + nnCluster = existingCluster; + nnCosineSimilarity = csim; + } + } + + if (nnCluster) { + // If we found an existing cluster that is near enough, + // sublimate the cluster produced by hdbscan into that cluster. + for (const j of hdbCluster) { + const { faceID } = ensure(faces[i + j]); + nnCluster.faceIDs.push(faceID); + clusterIDForFaceID.set(faceID, nnCluster.id); + } + } else { + // Otherwise make a new cluster from the cluster produced by + // hdbscan. + const clusterID = newClusterID(); + const faceIDs: string[] = []; + for (const j of hdbCluster) { + const { faceID } = ensure(faces[i + j]); + faceIDs.push(faceID); + clusterIDForFaceID.set(faceID, clusterID); + } + clusterIndexForClusterID.set(clusterID, clusters.length); + clusters.push({ id: clusterID, faceIDs }); + } } - clusters.push({ id: clusterID, faceIDs }); } // Convert into the data structure we're using to debug/visualize. @@ -398,14 +466,15 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { neighbours.push({ face: fj, cosineSimilarity: csim }); } - neighbours = neighbours.sort( - (a, b) => b.cosineSimilarity - a.cosineSimilarity, - ); + neighbours = neighbours + .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity) + .slice(0, 30); faceAndNeigbours.push({ face: fi, neighbours }); } // Prune too small clusters. + // TODO-Cluster this is likely not needed since hdbscan already has a min? const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1); let cgroups = await clusterGroups(); @@ -426,7 +495,6 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // For each cluster group, use the highest scoring face in any of its // clusters as its display face. - const faceForFaceID = new Map(faces.map((f) => [f.faceID, f])); for (const cgroup of cgroups) { cgroup.displayFaceID = cgroup.clusterIDs .map((clusterID) => clusterIndexForClusterID.get(clusterID)) From 89a5a9f42f09be8a348676b04f55099d46b87630 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 18:42:17 +0530 Subject: [PATCH 03/11] Prune --- web/packages/new/photos/services/ml/index.ts | 23 +++++++------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/web/packages/new/photos/services/ml/index.ts b/web/packages/new/photos/services/ml/index.ts index 43d90578b8..53940232f6 100644 --- a/web/packages/new/photos/services/ml/index.ts +++ b/web/packages/new/photos/services/ml/index.ts @@ -387,16 +387,14 @@ export const wipClusterDebugPageContents = async (): Promise< const fileForFace = ({ faceID }: Face) => ensure(localFileByID.get(ensure(fileIDFromFaceID(faceID)))); - const faceFNs = faceAndNeigbours - .map(({ face, neighbours }) => ({ + const faceFNs = faceAndNeigbours.map(({ face, neighbours }) => ({ + face, + neighbours: neighbours.map(({ face, cosineSimilarity }) => ({ face, - neighbours: neighbours.map(({ face, cosineSimilarity }) => ({ - face, - enteFile: fileForFace(face), - cosineSimilarity, - })), - })) - .sort((a, b) => b.face.score - a.face.score); + enteFile: fileForFace(face), + cosineSimilarity, + })), + })); const clusterIDForFaceID = new Map( clusters.flatMap((cluster) => @@ -408,12 +406,7 @@ export const wipClusterDebugPageContents = async (): Promise< _wip_searchPersons = searchPersons; triggerStatusUpdate(); - const prunedFaceFNs = faceFNs.slice(0, 30).map(({ face, neighbours }) => ({ - face, - neighbours: neighbours.slice(0, 30), - })); - - return { faceFNs: prunedFaceFNs, clusters, clusterIDForFaceID }; + return { faceFNs, clusters, clusterIDForFaceID }; }; export const wipCluster = () => void wipClusterDebugPageContents(); From 2179b193d21af7a7f7218e889a6f16b37736245b Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 19:07:54 +0530 Subject: [PATCH 04/11] Preview --- web/apps/photos/src/pages/cluster-debug.tsx | 4 +- .../new/photos/services/ml/cluster-new.ts | 76 +++++++++++++++---- web/packages/new/photos/services/ml/index.ts | 32 ++++++-- 3 files changed, 89 insertions(+), 23 deletions(-) diff --git a/web/apps/photos/src/pages/cluster-debug.tsx b/web/apps/photos/src/pages/cluster-debug.tsx index c6abe7226f..dcffaecfd2 100644 --- a/web/apps/photos/src/pages/cluster-debug.tsx +++ b/web/apps/photos/src/pages/cluster-debug.tsx @@ -53,8 +53,8 @@ export default function ClusterDebug() { {`${clusterRes.clusters.length} clusters`} - Showing only upto first 30 faces (and only upto 30 nearest - neighbours of each). + Showing only top 20 and bottom 10 clusters (and only up to 50 faces in + each, sorted by cosine distance to highest scoring face in the cluster).
diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts index e934b54676..983c128a6a 100644 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ b/web/packages/new/photos/services/ml/cluster-new.ts @@ -124,6 +124,16 @@ interface FaceNeighbour { cosineSimilarity: number; } +export interface ClusterPreview { + clusterSize: number; + faces: ClusterPreviewFace[]; +} + +interface ClusterPreviewFace { + face: Face; + cosineSimilarity: number; +} + /** * Cluster faces into groups. * @@ -455,22 +465,56 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { } // Convert into the data structure we're using to debug/visualize. - const faceAndNeigbours: FaceNeighbours[] = []; - const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30); - for (const fi of topFaces) { - let neighbours: FaceNeighbour[] = []; - for (const fj of faces) { - // The vectors are already normalized, so we can directly use their - // dot product as their cosine similarity. - const csim = dotProduct(fi.embedding, fj.embedding); - neighbours.push({ face: fj, cosineSimilarity: csim }); + // const faceAndNeigbours: FaceNeighbours[] = []; + // const topFaces = faces.sort((a, b) => b.score - a.score).slice(0, 30); + // for (const fi of topFaces) { + // let neighbours: FaceNeighbour[] = []; + // for (const fj of faces) { + // // The vectors are already normalized, so we can directly use their + // // dot product as their cosine similarity. + // const csim = dotProduct(fi.embedding, fj.embedding); + // neighbours.push({ face: fj, cosineSimilarity: csim }); + // } + + // neighbours = neighbours + // .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity) + // .slice(0, 30); + + // faceAndNeigbours.push({ face: fi, neighbours }); + // } + + // Convert into the data structure we're using to debug/visualize. + // + // > Showing only top 20 and bottom 10 clusters (and only up to 50 faces in + // > each, sorted by cosine distance to highest scoring face in the + // > cluster). + + const sortedClusters = clusters.sort( + (a, b) => b.faceIDs.length - a.faceIDs.length, + ); + const debugClusters = + sortedClusters.length < 30 + ? sortedClusters + : sortedClusters.slice(0, 20).concat(sortedClusters.slice(-10)); + const clusterPreviews: ClusterPreview[] = []; + for (const cluster of debugClusters) { + const faces = cluster.faceIDs.map((id) => + ensure(faceForFaceID.get(id)), + ); + const topFace = faces.reduce((max, face) => + max.score > face.score ? max : face, + ); + const previewFaces: ClusterPreviewFace[] = []; + for (const face of faces) { + const csim = dotProduct(topFace.embedding, face.embedding); + previewFaces.push({ face, cosineSimilarity: csim }); } - - neighbours = neighbours - .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity) - .slice(0, 30); - - faceAndNeigbours.push({ face: fi, neighbours }); + clusterPreviews.push({ + clusterSize: cluster.faceIDs.length, + faces: previewFaces + .sort((a, b) => b.cosineSimilarity - a.cosineSimilarity) + .slice(0, 50), + }); } // Prune too small clusters. @@ -518,5 +562,5 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { `Clustered ${faces.length} faces into ${validClusters.length} clusters (${Date.now() - t} ms)`, ); - return { faces, clusters: validClusters, cgroups, faceAndNeigbours }; + return { faces, clusters: validClusters, cgroups, clusterPreviews }; }; diff --git a/web/packages/new/photos/services/ml/index.ts b/web/packages/new/photos/services/ml/index.ts index 53940232f6..df8d08235c 100644 --- a/web/packages/new/photos/services/ml/index.ts +++ b/web/packages/new/photos/services/ml/index.ts @@ -360,6 +360,18 @@ export interface FaceFileNeighbour { cosineSimilarity: number; } +// "with file" +export interface ClusterPreviewWF { + clusterSize: number; + faces: ClusterPreviewFaceWF[]; +} + +interface ClusterPreviewFaceWF { + face: Face; + enteFile: EnteFile; + cosineSimilarity: number; +} + export interface ClusterDebugPageContents { faceFNs: FaceFileNeighbours[]; clusters: FaceCluster[]; @@ -377,7 +389,7 @@ export const wipClusterDebugPageContents = async (): Promise< triggerStatusUpdate(); // const { faceAndNeigbours, clusters, cgroups } = await clusterFaces( - const { faceAndNeigbours, clusters, cgroups } = await clusterFacesHdb( + const { clusterPreviews, clusters, cgroups } = await clusterFacesHdb( await faceIndexes(), ); const searchPersons = await convertToSearchPersons(clusters, cgroups); @@ -387,9 +399,19 @@ export const wipClusterDebugPageContents = async (): Promise< const fileForFace = ({ faceID }: Face) => ensure(localFileByID.get(ensure(fileIDFromFaceID(faceID)))); - const faceFNs = faceAndNeigbours.map(({ face, neighbours }) => ({ - face, - neighbours: neighbours.map(({ face, cosineSimilarity }) => ({ + // const faceFNs = faceAndNeigbours.map( + // ({ topFace: face, faces: neighbours }) => ({ + // face, + // neighbours: neighbours.map(({ face, cosineSimilarity }) => ({ + // face, + // enteFile: fileForFace(face), + // cosineSimilarity, + // })), + // }), + // ); + const clusterPreviewWFs = clusterPreviews.map(({ clusterSize, faces }) => ({ + clusterSize, + faces: faces.map(({ face, cosineSimilarity }) => ({ face, enteFile: fileForFace(face), cosineSimilarity, @@ -406,7 +428,7 @@ export const wipClusterDebugPageContents = async (): Promise< _wip_searchPersons = searchPersons; triggerStatusUpdate(); - return { faceFNs, clusters, clusterIDForFaceID }; + return { clusterPreviewWFs, clusters, clusterIDForFaceID }; }; export const wipCluster = () => void wipClusterDebugPageContents(); From 3d952120233fe9de9b5a8a2dfc46422bfb4aba08 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 19:20:56 +0530 Subject: [PATCH 05/11] Preview --- web/apps/photos/src/pages/cluster-debug.tsx | 61 +++++++++++-------- .../new/photos/services/ml/cluster-new.ts | 7 ++- web/packages/new/photos/services/ml/index.ts | 5 +- 3 files changed, 42 insertions(+), 31 deletions(-) diff --git a/web/apps/photos/src/pages/cluster-debug.tsx b/web/apps/photos/src/pages/cluster-debug.tsx index dcffaecfd2..23930f1f03 100644 --- a/web/apps/photos/src/pages/cluster-debug.tsx +++ b/web/apps/photos/src/pages/cluster-debug.tsx @@ -4,10 +4,9 @@ import { faceCrop, wipClusterDebugPageContents, type ClusterDebugPageContents, - type FaceFileNeighbour, - type FaceFileNeighbours, + type ClusterPreviewFaceWF, + type ClusterPreviewWF, } from "@/new/photos/services/ml"; -import type { Face } from "@/new/photos/services/ml/face"; import { FlexWrapper, FluidContainer, @@ -15,7 +14,7 @@ import { } from "@ente/shared/components/Container"; import EnteSpinner from "@ente/shared/components/EnteSpinner"; import BackButton from "@mui/icons-material/ArrowBackOutlined"; -import { Box, IconButton, styled, Typography } from "@mui/material"; +import { Box, IconButton, Stack, styled, Typography } from "@mui/material"; import { useRouter } from "next/router"; import { AppContext } from "pages/_app"; import React, { useContext, useEffect, useMemo, useRef, useState } from "react"; @@ -53,8 +52,9 @@ export default function ClusterDebug() { {`${clusterRes.clusters.length} clusters`} - Showing only top 20 and bottom 10 clusters (and only up to 50 faces in - each, sorted by cosine distance to highest scoring face in the cluster). + Showing only top 20 and bottom 10 clusters (and only up to 50 + faces in each, sorted by cosine distance to highest scoring face + in the cluster).
@@ -112,7 +112,7 @@ const ClusterPhotoList: React.FC = ({ width, clusterRes, }) => { - const { faceFNs, clusterIDForFaceID } = clusterRes; + const { clusterPreviewWFs, clusterIDForFaceID } = clusterRes; const [itemList, setItemList] = useState([]); const listRef = useRef(null); @@ -125,8 +125,8 @@ const ClusterPhotoList: React.FC = ({ const listItemHeight = 120 * shrinkRatio + 24 + 4; useEffect(() => { - setItemList(itemListFromFaceFNs(faceFNs, columns)); - }, [columns, faceFNs]); + setItemList(itemListFromClusterPreviewWFs(clusterPreviewWFs, columns)); + }, [columns, clusterPreviewWFs]); useEffect(() => { listRef.current?.resetAfterIndex(0); @@ -138,7 +138,7 @@ const ClusterPhotoList: React.FC = ({ const generateKey = (i: number) => Array.isArray(itemList[i]) ? `${itemList[i][0].enteFile.id}/${itemList[i][0].face.faceID}-${itemList[i].slice(-1)[0].enteFile.id}/${itemList[i].slice(-1)[0].face.faceID}-${i}` - : `${itemList[i].faceID}-${i}`; + : `${itemList[i]}-${i}`; return ( = ({ > {!Array.isArray(item) ? ( - {`score ${item.score.toFixed(2)} blur ${item.blur.toFixed(0)}`} + {`cluster size ${item.toFixed(2)}`} ) : ( - item.map((faceFN, i) => ( + item.map((faceWF, i) => ( )) )} @@ -181,19 +181,20 @@ const ClusterPhotoList: React.FC = ({ ); }; -type ItemListItem = Face | FaceFileNeighbour[]; +// type ItemListItem = Face | FaceFileNeighbour[]; +type ItemListItem = number | ClusterPreviewFaceWF[]; -const itemListFromFaceFNs = ( - faceFNs: FaceFileNeighbours[], +const itemListFromClusterPreviewWFs = ( + clusterPreviewWFs: ClusterPreviewWF[], columns: number, ) => { const result: ItemListItem[] = []; - for (let index = 0; index < faceFNs.length; index++) { - const { face, neighbours } = faceFNs[index]; - result.push(face); + for (let index = 0; index < clusterPreviewWFs.length; index++) { + const { clusterSize, faces } = clusterPreviewWFs[index]; + result.push(clusterSize); let lastIndex = 0; - while (lastIndex < neighbours.length) { - result.push(neighbours.slice(lastIndex, lastIndex + columns)); + while (lastIndex < faces.length) { + result.push(faces.slice(lastIndex, lastIndex + columns)); lastIndex += columns; } } @@ -210,12 +211,12 @@ const getShrinkRatio = (width: number, columns: number) => (columns * 120); interface FaceItemProps { - faceFN: FaceFileNeighbour; + faceWF: ClusterPreviewFaceWF; clusterIDForFaceID: Map; } -const FaceItem: React.FC = ({ faceFN, clusterIDForFaceID }) => { - const { face, enteFile, cosineSimilarity } = faceFN; +const FaceItem: React.FC = ({ faceWF, clusterIDForFaceID }) => { + const { face, enteFile, cosineSimilarity } = faceWF; const { faceID } = face; const [objectURL, setObjectURL] = useState(); @@ -252,9 +253,15 @@ const FaceItem: React.FC = ({ faceFN, clusterIDForFaceID }) => { src={objectURL} /> )} - - {cosineSimilarity.toFixed(2)} - + + + {`${face.blur.toFixed(0)} blr`} + + + + {`cos ${cosineSimilarity.toFixed(2)}`} + + ); }; diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts index 983c128a6a..e2bf78eb35 100644 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ b/web/packages/new/photos/services/ml/cluster-new.ts @@ -352,7 +352,10 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { const t = Date.now(); // A flattened array of faces. - const faces0 = [...enumerateFaces(faceIndexes)]; + // TODO-Cluster ad-hoc filtering and slicing + const faces0 = [...enumerateFaces(faceIndexes)] + .filter((f) => f.blur > 50) + .slice(0, 1000); // TODO-Cluster testing code, can be removed once done const faces = Array(1) .fill(0) @@ -433,7 +436,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // Use a higher cosine similarity threshold if either of the two // faces are blurry. const threshold = - existingFace.blur < 100 || newFace.blur < 100 ? 0.84 : 0.7; + existingFace.blur < 100 || newFace.blur < 100 ? 0.9 : 0.7; if (csim > threshold && csim > nnCosineSimilarity) { nnCluster = existingCluster; nnCosineSimilarity = csim; diff --git a/web/packages/new/photos/services/ml/index.ts b/web/packages/new/photos/services/ml/index.ts index df8d08235c..699a9b9c14 100644 --- a/web/packages/new/photos/services/ml/index.ts +++ b/web/packages/new/photos/services/ml/index.ts @@ -366,14 +366,15 @@ export interface ClusterPreviewWF { faces: ClusterPreviewFaceWF[]; } -interface ClusterPreviewFaceWF { +export interface ClusterPreviewFaceWF { face: Face; enteFile: EnteFile; cosineSimilarity: number; } export interface ClusterDebugPageContents { - faceFNs: FaceFileNeighbours[]; + // faceFNs: FaceFileNeighbours[]; + clusterPreviewWFs: ClusterPreviewWF[]; clusters: FaceCluster[]; clusterIDForFaceID: Map; } From 29b5830e19a861e20fc646d7db5a8bfcbbab8382 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 19:31:24 +0530 Subject: [PATCH 06/11] Print scores --- web/apps/photos/src/pages/cluster-debug.tsx | 8 +++++--- web/packages/new/photos/services/ml/cluster-new.ts | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/web/apps/photos/src/pages/cluster-debug.tsx b/web/apps/photos/src/pages/cluster-debug.tsx index 23930f1f03..62d2e05df4 100644 --- a/web/apps/photos/src/pages/cluster-debug.tsx +++ b/web/apps/photos/src/pages/cluster-debug.tsx @@ -255,11 +255,13 @@ const FaceItem: React.FC = ({ faceWF, clusterIDForFaceID }) => { )} - {`${face.blur.toFixed(0)} blr`} + {`b ${face.blur.toFixed(0)} b`} - - {`cos ${cosineSimilarity.toFixed(2)}`} + {`s ${face.score.toFixed(2)}`} + + + {`c ${cosineSimilarity.toFixed(2)}`} diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts index e2bf78eb35..e2149db180 100644 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ b/web/packages/new/photos/services/ml/cluster-new.ts @@ -355,7 +355,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // TODO-Cluster ad-hoc filtering and slicing const faces0 = [...enumerateFaces(faceIndexes)] .filter((f) => f.blur > 50) - .slice(0, 1000); + .slice(0, 6000); // TODO-Cluster testing code, can be removed once done const faces = Array(1) .fill(0) From c9acda1b6d4b25149772039607bcbc8ff9e5dec9 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 19:47:16 +0530 Subject: [PATCH 07/11] Show direction --- web/apps/photos/src/pages/cluster-debug.tsx | 15 ++++++++++++--- web/packages/new/photos/services/ml/face.ts | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/web/apps/photos/src/pages/cluster-debug.tsx b/web/apps/photos/src/pages/cluster-debug.tsx index 62d2e05df4..0798ad608d 100644 --- a/web/apps/photos/src/pages/cluster-debug.tsx +++ b/web/apps/photos/src/pages/cluster-debug.tsx @@ -7,6 +7,7 @@ import { type ClusterPreviewFaceWF, type ClusterPreviewWF, } from "@/new/photos/services/ml"; +import { faceDirection } from "@/new/photos/services/ml/face"; import { FlexWrapper, FluidContainer, @@ -236,6 +237,8 @@ const FaceItem: React.FC = ({ faceWF, clusterIDForFaceID }) => { }; }, [faceID, enteFile]); + const fd = faceDirection(face.detection); + const d = fd == "straight" ? "•" : fd == "left" ? "←" : "→"; return ( = ({ faceWF, clusterIDForFaceID }) => { )} - {`b ${face.blur.toFixed(0)} b`} + {`b${face.blur.toFixed(0)} `} - {`s ${face.score.toFixed(2)}`} + {`s${face.score.toFixed(1)}`} - {`c ${cosineSimilarity.toFixed(2)}`} + {`c${cosineSimilarity.toFixed(1)}`} + + + {`c${cosineSimilarity.toFixed(1)}`} + + + {`d${d}`} diff --git a/web/packages/new/photos/services/ml/face.ts b/web/packages/new/photos/services/ml/face.ts index 891b605db2..d8616b7426 100644 --- a/web/packages/new/photos/services/ml/face.ts +++ b/web/packages/new/photos/services/ml/face.ts @@ -714,7 +714,7 @@ const detectBlur = ( type FaceDirection = "left" | "right" | "straight"; -const faceDirection = ({ landmarks }: FaceDetection): FaceDirection => { +export const faceDirection = ({ landmarks }: FaceDetection): FaceDirection => { const leftEye = landmarks[0]!; const rightEye = landmarks[1]!; const nose = landmarks[2]!; From 4fd32155dc98aee636a0f01e8933f18df91e3a7f Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 19:55:05 +0530 Subject: [PATCH 08/11] Worker --- .../new/photos/services/ml/cluster-new.ts | 13 +++++++++--- web/packages/new/photos/services/ml/index.ts | 21 +++++-------------- web/packages/new/photos/services/ml/worker.ts | 7 +++++++ 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts index e2149db180..1258551439 100644 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ b/web/packages/new/photos/services/ml/cluster-new.ts @@ -399,7 +399,8 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // by restructuring the code, but hours of uninterruptible work is anyways // not feasible. - const batchSize = 10_000; + // const batchSize = 10_000; // TODO-Cluster + const batchSize = 1_000; for (let i = 0; i < faceEmbeddings.length; i += batchSize) { const embeddings = faceEmbeddings.slice(i, i + batchSize); const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings); @@ -562,8 +563,14 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // cgroups, // }); log.info( - `Clustered ${faces.length} faces into ${validClusters.length} clusters (${Date.now() - t} ms)`, + `Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`, ); - return { faces, clusters: validClusters, cgroups, clusterPreviews }; + return { + faces, + clusters: validClusters, + cgroups, + clusterPreviews, + clusterIDForFaceID, + }; }; diff --git a/web/packages/new/photos/services/ml/index.ts b/web/packages/new/photos/services/ml/index.ts index 699a9b9c14..9f85f47119 100644 --- a/web/packages/new/photos/services/ml/index.ts +++ b/web/packages/new/photos/services/ml/index.ts @@ -20,14 +20,9 @@ import { getAllLocalFiles } from "../files"; import { getRemoteFlag, updateRemoteFlag } from "../remote-store"; import type { SearchPerson } from "../search/types"; import type { UploadItem } from "../upload/types"; -import { clusterFacesHdb, type CGroup, type FaceCluster } from "./cluster-new"; +import { type CGroup, type FaceCluster } from "./cluster-new"; import { regenerateFaceCrops } from "./crop"; -import { - clearMLDB, - faceIndex, - faceIndexes, - indexableAndIndexedCounts, -} from "./db"; +import { clearMLDB, faceIndex, indexableAndIndexedCounts } from "./db"; import type { Face } from "./face"; import { MLWorker } from "./worker"; import type { CLIPMatches } from "./worker-types"; @@ -390,9 +385,9 @@ export const wipClusterDebugPageContents = async (): Promise< triggerStatusUpdate(); // const { faceAndNeigbours, clusters, cgroups } = await clusterFaces( - const { clusterPreviews, clusters, cgroups } = await clusterFacesHdb( - await faceIndexes(), - ); + const { clusterPreviews, clusters, cgroups, clusterIDForFaceID } = + await worker().then((w) => w.clusterFacesHdb()); + const searchPersons = await convertToSearchPersons(clusters, cgroups); const localFiles = await getAllLocalFiles(); @@ -419,12 +414,6 @@ export const wipClusterDebugPageContents = async (): Promise< })), })); - const clusterIDForFaceID = new Map( - clusters.flatMap((cluster) => - cluster.faceIDs.map((id) => [id, cluster.id]), - ), - ); - _wip_isClustering = false; _wip_searchPersons = searchPersons; triggerStatusUpdate(); diff --git a/web/packages/new/photos/services/ml/worker.ts b/web/packages/new/photos/services/ml/worker.ts index f21f58d85a..e4a3e5ecab 100644 --- a/web/packages/new/photos/services/ml/worker.ts +++ b/web/packages/new/photos/services/ml/worker.ts @@ -24,8 +24,10 @@ import { indexCLIP, type CLIPIndex, } from "./clip"; +import { clusterFacesHdb } from "./cluster-new"; import { saveFaceCrops } from "./crop"; import { + faceIndexes, indexableFileIDs, markIndexingFailed, saveIndexes, @@ -272,6 +274,11 @@ export class MLWorker { remoteMLData: mlDataByID.get(id), })); } + + // TODO-Cluster + async clusterFacesHdb() { + return clusterFacesHdb(await faceIndexes()); + } } expose(MLWorker); From 15884597b4c27e4253f1db88eaf14360dcf27d51 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 19:57:30 +0530 Subject: [PATCH 09/11] uc --- web/apps/photos/src/pages/cluster-debug.tsx | 5 +---- .../new/photos/services/ml/cluster-new.ts | 11 +++++++--- web/packages/new/photos/services/ml/index.ts | 20 ++++++++++++++++--- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/web/apps/photos/src/pages/cluster-debug.tsx b/web/apps/photos/src/pages/cluster-debug.tsx index 0798ad608d..db187751de 100644 --- a/web/apps/photos/src/pages/cluster-debug.tsx +++ b/web/apps/photos/src/pages/cluster-debug.tsx @@ -50,7 +50,7 @@ export default function ClusterDebug() { return ( <> - {`${clusterRes.clusters.length} clusters`} + {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`} Showing only top 20 and bottom 10 clusters (and only up to 50 @@ -266,9 +266,6 @@ const FaceItem: React.FC = ({ faceWF, clusterIDForFaceID }) => { {`c${cosineSimilarity.toFixed(1)}`} - - {`c${cosineSimilarity.toFixed(1)}`} - {`d${d}`} diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts index 1258551439..e49db72385 100644 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ b/web/packages/new/photos/services/ml/cluster-new.ts @@ -354,7 +354,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // A flattened array of faces. // TODO-Cluster ad-hoc filtering and slicing const faces0 = [...enumerateFaces(faceIndexes)] - .filter((f) => f.blur > 50) + .filter((f) => f.blur > 99) .slice(0, 6000); // TODO-Cluster testing code, can be removed once done const faces = Array(1) @@ -437,7 +437,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // Use a higher cosine similarity threshold if either of the two // faces are blurry. const threshold = - existingFace.blur < 100 || newFace.blur < 100 ? 0.9 : 0.7; + existingFace.blur < 200 || newFace.blur < 200 ? 0.9 : 0.7; if (csim > threshold && csim > nnCosineSimilarity) { nnCluster = existingCluster; nnCosineSimilarity = csim; @@ -566,8 +566,13 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { `Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`, ); + const clusteredCount = clusterIDForFaceID.size + const unclusteredCount = faces.length - clusteredCount; + return { - faces, + // faces, + clusteredCount, + unclusteredCount, clusters: validClusters, cgroups, clusterPreviews, diff --git a/web/packages/new/photos/services/ml/index.ts b/web/packages/new/photos/services/ml/index.ts index 9f85f47119..3f588c09ad 100644 --- a/web/packages/new/photos/services/ml/index.ts +++ b/web/packages/new/photos/services/ml/index.ts @@ -368,6 +368,8 @@ export interface ClusterPreviewFaceWF { } export interface ClusterDebugPageContents { + clusteredCount: number; + unclusteredCount: number; // faceFNs: FaceFileNeighbours[]; clusterPreviewWFs: ClusterPreviewWF[]; clusters: FaceCluster[]; @@ -385,8 +387,14 @@ export const wipClusterDebugPageContents = async (): Promise< triggerStatusUpdate(); // const { faceAndNeigbours, clusters, cgroups } = await clusterFaces( - const { clusterPreviews, clusters, cgroups, clusterIDForFaceID } = - await worker().then((w) => w.clusterFacesHdb()); + const { + clusteredCount, + unclusteredCount, + clusterPreviews, + clusters, + cgroups, + clusterIDForFaceID, + } = await worker().then((w) => w.clusterFacesHdb()); const searchPersons = await convertToSearchPersons(clusters, cgroups); @@ -418,7 +426,13 @@ export const wipClusterDebugPageContents = async (): Promise< _wip_searchPersons = searchPersons; triggerStatusUpdate(); - return { clusterPreviewWFs, clusters, clusterIDForFaceID }; + return { + clusteredCount, + unclusteredCount, + clusterPreviewWFs, + clusters, + clusterIDForFaceID, + }; }; export const wipCluster = () => void wipClusterDebugPageContents(); From d6c7ab0735087aaff9eee6c9b574eaefcaa3fa19 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 20:18:31 +0530 Subject: [PATCH 10/11] Inline --- .../new/photos/services/ml/cluster-new.ts | 66 ++++++++++++------- web/packages/new/photos/services/ml/index.ts | 31 ++++++++- 2 files changed, 73 insertions(+), 24 deletions(-) diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts index e49db72385..df97ca4bd9 100644 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ b/web/packages/new/photos/services/ml/cluster-new.ts @@ -525,34 +525,56 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // TODO-Cluster this is likely not needed since hdbscan already has a min? const validClusters = clusters.filter(({ faceIDs }) => faceIDs.length > 1); - let cgroups = await clusterGroups(); + // let cgroups = await clusterGroups(); + + // // TODO-Cluster - Currently we're not syncing with remote or saving anything + // // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced) + // // cgroup, one per cluster. + // cgroups = cgroups.concat( + // validClusters.map((c) => ({ + // id: c.id, + // name: undefined, + // clusterIDs: [c.id], + // isHidden: false, + // avatarFaceID: undefined, + // displayFaceID: undefined, + // })), + // ); + + // // For each cluster group, use the highest scoring face in any of its + // // clusters as its display face. + // for (const cgroup of cgroups) { + // cgroup.displayFaceID = cgroup.clusterIDs + // .map((clusterID) => clusterIndexForClusterID.get(clusterID)) + // .filter((i) => i !== undefined) /* 0 is a valid index */ + // .flatMap((i) => clusters[i]?.faceIDs ?? []) + // .map((faceID) => faceForFaceID.get(faceID)) + // .filter((face) => !!face) + // .reduce((max, face) => + // max.score > face.score ? max : face, + // ).faceID; + // } // TODO-Cluster - Currently we're not syncing with remote or saving anything // locally, so cgroups will be empty. Create a temporary (unsaved, unsynced) // cgroup, one per cluster. - cgroups = cgroups.concat( - validClusters.map((c) => ({ - id: c.id, + + const cgroups: CGroup[] = []; + for (const cluster of sortedClusters) { + const faces = cluster.faceIDs.map((id) => + ensure(faceForFaceID.get(id)), + ); + const topFace = faces.reduce((max, face) => + max.score > face.score ? max : face, + ); + cgroups.push({ + id: cluster.id, name: undefined, - clusterIDs: [c.id], + clusterIDs: [cluster.id], isHidden: false, avatarFaceID: undefined, - displayFaceID: undefined, - })), - ); - - // For each cluster group, use the highest scoring face in any of its - // clusters as its display face. - for (const cgroup of cgroups) { - cgroup.displayFaceID = cgroup.clusterIDs - .map((clusterID) => clusterIndexForClusterID.get(clusterID)) - .filter((i) => i !== undefined) /* 0 is a valid index */ - .flatMap((i) => clusters[i]?.faceIDs ?? []) - .map((faceID) => faceForFaceID.get(faceID)) - .filter((face) => !!face) - .reduce((max, face) => - max.score > face.score ? max : face, - ).faceID; + displayFaceID: topFace.faceID, + }); } // log.info("ml/cluster", { @@ -566,7 +588,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { `Clustered ${faces.length} faces into ${validClusters.length} clusters, with ${faces.length - clusterIDForFaceID.size} faces remaining unclustered (${Date.now() - t} ms)`, ); - const clusteredCount = clusterIDForFaceID.size + const clusteredCount = clusterIDForFaceID.size; const unclusteredCount = faces.length - clusteredCount; return { diff --git a/web/packages/new/photos/services/ml/index.ts b/web/packages/new/photos/services/ml/index.ts index 3f588c09ad..4248c295f0 100644 --- a/web/packages/new/photos/services/ml/index.ts +++ b/web/packages/new/photos/services/ml/index.ts @@ -396,7 +396,7 @@ export const wipClusterDebugPageContents = async (): Promise< clusterIDForFaceID, } = await worker().then((w) => w.clusterFacesHdb()); - const searchPersons = await convertToSearchPersons(clusters, cgroups); + // const searchPersons = await convertToSearchPersons(clusters, cgroups); const localFiles = await getAllLocalFiles(); const localFileByID = new Map(localFiles.map((f) => [f.id, f])); @@ -422,6 +422,32 @@ export const wipClusterDebugPageContents = async (): Promise< })), })); + const clusterByID = new Map(clusters.map((c) => [c.id, c])); + + const searchPersons = cgroups + .map((cgroup) => { + const faceID = ensure(cgroup.displayFaceID); + const fileID = ensure(fileIDFromFaceID(faceID)); + const file = ensure(localFileByID.get(fileID)); + + const faceIDs = cgroup.clusterIDs + .map((id) => ensure(clusterByID.get(id))) + .flatMap((cluster) => cluster.faceIDs); + const fileIDs = faceIDs + .map((faceID) => fileIDFromFaceID(faceID)) + .filter((fileID) => fileID !== undefined); + + return { + id: cgroup.id, + name: cgroup.name, + faceIDs, + files: [...new Set(fileIDs)], + displayFaceID: faceID, + displayFaceFile: file, + }; + }) + .sort((a, b) => b.faceIDs.length - a.faceIDs.length); + _wip_isClustering = false; _wip_searchPersons = searchPersons; triggerStatusUpdate(); @@ -437,7 +463,8 @@ export const wipClusterDebugPageContents = async (): Promise< export const wipCluster = () => void wipClusterDebugPageContents(); -const convertToSearchPersons = async ( +// TODO-Cluster remove me +export const convertToSearchPersons = async ( clusters: FaceCluster[], cgroups: CGroup[], ) => { From d5a1187e13a20c3c58d73b01314eb3447ce14843 Mon Sep 17 00:00:00 2001 From: Manav Rathi Date: Thu, 29 Aug 2024 20:33:18 +0530 Subject: [PATCH 11/11] Prep --- web/apps/photos/src/pages/cluster-debug.tsx | 24 +++++++++----- .../new/photos/components/MLSettings.tsx | 15 +++++---- .../new/photos/services/ml/cluster-new.ts | 32 +++++++++---------- 3 files changed, 40 insertions(+), 31 deletions(-) diff --git a/web/apps/photos/src/pages/cluster-debug.tsx b/web/apps/photos/src/pages/cluster-debug.tsx index db187751de..fcbeadfcab 100644 --- a/web/apps/photos/src/pages/cluster-debug.tsx +++ b/web/apps/photos/src/pages/cluster-debug.tsx @@ -49,14 +49,22 @@ export default function ClusterDebug() { } return ( <> - - {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`} - - - Showing only top 20 and bottom 10 clusters (and only up to 50 - faces in each, sorted by cosine distance to highest scoring face - in the cluster). - + + + {`${clusterRes.clusters.length} clusters from ${clusterRes.clusteredCount} faces. ${clusterRes.unclusteredCount} unclustered faces.`} + + + Showing only top 30 and bottom 30 clusters. + + + For each cluster showing only up to 50 faces, sorted by + cosine similarity to highest scoring face in the cluster. + + + Below each face is its{" "} + blur - score - cosineSimilarity - direction + +
diff --git a/web/packages/new/photos/components/MLSettings.tsx b/web/packages/new/photos/components/MLSettings.tsx index eeff4d1be8..c8785110b6 100644 --- a/web/packages/new/photos/components/MLSettings.tsx +++ b/web/packages/new/photos/components/MLSettings.tsx @@ -8,7 +8,6 @@ import { enableML, mlStatusSnapshot, mlStatusSubscribe, - wipCluster, wipClusterEnable, type MLStatus, } from "@/new/photos/services/ml"; @@ -341,7 +340,7 @@ const ManageML: React.FC = ({ // TODO-Cluster const router = useRouter(); - const wipClusterNow = () => wipCluster(); + // const wipClusterNow = () => wipCluster(); const wipClusterShowNow = () => router.push("/cluster-debug"); return ( @@ -391,18 +390,20 @@ const ManageML: React.FC = ({ )} - {showClusterOpt && ( + {/* {showClusterOpt && ( = ({ )} /> - )} + )} */} ); }; diff --git a/web/packages/new/photos/services/ml/cluster-new.ts b/web/packages/new/photos/services/ml/cluster-new.ts index df97ca4bd9..d6e1dc505a 100644 --- a/web/packages/new/photos/services/ml/cluster-new.ts +++ b/web/packages/new/photos/services/ml/cluster-new.ts @@ -348,14 +348,13 @@ function* enumerateFaces(faceIndices: FaceIndex[]) { } } -export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { +export const clusterFacesHdb = (faceIndexes: FaceIndex[]) => { const t = Date.now(); // A flattened array of faces. // TODO-Cluster ad-hoc filtering and slicing - const faces0 = [...enumerateFaces(faceIndexes)] - .filter((f) => f.blur > 99) - .slice(0, 6000); + const faces0 = [...enumerateFaces(faceIndexes)].filter((f) => f.blur > 99); + // .slice(0, 6000); // TODO-Cluster testing code, can be removed once done const faces = Array(1) .fill(0) @@ -386,27 +385,28 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // be list of existing clusters we fetch from remote. const clusters: FaceCluster[] = []; - // Process the faces in batches of 10k. The faces are already sorted by file - // ID, which is a monotonically increasing integer, so we will also have - // some temporal locality. + // Process the faces in batches. The faces are already sorted by file ID, + // which is a monotonically increasing integer, so we will also have some + // temporal locality. // - // The number 10k was derived by ad-hoc observations. On a particular test - // dataset, clustering 10k took ~2 mins, while 20k took ~8 mins. Memory - // usage was constant in both cases. + // The number 2500 was derived by ad-hoc observations and takes a few + // seconds. On a particular test dataset and a particular machine, + // clustering 1k took ~2 seconds, 10k took ~2 mins, while 20k took ~8 mins. + // Memory usage was constant in all these cases. // // At around 100k faces, the clustering starts taking hours, and we start // running into stack overflows. The stack overflows can perhaps be avoided // by restructuring the code, but hours of uninterruptible work is anyways // not feasible. - // const batchSize = 10_000; // TODO-Cluster - const batchSize = 1_000; + const batchSize = 2500; for (let i = 0; i < faceEmbeddings.length; i += batchSize) { + const it = Date.now(); const embeddings = faceEmbeddings.slice(i, i + batchSize); const { clusters: hdbClusters } = clusterFacesHdbscan(embeddings); log.info( - `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - t} ms)`, + `hdbscan produced ${hdbClusters.length} clusters from ${embeddings.length} faces (${Date.now() - it} ms)`, ); // Merge the new clusters we got from hdbscan into the existing clusters @@ -489,7 +489,7 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { // Convert into the data structure we're using to debug/visualize. // - // > Showing only top 20 and bottom 10 clusters (and only up to 50 faces in + // > Showing only top 30 and bottom 30 clusters (and only up to 50 faces in // > each, sorted by cosine distance to highest scoring face in the // > cluster). @@ -497,9 +497,9 @@ export const clusterFacesHdb = async (faceIndexes: FaceIndex[]) => { (a, b) => b.faceIDs.length - a.faceIDs.length, ); const debugClusters = - sortedClusters.length < 30 + sortedClusters.length < 60 ? sortedClusters - : sortedClusters.slice(0, 20).concat(sortedClusters.slice(-10)); + : sortedClusters.slice(0, 30).concat(sortedClusters.slice(-30)); const clusterPreviews: ClusterPreview[] = []; for (const cluster of debugClusters) { const faces = cluster.faceIDs.map((id) =>