Debugging shows that random sampling is taking good fraction of total prep time

prep is the one which takes most, almost 10x of loop/post
2024-10-16 20:49:20 +05:30
parent 1e72a3ba34
commit 15b151fcc0
1 changed files with 42 additions and 1 deletions
--- a/web/packages/new/photos/services/ml/people.ts
+++ b/web/packages/new/photos/services/ml/people.ts
@@ -366,6 +366,8 @@ export const _suggestionsAndChoicesForPerson = async (
    console.time("prep");
    const startTime = Date.now();

+    console.time("prep/1");
+
    const personClusters = person.cgroup.data.assigned;
    // TODO-Cluster: Persist this.
    const ignoredClusters: FaceCluster[] = [];
@@ -373,6 +375,9 @@ export const _suggestionsAndChoicesForPerson = async (
    const clusters = await savedFaceClusters();
    const faceIndexes = await savedFaceIndexes();

+    console.timeEnd("prep/1");
+    console.time("prep/2");
+
    const embeddingByFaceID = new Map(
        faceIndexes
            .map(({ faces }) =>
@@ -383,6 +388,9 @@ export const _suggestionsAndChoicesForPerson = async (
            .flat(),
    );

+    console.timeEnd("prep/2");
+    console.time("prep/3");
+
    const personClusterIDs = new Set(personClusters.map(({ id }) => id));
    const ignoredClusterIDs = new Set(ignoredClusters.map(({ id }) => id));

@@ -391,9 +399,13 @@ export const _suggestionsAndChoicesForPerson = async (
        .flat()
        .filter((e) => !!e);

+    console.timeEnd("prep/3");
+    console.time("prep/4");
+
    // Randomly sample faces to limit the O(n^2) cost.
    const sampledPersonEmbeddings = randomSample(personFaceEmbeddings, 50);

+    console.timeEnd("prep/4");
    console.timeEnd("prep");

    console.time("loop");
@@ -507,5 +519,34 @@ export const _suggestionsAndChoicesForPerson = async (
    return { choices, suggestions };
 };

-const randomSample = <T>(items: T[], n: number) =>
+/**
+ * Return a random sample of {@link n} elements from the given {@link items}.
+ *
+ * Functionally this is equivalent to `shuffled(items).slice(0, n)`, except it
+ * tries to be a bit faster for long arrays when we need only a small sample
+ * from it.
+ */
+const randomSample = <T>(items: T[], n: number) => {
+    if (items.length <= n) return items;
+    if (n == 0) return [];
+
+    if (n > items.length / 3) {
+        // Avoid using the random sampling without replacement method if a
+        // significant proportion of the original items are needed, otherwise we
+        // might run into long retry loop at the tail end (hitting the same
+        // indexes again an again).
+        return shuffled(items).slice(0, n);
+    }
+
+    const ix = new Set<number>();
+    while (ix.size < n) {
+        ix.add(Math.floor(Math.random() * items.length))
+    }
+    // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
+    return [...ix].map((i) => items[i]!);
+}
+
+
+const randomSampleOld = <T>(items: T[], n: number) => {
    items.length < n ? items : shuffled(items).slice(0, n);
+}