[desktop] Indexing tweaks (#2749)

2024-08-19 15:14:05 +05:30
parent 60cee41a44 769b3ab21f
commit 81b913cb79
10 changed files with 84 additions and 41 deletions
--- a/web/apps/accounts/src/pages/_app.tsx
+++ b/web/apps/accounts/src/pages/_app.tsx
@@ -52,9 +52,7 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
        setDialogBoxAttributesV2,
    };

-    const title = isI18nReady
-        ? t("title", { context: "accounts" })
-        : staticAppTitle;
+    const title = isI18nReady ? t("title_accounts") : staticAppTitle;

    return (
        <>
--- a/web/apps/auth/src/pages/_app.tsx
+++ b/web/apps/auth/src/pages/_app.tsx
@@ -150,9 +150,7 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
        somethingWentWrong,
    };

-    const title = isI18nReady
-        ? t("title", { context: "auth" })
-        : staticAppTitle;
+    const title = isI18nReady ? t("title_auth") : staticAppTitle;

    return (
        <>
--- a/web/apps/photos/src/pages/_app.tsx
+++ b/web/apps/photos/src/pages/_app.tsx
@@ -330,9 +330,7 @@ export default function App({ Component, pageProps }: AppProps) {
        logout,
    };

-    const title = isI18nReady
-        ? t("title", { context: "photos" })
-        : staticAppTitle;
+    const title = isI18nReady ? t("title_photos") : staticAppTitle;

    return (
        <>
--- a/web/apps/photos/src/services/searchService.ts
+++ b/web/apps/photos/src/services/searchService.ts
@@ -193,6 +193,9 @@ export async function getMLStatusSuggestion(): Promise<Suggestion> {
        case "indexing":
            label = t("indexing_photos", status);
            break;
+        case "fetching":
+            label = t("indexing_fetching", status);
+            break;
        case "clustering":
            label = t("indexing_people", status);
            break;
--- a/web/packages/base/locales/en-US/translation.json
+++ b/web/packages/base/locales/en-US/translation.json
@@ -232,6 +232,7 @@
    "PEOPLE": "People",
    "indexing_scheduled": "Indexing is scheduled...",
    "indexing_photos": "Indexing photos ({{nSyncedFiles, number}} / {{nTotalFiles, number}})",
+    "indexing_fetching": "Fetching indexes ({{nSyncedFiles, number}} / {{nTotalFiles, number}})",
    "indexing_people": "Indexing people in {{nSyncedFiles, number}} photos...",
    "indexing_done": "Indexed {{nSyncedFiles, number}} photos",
    "UNIDENTIFIED_FACES": "Unidentified faces",
@@ -484,6 +485,7 @@
    "indexing": "Indexing",
    "processed": "Processed",
    "indexing_status_running": "Running",
+    "indexing_status_fetching": "Fetching",
    "indexing_status_scheduled": "Scheduled",
    "indexing_status_done": "Done",
    "ml_search_disable": "Disable machine learning",
--- a/web/packages/new/photos/components/MLSettings.tsx
+++ b/web/packages/new/photos/components/MLSettings.tsx
@@ -299,15 +299,18 @@ const ManageML: React.FC<ManageMLProps> = ({

    let status: string;
    switch (phase) {
-        case "indexing":
-            status = "running";
-            break;
        case "scheduled":
-            status = "scheduled";
+            status = t("indexing_status_scheduled");
+            break;
+        case "fetching":
+            status = t("indexing_status_fetching");
+            break;
+        case "indexing":
+            status = t("indexing_status_running");
            break;
        // TODO: Clustering
        default:
-            status = "done";
+            status = t("indexing_status_done");
            break;
    }
    const processed = `${nSyncedFiles} / ${nTotalFiles}`;
@@ -351,9 +354,7 @@ const ManageML: React.FC<ManageMLProps> = ({
                        <Typography color="text.faint">
                            {t("indexing")}
                        </Typography>
-                        <Typography>
-                            {t("indexing_status", { context: status })}
-                        </Typography>
+                        <Typography>{status}</Typography>
                    </Stack>
                    <Divider sx={{ marginInlineStart: 2 }} />
                    <Stack
--- a/web/packages/new/photos/services/ml/db.ts
+++ b/web/packages/new/photos/services/ml/db.ts
@@ -394,14 +394,23 @@ export const indexableAndIndexedCounts = async () => {
 * universe, we filter out fileIDs the files corresponding to which have already
 * been indexed, or which should be ignored.
 *
- * @param count Limit the result to up to {@link count} items.
+ * @param count Limit the result to up to {@link count} items. If there are more
+ * than {@link count} items present, the files with the higher file IDs (which
+ * can be taken as a approximate for their creation order) are preferred.
 */
-export const indexableFileIDs = async (count?: number) => {
+export const indexableFileIDs = async (count: number) => {
    const db = await mlDB();
    const tx = db.transaction("file-status", "readonly");
-    return tx.store
+    let cursor = await tx.store
        .index("status")
-        .getAllKeys(IDBKeyRange.only("indexable"), count);
+        .openKeyCursor(IDBKeyRange.only("indexable"), "prev");
+    const result: number[] = [];
+    while (cursor && count > 0) {
+        result.push(cursor.primaryKey);
+        cursor = await cursor.continue();
+        count -= 1;
+    }
+    return result;
 };

 /**
--- a/web/packages/new/photos/services/ml/index.ts
+++ b/web/packages/new/photos/services/ml/index.ts
@@ -89,7 +89,7 @@ const worker = () =>
 const createComlinkWorker = async () => {
    const electron = ensureElectron();
    const delegate = {
-        workerDidProcessFile,
+        workerDidProcessFileOrIdle,
    };

    // Obtain a message port from the Electron layer.
@@ -404,13 +404,16 @@ export type MLStatus =
           *
           * - "indexing": The indexer is currently running.
           *
+           * - "fetching": The indexer is currently running, but we're primarily
+           *   fetching indexes for existing files.
+           *
           * - "clustering": All file we know of have been indexed, and we are now
           *   clustering the faces that were found.
           *
           * - "done": ML indexing and face clustering is complete for the user's
           *   library.
           */
-          phase: "scheduled" | "indexing" | "clustering" | "done";
+          phase: "scheduled" | "indexing" | "fetching" | "clustering" | "done";
          /** The number of files that have already been indexed. */
          nSyncedFiles: number;
          /** The total number of files that are eligible for indexing. */
@@ -476,12 +479,19 @@ const getMLStatus = async (): Promise<MLStatus> => {

    const { indexedCount, indexableCount } = await indexableAndIndexedCounts();

+    // During live uploads, the indexable count remains zero even as the indexer
+    // is processing the newly uploaded items. This is because these "live
+    // queue" items do not yet have a "file-status" entry.
+    //
+    // So use the state of the worker as a guide for the phase, not the
+    // indexable count.
+
    let phase: MLStatus["phase"];
-    if (indexableCount > 0) {
-        const isIndexing = await (await worker()).isIndexing();
-        phase = !isIndexing ? "scheduled" : "indexing";
+    const state = await (await worker()).state;
+    if (state == "indexing" || state == "fetching") {
+        phase = state;
    } else {
-        phase = "done";
+        phase = indexableCount > 0 ? "scheduled" : "done";
    }

    return {
@@ -513,7 +523,7 @@ const setInterimScheduledStatus = () => {
    setMLStatusSnapshot({ phase: "scheduled", nSyncedFiles, nTotalFiles });
 };

-const workerDidProcessFile = throttled(updateMLStatusSnapshot, 2000);
+const workerDidProcessFileOrIdle = throttled(updateMLStatusSnapshot, 2000);

 /**
 * Use CLIP to perform a natural language search over image embeddings.
--- a/web/packages/new/photos/services/ml/worker-types.ts
+++ b/web/packages/new/photos/services/ml/worker-types.ts
@@ -8,11 +8,10 @@
 */
 export interface MLWorkerDelegate {
    /**
-     * Called whenever a file is processed during indexing.
-     *
-     * It is called both when the indexing was successful or it failed.
+     * Called whenever the worker processes a file during indexing (either
+     * successfully or with errors), or when in goes into the "idle" state.
     */
-    workerDidProcessFile: () => void;
+    workerDidProcessFileOrIdle: () => void;
 }

 /**
--- a/web/packages/new/photos/services/ml/worker.ts
+++ b/web/packages/new/photos/services/ml/worker.ts
@@ -1,4 +1,5 @@
 import { clientPackageName } from "@/base/app";
+import { assertionFailed } from "@/base/assert";
 import { isHTTP4xxError } from "@/base/http";
 import { getKVN } from "@/base/kv";
 import { ensureAuthToken } from "@/base/local-user";
@@ -39,6 +40,20 @@ import {
 } from "./ml-data";
 import type { CLIPMatches, MLWorkerDelegate } from "./worker-types";

+/**
+ * A rough hint at what the worker is up to.
+ *
+ * -   "idle": Not doing anything
+ * -   "tick": Transitioning to a new state
+ * -   "indexing": Indexing
+ * -   "fetching": A subset of indexing
+ *
+ * During indexing, the state is set to "fetching" whenever remote provided us
+ * data for more than 50% of the files that we requested from it in the last
+ * fetch during indexing.
+ */
+export type WorkerState = "idle" | "tick" | "indexing" | "fetching";
+
 const idleDurationStart = 5; /* 5 seconds */
 const idleDurationMax = 16 * 60; /* 16 minutes */

@@ -76,9 +91,11 @@ interface IndexableItem {
 * particular, for finding the closest CLIP match when the user does a search.
 */
 export class MLWorker {
+    /** The last known state of the worker. */
+    public state: WorkerState = "idle";
+
    private electron: ElectronMLWorker | undefined;
    private delegate: MLWorkerDelegate | undefined;
-    private state: "idle" | "tick" | "indexing" = "idle";
    private liveQ: IndexableItem[] = [];
    private idleTimeout: ReturnType<typeof setTimeout> | undefined;
    private idleDuration = idleDurationStart; /* unit: seconds */
@@ -164,13 +181,6 @@ export class MLWorker {
        }
    }

-    /**
-     * Return true if we're currently indexing.
-     */
-    isIndexing() {
-        return this.state == "indexing";
-    }
-
    /**
     * Find {@link CLIPMatches} for a given {@link searchPhrase}.
     */
@@ -223,6 +233,7 @@ export class MLWorker {
        this.state = "idle";
        this.idleDuration = Math.min(this.idleDuration * 2, idleDurationMax);
        this.idleTimeout = setTimeout(scheduleTick, this.idleDuration * 1000);
+        this.delegate?.workerDidProcessFileOrIdle();
    }

    /** Return the next batch of items to backfill (if any). */
@@ -234,8 +245,20 @@ export class MLWorker {
            200,
        );
        if (!filesByID.size) return [];
+
        // Fetch their existing ML data (if any).
        const mlDataByID = await fetchMLData(filesByID);
+
+        // If the number of files for which remote gave us data is more than 50%
+        // of what we asked of it, assume we are "fetching", not "indexing".
+        // This is a heuristic to try and show a better indexing state in the UI
+        // (so that the user does not think that their files are being
+        // unnecessarily reindexed).
+        if (this.state != "indexing" && this.state != "fetching")
+            assertionFailed(`Unexpected state ${this.state}`);
+        this.state =
+            mlDataByID.size * 2 > filesByID.size ? "fetching" : "indexing";
+
        // Return files after annotating them with their existing ML data.
        return Array.from(filesByID, ([id, file]) => ({
            enteFile: file,
@@ -298,7 +321,7 @@ const indexNextBatch = async (
        await Promise.race(tasks);

        // Let the main thread now we're doing something.
-        delegate?.workerDidProcessFile();
+        delegate?.workerDidProcessFileOrIdle();

        // Let us drain the microtask queue. This also gives a chance for other
        // interactive tasks like `clipMatches` to run.
@@ -317,6 +340,8 @@ const indexNextBatch = async (
 * about. Then return the next {@link count} files that still need to be
 * indexed.
 *
+ * When returning from amongst pending files, prefer the most recent ones first.
+ *
 * For specifics of what a "sync" entails, see {@link updateAssumingLocalFiles}.
 *
 * @param userID Sync only files owned by a {@link userID} with the face DB.