[desktop] Indexing tweaks (#2749)

This commit is contained in:
Manav Rathi
2024-08-19 15:14:05 +05:30
committed by GitHub
10 changed files with 84 additions and 41 deletions

View File

@@ -52,9 +52,7 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
setDialogBoxAttributesV2,
};
const title = isI18nReady
? t("title", { context: "accounts" })
: staticAppTitle;
const title = isI18nReady ? t("title_accounts") : staticAppTitle;
return (
<>

View File

@@ -150,9 +150,7 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
somethingWentWrong,
};
const title = isI18nReady
? t("title", { context: "auth" })
: staticAppTitle;
const title = isI18nReady ? t("title_auth") : staticAppTitle;
return (
<>

View File

@@ -330,9 +330,7 @@ export default function App({ Component, pageProps }: AppProps) {
logout,
};
const title = isI18nReady
? t("title", { context: "photos" })
: staticAppTitle;
const title = isI18nReady ? t("title_photos") : staticAppTitle;
return (
<>

View File

@@ -193,6 +193,9 @@ export async function getMLStatusSuggestion(): Promise<Suggestion> {
case "indexing":
label = t("indexing_photos", status);
break;
case "fetching":
label = t("indexing_fetching", status);
break;
case "clustering":
label = t("indexing_people", status);
break;

View File

@@ -232,6 +232,7 @@
"PEOPLE": "People",
"indexing_scheduled": "Indexing is scheduled...",
"indexing_photos": "Indexing photos ({{nSyncedFiles, number}} / {{nTotalFiles, number}})",
"indexing_fetching": "Fetching indexes ({{nSyncedFiles, number}} / {{nTotalFiles, number}})",
"indexing_people": "Indexing people in {{nSyncedFiles, number}} photos...",
"indexing_done": "Indexed {{nSyncedFiles, number}} photos",
"UNIDENTIFIED_FACES": "Unidentified faces",
@@ -484,6 +485,7 @@
"indexing": "Indexing",
"processed": "Processed",
"indexing_status_running": "Running",
"indexing_status_fetching": "Fetching",
"indexing_status_scheduled": "Scheduled",
"indexing_status_done": "Done",
"ml_search_disable": "Disable machine learning",

View File

@@ -299,15 +299,18 @@ const ManageML: React.FC<ManageMLProps> = ({
let status: string;
switch (phase) {
case "indexing":
status = "running";
break;
case "scheduled":
status = "scheduled";
status = t("indexing_status_scheduled");
break;
case "fetching":
status = t("indexing_status_fetching");
break;
case "indexing":
status = t("indexing_status_running");
break;
// TODO: Clustering
default:
status = "done";
status = t("indexing_status_done");
break;
}
const processed = `${nSyncedFiles} / ${nTotalFiles}`;
@@ -351,9 +354,7 @@ const ManageML: React.FC<ManageMLProps> = ({
<Typography color="text.faint">
{t("indexing")}
</Typography>
<Typography>
{t("indexing_status", { context: status })}
</Typography>
<Typography>{status}</Typography>
</Stack>
<Divider sx={{ marginInlineStart: 2 }} />
<Stack

View File

@@ -394,14 +394,23 @@ export const indexableAndIndexedCounts = async () => {
* universe, we filter out fileIDs the files corresponding to which have already
* been indexed, or which should be ignored.
*
* @param count Limit the result to up to {@link count} items.
* @param count Limit the result to up to {@link count} items. If there are more
* than {@link count} items present, the files with the higher file IDs (which
* can be taken as a approximate for their creation order) are preferred.
*/
export const indexableFileIDs = async (count?: number) => {
export const indexableFileIDs = async (count: number) => {
const db = await mlDB();
const tx = db.transaction("file-status", "readonly");
return tx.store
let cursor = await tx.store
.index("status")
.getAllKeys(IDBKeyRange.only("indexable"), count);
.openKeyCursor(IDBKeyRange.only("indexable"), "prev");
const result: number[] = [];
while (cursor && count > 0) {
result.push(cursor.primaryKey);
cursor = await cursor.continue();
count -= 1;
}
return result;
};
/**

View File

@@ -89,7 +89,7 @@ const worker = () =>
const createComlinkWorker = async () => {
const electron = ensureElectron();
const delegate = {
workerDidProcessFile,
workerDidProcessFileOrIdle,
};
// Obtain a message port from the Electron layer.
@@ -404,13 +404,16 @@ export type MLStatus =
*
* - "indexing": The indexer is currently running.
*
* - "fetching": The indexer is currently running, but we're primarily
* fetching indexes for existing files.
*
* - "clustering": All file we know of have been indexed, and we are now
* clustering the faces that were found.
*
* - "done": ML indexing and face clustering is complete for the user's
* library.
*/
phase: "scheduled" | "indexing" | "clustering" | "done";
phase: "scheduled" | "indexing" | "fetching" | "clustering" | "done";
/** The number of files that have already been indexed. */
nSyncedFiles: number;
/** The total number of files that are eligible for indexing. */
@@ -476,12 +479,19 @@ const getMLStatus = async (): Promise<MLStatus> => {
const { indexedCount, indexableCount } = await indexableAndIndexedCounts();
// During live uploads, the indexable count remains zero even as the indexer
// is processing the newly uploaded items. This is because these "live
// queue" items do not yet have a "file-status" entry.
//
// So use the state of the worker as a guide for the phase, not the
// indexable count.
let phase: MLStatus["phase"];
if (indexableCount > 0) {
const isIndexing = await (await worker()).isIndexing();
phase = !isIndexing ? "scheduled" : "indexing";
const state = await (await worker()).state;
if (state == "indexing" || state == "fetching") {
phase = state;
} else {
phase = "done";
phase = indexableCount > 0 ? "scheduled" : "done";
}
return {
@@ -513,7 +523,7 @@ const setInterimScheduledStatus = () => {
setMLStatusSnapshot({ phase: "scheduled", nSyncedFiles, nTotalFiles });
};
const workerDidProcessFile = throttled(updateMLStatusSnapshot, 2000);
const workerDidProcessFileOrIdle = throttled(updateMLStatusSnapshot, 2000);
/**
* Use CLIP to perform a natural language search over image embeddings.

View File

@@ -8,11 +8,10 @@
*/
export interface MLWorkerDelegate {
/**
* Called whenever a file is processed during indexing.
*
* It is called both when the indexing was successful or it failed.
* Called whenever the worker processes a file during indexing (either
* successfully or with errors), or when in goes into the "idle" state.
*/
workerDidProcessFile: () => void;
workerDidProcessFileOrIdle: () => void;
}
/**

View File

@@ -1,4 +1,5 @@
import { clientPackageName } from "@/base/app";
import { assertionFailed } from "@/base/assert";
import { isHTTP4xxError } from "@/base/http";
import { getKVN } from "@/base/kv";
import { ensureAuthToken } from "@/base/local-user";
@@ -39,6 +40,20 @@ import {
} from "./ml-data";
import type { CLIPMatches, MLWorkerDelegate } from "./worker-types";
/**
* A rough hint at what the worker is up to.
*
* - "idle": Not doing anything
* - "tick": Transitioning to a new state
* - "indexing": Indexing
* - "fetching": A subset of indexing
*
* During indexing, the state is set to "fetching" whenever remote provided us
* data for more than 50% of the files that we requested from it in the last
* fetch during indexing.
*/
export type WorkerState = "idle" | "tick" | "indexing" | "fetching";
const idleDurationStart = 5; /* 5 seconds */
const idleDurationMax = 16 * 60; /* 16 minutes */
@@ -76,9 +91,11 @@ interface IndexableItem {
* particular, for finding the closest CLIP match when the user does a search.
*/
export class MLWorker {
/** The last known state of the worker. */
public state: WorkerState = "idle";
private electron: ElectronMLWorker | undefined;
private delegate: MLWorkerDelegate | undefined;
private state: "idle" | "tick" | "indexing" = "idle";
private liveQ: IndexableItem[] = [];
private idleTimeout: ReturnType<typeof setTimeout> | undefined;
private idleDuration = idleDurationStart; /* unit: seconds */
@@ -164,13 +181,6 @@ export class MLWorker {
}
}
/**
* Return true if we're currently indexing.
*/
isIndexing() {
return this.state == "indexing";
}
/**
* Find {@link CLIPMatches} for a given {@link searchPhrase}.
*/
@@ -223,6 +233,7 @@ export class MLWorker {
this.state = "idle";
this.idleDuration = Math.min(this.idleDuration * 2, idleDurationMax);
this.idleTimeout = setTimeout(scheduleTick, this.idleDuration * 1000);
this.delegate?.workerDidProcessFileOrIdle();
}
/** Return the next batch of items to backfill (if any). */
@@ -234,8 +245,20 @@ export class MLWorker {
200,
);
if (!filesByID.size) return [];
// Fetch their existing ML data (if any).
const mlDataByID = await fetchMLData(filesByID);
// If the number of files for which remote gave us data is more than 50%
// of what we asked of it, assume we are "fetching", not "indexing".
// This is a heuristic to try and show a better indexing state in the UI
// (so that the user does not think that their files are being
// unnecessarily reindexed).
if (this.state != "indexing" && this.state != "fetching")
assertionFailed(`Unexpected state ${this.state}`);
this.state =
mlDataByID.size * 2 > filesByID.size ? "fetching" : "indexing";
// Return files after annotating them with their existing ML data.
return Array.from(filesByID, ([id, file]) => ({
enteFile: file,
@@ -298,7 +321,7 @@ const indexNextBatch = async (
await Promise.race(tasks);
// Let the main thread now we're doing something.
delegate?.workerDidProcessFile();
delegate?.workerDidProcessFileOrIdle();
// Let us drain the microtask queue. This also gives a chance for other
// interactive tasks like `clipMatches` to run.
@@ -317,6 +340,8 @@ const indexNextBatch = async (
* about. Then return the next {@link count} files that still need to be
* indexed.
*
* When returning from amongst pending files, prefer the most recent ones first.
*
* For specifics of what a "sync" entails, see {@link updateAssumingLocalFiles}.
*
* @param userID Sync only files owned by a {@link userID} with the face DB.