This commit is contained in:
Manav Rathi
2024-08-13 13:27:03 +05:30
parent fd6cab6c26
commit 3980d6b614
2 changed files with 27 additions and 28 deletions

View File

@@ -35,7 +35,7 @@ import { type RemoteFaceIndex } from "./face";
*
* which is then gzipped to get the plaintext data to upload.
*
* [Note: Preserve unknown derived data fields]
* [Note: Preserve unknown ML data fields]
*
* The (unzipped) remote mldata can contain arbitrary keys at the top level
* apart from the ones that the current client knows about. We need to preserve
@@ -214,7 +214,7 @@ const remoteMLDataFromJSONString = (jsonString: string) => {
* client (us) to ensure that we preserve the parts of the pre-existing ML data
* (if any) that we did not understand or touch.
*
* See: [Note: Preserve unknown derived data fields].
* See: [Note: Preserve unknown ML data fields].
*/
export const putMLData = async (enteFile: EnteFile, mlData: RawRemoteMLData) =>
putFileData(enteFile, "mldata", await gzip(JSON.stringify(mlData)));

View File

@@ -30,13 +30,13 @@ import {
saveIndexes,
updateAssumingLocalFiles,
} from "./db";
import { faceIndexingVersion, indexFaces, type FaceIndex } from "./face";
import {
fetchMLData,
putMLData,
type RawRemoteMLData,
type RemoteMLData,
} from "./ml-data";
import { faceIndexingVersion, indexFaces, type FaceIndex } from "./face";
import type { CLIPMatches, MLWorkerDelegate } from "./worker-types";
const idleDurationStart = 5; /* 5 seconds */
@@ -47,8 +47,8 @@ interface IndexableItem {
enteFile: EnteFile;
/** If the file was uploaded from the current client, then its contents. */
uploadItem: UploadItem | undefined;
/** The existing derived data on remote corresponding to this file. */
remoteDerivedData: RemoteMLData | undefined;
/** The existing ML data on remote corresponding to this file. */
remoteMLData: RemoteMLData | undefined;
}
/**
@@ -111,9 +111,9 @@ export class MLWorker {
* This function enqueues a backfill attempt and returns immediately without
* waiting for it complete.
*
* During a backfill, we first attempt to fetch derived data for files which
* don't have that data locally. If we fetch and find what we need, we save
* it locally. Otherwise we index them.
* During a backfill, we first attempt to fetch ML data for files which
* don't have that data locally. If on fetching we find what we need, we
* save it locally. Otherwise we index them.
*/
sync() {
this.wakeUp();
@@ -156,9 +156,8 @@ export class MLWorker {
// the live queue, it'll later get indexed anyway when we backfill.
if (this.liveQ.length < 200) {
// The file is just being uploaded, and so will not have any
// pre-existing derived data on remote.
const remoteDerivedData = undefined;
this.liveQ.push({ enteFile, uploadItem, remoteDerivedData });
// pre-existing ML data on remote.
this.liveQ.push({ enteFile, uploadItem, remoteMLData: undefined });
this.wakeUp();
} else {
log.debug(() => "Ignoring upload item since liveQ is full");
@@ -235,13 +234,13 @@ export class MLWorker {
200,
);
if (!filesByID.size) return [];
// Fetch their existing derived data (if any).
const derivedDataByID = await fetchMLData(filesByID);
// Return files after annotating them with their existing derived data.
// Fetch their existing ML data (if any).
const mlDataByID = await fetchMLData(filesByID);
// Return files after annotating them with their existing ML data.
return Array.from(filesByID, ([id, file]) => ({
enteFile: file,
uploadItem: undefined,
remoteDerivedData: derivedDataByID.get(id),
remoteMLData: mlDataByID.get(id),
}));
}
}
@@ -387,7 +386,7 @@ const syncWithLocalFilesAndGetFilesToIndex = async (
* then remote will return a 413 Request Entity Too Large).
*/
const index = async (
{ enteFile, uploadItem, remoteDerivedData }: IndexableItem,
{ enteFile, uploadItem, remoteMLData }: IndexableItem,
electron: ElectronMLWorker,
) => {
const f = fileLogID(enteFile);
@@ -399,8 +398,8 @@ const index = async (
// Discard any existing data that is made by an older indexing pipelines.
// See: [Note: Embedding versions]
const existingRemoteFaceIndex = remoteDerivedData?.parsed?.face;
const existingRemoteCLIPIndex = remoteDerivedData?.parsed?.clip;
const existingRemoteFaceIndex = remoteMLData?.parsed?.face;
const existingRemoteCLIPIndex = remoteMLData?.parsed?.clip;
let existingFaceIndex: FaceIndex | undefined;
if (
@@ -422,8 +421,8 @@ const index = async (
existingCLIPIndex = { embedding };
}
// If we already have all the derived data fields then just update our local
// db and return.
// If we already have all the ML data types then just update our local db
// and return.
if (existingFaceIndex && existingCLIPIndex) {
try {
@@ -438,7 +437,7 @@ const index = async (
return;
}
// There is at least one derived data type that still needs to be indexed.
// There is at least one ML data type that still needs to be indexed.
const renderableBlob = await fetchRenderableBlob(
enteFile,
@@ -501,22 +500,22 @@ const index = async (
// Perform an "upsert" by using the existing raw data we got from the
// remote as the base, and inserting or overwriting any newly indexed
// parts. See: [Note: Preserve unknown derived data fields].
// parts. See: [Note: Preserve unknown ML data fields].
const existingRawDerivedData = remoteDerivedData?.raw ?? {};
const rawDerivedData: RawRemoteMLData = {
...existingRawDerivedData,
const existingRawMLData = remoteMLData?.raw ?? {};
const rawMLData: RawRemoteMLData = {
...existingRawMLData,
face: remoteFaceIndex,
clip: remoteCLIPIndex,
};
log.debug(() => ["Uploading derived data", rawDerivedData]);
log.debug(() => ["Uploading ML data", rawMLData]);
try {
await putMLData(enteFile, rawDerivedData);
await putMLData(enteFile, rawMLData);
} catch (e) {
// See: [Note: Transient and permanent indexing failures]
log.error(`Failed to put derived data for ${f}`, e);
log.error(`Failed to put ML data for ${f}`, e);
if (isHTTP4xxError(e)) await markIndexingFailed(enteFile.id);
throw e;
}