diff --git a/web/packages/new/photos/services/ml/embedding.ts b/web/packages/new/photos/services/ml/embedding.ts index 608e8c059e..00d0996060 100644 --- a/web/packages/new/photos/services/ml/embedding.ts +++ b/web/packages/new/photos/services/ml/embedding.ts @@ -99,25 +99,42 @@ const RemoteEmbedding = z.object({ type RemoteEmbedding = z.infer; +export type OriginalRemoteDerivedData = Record; + +export type ParsedRemoteDerivedData = Partial<{ + face: RemoteFaceIndex; + clip: RemoteCLIPIndex; +}>; + /** * The decrypted payload of a {@link RemoteEmbedding} for the "combined" * {@link EmbeddingModel}. * * [Note: Preserve unknown derived data fields] * - * There is one entry for each of the embedding types that the current client - * knows about. However, there might be other fields apart from the known ones - * at the top level, and we need to ensure that we preserve them verbatim when - * trying use {@link putDerivedData} with an {@link RemoteDerivedData} obtained - * from remote as the base, with locally indexed additions. + * The remote derived data can contain arbitrary key at the top level apart from + * the ones that the current client knows about. We need to preserve these + * verbatim when we use {@link putDerivedData}. + * + * Thus we return two separate results from {@link fetchDerivedData}: + * + * - {@link OriginalRemoteDerivedData}: The original, unmodified JSON. + * + * - {@link ParsedRemoteDerivedData}: The particular fields that the current + * client knows about, parsed according to their expected structure. + * + * When integrating this information into our local state, we use the parsed + * version. And if we need to update the state on remote (e.g. if the current + * client notices an embedding type that was missing), then we use the original + * JSON as the base. */ -export type RemoteDerivedData = Record & { - face: RemoteFaceIndex; - clip: RemoteCLIPIndex; -}; +export interface RemoteDerivedData { + original: OriginalRemoteDerivedData; + parsed: ParsedRemoteDerivedData | undefined; +} /** - * Zod schemas for the {@link RemoteFaceIndex} type. + * Zod schema for the {@link RemoteFaceIndex} type. * * [Note: Duplicated Zod schema and TypeScript type] * @@ -170,7 +187,7 @@ const RemoteFaceIndex = z.object({ }); /** - * Zod schemas for the {@link RemoteCLIPIndex} types. + * Zod schema for the {@link RemoteCLIPIndex} type. * * See: [Note: Duplicated Zod schema and TypeScript type] */ @@ -181,17 +198,17 @@ const RemoteCLIPIndex = z.object({ }); /** - * Zod schemas for a partial {@link RemoteCLIPIndex} type. Note that we need to - * preserve any top level fields in the JSON that we don't understand. - * - * See: [Note: Preserve unknown derived data fields] + * Zod schema for the {@link ParsedRemoteDerivedData} type. */ -const RemoteDerivedData = z - .object({ - face: RemoteFaceIndex.nullish().transform(nullToUndefined), - clip: RemoteCLIPIndex.nullish().transform(nullToUndefined), - }) - .passthrough(); +const OriginalRemoteDerivedData = z.object({}).passthrough(); + +/** + * Zod schema for the {@link ParsedRemoteDerivedData} type. + */ +const ParsedRemoteDerivedData = z.object({ + face: RemoteFaceIndex.nullish().transform(nullToUndefined), + clip: RemoteCLIPIndex.nullish().transform(nullToUndefined), +}); /** * Fetch derived data for the given files from remote. @@ -205,12 +222,14 @@ const RemoteDerivedData = z * fields set to optional (since a remote embedding may have a subset of the * fields that we locally generate). */ -export const fetchDerivedData = async (filesByID: Map) => { +export const fetchDerivedData = async ( + filesByID: Map, +): Promise> => { const remoteEmbeddings = await fetchEmbeddings("combined", [ ...filesByID.keys(), ]); - const result = new Map>(); + const result = new Map(); for (const remoteEmbedding of remoteEmbeddings) { const { fileID } = remoteEmbedding; const file = filesByID.get(fileID); @@ -226,7 +245,12 @@ export const fetchDerivedData = async (filesByID: Map) => { file.key, ); const jsonString = await gunzip(decryptedBytes); - result.set(fileID, RemoteDerivedData.parse(JSON.parse(jsonString))); + const original = OriginalRemoteDerivedData.parse( + JSON.parse(jsonString), + ); + const parseResult = ParsedRemoteDerivedData.safeParse(original); + const parsed = parseResult.success ? parseResult.data : undefined; + result.set(fileID, { original, parsed }); } catch (e) { // This shouldn't happen. Likely some client has uploaded a // corrupted embedding. Ignore it so that it gets reindexed and diff --git a/web/packages/new/photos/services/ml/worker.ts b/web/packages/new/photos/services/ml/worker.ts index 51b4912c12..36f7e9f75a 100644 --- a/web/packages/new/photos/services/ml/worker.ts +++ b/web/packages/new/photos/services/ml/worker.ts @@ -139,8 +139,8 @@ export class MLWorker { // live queue is just an optimization: if a file doesn't get indexed via // the live queue, it'll later get indexed anyway when we backfill. if (this.liveQ.length < 200) { - // The file is just being uploaded, and it (logical bugs withstanding), - // will not exist on remote. + // The file is just being uploaded, and so will not have any + // pre-existing derived data on remote. const remoteDerivedData = undefined; this.liveQ.push({ enteFile, uploadItem, remoteDerivedData }); this.wakeUp();