Files
ente/web/packages/gallery/services/file-data.ts
2025-06-26 07:50:17 +05:30

429 lines
16 KiB
TypeScript

import { encryptBlob } from "ente-base/crypto";
import type { EncryptedBlobB64 } from "ente-base/crypto/types";
import {
authenticatedPublicAlbumsRequestHeaders,
authenticatedRequestHeaders,
ensureOk,
retryEnsuringHTTPOk,
type PublicAlbumsCredentials,
} from "ente-base/http";
import { apiURL } from "ente-base/origins";
import type { EnteFile } from "ente-media/file";
import { nullToUndefined } from "ente-utils/transform";
import { z } from "zod/v4";
/**
* [Note: File data APIs]
*
* Remote provides some APIs that talk in terms of "file data", which refers to
* all the files (original or derived) associated with an {@link EnteFile}.
*
* For example, for each original image that the user uploads, there will be an
* associated thumbnail file, potentially one or more preview files (optimized
* for size or compatibility across devices), various ML embeddings generated
* for that file, and possibly more such files in the future.
*
* There are specialized APIs for fetching and uploading the originals and the
* thumbnails. But for the other associated data, we can use the file data APIs.
*/
type FileDataType =
| "mldata" /* See: [Note: "mldata" format] */
| "vid_preview" /* See: [Note: Video playlist and preview] */;
const RemoteFileData = z.object({
/**
* The ID of the {@link EnteFile} with which this file data is associated.
*/
fileID: z.number(),
/**
* Base64 representation of the encrypted data. Its plaintext contents will
* be specific to each file data type.
*/
encryptedData: z.string(),
/**
* Base64 representation of the header that should be passed when decrypting
* {@link encryptedData}. See the {@link decryptMetadata} function in the
* crypto layer.
*/
decryptionHeader: z.string(),
/**
* The epoch microseconds when this file data entry was last upserted.
*
* [Note: PUT "mldata" version check]
*
* When PUT-ting mldata onto remote, the client is expected to pass the
* updated at of the existing {@link RemoteFileData} which it is updating
* (this field), or 0 if the client is creating a new entity.
*
* This allows remote to detect and reject cases where the client is trying
* to overwrite a version it hasn't yet pulled.
*
* About the optionality of this field: Newer museums are expected to always
* provide the {@link updatedAt} in the response, but for ease of self
* hosters we don't take a hard dependency on the latest museum and instead
* allow this field to be optional. When it is not present, effectively
* we'll pass 0 as {@link lastUpdatedAt} in the "mldata" PUT API call, but
* since it's an old museum it'll anyway ignore it.
*
* > This note was added May 2025, and the optionality can be removed in a
* > few months when museums should've updated (tag: Migration).
*/
updatedAt: z.number().nullish().transform(nullToUndefined),
});
type RemoteFileData = z.infer<typeof RemoteFileData>;
/**
* Fetch file data of a particular type for the given list of files.
*
* @param type The {@link FileDataType} which we want.
*
* @param fileIDs The ids of the files for which we want the file data.
*
* @returns a list of {@link RemoteFileData} items for the files which had file
* data for the given type, and that remote was able to successfully retrieve.
*
* The order of this list is arbitrary, and the caller should use the
* {@link fileID} present within the {@link RemoteFileData} to associate an item
* in the result back to a file instead of relying on the order or count of
* items in the result.
*
* In rare cases (issues with the upstream object storage), it is possible for
* remote to not return entries for a particular file even though it has
* associated data of that type. Such skipped entries are mentioned in the
* payload, but we don't parse that information currently since the higher
* levels of our code that use this function handle such rare skips gracefully.
*/
export const fetchFilesData = async (
type: FileDataType,
fileIDs: number[],
): Promise<RemoteFileData[]> => {
const res = await fetch(await apiURL("/files/data/fetch"), {
method: "POST",
headers: await authenticatedRequestHeaders(),
body: JSON.stringify({ type, fileIDs }),
});
ensureOk(res);
return z.object({ data: z.array(RemoteFileData) }).parse(await res.json())
.data;
};
/**
* A variant of {@link fetchFilesData} that fetches data for a single file.
*
* Unlike {@link fetchFilesData}, this uses a HTTP GET request.
*
* Returns `undefined` if no file data of the given type has been uploaded for
* this file yet (e.g. if type was "vid_preview", this would indicate that a
* video preview has been generated for this file yet).
*
* @param publicAlbumsCredentials Credentials to use when we are running in the
* context of the public albums app. If these are not specified, then the
* credentials of the logged in user are used.
*/
export const fetchFileData = async (
type: FileDataType,
fileID: number,
publicAlbumsCredentials?: PublicAlbumsCredentials,
): Promise<RemoteFileData | undefined> => {
const params = new URLSearchParams({
type,
fileID: fileID.toString(),
// Ask museum to respond with 204 instead of 404 if no playlist exists
// for the given file.
preferNoContent: "true",
});
let res: Response;
if (publicAlbumsCredentials) {
const url = await apiURL("/public-collection/files/data/fetch");
const headers = authenticatedPublicAlbumsRequestHeaders(
publicAlbumsCredentials,
);
res = await fetch(`${url}?${params.toString()}`, { headers });
} else {
const url = await apiURL("/files/data/fetch");
res = await fetch(`${url}?${params.toString()}`, {
headers: await authenticatedRequestHeaders(),
});
}
if (res.status == 204) return undefined;
// We're passing `preferNoContent` so the expected response is 204, but this
// might be a self hoster running an older museum that does not recognize
// that flag, so retain the old behavior. This fallback can be removed in a
// few months (tag: Migration, note added May 2025).
if (res.status == 404) return undefined;
ensureOk(res);
return z.object({ data: RemoteFileData }).parse(await res.json()).data;
};
/**
* An entry in the response to the `/files/data/status-diff`. The actual
* structure has more fields, there are just the fields we are interested in.
*/
const RemoteFDStatus = z.object({
/**
* The ID of the file whose file data we're querying.
*/
fileID: z.number(),
/**
* Expected to be one of {@link FileDataType}
*/
type: z.string(),
/**
* `true` if the file data has been deleted.
*
* This can be true in the in-progress partial deletion case, which the file
* data deletion has been processed but the file deletion has not yet been
* processed.
*/
isDeleted: z.boolean(),
/**
* The epoch microseconds when this file data entry was added or updated.
*/
updatedAt: z.number(),
});
/**
* A paginated part of the result set sent by remote during
* {@link syncUpdatedFileDataFileIDs}.
*/
export interface UpdatedFileDataFileIDsPage {
/**
* The IDs of files for which a file data entry has been created or updated.
*/
fileIDs: Set<number>;
/**
* The latest updatedAt (epoch microseconds) time obtained from remote in
* this batch being fetched (from amongst all of the files in the batch, not
* just those that were filtered to be part of {@link fileIDs}).
*/
lastUpdatedAt: number;
}
/**
* Fetch the IDs of files for which new file data entries of the given
* {@link type} have been created or updated since the given {@link sinceTime}.
*
* The interaction with remote is paginated, with the {@link onPage} callback
* being called as each page of new data is received.
*
* @param type The {@link FileDataType} for which we want to check for creation
* or updates.
*
* @param lastUpdatedAt Epoch microseconds. This is used to ask remote to
* provide us only entries whose {@link updatedAt} is more than the given value.
* Set this to zero to start from the beginning.
*
* @param onPage A callback invoked for each page of results received from
* remote. It is passed the fileIDs received in the batch under consideration,
* and the largest of the updated time for all entries (irrespective of
* {@link type}) in that batch.
*
* ----
*
* [Note: Pruning stale status-diff entries]
*
* Unlike other "diff" APIs, the diff API used here won't return tombstone
* entries for deleted files. This is not a problem because there are no current
* cases where existing playlists or ML indexes get deleted (unless the
* underlying file is deleted). See: [Note: Caching HLS playlist data].
*
* Note that the "/files/data/status-diff" includes entries for files that are
* in trash. This means that, while not a practical problem (because it's just
* numeric ids), the number of fileIDs we store locally can grow unbounded as
* files move to trash and then get deleted. So to prune them, we also add a
* hook to the /trash/v2/diff processing, and prune any locally saved file IDs
* which have been deleted from trash.
*/
export const syncUpdatedFileDataFileIDs = async (
type: FileDataType,
lastUpdatedAt: number,
onPage: (page: UpdatedFileDataFileIDsPage) => Promise<void>,
): Promise<void> => {
while (true) {
const res = await fetch(await apiURL("/files/data/status-diff"), {
method: "POST",
headers: await authenticatedRequestHeaders(),
body: JSON.stringify({ lastUpdatedAt }),
});
ensureOk(res);
const diff = z
.object({ diff: RemoteFDStatus.array().nullish() })
.parse(await res.json()).diff;
if (diff?.length) {
const fileIDs = new Set<number>();
for (const fd of diff) {
lastUpdatedAt = Math.max(lastUpdatedAt, fd.updatedAt);
// While we could prune isDeleted entries here, we can also rely
// on the pruning that happens when the trash gets synced. See:
// [Note: Pruning stale status-diff entries]
if (fd.type == type && !fd.isDeleted) {
fileIDs.add(fd.fileID);
}
}
await onPage({ fileIDs, lastUpdatedAt });
} else {
break;
}
}
};
/**
* Upload file data associated with the given file to remote.
*
* This function will save or update the given data as the latest file data of
* {@link type} associated with the given {@link file}. The data will be
* end-to-end encrypted using the given {@link file}'s key before uploading.
*
* @param file {@link EnteFile} which this data is associated with.
*
* @param type The {@link FileDataType} which we are uploading.
*
* @param data The binary data to upload. The exact contents of the data are
* {@link type} specific.
*
* @param lastUpdatedAt The {@link updatedAt} of the {@link RemoteFileData}
* which we are updating, or 0 to indicate a new entity.
*/
export const putFileData = async (
file: EnteFile,
type: FileDataType,
data: Uint8Array,
lastUpdatedAt: number,
) => {
const { encryptedData, decryptionHeader } = await encryptBlob(
data,
file.key,
);
const res = await fetch(await apiURL("/files/data"), {
method: "PUT",
headers: await authenticatedRequestHeaders(),
body: JSON.stringify({
fileID: file.id,
type,
encryptedData,
decryptionHeader,
lastUpdatedAt,
}),
});
ensureOk(res);
};
/**
* Fetch the preview file data the given file.
*
* @param type The {@link FileDataType} which we want.
*
* @param fileIDs The id of the files for which we want the file preview data.
*
* @param publicAlbumsCredentials Credentials to use when we are running in the
* context of the public albums app. If these are not specified, then the
* credentials of the logged in user are used.
*
* @returns the (pre-signed) URL to the preview data, or undefined if there is
* not preview data of the given type for the given file yet.
*
* [Note: File data vs file preview data]
*
* In museum's ontology, there is a distinction between two concepts:
*
* S3 metadata (museum term, the APIs call it "file data") is data that museum
* uploads on behalf of the client. e.g.,
*
* - ML data.
*
* - Preview video playlist.
*
* S3 file data (museum term, the APIs call it "file preview data") is data that
* a client itself uploads. e.g.,
*
* - The preview video itself.
*
* - Additional preview images.
*
* [Note: Video playlist and preview]
*
* For a streaming video, both these concepts are needed:
*
* - The encrypted HLS playlist is stored as "file data" of type "vid_preview",
*
* - The encrypted video chunks that the playlist refers to are stored as "file
* preview data" of type "vid_preview".
*/
export const fetchFilePreviewData = async (
type: FileDataType,
fileID: number,
publicAlbumsCredentials?: PublicAlbumsCredentials,
): Promise<string | undefined> => {
const params = new URLSearchParams({ type, fileID: fileID.toString() });
let res: Response;
if (publicAlbumsCredentials) {
const headers = authenticatedPublicAlbumsRequestHeaders(
publicAlbumsCredentials,
);
const url = await apiURL("/public-collection/files/data/preview");
res = await fetch(`${url}?${params.toString()}`, { headers });
} else {
const url = await apiURL("/files/data/preview");
res = await fetch(`${url}?${params.toString()}`, {
headers: await authenticatedRequestHeaders(),
});
}
if (res.status == 404) return undefined;
ensureOk(res);
return z.object({ url: z.string() }).parse(await res.json()).url;
};
/**
* Update the video data associated with the given file to remote.
*
* Video data refers to two things:
*
* - The encrypted HLS playlist ("file data" of type "vid_preview").
*
* - The object ID of an (already uploaded) "file preview data" file containing
* the video segments.
*
* This function is similar to {@link putFileData}, except it will save (or
* update) both the playlist, and the reference to its associated segment file,
* associated with the given {@link file}. The playlist data will be end-to-end
* encrypted using the given {@link file}'s key before uploading.
*
* @param file {@link EnteFile} which this data is associated with.
*
* @param encryptedPlaylist The encrypted playlist data (along with the nonce
* used during encryption).
*
* @param objectID Object ID of an already uploaded "file preview data" (see
* {@link getFilePreviewDataUploadURL}).
*
* @param objectSize The size (in bytes) of the file corresponding to
* {@link objectID}.
*/
export const putVideoData = async (
file: EnteFile,
encryptedPlaylist: EncryptedBlobB64,
objectID: string,
objectSize: number,
) =>
retryEnsuringHTTPOk(
async () =>
fetch(await apiURL("/files/video-data"), {
method: "PUT",
headers: await authenticatedRequestHeaders(),
body: JSON.stringify({
fileID: file.id,
objectID,
objectSize,
playlist: encryptedPlaylist.encryptedData,
playlistHeader: encryptedPlaylist.decryptionHeader,
}),
}),
{ retryProfile: "background" },
);