Handle versioning

This commit is contained in:
Manav Rathi
2024-07-18 13:20:46 +05:30
parent 07c2d3cf28
commit e49ebdfc2e
3 changed files with 32 additions and 22 deletions

View File

@@ -47,26 +47,6 @@ import { type RemoteFaceIndex } from "./face";
*
* Thus, now this is best thought of a tag for a particular format of encoding
* all the derived data associated with a file.
*
* [Note: Handling versioning of embeddings]
*
* The embeddings themselves have a version embedded in them, so it is possible
* for us to make backward compatible updates to the indexing process on newer
* clients (There is also a top level version field too but that is not used).
*
* If we bump the version of same model (say when indexing on a newer client),
* the assumption will be that older client will be able to consume the
* response. e.g. Say if we improve blur detection, older client should just
* consume embeddings with a newer version and not try to index the file again
* locally.
*
* If we get an embedding with version that is older than the version the client
* supports, then the client should ignore it. This way, the file will get
* reindexed locally an embedding with a newer version will get put to remote.
*
* In the case where the changes are not backward compatible and can only be
* consumed by clients with the relevant scaffolding, then we change this
* "model" (i.e "type") field to create a new universe of embeddings.
*/
export type EmbeddingModel =
// TODO-ML: prune

View File

@@ -80,9 +80,30 @@ export type RemoteFaceIndex = FaceIndex & {
/**
* An integral version number of the indexing algorithm / pipeline.
*
* [Note: Embedding versions]
*
* Embeddings have an associated version so it is possible for us to make
* backward compatible updates to the indexing process on newer clients.
*
* Clients agree out of band what a particular version means, and guarantee
* that an embedding with a particular version will be the same (to epsilon
* cosine similarity) irrespective of the client that indexed the file.
*
* If we bump the version of same model (say when indexing on a newer
* client), we will do it in a manner that older client will be able to
* consume the response. The schema should not change in non-additive
* manners. For example, say if we improve blur detection, older client
* should just consume embeddings with a newer version and not try to index
* the file again locally.
*
* When fetching from remote, if we get an embedding with version that is
* older than the version the client supports, then the client should ignore
* it. This way, the file will get reindexed locally and an embedding with a
* newer version will also get saved to remote.
*
* In the case where the changes are not backward compatible and can only be
* consumed by clients by making code changes, then we will introduce a new
* subtype (top level key) in the derived data.
*/
version: number;
/**

View File

@@ -368,18 +368,27 @@ const index = async (
// Massage the existing data (if any) that we got from remote to the form
// that the rest of this function operates on.
//
// Discard any existing data that is made by an older indexing pipelines.
// See: [Note: Embedding versions]
const existingRemoteFaceIndex = remoteDerivedData?.parsed?.face;
const existingRemoteCLIPIndex = remoteDerivedData?.parsed?.clip;
let existingFaceIndex: FaceIndex | undefined;
if (existingRemoteFaceIndex) {
if (
existingRemoteFaceIndex &&
existingRemoteFaceIndex.version >= faceIndexingVersion
) {
const { width, height, faces } = existingRemoteFaceIndex;
existingFaceIndex = { width, height, faces };
}
let existingCLIPIndex: CLIPIndex | undefined;
if (existingRemoteCLIPIndex) {
if (
existingRemoteCLIPIndex &&
existingRemoteCLIPIndex.version >= clipIndexingVersion
) {
const { embedding } = existingRemoteCLIPIndex;
existingCLIPIndex = { embedding };
}