[desktop] Use clip-bpe-js as the Tokenizer

Replace the inlined code with the library. Tested by comparing the produced embeddings on a few sample prompts (were exactly the same).
2024-07-01 13:03:07 +05:30
parent b1da1dfe05
commit bbc44d6ac1
8 changed files with 18 additions and 536 deletions
--- a/desktop/package.json
+++ b/desktop/package.json
@@ -27,6 +27,7 @@
        "any-shell-escape": "^0.1",
        "auto-launch": "^5.0",
        "chokidar": "^3.6",
+        "clip-bpe-js": "^0.0.6",
        "compare-versions": "^6.1",
        "electron-log": "^5.1",
        "electron-store": "^8.2",
--- a/desktop/src/main/services/ml-clip.ts
+++ b/desktop/src/main/services/ml-clip.ts
@@ -5,10 +5,10 @@
 *
 * @see `web/apps/photos/src/services/clip-service.ts` for more details.
 */
+import Tokenizer from "clip-bpe-js";
 import jpeg from "jpeg-js";
 import fs from "node:fs/promises";
 import * as ort from "onnxruntime-node";
-import Tokenizer from "../../thirdparty/clip-bpe-ts/mod";
 import log from "../log";
 import { writeStream } from "../stream";
 import { ensure, wait } from "../utils/common";
--- a/desktop/src/thirdparty/clip-bpe-ts/LICENSE
+++ b/desktop/src/thirdparty/clip-bpe-ts/LICENSE
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2023 josephrocca
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/desktop/src/thirdparty/clip-bpe-ts/README.md
+++ b/desktop/src/thirdparty/clip-bpe-ts/README.md
@@ -1,40 +0,0 @@
-# CLIP Byte Pair Encoding JavaScript Port
-
-A JavaScript port of
-[OpenAI's CLIP byte-pair-encoding tokenizer](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py).
-
-```js
-import Tokenizer from "https://deno.land/x/clip_bpe@v0.0.6/mod.js";
-let t = new Tokenizer();
-
-t.encode("hello"); // [3306]
-t.encode("magnificent"); // [10724]
-t.encode("magnificently"); // [9725, 2922]
-t.decode(t.encode("HELLO")); // "hello "
-t.decode(t.encode("abc123")); // "abc 1 2 3 "
-t.decode(st.encode("let's see here")); // "let 's see here "
-t.encode("hello world!"); // [3306, 1002, 256]
-
-// to encode for CLIP (trims to maximum of 77 tokens and adds start and end token, and pads with zeros if less than 77 tokens):
-t.encodeForCLIP("hello world!"); // [49406,3306,1002,256,49407,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-```
-
-This encoder/decoder behaves differently to the the GPT-2/3 tokenizer
-(JavaScript version of that
-[here](https://github.com/latitudegames/GPT-3-Encoder)). For example, it doesn't
-preserve capital letters, as shown above.
-
-The
-[Python version](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py)
-of this tokenizer uses the `ftfy` module to clean up the text before encoding
-it. I didn't include that module by default because currently the only version
-available in JavaScript is
-[this one](https://github.com/josephrocca/ftfy-pyodide), which requires
-importing a full Python runtime as a WebAssembly module. If you want the `ftfy`
-cleaning, just import it and clean your text with it before passing it to the
-`.encode()` method.
-
-# License
-
-To the extent that there is any original work in this repo, it is MIT Licensed,
-just like [openai/CLIP](https://github.com/openai/CLIP).
--- a/desktop/src/thirdparty/clip-bpe-ts/bpe_simple_vocab_16e6.ts
+++ b/desktop/src/thirdparty/clip-bpe-ts/bpe_simple_vocab_16e6.ts
--- a/desktop/src/thirdparty/clip-bpe-ts/mod.ts
+++ b/desktop/src/thirdparty/clip-bpe-ts/mod.ts
@@ -1,470 +0,0 @@
-/* eslint-disable */
-
-import * as htmlEntities from "html-entities";
-import bpeVocabData from "./bpe_simple_vocab_16e6";
-// import ftfy from "https://deno.land/x/ftfy_pyodide@v0.1.1/mod.js";
-
-function ord(c: string) {
-    return c.charCodeAt(0);
-}
-function range(start: number, stop?: number, step: number = 1) {
-    if (stop === undefined) {
-        stop = start;
-        start = 0;
-    }
-
-    if ((step > 0 && start >= stop) || (step < 0 && start <= stop)) {
-        return [];
-    }
-
-    const result: number[] = [];
-    for (let i = start; step > 0 ? i < stop : i > stop; i += step) {
-        result.push(i);
-    }
-
-    return result;
-}
-
-function bytesToUnicode() {
-    const bs = [
-        ...range(ord("!"), ord("~") + 1),
-        ...range(ord("¡"), ord("¬") + 1),
-        ...range(ord("®"), ord("ÿ") + 1),
-    ];
-    const cs = bs.slice(0);
-    let n = 0;
-    for (const b of range(2 ** 8)) {
-        if (!bs.includes(b)) {
-            bs.push(b);
-            cs.push(2 ** 8 + n);
-            n += 1;
-        }
-    }
-    const csString = cs.map((n) => String.fromCharCode(n));
-    return Object.fromEntries(bs.map((v, i) => [v, csString[i]]));
-}
-
-function getPairs(word: string | any[]) {
-    const pairs: [string, string][] = [];
-    let prevChar = word[0];
-    for (const char of word.slice(1)) {
-        pairs.push([prevChar, char]);
-        prevChar = char;
-    }
-    return pairs;
-}
-
-function basicClean(text: string) {
-    // text = ftfy.fix_text(text);
-    text = htmlEntities.decode(htmlEntities.decode(text));
-    return text.trim();
-}
-
-function whitespaceClean(text: string) {
-    return text.replace(/\s+/g, " ").trim();
-}
-
-export default class {
-    byteEncoder;
-    byteDecoder: {
-        [k: string]: number;
-    };
-    encoder;
-    decoder: any;
-    bpeRanks: any;
-    cache: Record<string, string>;
-    pat: RegExp;
-    constructor() {
-        this.byteEncoder = bytesToUnicode();
-        this.byteDecoder = Object.fromEntries(
-            Object.entries(this.byteEncoder).map(([k, v]) => [v, Number(k)]),
-        );
-        let merges = bpeVocabData.text.split("\n");
-        merges = merges.slice(1, 49152 - 256 - 2 + 1);
-        const mergedMerges = merges.map((merge) => merge.split(" "));
-        // There was a bug related to the ordering of Python's .values() output. I'm lazy do I've just copy-pasted the Python output:
-        let vocab = [
-            "!",
-            '"',
-            "#",
-            "$",
-            "%",
-            "&",
-            "'",
-            "(",
-            ")",
-            "*",
-            "+",
-            ",",
-            "-",
-            ".",
-            "/",
-            "0",
-            "1",
-            "2",
-            "3",
-            "4",
-            "5",
-            "6",
-            "7",
-            "8",
-            "9",
-            ":",
-            ";",
-            "<",
-            "=",
-            ">",
-            "?",
-            "@",
-            "A",
-            "B",
-            "C",
-            "D",
-            "E",
-            "F",
-            "G",
-            "H",
-            "I",
-            "J",
-            "K",
-            "L",
-            "M",
-            "N",
-            "O",
-            "P",
-            "Q",
-            "R",
-            "S",
-            "T",
-            "U",
-            "V",
-            "W",
-            "X",
-            "Y",
-            "Z",
-            "[",
-            "\\",
-            "]",
-            "^",
-            "_",
-            "`",
-            "a",
-            "b",
-            "c",
-            "d",
-            "e",
-            "f",
-            "g",
-            "h",
-            "i",
-            "j",
-            "k",
-            "l",
-            "m",
-            "n",
-            "o",
-            "p",
-            "q",
-            "r",
-            "s",
-            "t",
-            "u",
-            "v",
-            "w",
-            "x",
-            "y",
-            "z",
-            "{",
-            "|",
-            "}",
-            "~",
-            "¡",
-            "¢",
-            "£",
-            "¤",
-            "¥",
-            "¦",
-            "§",
-            "¨",
-            "©",
-            "ª",
-            "«",
-            "¬",
-            "®",
-            "¯",
-            "°",
-            "±",
-            "²",
-            "³",
-            "´",
-            "µ",
-            "¶",
-            "·",
-            "¸",
-            "¹",
-            "º",
-            "»",
-            "¼",
-            "½",
-            "¾",
-            "¿",
-            "À",
-            "Á",
-            "Â",
-            "Ã",
-            "Ä",
-            "Å",
-            "Æ",
-            "Ç",
-            "È",
-            "É",
-            "Ê",
-            "Ë",
-            "Ì",
-            "Í",
-            "Î",
-            "Ï",
-            "Ð",
-            "Ñ",
-            "Ò",
-            "Ó",
-            "Ô",
-            "Õ",
-            "Ö",
-            "×",
-            "Ø",
-            "Ù",
-            "Ú",
-            "Û",
-            "Ü",
-            "Ý",
-            "Þ",
-            "ß",
-            "à",
-            "á",
-            "â",
-            "ã",
-            "ä",
-            "å",
-            "æ",
-            "ç",
-            "è",
-            "é",
-            "ê",
-            "ë",
-            "ì",
-            "í",
-            "î",
-            "ï",
-            "ð",
-            "ñ",
-            "ò",
-            "ó",
-            "ô",
-            "õ",
-            "ö",
-            "÷",
-            "ø",
-            "ù",
-            "ú",
-            "û",
-            "ü",
-            "ý",
-            "þ",
-            "ÿ",
-            "Ā",
-            "ā",
-            "Ă",
-            "ă",
-            "Ą",
-            "ą",
-            "Ć",
-            "ć",
-            "Ĉ",
-            "ĉ",
-            "Ċ",
-            "ċ",
-            "Č",
-            "č",
-            "Ď",
-            "ď",
-            "Đ",
-            "đ",
-            "Ē",
-            "ē",
-            "Ĕ",
-            "ĕ",
-            "Ė",
-            "ė",
-            "Ę",
-            "ę",
-            "Ě",
-            "ě",
-            "Ĝ",
-            "ĝ",
-            "Ğ",
-            "ğ",
-            "Ġ",
-            "ġ",
-            "Ģ",
-            "ģ",
-            "Ĥ",
-            "ĥ",
-            "Ħ",
-            "ħ",
-            "Ĩ",
-            "ĩ",
-            "Ī",
-            "ī",
-            "Ĭ",
-            "ĭ",
-            "Į",
-            "į",
-            "İ",
-            "ı",
-            "Ĳ",
-            "ĳ",
-            "Ĵ",
-            "ĵ",
-            "Ķ",
-            "ķ",
-            "ĸ",
-            "Ĺ",
-            "ĺ",
-            "Ļ",
-            "ļ",
-            "Ľ",
-            "ľ",
-            "Ŀ",
-            "ŀ",
-            "Ł",
-            "ł",
-            "Ń",
-        ];
-        vocab = [...vocab, ...vocab.map((v) => v + "</w>")];
-        for (const merge of mergedMerges) {
-            vocab.push(merge.join(""));
-        }
-        vocab.push("<|startoftext|>", "<|endoftext|>");
-        this.encoder = Object.fromEntries(vocab.map((v, i) => [v, i]));
-        this.decoder = Object.fromEntries(
-            Object.entries(this.encoder).map(([k, v]) => [v, k]),
-        );
-        this.bpeRanks = Object.fromEntries(
-            mergedMerges.map((v, i) => [v.join("·😎·"), i]),
-        ); // ·😎· because js doesn't yet have tuples
-        this.cache = {
-            "<|startoftext|>": "<|startoftext|>",
-            "<|endoftext|>": "<|endoftext|>",
-        };
-        this.pat =
-            /<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+/giu;
-    }
-
-    bpe(token: string) {
-        if (this.cache[token] !== undefined) {
-            return this.cache[token];
-        }
-
-        let word = [...token.slice(0, -1), token.slice(-1) + "</w>"];
-        let pairs = getPairs(word);
-
-        if (pairs.length === 0) {
-            return token + "</w>";
-        }
-
-        while (1) {
-            let bigram: [string, string] | null = null;
-            let minRank = Infinity;
-            for (const p of pairs) {
-                const r = this.bpeRanks[p.join("·😎·")];
-                if (r === undefined) continue;
-                if (r < minRank) {
-                    minRank = r;
-                    bigram = p;
-                }
-            }
-
-            if (bigram === null) {
-                break;
-            }
-
-            const [first, second] = bigram;
-            const newWord: string[] = [];
-            let i = 0;
-            while (i < word.length) {
-                const j = word.indexOf(first, i);
-
-                if (j === -1) {
-                    newWord.push(...word.slice(i));
-                    break;
-                }
-
-                newWord.push(...word.slice(i, j));
-                i = j;
-
-                if (
-                    word[i] === first &&
-                    i < word.length - 1 &&
-                    word[i + 1] === second
-                ) {
-                    newWord.push(first + second);
-                    i += 2;
-                } else {
-                    // @ts-expect-error "Array indexing can return undefined but not modifying thirdparty code"
-                    newWord.push(word[i]);
-                    i += 1;
-                }
-            }
-            word = newWord;
-            if (word.length === 1) {
-                break;
-            } else {
-                pairs = getPairs(word);
-            }
-        }
-        const joinedWord = word.join(" ");
-        this.cache[token] = joinedWord;
-        return joinedWord;
-    }
-
-    encode(text: string) {
-        const bpeTokens: number[] = [];
-        text = whitespaceClean(basicClean(text)).toLowerCase();
-        for (let token of [...text.matchAll(this.pat)].map((m) => m[0])) {
-            token = [...token]
-                .map((b) => this.byteEncoder[b.charCodeAt(0) as number])
-                .join("");
-            bpeTokens.push(
-                // @ts-expect-error "Array indexing can return undefined but not modifying thirdparty code"
-                ...this.bpe(token)
-                    .split(" ")
-                    .map((bpeToken: string) => this.encoder[bpeToken]),
-            );
-        }
-        return bpeTokens;
-    }
-
-    // adds start and end token, and adds padding 0's and ensures it's 77 tokens long
-    encodeForCLIP(text: string) {
-        let tokens = this.encode(text);
-        tokens.unshift(49406); // start token
-        tokens = tokens.slice(0, 76);
-        tokens.push(49407); // end token
-        while (tokens.length < 77) tokens.push(0);
-        return tokens;
-    }
-
-    decode(tokens: any[]) {
-        let text = tokens
-            .map((token: string | number) => this.decoder[token])
-            .join("");
-        text = [...text]
-            .map((c) => this.byteDecoder[c])
-            // @ts-expect-error "Array indexing can return undefined but not modifying thirdparty code"
-            .map((v) => String.fromCharCode(v))
-            .join("")
-            .replace(/<\/w>/g, " ");
-        return text;
-    }
-}
--- a/desktop/src/types/clip-bpe-js.ts
+++ b/desktop/src/types/clip-bpe-js.ts
@@ -0,0 +1,11 @@
+/**
+ * @file Types for "clip-bpe-js"
+ *
+ * Non exhaustive, only the function we need.
+ */
+
+declare module "clip-bpe-js" {
+    class Tokenizer {
+        encodeForCLIP(text: string): number[];
+    }
+}
--- a/desktop/yarn.lock
+++ b/desktop/yarn.lock
@@ -928,6 +928,11 @@ cli-truncate@^2.1.0:
    slice-ansi "^3.0.0"
    string-width "^4.2.0"

+clip-bpe-js@^0.0.6:
+  version "0.0.6"
+  resolved "https://registry.yarnpkg.com/clip-bpe-js/-/clip-bpe-js-0.0.6.tgz#a11c228e793fa29841f8cd4f8576fc1ff3403511"
+  integrity sha512-+0n0eeacgFmg9hKhHhXanKkRDhqRwzYK7dj46VYMzePxmN8zbbDgeTQZchfvARNpbBMRi7FfWRSW3lysOUDX/Q==
+
 cliui@^8.0.1:
  version "8.0.1"
  resolved "https://registry.yarnpkg.com/cliui/-/cliui-8.0.1.tgz#0c04b075db02cbfe60dc8e6cf2f5486b1a3608aa"