[desktop] Use clip-bpe-js as the Tokenizer

Replace the inlined code with the library.

Tested by comparing the produced embeddings on a few sample prompts (were
exactly the same).
This commit is contained in:
Manav Rathi
2024-07-01 13:03:07 +05:30
parent b1da1dfe05
commit bbc44d6ac1
8 changed files with 18 additions and 536 deletions

View File

@@ -27,6 +27,7 @@
"any-shell-escape": "^0.1",
"auto-launch": "^5.0",
"chokidar": "^3.6",
"clip-bpe-js": "^0.0.6",
"compare-versions": "^6.1",
"electron-log": "^5.1",
"electron-store": "^8.2",

View File

@@ -5,10 +5,10 @@
*
* @see `web/apps/photos/src/services/clip-service.ts` for more details.
*/
import Tokenizer from "clip-bpe-js";
import jpeg from "jpeg-js";
import fs from "node:fs/promises";
import * as ort from "onnxruntime-node";
import Tokenizer from "../../thirdparty/clip-bpe-ts/mod";
import log from "../log";
import { writeStream } from "../stream";
import { ensure, wait } from "../utils/common";

View File

@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2023 josephrocca
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,40 +0,0 @@
# CLIP Byte Pair Encoding JavaScript Port
A JavaScript port of
[OpenAI's CLIP byte-pair-encoding tokenizer](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py).
```js
import Tokenizer from "https://deno.land/x/clip_bpe@v0.0.6/mod.js";
let t = new Tokenizer();
t.encode("hello"); // [3306]
t.encode("magnificent"); // [10724]
t.encode("magnificently"); // [9725, 2922]
t.decode(t.encode("HELLO")); // "hello "
t.decode(t.encode("abc123")); // "abc 1 2 3 "
t.decode(st.encode("let's see here")); // "let 's see here "
t.encode("hello world!"); // [3306, 1002, 256]
// to encode for CLIP (trims to maximum of 77 tokens and adds start and end token, and pads with zeros if less than 77 tokens):
t.encodeForCLIP("hello world!"); // [49406,3306,1002,256,49407,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
```
This encoder/decoder behaves differently to the the GPT-2/3 tokenizer
(JavaScript version of that
[here](https://github.com/latitudegames/GPT-3-Encoder)). For example, it doesn't
preserve capital letters, as shown above.
The
[Python version](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py)
of this tokenizer uses the `ftfy` module to clean up the text before encoding
it. I didn't include that module by default because currently the only version
available in JavaScript is
[this one](https://github.com/josephrocca/ftfy-pyodide), which requires
importing a full Python runtime as a WebAssembly module. If you want the `ftfy`
cleaning, just import it and clean your text with it before passing it to the
`.encode()` method.
# License
To the extent that there is any original work in this repo, it is MIT Licensed,
just like [openai/CLIP](https://github.com/openai/CLIP).

File diff suppressed because one or more lines are too long

View File

@@ -1,470 +0,0 @@
/* eslint-disable */
import * as htmlEntities from "html-entities";
import bpeVocabData from "./bpe_simple_vocab_16e6";
// import ftfy from "https://deno.land/x/ftfy_pyodide@v0.1.1/mod.js";
function ord(c: string) {
return c.charCodeAt(0);
}
function range(start: number, stop?: number, step: number = 1) {
if (stop === undefined) {
stop = start;
start = 0;
}
if ((step > 0 && start >= stop) || (step < 0 && start <= stop)) {
return [];
}
const result: number[] = [];
for (let i = start; step > 0 ? i < stop : i > stop; i += step) {
result.push(i);
}
return result;
}
function bytesToUnicode() {
const bs = [
...range(ord("!"), ord("~") + 1),
...range(ord("¡"), ord("¬") + 1),
...range(ord("®"), ord("ÿ") + 1),
];
const cs = bs.slice(0);
let n = 0;
for (const b of range(2 ** 8)) {
if (!bs.includes(b)) {
bs.push(b);
cs.push(2 ** 8 + n);
n += 1;
}
}
const csString = cs.map((n) => String.fromCharCode(n));
return Object.fromEntries(bs.map((v, i) => [v, csString[i]]));
}
function getPairs(word: string | any[]) {
const pairs: [string, string][] = [];
let prevChar = word[0];
for (const char of word.slice(1)) {
pairs.push([prevChar, char]);
prevChar = char;
}
return pairs;
}
function basicClean(text: string) {
// text = ftfy.fix_text(text);
text = htmlEntities.decode(htmlEntities.decode(text));
return text.trim();
}
function whitespaceClean(text: string) {
return text.replace(/\s+/g, " ").trim();
}
export default class {
byteEncoder;
byteDecoder: {
[k: string]: number;
};
encoder;
decoder: any;
bpeRanks: any;
cache: Record<string, string>;
pat: RegExp;
constructor() {
this.byteEncoder = bytesToUnicode();
this.byteDecoder = Object.fromEntries(
Object.entries(this.byteEncoder).map(([k, v]) => [v, Number(k)]),
);
let merges = bpeVocabData.text.split("\n");
merges = merges.slice(1, 49152 - 256 - 2 + 1);
const mergedMerges = merges.map((merge) => merge.split(" "));
// There was a bug related to the ordering of Python's .values() output. I'm lazy do I've just copy-pasted the Python output:
let vocab = [
"!",
'"',
"#",
"$",
"%",
"&",
"'",
"(",
")",
"*",
"+",
",",
"-",
".",
"/",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
":",
";",
"<",
"=",
">",
"?",
"@",
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
"Q",
"R",
"S",
"T",
"U",
"V",
"W",
"X",
"Y",
"Z",
"[",
"\\",
"]",
"^",
"_",
"`",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"{",
"|",
"}",
"~",
"¡",
"¢",
"£",
"¤",
"¥",
"¦",
"§",
"¨",
"©",
"ª",
"«",
"¬",
"®",
"¯",
"°",
"±",
"²",
"³",
"´",
"µ",
"¶",
"·",
"¸",
"¹",
"º",
"»",
"¼",
"½",
"¾",
"¿",
"À",
"Á",
"Â",
"Ã",
"Ä",
"Å",
"Æ",
"Ç",
"È",
"É",
"Ê",
"Ë",
"Ì",
"Í",
"Î",
"Ï",
"Ð",
"Ñ",
"Ò",
"Ó",
"Ô",
"Õ",
"Ö",
"×",
"Ø",
"Ù",
"Ú",
"Û",
"Ü",
"Ý",
"Þ",
"ß",
"à",
"á",
"â",
"ã",
"ä",
"å",
"æ",
"ç",
"è",
"é",
"ê",
"ë",
"ì",
"í",
"î",
"ï",
"ð",
"ñ",
"ò",
"ó",
"ô",
"õ",
"ö",
"÷",
"ø",
"ù",
"ú",
"û",
"ü",
"ý",
"þ",
"ÿ",
"Ā",
"ā",
"Ă",
"ă",
"Ą",
"ą",
"Ć",
"ć",
"Ĉ",
"ĉ",
"Ċ",
"ċ",
"Č",
"č",
"Ď",
"ď",
"Đ",
"đ",
"Ē",
"ē",
"Ĕ",
"ĕ",
"Ė",
"ė",
"Ę",
"ę",
"Ě",
"ě",
"Ĝ",
"ĝ",
"Ğ",
"ğ",
"Ġ",
"ġ",
"Ģ",
"ģ",
"Ĥ",
"ĥ",
"Ħ",
"ħ",
"Ĩ",
"ĩ",
"Ī",
"ī",
"Ĭ",
"ĭ",
"Į",
"į",
"İ",
"ı",
"IJ",
"ij",
"Ĵ",
"ĵ",
"Ķ",
"ķ",
"ĸ",
"Ĺ",
"ĺ",
"Ļ",
"ļ",
"Ľ",
"ľ",
"Ŀ",
"ŀ",
"Ł",
"ł",
"Ń",
];
vocab = [...vocab, ...vocab.map((v) => v + "</w>")];
for (const merge of mergedMerges) {
vocab.push(merge.join(""));
}
vocab.push("<|startoftext|>", "<|endoftext|>");
this.encoder = Object.fromEntries(vocab.map((v, i) => [v, i]));
this.decoder = Object.fromEntries(
Object.entries(this.encoder).map(([k, v]) => [v, k]),
);
this.bpeRanks = Object.fromEntries(
mergedMerges.map((v, i) => [v.join("·😎·"), i]),
); // ·😎· because js doesn't yet have tuples
this.cache = {
"<|startoftext|>": "<|startoftext|>",
"<|endoftext|>": "<|endoftext|>",
};
this.pat =
/<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+/giu;
}
bpe(token: string) {
if (this.cache[token] !== undefined) {
return this.cache[token];
}
let word = [...token.slice(0, -1), token.slice(-1) + "</w>"];
let pairs = getPairs(word);
if (pairs.length === 0) {
return token + "</w>";
}
while (1) {
let bigram: [string, string] | null = null;
let minRank = Infinity;
for (const p of pairs) {
const r = this.bpeRanks[p.join("·😎·")];
if (r === undefined) continue;
if (r < minRank) {
minRank = r;
bigram = p;
}
}
if (bigram === null) {
break;
}
const [first, second] = bigram;
const newWord: string[] = [];
let i = 0;
while (i < word.length) {
const j = word.indexOf(first, i);
if (j === -1) {
newWord.push(...word.slice(i));
break;
}
newWord.push(...word.slice(i, j));
i = j;
if (
word[i] === first &&
i < word.length - 1 &&
word[i + 1] === second
) {
newWord.push(first + second);
i += 2;
} else {
// @ts-expect-error "Array indexing can return undefined but not modifying thirdparty code"
newWord.push(word[i]);
i += 1;
}
}
word = newWord;
if (word.length === 1) {
break;
} else {
pairs = getPairs(word);
}
}
const joinedWord = word.join(" ");
this.cache[token] = joinedWord;
return joinedWord;
}
encode(text: string) {
const bpeTokens: number[] = [];
text = whitespaceClean(basicClean(text)).toLowerCase();
for (let token of [...text.matchAll(this.pat)].map((m) => m[0])) {
token = [...token]
.map((b) => this.byteEncoder[b.charCodeAt(0) as number])
.join("");
bpeTokens.push(
// @ts-expect-error "Array indexing can return undefined but not modifying thirdparty code"
...this.bpe(token)
.split(" ")
.map((bpeToken: string) => this.encoder[bpeToken]),
);
}
return bpeTokens;
}
// adds start and end token, and adds padding 0's and ensures it's 77 tokens long
encodeForCLIP(text: string) {
let tokens = this.encode(text);
tokens.unshift(49406); // start token
tokens = tokens.slice(0, 76);
tokens.push(49407); // end token
while (tokens.length < 77) tokens.push(0);
return tokens;
}
decode(tokens: any[]) {
let text = tokens
.map((token: string | number) => this.decoder[token])
.join("");
text = [...text]
.map((c) => this.byteDecoder[c])
// @ts-expect-error "Array indexing can return undefined but not modifying thirdparty code"
.map((v) => String.fromCharCode(v))
.join("")
.replace(/<\/w>/g, " ");
return text;
}
}

View File

@@ -0,0 +1,11 @@
/**
* @file Types for "clip-bpe-js"
*
* Non exhaustive, only the function we need.
*/
declare module "clip-bpe-js" {
class Tokenizer {
encodeForCLIP(text: string): number[];
}
}

View File

@@ -928,6 +928,11 @@ cli-truncate@^2.1.0:
slice-ansi "^3.0.0"
string-width "^4.2.0"
clip-bpe-js@^0.0.6:
version "0.0.6"
resolved "https://registry.yarnpkg.com/clip-bpe-js/-/clip-bpe-js-0.0.6.tgz#a11c228e793fa29841f8cd4f8576fc1ff3403511"
integrity sha512-+0n0eeacgFmg9hKhHhXanKkRDhqRwzYK7dj46VYMzePxmN8zbbDgeTQZchfvARNpbBMRi7FfWRSW3lysOUDX/Q==
cliui@^8.0.1:
version "8.0.1"
resolved "https://registry.yarnpkg.com/cliui/-/cliui-8.0.1.tgz#0c04b075db02cbfe60dc8e6cf2f5486b1a3608aa"