[desktop] Use clip-bpe-js as the Tokenizer
Replace the inlined code with the library. Tested by comparing the produced embeddings on a few sample prompts (were exactly the same).
This commit is contained in:
@@ -27,6 +27,7 @@
|
||||
"any-shell-escape": "^0.1",
|
||||
"auto-launch": "^5.0",
|
||||
"chokidar": "^3.6",
|
||||
"clip-bpe-js": "^0.0.6",
|
||||
"compare-versions": "^6.1",
|
||||
"electron-log": "^5.1",
|
||||
"electron-store": "^8.2",
|
||||
|
||||
@@ -5,10 +5,10 @@
|
||||
*
|
||||
* @see `web/apps/photos/src/services/clip-service.ts` for more details.
|
||||
*/
|
||||
import Tokenizer from "clip-bpe-js";
|
||||
import jpeg from "jpeg-js";
|
||||
import fs from "node:fs/promises";
|
||||
import * as ort from "onnxruntime-node";
|
||||
import Tokenizer from "../../thirdparty/clip-bpe-ts/mod";
|
||||
import log from "../log";
|
||||
import { writeStream } from "../stream";
|
||||
import { ensure, wait } from "../utils/common";
|
||||
|
||||
21
desktop/src/thirdparty/clip-bpe-ts/LICENSE
vendored
21
desktop/src/thirdparty/clip-bpe-ts/LICENSE
vendored
@@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 josephrocca
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
40
desktop/src/thirdparty/clip-bpe-ts/README.md
vendored
40
desktop/src/thirdparty/clip-bpe-ts/README.md
vendored
@@ -1,40 +0,0 @@
|
||||
# CLIP Byte Pair Encoding JavaScript Port
|
||||
|
||||
A JavaScript port of
|
||||
[OpenAI's CLIP byte-pair-encoding tokenizer](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py).
|
||||
|
||||
```js
|
||||
import Tokenizer from "https://deno.land/x/clip_bpe@v0.0.6/mod.js";
|
||||
let t = new Tokenizer();
|
||||
|
||||
t.encode("hello"); // [3306]
|
||||
t.encode("magnificent"); // [10724]
|
||||
t.encode("magnificently"); // [9725, 2922]
|
||||
t.decode(t.encode("HELLO")); // "hello "
|
||||
t.decode(t.encode("abc123")); // "abc 1 2 3 "
|
||||
t.decode(st.encode("let's see here")); // "let 's see here "
|
||||
t.encode("hello world!"); // [3306, 1002, 256]
|
||||
|
||||
// to encode for CLIP (trims to maximum of 77 tokens and adds start and end token, and pads with zeros if less than 77 tokens):
|
||||
t.encodeForCLIP("hello world!"); // [49406,3306,1002,256,49407,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
```
|
||||
|
||||
This encoder/decoder behaves differently to the the GPT-2/3 tokenizer
|
||||
(JavaScript version of that
|
||||
[here](https://github.com/latitudegames/GPT-3-Encoder)). For example, it doesn't
|
||||
preserve capital letters, as shown above.
|
||||
|
||||
The
|
||||
[Python version](https://github.com/openai/CLIP/blob/3bee28119e6b28e75b82b811b87b56935314e6a5/clip/simple_tokenizer.py)
|
||||
of this tokenizer uses the `ftfy` module to clean up the text before encoding
|
||||
it. I didn't include that module by default because currently the only version
|
||||
available in JavaScript is
|
||||
[this one](https://github.com/josephrocca/ftfy-pyodide), which requires
|
||||
importing a full Python runtime as a WebAssembly module. If you want the `ftfy`
|
||||
cleaning, just import it and clean your text with it before passing it to the
|
||||
`.encode()` method.
|
||||
|
||||
# License
|
||||
|
||||
To the extent that there is any original work in this repo, it is MIT Licensed,
|
||||
just like [openai/CLIP](https://github.com/openai/CLIP).
|
||||
File diff suppressed because one or more lines are too long
470
desktop/src/thirdparty/clip-bpe-ts/mod.ts
vendored
470
desktop/src/thirdparty/clip-bpe-ts/mod.ts
vendored
@@ -1,470 +0,0 @@
|
||||
/* eslint-disable */
|
||||
|
||||
import * as htmlEntities from "html-entities";
|
||||
import bpeVocabData from "./bpe_simple_vocab_16e6";
|
||||
// import ftfy from "https://deno.land/x/ftfy_pyodide@v0.1.1/mod.js";
|
||||
|
||||
function ord(c: string) {
|
||||
return c.charCodeAt(0);
|
||||
}
|
||||
function range(start: number, stop?: number, step: number = 1) {
|
||||
if (stop === undefined) {
|
||||
stop = start;
|
||||
start = 0;
|
||||
}
|
||||
|
||||
if ((step > 0 && start >= stop) || (step < 0 && start <= stop)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const result: number[] = [];
|
||||
for (let i = start; step > 0 ? i < stop : i > stop; i += step) {
|
||||
result.push(i);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function bytesToUnicode() {
|
||||
const bs = [
|
||||
...range(ord("!"), ord("~") + 1),
|
||||
...range(ord("¡"), ord("¬") + 1),
|
||||
...range(ord("®"), ord("ÿ") + 1),
|
||||
];
|
||||
const cs = bs.slice(0);
|
||||
let n = 0;
|
||||
for (const b of range(2 ** 8)) {
|
||||
if (!bs.includes(b)) {
|
||||
bs.push(b);
|
||||
cs.push(2 ** 8 + n);
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
const csString = cs.map((n) => String.fromCharCode(n));
|
||||
return Object.fromEntries(bs.map((v, i) => [v, csString[i]]));
|
||||
}
|
||||
|
||||
function getPairs(word: string | any[]) {
|
||||
const pairs: [string, string][] = [];
|
||||
let prevChar = word[0];
|
||||
for (const char of word.slice(1)) {
|
||||
pairs.push([prevChar, char]);
|
||||
prevChar = char;
|
||||
}
|
||||
return pairs;
|
||||
}
|
||||
|
||||
function basicClean(text: string) {
|
||||
// text = ftfy.fix_text(text);
|
||||
text = htmlEntities.decode(htmlEntities.decode(text));
|
||||
return text.trim();
|
||||
}
|
||||
|
||||
function whitespaceClean(text: string) {
|
||||
return text.replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
export default class {
|
||||
byteEncoder;
|
||||
byteDecoder: {
|
||||
[k: string]: number;
|
||||
};
|
||||
encoder;
|
||||
decoder: any;
|
||||
bpeRanks: any;
|
||||
cache: Record<string, string>;
|
||||
pat: RegExp;
|
||||
constructor() {
|
||||
this.byteEncoder = bytesToUnicode();
|
||||
this.byteDecoder = Object.fromEntries(
|
||||
Object.entries(this.byteEncoder).map(([k, v]) => [v, Number(k)]),
|
||||
);
|
||||
let merges = bpeVocabData.text.split("\n");
|
||||
merges = merges.slice(1, 49152 - 256 - 2 + 1);
|
||||
const mergedMerges = merges.map((merge) => merge.split(" "));
|
||||
// There was a bug related to the ordering of Python's .values() output. I'm lazy do I've just copy-pasted the Python output:
|
||||
let vocab = [
|
||||
"!",
|
||||
'"',
|
||||
"#",
|
||||
"$",
|
||||
"%",
|
||||
"&",
|
||||
"'",
|
||||
"(",
|
||||
")",
|
||||
"*",
|
||||
"+",
|
||||
",",
|
||||
"-",
|
||||
".",
|
||||
"/",
|
||||
"0",
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
"5",
|
||||
"6",
|
||||
"7",
|
||||
"8",
|
||||
"9",
|
||||
":",
|
||||
";",
|
||||
"<",
|
||||
"=",
|
||||
">",
|
||||
"?",
|
||||
"@",
|
||||
"A",
|
||||
"B",
|
||||
"C",
|
||||
"D",
|
||||
"E",
|
||||
"F",
|
||||
"G",
|
||||
"H",
|
||||
"I",
|
||||
"J",
|
||||
"K",
|
||||
"L",
|
||||
"M",
|
||||
"N",
|
||||
"O",
|
||||
"P",
|
||||
"Q",
|
||||
"R",
|
||||
"S",
|
||||
"T",
|
||||
"U",
|
||||
"V",
|
||||
"W",
|
||||
"X",
|
||||
"Y",
|
||||
"Z",
|
||||
"[",
|
||||
"\\",
|
||||
"]",
|
||||
"^",
|
||||
"_",
|
||||
"`",
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
"e",
|
||||
"f",
|
||||
"g",
|
||||
"h",
|
||||
"i",
|
||||
"j",
|
||||
"k",
|
||||
"l",
|
||||
"m",
|
||||
"n",
|
||||
"o",
|
||||
"p",
|
||||
"q",
|
||||
"r",
|
||||
"s",
|
||||
"t",
|
||||
"u",
|
||||
"v",
|
||||
"w",
|
||||
"x",
|
||||
"y",
|
||||
"z",
|
||||
"{",
|
||||
"|",
|
||||
"}",
|
||||
"~",
|
||||
"¡",
|
||||
"¢",
|
||||
"£",
|
||||
"¤",
|
||||
"¥",
|
||||
"¦",
|
||||
"§",
|
||||
"¨",
|
||||
"©",
|
||||
"ª",
|
||||
"«",
|
||||
"¬",
|
||||
"®",
|
||||
"¯",
|
||||
"°",
|
||||
"±",
|
||||
"²",
|
||||
"³",
|
||||
"´",
|
||||
"µ",
|
||||
"¶",
|
||||
"·",
|
||||
"¸",
|
||||
"¹",
|
||||
"º",
|
||||
"»",
|
||||
"¼",
|
||||
"½",
|
||||
"¾",
|
||||
"¿",
|
||||
"À",
|
||||
"Á",
|
||||
"Â",
|
||||
"Ã",
|
||||
"Ä",
|
||||
"Å",
|
||||
"Æ",
|
||||
"Ç",
|
||||
"È",
|
||||
"É",
|
||||
"Ê",
|
||||
"Ë",
|
||||
"Ì",
|
||||
"Í",
|
||||
"Î",
|
||||
"Ï",
|
||||
"Ð",
|
||||
"Ñ",
|
||||
"Ò",
|
||||
"Ó",
|
||||
"Ô",
|
||||
"Õ",
|
||||
"Ö",
|
||||
"×",
|
||||
"Ø",
|
||||
"Ù",
|
||||
"Ú",
|
||||
"Û",
|
||||
"Ü",
|
||||
"Ý",
|
||||
"Þ",
|
||||
"ß",
|
||||
"à",
|
||||
"á",
|
||||
"â",
|
||||
"ã",
|
||||
"ä",
|
||||
"å",
|
||||
"æ",
|
||||
"ç",
|
||||
"è",
|
||||
"é",
|
||||
"ê",
|
||||
"ë",
|
||||
"ì",
|
||||
"í",
|
||||
"î",
|
||||
"ï",
|
||||
"ð",
|
||||
"ñ",
|
||||
"ò",
|
||||
"ó",
|
||||
"ô",
|
||||
"õ",
|
||||
"ö",
|
||||
"÷",
|
||||
"ø",
|
||||
"ù",
|
||||
"ú",
|
||||
"û",
|
||||
"ü",
|
||||
"ý",
|
||||
"þ",
|
||||
"ÿ",
|
||||
"Ā",
|
||||
"ā",
|
||||
"Ă",
|
||||
"ă",
|
||||
"Ą",
|
||||
"ą",
|
||||
"Ć",
|
||||
"ć",
|
||||
"Ĉ",
|
||||
"ĉ",
|
||||
"Ċ",
|
||||
"ċ",
|
||||
"Č",
|
||||
"č",
|
||||
"Ď",
|
||||
"ď",
|
||||
"Đ",
|
||||
"đ",
|
||||
"Ē",
|
||||
"ē",
|
||||
"Ĕ",
|
||||
"ĕ",
|
||||
"Ė",
|
||||
"ė",
|
||||
"Ę",
|
||||
"ę",
|
||||
"Ě",
|
||||
"ě",
|
||||
"Ĝ",
|
||||
"ĝ",
|
||||
"Ğ",
|
||||
"ğ",
|
||||
"Ġ",
|
||||
"ġ",
|
||||
"Ģ",
|
||||
"ģ",
|
||||
"Ĥ",
|
||||
"ĥ",
|
||||
"Ħ",
|
||||
"ħ",
|
||||
"Ĩ",
|
||||
"ĩ",
|
||||
"Ī",
|
||||
"ī",
|
||||
"Ĭ",
|
||||
"ĭ",
|
||||
"Į",
|
||||
"į",
|
||||
"İ",
|
||||
"ı",
|
||||
"IJ",
|
||||
"ij",
|
||||
"Ĵ",
|
||||
"ĵ",
|
||||
"Ķ",
|
||||
"ķ",
|
||||
"ĸ",
|
||||
"Ĺ",
|
||||
"ĺ",
|
||||
"Ļ",
|
||||
"ļ",
|
||||
"Ľ",
|
||||
"ľ",
|
||||
"Ŀ",
|
||||
"ŀ",
|
||||
"Ł",
|
||||
"ł",
|
||||
"Ń",
|
||||
];
|
||||
vocab = [...vocab, ...vocab.map((v) => v + "</w>")];
|
||||
for (const merge of mergedMerges) {
|
||||
vocab.push(merge.join(""));
|
||||
}
|
||||
vocab.push("<|startoftext|>", "<|endoftext|>");
|
||||
this.encoder = Object.fromEntries(vocab.map((v, i) => [v, i]));
|
||||
this.decoder = Object.fromEntries(
|
||||
Object.entries(this.encoder).map(([k, v]) => [v, k]),
|
||||
);
|
||||
this.bpeRanks = Object.fromEntries(
|
||||
mergedMerges.map((v, i) => [v.join("·😎·"), i]),
|
||||
); // ·😎· because js doesn't yet have tuples
|
||||
this.cache = {
|
||||
"<|startoftext|>": "<|startoftext|>",
|
||||
"<|endoftext|>": "<|endoftext|>",
|
||||
};
|
||||
this.pat =
|
||||
/<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+/giu;
|
||||
}
|
||||
|
||||
bpe(token: string) {
|
||||
if (this.cache[token] !== undefined) {
|
||||
return this.cache[token];
|
||||
}
|
||||
|
||||
let word = [...token.slice(0, -1), token.slice(-1) + "</w>"];
|
||||
let pairs = getPairs(word);
|
||||
|
||||
if (pairs.length === 0) {
|
||||
return token + "</w>";
|
||||
}
|
||||
|
||||
while (1) {
|
||||
let bigram: [string, string] | null = null;
|
||||
let minRank = Infinity;
|
||||
for (const p of pairs) {
|
||||
const r = this.bpeRanks[p.join("·😎·")];
|
||||
if (r === undefined) continue;
|
||||
if (r < minRank) {
|
||||
minRank = r;
|
||||
bigram = p;
|
||||
}
|
||||
}
|
||||
|
||||
if (bigram === null) {
|
||||
break;
|
||||
}
|
||||
|
||||
const [first, second] = bigram;
|
||||
const newWord: string[] = [];
|
||||
let i = 0;
|
||||
while (i < word.length) {
|
||||
const j = word.indexOf(first, i);
|
||||
|
||||
if (j === -1) {
|
||||
newWord.push(...word.slice(i));
|
||||
break;
|
||||
}
|
||||
|
||||
newWord.push(...word.slice(i, j));
|
||||
i = j;
|
||||
|
||||
if (
|
||||
word[i] === first &&
|
||||
i < word.length - 1 &&
|
||||
word[i + 1] === second
|
||||
) {
|
||||
newWord.push(first + second);
|
||||
i += 2;
|
||||
} else {
|
||||
// @ts-expect-error "Array indexing can return undefined but not modifying thirdparty code"
|
||||
newWord.push(word[i]);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
word = newWord;
|
||||
if (word.length === 1) {
|
||||
break;
|
||||
} else {
|
||||
pairs = getPairs(word);
|
||||
}
|
||||
}
|
||||
const joinedWord = word.join(" ");
|
||||
this.cache[token] = joinedWord;
|
||||
return joinedWord;
|
||||
}
|
||||
|
||||
encode(text: string) {
|
||||
const bpeTokens: number[] = [];
|
||||
text = whitespaceClean(basicClean(text)).toLowerCase();
|
||||
for (let token of [...text.matchAll(this.pat)].map((m) => m[0])) {
|
||||
token = [...token]
|
||||
.map((b) => this.byteEncoder[b.charCodeAt(0) as number])
|
||||
.join("");
|
||||
bpeTokens.push(
|
||||
// @ts-expect-error "Array indexing can return undefined but not modifying thirdparty code"
|
||||
...this.bpe(token)
|
||||
.split(" ")
|
||||
.map((bpeToken: string) => this.encoder[bpeToken]),
|
||||
);
|
||||
}
|
||||
return bpeTokens;
|
||||
}
|
||||
|
||||
// adds start and end token, and adds padding 0's and ensures it's 77 tokens long
|
||||
encodeForCLIP(text: string) {
|
||||
let tokens = this.encode(text);
|
||||
tokens.unshift(49406); // start token
|
||||
tokens = tokens.slice(0, 76);
|
||||
tokens.push(49407); // end token
|
||||
while (tokens.length < 77) tokens.push(0);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
decode(tokens: any[]) {
|
||||
let text = tokens
|
||||
.map((token: string | number) => this.decoder[token])
|
||||
.join("");
|
||||
text = [...text]
|
||||
.map((c) => this.byteDecoder[c])
|
||||
// @ts-expect-error "Array indexing can return undefined but not modifying thirdparty code"
|
||||
.map((v) => String.fromCharCode(v))
|
||||
.join("")
|
||||
.replace(/<\/w>/g, " ");
|
||||
return text;
|
||||
}
|
||||
}
|
||||
11
desktop/src/types/clip-bpe-js.ts
Normal file
11
desktop/src/types/clip-bpe-js.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
/**
|
||||
* @file Types for "clip-bpe-js"
|
||||
*
|
||||
* Non exhaustive, only the function we need.
|
||||
*/
|
||||
|
||||
declare module "clip-bpe-js" {
|
||||
class Tokenizer {
|
||||
encodeForCLIP(text: string): number[];
|
||||
}
|
||||
}
|
||||
@@ -928,6 +928,11 @@ cli-truncate@^2.1.0:
|
||||
slice-ansi "^3.0.0"
|
||||
string-width "^4.2.0"
|
||||
|
||||
clip-bpe-js@^0.0.6:
|
||||
version "0.0.6"
|
||||
resolved "https://registry.yarnpkg.com/clip-bpe-js/-/clip-bpe-js-0.0.6.tgz#a11c228e793fa29841f8cd4f8576fc1ff3403511"
|
||||
integrity sha512-+0n0eeacgFmg9hKhHhXanKkRDhqRwzYK7dj46VYMzePxmN8zbbDgeTQZchfvARNpbBMRi7FfWRSW3lysOUDX/Q==
|
||||
|
||||
cliui@^8.0.1:
|
||||
version "8.0.1"
|
||||
resolved "https://registry.yarnpkg.com/cliui/-/cliui-8.0.1.tgz#0c04b075db02cbfe60dc8e6cf2f5486b1a3608aa"
|
||||
|
||||
Reference in New Issue
Block a user