[web][photos] Correct clip preprocessing

This commit is contained in:
laurenspriem
2024-07-02 12:06:41 +05:30
parent b4d0eb843b
commit ff2d838b64

View File

@@ -49,73 +49,203 @@ const clipImageEmbedding_ = async (jpegFilePath: string) => {
return normalizeEmbedding(imageEmbedding);
};
const getRGBData = async (jpegFilePath: string): Promise<number[]> => {
const getRGBData = async (
jpegFilePath: string,
): Promise<Float32Array> => {
const jpegData = await fs.readFile(jpegFilePath);
const rawImageData = jpeg.decode(jpegData, {
useTArray: true,
formatAsRGBA: false,
});
const nx = rawImageData.width;
const ny = rawImageData.height;
const inputImage = rawImageData.data;
const nx2 = 224;
const ny2 = 224;
const totalSize = 3 * nx2 * ny2;
const result = Array<number>(totalSize).fill(0);
const scale = Math.max(nx, ny) / 224;
const nx3 = Math.round(nx / scale);
const ny3 = Math.round(ny / scale);
}); // TODO: manav: make sure this works on all images, not just jpeg
const pixelData = rawImageData.data;
const requiredWidth = 224;
const requiredHeight = 224;
const requiredSize = 3 * requiredWidth * requiredHeight;
const mean: number[] = [0.48145466, 0.4578275, 0.40821073];
const std: number[] = [0.26862954, 0.26130258, 0.27577711];
for (let y = 0; y < ny3; y++) {
for (let x = 0; x < nx3; x++) {
for (let c = 0; c < 3; c++) {
// Linear interpolation
const sx = (x + 0.5) * scale - 0.5;
const sy = (y + 0.5) * scale - 0.5;
const scale = Math.max(
requiredWidth / rawImageData.width,
requiredHeight / rawImageData.height,
);
const scaledWidth = Math.round(rawImageData.width * scale);
const scaledHeight = Math.round(rawImageData.height * scale);
const widthOffset = Math.max(0, scaledWidth - requiredWidth) / 2;
const heightOffset = Math.max(0, scaledHeight - requiredHeight) / 2;
const x0 = Math.max(0, Math.floor(sx));
const y0 = Math.max(0, Math.floor(sy));
const processedImage = new Float32Array(requiredSize);
const x1 = Math.min(x0 + 1, nx - 1);
const y1 = Math.min(y0 + 1, ny - 1);
const dx = sx - x0;
const dy = sy - y0;
const j00 = 3 * (y0 * nx + x0) + c;
const j01 = 3 * (y0 * nx + x1) + c;
const j10 = 3 * (y1 * nx + x0) + c;
const j11 = 3 * (y1 * nx + x1) + c;
const v00 = inputImage[j00] ?? 0;
const v01 = inputImage[j01] ?? 0;
const v10 = inputImage[j10] ?? 0;
const v11 = inputImage[j11] ?? 0;
const v0 = v00 * (1 - dx) + v01 * dx;
const v1 = v10 * (1 - dx) + v11 * dx;
const v = v0 * (1 - dy) + v1 * dy;
const v2 = Math.min(Math.max(Math.round(v), 0), 255);
// createTensorWithDataList is dumb compared to reshape and
// hence has to be given with one channel after another
const i = y * nx3 + x + (c % 3) * 224 * 224;
result[i] = (v2 / 255 - (mean[c] ?? 0)) / (std[c] ?? 1);
}
// Populate the Float32Array with normalized pixel values.
let pi = 0;
const cOffsetG = requiredHeight * requiredWidth; // ChannelOffsetGreen
const cOffsetB = 2 * requiredHeight * requiredWidth; // ChannelOffsetBlue
for (let h = 0 + heightOffset; h < scaledHeight - heightOffset; h++) {
for (let w = 0 + widthOffset; w < scaledWidth - widthOffset; w++) {
const { r, g, b } = pixelRGBBicubic(
w / scale,
h / scale,
pixelData,
rawImageData.width,
rawImageData.height,
);
processedImage[pi] = (r / 255.0 - mean[0]!) / std[0]!;
processedImage[pi + cOffsetG] = (g / 255.0 - mean[1]!) / std[1]!;
processedImage[pi + cOffsetB] = (b / 255.0 - mean[2]!) / std[2]!;
pi++;
}
}
return processedImage;
};
return result;
// NOTE: exact duplicate of the function in web/apps/photos/src/services/face/image.ts
const pixelRGBBicubic = (
fx: number,
fy: number,
imageData: Uint8Array,
imageWidth: number,
imageHeight: number,
) => {
// Clamp to image boundaries.
fx = clamp(fx, 0, imageWidth - 1);
fy = clamp(fy, 0, imageHeight - 1);
const x = Math.trunc(fx) - (fx >= 0.0 ? 0 : 1);
const px = x - 1;
const nx = x + 1;
const ax = x + 2;
const y = Math.trunc(fy) - (fy >= 0.0 ? 0 : 1);
const py = y - 1;
const ny = y + 1;
const ay = y + 2;
const dx = fx - x;
const dy = fy - y;
const cubic = (
dx: number,
ipp: number,
icp: number,
inp: number,
iap: number,
) =>
icp +
0.5 *
(dx * (-ipp + inp) +
dx * dx * (2 * ipp - 5 * icp + 4 * inp - iap) +
dx * dx * dx * (-ipp + 3 * icp - 3 * inp + iap));
const icc = pixelRGBA(imageData, imageWidth, imageHeight, x, y);
const ipp =
px < 0 || py < 0
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, px, py);
const icp =
px < 0 ? icc : pixelRGBA(imageData, imageWidth, imageHeight, x, py);
const inp =
py < 0 || nx >= imageWidth
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, nx, py);
const iap =
ax >= imageWidth || py < 0
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, ax, py);
const ip0 = cubic(dx, ipp.r!, icp.r!, inp.r!, iap.r!);
const ip1 = cubic(dx, ipp.g!, icp.g!, inp.g!, iap.g!);
const ip2 = cubic(dx, ipp.b!, icp.b!, inp.b!, iap.b!);
// const ip3 = cubic(dx, ipp.a, icp.a, inp.a, iap.a);
const ipc =
px < 0 ? icc : pixelRGBA(imageData, imageWidth, imageHeight, px, y);
const inc =
nx >= imageWidth
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, nx, y);
const iac =
ax >= imageWidth
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, ax, y);
const ic0 = cubic(dx, ipc.r!, icc.r!, inc.r!, iac.r!);
const ic1 = cubic(dx, ipc.g!, icc.g!, inc.g!, iac.g!);
const ic2 = cubic(dx, ipc.b!, icc.b!, inc.b!, iac.b!);
// const ic3 = cubic(dx, ipc.a, icc.a, inc.a, iac.a);
const ipn =
px < 0 || ny >= imageHeight
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, px, ny);
const icn =
ny >= imageHeight
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, x, ny);
const inn =
nx >= imageWidth || ny >= imageHeight
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, nx, ny);
const ian =
ax >= imageWidth || ny >= imageHeight
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, ax, ny);
const in0 = cubic(dx, ipn.r!, icn.r!, inn.r!, ian.r!);
const in1 = cubic(dx, ipn.g!, icn.g!, inn.g!, ian.g!);
const in2 = cubic(dx, ipn.b!, icn.b!, inn.b!, ian.b!);
// const in3 = cubic(dx, ipn.a, icn.a, inn.a, ian.a);
const ipa =
px < 0 || ay >= imageHeight
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, px, ay);
const ica =
ay >= imageHeight
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, x, ay);
const ina =
nx >= imageWidth || ay >= imageHeight
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, nx, ay);
const iaa =
ax >= imageWidth || ay >= imageHeight
? icc
: pixelRGBA(imageData, imageWidth, imageHeight, ax, ay);
const ia0 = cubic(dx, ipa.r!, ica.r!, ina.r!, iaa.r!);
const ia1 = cubic(dx, ipa.g!, ica.g!, ina.g!, iaa.g!);
const ia2 = cubic(dx, ipa.b!, ica.b!, ina.b!, iaa.b!);
// const ia3 = cubic(dx, ipa.a, ica.a, ina.a, iaa.a);
const c0 = Math.trunc(clamp(cubic(dy, ip0, ic0, in0, ia0), 0, 255));
const c1 = Math.trunc(clamp(cubic(dy, ip1, ic1, in1, ia1), 0, 255));
const c2 = Math.trunc(clamp(cubic(dy, ip2, ic2, in2, ia2), 0, 255));
// const c3 = cubic(dy, ip3, ic3, in3, ia3);
return { r: c0, g: c1, b: c2 };
};
// NOTE: exact duplicate of the function in web/apps/photos/src/services/face/image.ts
const clamp = (value: number, min: number, max: number) =>
Math.min(max, Math.max(min, value));
// NOTE: exact duplicate of the function in web/apps/photos/src/services/face/image.ts
const pixelRGBA = (
imageData: Uint8Array,
width: number,
height: number,
x: number,
y: number,
) => {
if (x < 0 || x >= width || y < 0 || y >= height) {
return { r: 0, g: 0, b: 0, a: 0 };
}
const index = (y * width + x) * 4;
return {
r: imageData[index],
g: imageData[index + 1],
b: imageData[index + 2],
a: imageData[index + 3],
};
};
const normalizeEmbedding = (embedding: Float32Array) => {