diff --git a/desktop/src/main/services/ml-clip.ts b/desktop/src/main/services/ml-clip.ts index 6719e191c4..92c56ccc80 100644 --- a/desktop/src/main/services/ml-clip.ts +++ b/desktop/src/main/services/ml-clip.ts @@ -49,73 +49,203 @@ const clipImageEmbedding_ = async (jpegFilePath: string) => { return normalizeEmbedding(imageEmbedding); }; -const getRGBData = async (jpegFilePath: string): Promise => { +const getRGBData = async ( + jpegFilePath: string, +): Promise => { const jpegData = await fs.readFile(jpegFilePath); const rawImageData = jpeg.decode(jpegData, { useTArray: true, formatAsRGBA: false, - }); - - const nx = rawImageData.width; - const ny = rawImageData.height; - const inputImage = rawImageData.data; - - const nx2 = 224; - const ny2 = 224; - const totalSize = 3 * nx2 * ny2; - - const result = Array(totalSize).fill(0); - const scale = Math.max(nx, ny) / 224; - - const nx3 = Math.round(nx / scale); - const ny3 = Math.round(ny / scale); + }); // TODO: manav: make sure this works on all images, not just jpeg + const pixelData = rawImageData.data; + const requiredWidth = 224; + const requiredHeight = 224; + const requiredSize = 3 * requiredWidth * requiredHeight; const mean: number[] = [0.48145466, 0.4578275, 0.40821073]; const std: number[] = [0.26862954, 0.26130258, 0.27577711]; - for (let y = 0; y < ny3; y++) { - for (let x = 0; x < nx3; x++) { - for (let c = 0; c < 3; c++) { - // Linear interpolation - const sx = (x + 0.5) * scale - 0.5; - const sy = (y + 0.5) * scale - 0.5; + const scale = Math.max( + requiredWidth / rawImageData.width, + requiredHeight / rawImageData.height, + ); + const scaledWidth = Math.round(rawImageData.width * scale); + const scaledHeight = Math.round(rawImageData.height * scale); + const widthOffset = Math.max(0, scaledWidth - requiredWidth) / 2; + const heightOffset = Math.max(0, scaledHeight - requiredHeight) / 2; - const x0 = Math.max(0, Math.floor(sx)); - const y0 = Math.max(0, Math.floor(sy)); + const processedImage = new Float32Array(requiredSize); - const x1 = Math.min(x0 + 1, nx - 1); - const y1 = Math.min(y0 + 1, ny - 1); - - const dx = sx - x0; - const dy = sy - y0; - - const j00 = 3 * (y0 * nx + x0) + c; - const j01 = 3 * (y0 * nx + x1) + c; - const j10 = 3 * (y1 * nx + x0) + c; - const j11 = 3 * (y1 * nx + x1) + c; - - const v00 = inputImage[j00] ?? 0; - const v01 = inputImage[j01] ?? 0; - const v10 = inputImage[j10] ?? 0; - const v11 = inputImage[j11] ?? 0; - - const v0 = v00 * (1 - dx) + v01 * dx; - const v1 = v10 * (1 - dx) + v11 * dx; - - const v = v0 * (1 - dy) + v1 * dy; - - const v2 = Math.min(Math.max(Math.round(v), 0), 255); - - // createTensorWithDataList is dumb compared to reshape and - // hence has to be given with one channel after another - const i = y * nx3 + x + (c % 3) * 224 * 224; - - result[i] = (v2 / 255 - (mean[c] ?? 0)) / (std[c] ?? 1); - } + // Populate the Float32Array with normalized pixel values. + let pi = 0; + const cOffsetG = requiredHeight * requiredWidth; // ChannelOffsetGreen + const cOffsetB = 2 * requiredHeight * requiredWidth; // ChannelOffsetBlue + for (let h = 0 + heightOffset; h < scaledHeight - heightOffset; h++) { + for (let w = 0 + widthOffset; w < scaledWidth - widthOffset; w++) { + const { r, g, b } = pixelRGBBicubic( + w / scale, + h / scale, + pixelData, + rawImageData.width, + rawImageData.height, + ); + processedImage[pi] = (r / 255.0 - mean[0]!) / std[0]!; + processedImage[pi + cOffsetG] = (g / 255.0 - mean[1]!) / std[1]!; + processedImage[pi + cOffsetB] = (b / 255.0 - mean[2]!) / std[2]!; + pi++; } } + return processedImage; +}; - return result; +// NOTE: exact duplicate of the function in web/apps/photos/src/services/face/image.ts +const pixelRGBBicubic = ( + fx: number, + fy: number, + imageData: Uint8Array, + imageWidth: number, + imageHeight: number, +) => { + // Clamp to image boundaries. + fx = clamp(fx, 0, imageWidth - 1); + fy = clamp(fy, 0, imageHeight - 1); + + const x = Math.trunc(fx) - (fx >= 0.0 ? 0 : 1); + const px = x - 1; + const nx = x + 1; + const ax = x + 2; + const y = Math.trunc(fy) - (fy >= 0.0 ? 0 : 1); + const py = y - 1; + const ny = y + 1; + const ay = y + 2; + const dx = fx - x; + const dy = fy - y; + + const cubic = ( + dx: number, + ipp: number, + icp: number, + inp: number, + iap: number, + ) => + icp + + 0.5 * + (dx * (-ipp + inp) + + dx * dx * (2 * ipp - 5 * icp + 4 * inp - iap) + + dx * dx * dx * (-ipp + 3 * icp - 3 * inp + iap)); + + const icc = pixelRGBA(imageData, imageWidth, imageHeight, x, y); + + const ipp = + px < 0 || py < 0 + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, px, py); + const icp = + px < 0 ? icc : pixelRGBA(imageData, imageWidth, imageHeight, x, py); + const inp = + py < 0 || nx >= imageWidth + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, nx, py); + const iap = + ax >= imageWidth || py < 0 + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, ax, py); + + const ip0 = cubic(dx, ipp.r!, icp.r!, inp.r!, iap.r!); + const ip1 = cubic(dx, ipp.g!, icp.g!, inp.g!, iap.g!); + const ip2 = cubic(dx, ipp.b!, icp.b!, inp.b!, iap.b!); + // const ip3 = cubic(dx, ipp.a, icp.a, inp.a, iap.a); + + const ipc = + px < 0 ? icc : pixelRGBA(imageData, imageWidth, imageHeight, px, y); + const inc = + nx >= imageWidth + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, nx, y); + const iac = + ax >= imageWidth + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, ax, y); + + const ic0 = cubic(dx, ipc.r!, icc.r!, inc.r!, iac.r!); + const ic1 = cubic(dx, ipc.g!, icc.g!, inc.g!, iac.g!); + const ic2 = cubic(dx, ipc.b!, icc.b!, inc.b!, iac.b!); + // const ic3 = cubic(dx, ipc.a, icc.a, inc.a, iac.a); + + const ipn = + px < 0 || ny >= imageHeight + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, px, ny); + const icn = + ny >= imageHeight + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, x, ny); + const inn = + nx >= imageWidth || ny >= imageHeight + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, nx, ny); + const ian = + ax >= imageWidth || ny >= imageHeight + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, ax, ny); + + const in0 = cubic(dx, ipn.r!, icn.r!, inn.r!, ian.r!); + const in1 = cubic(dx, ipn.g!, icn.g!, inn.g!, ian.g!); + const in2 = cubic(dx, ipn.b!, icn.b!, inn.b!, ian.b!); + // const in3 = cubic(dx, ipn.a, icn.a, inn.a, ian.a); + + const ipa = + px < 0 || ay >= imageHeight + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, px, ay); + const ica = + ay >= imageHeight + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, x, ay); + const ina = + nx >= imageWidth || ay >= imageHeight + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, nx, ay); + const iaa = + ax >= imageWidth || ay >= imageHeight + ? icc + : pixelRGBA(imageData, imageWidth, imageHeight, ax, ay); + + const ia0 = cubic(dx, ipa.r!, ica.r!, ina.r!, iaa.r!); + const ia1 = cubic(dx, ipa.g!, ica.g!, ina.g!, iaa.g!); + const ia2 = cubic(dx, ipa.b!, ica.b!, ina.b!, iaa.b!); + // const ia3 = cubic(dx, ipa.a, ica.a, ina.a, iaa.a); + + const c0 = Math.trunc(clamp(cubic(dy, ip0, ic0, in0, ia0), 0, 255)); + const c1 = Math.trunc(clamp(cubic(dy, ip1, ic1, in1, ia1), 0, 255)); + const c2 = Math.trunc(clamp(cubic(dy, ip2, ic2, in2, ia2), 0, 255)); + // const c3 = cubic(dy, ip3, ic3, in3, ia3); + + return { r: c0, g: c1, b: c2 }; +}; + +// NOTE: exact duplicate of the function in web/apps/photos/src/services/face/image.ts +const clamp = (value: number, min: number, max: number) => + Math.min(max, Math.max(min, value)); + +// NOTE: exact duplicate of the function in web/apps/photos/src/services/face/image.ts +const pixelRGBA = ( + imageData: Uint8Array, + width: number, + height: number, + x: number, + y: number, +) => { + if (x < 0 || x >= width || y < 0 || y >= height) { + return { r: 0, g: 0, b: 0, a: 0 }; + } + const index = (y * width + x) * 4; + return { + r: imageData[index], + g: imageData[index + 1], + b: imageData[index + 2], + a: imageData[index + 3], + }; }; const normalizeEmbedding = (embedding: Float32Array) => {