diff --git a/desktop/docs/dependencies.md b/desktop/docs/dependencies.md index a9e92b50d8..31ac013199 100644 --- a/desktop/docs/dependencies.md +++ b/desktop/docs/dependencies.md @@ -128,3 +128,6 @@ watcher for the watch folders functionality. [node-stream-zip](https://github.com/antelle/node-stream-zip) is used for reading of large ZIP files (e.g. during imports of Google Takeout ZIPs). + +[lru-cache](https://github.com/isaacs/node-lru-cache) is used to cache file ZIP +handles to avoid reopening them for every operation. diff --git a/desktop/package.json b/desktop/package.json index 29894c611f..286a0a9f91 100644 --- a/desktop/package.json +++ b/desktop/package.json @@ -34,6 +34,7 @@ "ffmpeg-static": "^5.2", "html-entities": "^2.5", "jpeg-js": "^0.4", + "lru-cache": "^10.2", "next-electron-server": "^1", "node-stream-zip": "^1.15", "onnxruntime-node": "^1.18" diff --git a/desktop/src/main/services/logout.ts b/desktop/src/main/services/logout.ts index 37e73309a4..ab031b9112 100644 --- a/desktop/src/main/services/logout.ts +++ b/desktop/src/main/services/logout.ts @@ -3,6 +3,7 @@ import log from "../log"; import { clearConvertToMP4Results } from "../stream"; import { clearStores } from "./store"; import { watchReset } from "./watch"; +import { clearOpenZipCache } from "./zip"; /** * Perform the native side logout sequence. @@ -30,4 +31,9 @@ export const logout = (watcher: FSWatcher) => { } catch (e) { ignoreError("native stores", e); } + try { + clearOpenZipCache(); + } catch (e) { + ignoreError("zip cache", e); + } }; diff --git a/desktop/src/main/services/upload.ts b/desktop/src/main/services/upload.ts index 516fbe6dde..ecd50df779 100644 --- a/desktop/src/main/services/upload.ts +++ b/desktop/src/main/services/upload.ts @@ -1,13 +1,13 @@ -import StreamZip from "node-stream-zip"; import fs from "node:fs/promises"; import path from "node:path"; import { existsSync } from "original-fs"; import type { PendingUploads, ZipItem } from "../../types/ipc"; import log from "../log"; import { uploadStatusStore } from "../stores/upload-status"; +import { clearOpenZipCache, markClosableZip, openZip } from "./zip"; export const listZipItems = async (zipPath: string): Promise => { - const zip = new StreamZip.async({ file: zipPath }); + const zip = openZip(zipPath); const entries = await zip.entries(); const entryNames: string[] = []; @@ -21,7 +21,7 @@ export const listZipItems = async (zipPath: string): Promise => { } } - await zip.close(); + markClosableZip(zipPath); return entryNames.map((entryName) => [zipPath, entryName]); }; @@ -34,7 +34,7 @@ export const pathOrZipItemSize = async ( return stat.size; } else { const [zipPath, entryName] = pathOrZipItem; - const zip = new StreamZip.async({ file: zipPath }); + const zip = openZip(zipPath); const entry = await zip.entry(entryName); if (!entry) throw new Error( @@ -152,4 +152,7 @@ export const markUploadedZipItems = ( uploadStatusStore.set("zipItems", updated); }; -export const clearPendingUploads = () => uploadStatusStore.clear(); +export const clearPendingUploads = () => { + uploadStatusStore.clear(); + clearOpenZipCache(); +}; diff --git a/desktop/src/main/services/zip.ts b/desktop/src/main/services/zip.ts new file mode 100644 index 0000000000..5a7f4242f0 --- /dev/null +++ b/desktop/src/main/services/zip.ts @@ -0,0 +1,74 @@ +import { LRUCache } from "lru-cache"; +import StreamZip from "node-stream-zip"; + +/** The cache. */ +const _cache = new LRUCache({ + max: 50, + disposeAfter: (zip, zipPath) => { + if (_refCount.has(zipPath)) { + // Add it back again. + _cache.set(zipPath, zip); + } else { + void zip.close(); + } + }, +}); + +/** Reference count. */ +const _refCount = new Map(); + +/** + * Cached `StreamZip.async`s + * + * This function uses an LRU cache to cache handles to zip files indexed by + * their path. + * + * To clear the cache (which is a good idea to avoid having open file handles + * lying around), use {@link clearOpenZipCache}. + * + * Why was this needed + * ------------------- + * + * Caching the StreamZip file handles _significantly_ (hours => seconds) + * improves the performance of the metadata parsing step during import of large + * Google Takeout zips. + * + * In ad-hoc tests, it seems that beyond a certain zip size (few GBs), reopening + * the handle to a stream zip overshadows the time taken to read the individual + * JSONs. + */ +export const openZip = (zipPath: string) => { + let result = _cache.get(zipPath); + if (!result) { + result = new StreamZip.async({ file: zipPath }); + _cache.set(zipPath, result); + } + _refCount.set(zipPath, (_refCount.get(zipPath) ?? 0) + 1); + return result; +}; + +/** + * Indicate to our cache that an item we opened earlier using {@link openZip} + * can now be safely closed. + * + * @param zipPath The key that was used for opening this zip. + */ +export const markClosableZip = (zipPath: string) => { + const rc = _refCount.get(zipPath); + if (!rc) throw new Error(`Double close for ${zipPath}`); + if (rc == 1) _refCount.delete(zipPath); + else _refCount.set(zipPath, rc - 1); +}; + +/** + * Clear any entries previously cached by {@link openZip}. + */ +export const clearOpenZipCache = () => { + if (_refCount.size > 0) { + const keys = JSON.stringify([..._refCount.keys()]); + throw new Error( + `Attempting to clear zip file cache when some items are still in use: ${keys}`, + ); + } + _cache.clear(); +}; diff --git a/desktop/src/main/stream.ts b/desktop/src/main/stream.ts index c11fb1121c..38feb192af 100644 --- a/desktop/src/main/stream.ts +++ b/desktop/src/main/stream.ts @@ -2,7 +2,6 @@ * @file stream data to-from renderer using a custom protocol handler. */ import { net, protocol } from "electron/main"; -import StreamZip from "node-stream-zip"; import { randomUUID } from "node:crypto"; import { createWriteStream, existsSync } from "node:fs"; import fs from "node:fs/promises"; @@ -11,6 +10,7 @@ import { ReadableStream } from "node:stream/web"; import { pathToFileURL } from "node:url"; import log from "./log"; import { ffmpegConvertToMP4 } from "./services/ffmpeg"; +import { markClosableZip, openZip } from "./services/zip"; import { ensure } from "./utils/common"; import { deleteTempFile, @@ -113,7 +113,7 @@ const handleRead = async (path: string) => { }; const handleReadZip = async (zipPath: string, entryName: string) => { - const zip = new StreamZip.async({ file: zipPath }); + const zip = openZip(zipPath); const entry = await zip.entry(entryName); if (!entry) return new Response("", { status: 404 }); @@ -130,7 +130,7 @@ const handleReadZip = async (zipPath: string, entryName: string) => { webReadableStreamAny as ReadableStream; // Close the zip handle when the underlying stream closes. - stream.on("end", () => void zip.close()); + stream.on("end", () => markClosableZip(zipPath)); // While it is documented that entry.time is the modification time, // the units are not mentioned. By seeing the source code, we can diff --git a/desktop/src/main/utils/temp.ts b/desktop/src/main/utils/temp.ts index 70dec844d6..912264387a 100644 --- a/desktop/src/main/utils/temp.ts +++ b/desktop/src/main/utils/temp.ts @@ -1,10 +1,10 @@ import { app } from "electron/main"; -import StreamZip from "node-stream-zip"; import { existsSync } from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; import type { ZipItem } from "../../types/ipc"; import log from "../log"; +import { markClosableZip, openZip } from "../services/zip"; import { ensure } from "./common"; /** @@ -128,9 +128,9 @@ export const makeFileForDataOrPathOrZipItem = async ( } else { writeToTemporaryFile = async () => { const [zipPath, entryName] = dataOrPathOrZipItem; - const zip = new StreamZip.async({ file: zipPath }); + const zip = openZip(zipPath); await zip.extract(entryName, path); - await zip.close(); + markClosableZip(zipPath); }; } } diff --git a/desktop/yarn.lock b/desktop/yarn.lock index 78f98106ef..b8143820e3 100644 --- a/desktop/yarn.lock +++ b/desktop/yarn.lock @@ -2259,7 +2259,7 @@ lowercase-keys@^2.0.0: resolved "https://registry.yarnpkg.com/lowercase-keys/-/lowercase-keys-2.0.0.tgz#2603e78b7b4b0006cbca2fbcc8a3202558ac9479" integrity sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA== -lru-cache@^10.2.0: +lru-cache@^10.2, lru-cache@^10.2.0: version "10.2.2" resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-10.2.2.tgz#48206bc114c1252940c41b25b41af5b545aca878" integrity sha512-9hp3Vp2/hFQUiIwKo8XCeFVnrg8Pk3TYNPIR7tJADKi5YfcF7vEaK7avFHTlSy3kOKYaJQaalfEo6YuXdceBOQ== diff --git a/web/packages/next/types/ipc.ts b/web/packages/next/types/ipc.ts index 646ec79127..21246ea660 100644 --- a/web/packages/next/types/ipc.ts +++ b/web/packages/next/types/ipc.ts @@ -567,6 +567,9 @@ export interface Electron { /** * Clear any pending uploads. + * + * This is also taken by the Node.js layer as a signal to clear any cached + * state it has kept around to speed up the upload. */ clearPendingUploads: () => Promise; }