[desktop] Cache the handle to the zip files to improve metadata parsing speed (#2287)
This should _significantly_ (hours => seconds) improve the performance of the metadata parsing step during import of large Google Takeout zips, and bring them to par as if the user had drag-and-dropped the unzipped folder instead. In my monkey tests, it seems that beyond a certain zip size (few GBs), reopening the handle to a stream zip overshadows the time taken to read the individual JSONs. This effect seems to grow very big for big zips to a point where the metadata parsing step takes hours. But note that I'm only testing this on synthetic exemplars I've created. After merging it'll also need testing on more realistic huge takeout examples.
This commit is contained in:
@@ -128,3 +128,6 @@ watcher for the watch folders functionality.
|
||||
|
||||
[node-stream-zip](https://github.com/antelle/node-stream-zip) is used for
|
||||
reading of large ZIP files (e.g. during imports of Google Takeout ZIPs).
|
||||
|
||||
[lru-cache](https://github.com/isaacs/node-lru-cache) is used to cache file ZIP
|
||||
handles to avoid reopening them for every operation.
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
"ffmpeg-static": "^5.2",
|
||||
"html-entities": "^2.5",
|
||||
"jpeg-js": "^0.4",
|
||||
"lru-cache": "^10.2",
|
||||
"next-electron-server": "^1",
|
||||
"node-stream-zip": "^1.15",
|
||||
"onnxruntime-node": "^1.18"
|
||||
|
||||
@@ -3,6 +3,7 @@ import log from "../log";
|
||||
import { clearConvertToMP4Results } from "../stream";
|
||||
import { clearStores } from "./store";
|
||||
import { watchReset } from "./watch";
|
||||
import { clearOpenZipCache } from "./zip";
|
||||
|
||||
/**
|
||||
* Perform the native side logout sequence.
|
||||
@@ -30,4 +31,9 @@ export const logout = (watcher: FSWatcher) => {
|
||||
} catch (e) {
|
||||
ignoreError("native stores", e);
|
||||
}
|
||||
try {
|
||||
clearOpenZipCache();
|
||||
} catch (e) {
|
||||
ignoreError("zip cache", e);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
import StreamZip from "node-stream-zip";
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { existsSync } from "original-fs";
|
||||
import type { PendingUploads, ZipItem } from "../../types/ipc";
|
||||
import log from "../log";
|
||||
import { uploadStatusStore } from "../stores/upload-status";
|
||||
import { clearOpenZipCache, markClosableZip, openZip } from "./zip";
|
||||
|
||||
export const listZipItems = async (zipPath: string): Promise<ZipItem[]> => {
|
||||
const zip = new StreamZip.async({ file: zipPath });
|
||||
const zip = openZip(zipPath);
|
||||
|
||||
const entries = await zip.entries();
|
||||
const entryNames: string[] = [];
|
||||
@@ -21,7 +21,7 @@ export const listZipItems = async (zipPath: string): Promise<ZipItem[]> => {
|
||||
}
|
||||
}
|
||||
|
||||
await zip.close();
|
||||
markClosableZip(zipPath);
|
||||
|
||||
return entryNames.map((entryName) => [zipPath, entryName]);
|
||||
};
|
||||
@@ -34,7 +34,7 @@ export const pathOrZipItemSize = async (
|
||||
return stat.size;
|
||||
} else {
|
||||
const [zipPath, entryName] = pathOrZipItem;
|
||||
const zip = new StreamZip.async({ file: zipPath });
|
||||
const zip = openZip(zipPath);
|
||||
const entry = await zip.entry(entryName);
|
||||
if (!entry)
|
||||
throw new Error(
|
||||
@@ -152,4 +152,7 @@ export const markUploadedZipItems = (
|
||||
uploadStatusStore.set("zipItems", updated);
|
||||
};
|
||||
|
||||
export const clearPendingUploads = () => uploadStatusStore.clear();
|
||||
export const clearPendingUploads = () => {
|
||||
uploadStatusStore.clear();
|
||||
clearOpenZipCache();
|
||||
};
|
||||
|
||||
74
desktop/src/main/services/zip.ts
Normal file
74
desktop/src/main/services/zip.ts
Normal file
@@ -0,0 +1,74 @@
|
||||
import { LRUCache } from "lru-cache";
|
||||
import StreamZip from "node-stream-zip";
|
||||
|
||||
/** The cache. */
|
||||
const _cache = new LRUCache<string, StreamZip.StreamZipAsync>({
|
||||
max: 50,
|
||||
disposeAfter: (zip, zipPath) => {
|
||||
if (_refCount.has(zipPath)) {
|
||||
// Add it back again.
|
||||
_cache.set(zipPath, zip);
|
||||
} else {
|
||||
void zip.close();
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
/** Reference count. */
|
||||
const _refCount = new Map<string, number>();
|
||||
|
||||
/**
|
||||
* Cached `StreamZip.async`s
|
||||
*
|
||||
* This function uses an LRU cache to cache handles to zip files indexed by
|
||||
* their path.
|
||||
*
|
||||
* To clear the cache (which is a good idea to avoid having open file handles
|
||||
* lying around), use {@link clearOpenZipCache}.
|
||||
*
|
||||
* Why was this needed
|
||||
* -------------------
|
||||
*
|
||||
* Caching the StreamZip file handles _significantly_ (hours => seconds)
|
||||
* improves the performance of the metadata parsing step during import of large
|
||||
* Google Takeout zips.
|
||||
*
|
||||
* In ad-hoc tests, it seems that beyond a certain zip size (few GBs), reopening
|
||||
* the handle to a stream zip overshadows the time taken to read the individual
|
||||
* JSONs.
|
||||
*/
|
||||
export const openZip = (zipPath: string) => {
|
||||
let result = _cache.get(zipPath);
|
||||
if (!result) {
|
||||
result = new StreamZip.async({ file: zipPath });
|
||||
_cache.set(zipPath, result);
|
||||
}
|
||||
_refCount.set(zipPath, (_refCount.get(zipPath) ?? 0) + 1);
|
||||
return result;
|
||||
};
|
||||
|
||||
/**
|
||||
* Indicate to our cache that an item we opened earlier using {@link openZip}
|
||||
* can now be safely closed.
|
||||
*
|
||||
* @param zipPath The key that was used for opening this zip.
|
||||
*/
|
||||
export const markClosableZip = (zipPath: string) => {
|
||||
const rc = _refCount.get(zipPath);
|
||||
if (!rc) throw new Error(`Double close for ${zipPath}`);
|
||||
if (rc == 1) _refCount.delete(zipPath);
|
||||
else _refCount.set(zipPath, rc - 1);
|
||||
};
|
||||
|
||||
/**
|
||||
* Clear any entries previously cached by {@link openZip}.
|
||||
*/
|
||||
export const clearOpenZipCache = () => {
|
||||
if (_refCount.size > 0) {
|
||||
const keys = JSON.stringify([..._refCount.keys()]);
|
||||
throw new Error(
|
||||
`Attempting to clear zip file cache when some items are still in use: ${keys}`,
|
||||
);
|
||||
}
|
||||
_cache.clear();
|
||||
};
|
||||
@@ -2,7 +2,6 @@
|
||||
* @file stream data to-from renderer using a custom protocol handler.
|
||||
*/
|
||||
import { net, protocol } from "electron/main";
|
||||
import StreamZip from "node-stream-zip";
|
||||
import { randomUUID } from "node:crypto";
|
||||
import { createWriteStream, existsSync } from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
@@ -11,6 +10,7 @@ import { ReadableStream } from "node:stream/web";
|
||||
import { pathToFileURL } from "node:url";
|
||||
import log from "./log";
|
||||
import { ffmpegConvertToMP4 } from "./services/ffmpeg";
|
||||
import { markClosableZip, openZip } from "./services/zip";
|
||||
import { ensure } from "./utils/common";
|
||||
import {
|
||||
deleteTempFile,
|
||||
@@ -113,7 +113,7 @@ const handleRead = async (path: string) => {
|
||||
};
|
||||
|
||||
const handleReadZip = async (zipPath: string, entryName: string) => {
|
||||
const zip = new StreamZip.async({ file: zipPath });
|
||||
const zip = openZip(zipPath);
|
||||
const entry = await zip.entry(entryName);
|
||||
if (!entry) return new Response("", { status: 404 });
|
||||
|
||||
@@ -130,7 +130,7 @@ const handleReadZip = async (zipPath: string, entryName: string) => {
|
||||
webReadableStreamAny as ReadableStream<Uint8Array>;
|
||||
|
||||
// Close the zip handle when the underlying stream closes.
|
||||
stream.on("end", () => void zip.close());
|
||||
stream.on("end", () => markClosableZip(zipPath));
|
||||
|
||||
// While it is documented that entry.time is the modification time,
|
||||
// the units are not mentioned. By seeing the source code, we can
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import { app } from "electron/main";
|
||||
import StreamZip from "node-stream-zip";
|
||||
import { existsSync } from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import type { ZipItem } from "../../types/ipc";
|
||||
import log from "../log";
|
||||
import { markClosableZip, openZip } from "../services/zip";
|
||||
import { ensure } from "./common";
|
||||
|
||||
/**
|
||||
@@ -128,9 +128,9 @@ export const makeFileForDataOrPathOrZipItem = async (
|
||||
} else {
|
||||
writeToTemporaryFile = async () => {
|
||||
const [zipPath, entryName] = dataOrPathOrZipItem;
|
||||
const zip = new StreamZip.async({ file: zipPath });
|
||||
const zip = openZip(zipPath);
|
||||
await zip.extract(entryName, path);
|
||||
await zip.close();
|
||||
markClosableZip(zipPath);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2259,7 +2259,7 @@ lowercase-keys@^2.0.0:
|
||||
resolved "https://registry.yarnpkg.com/lowercase-keys/-/lowercase-keys-2.0.0.tgz#2603e78b7b4b0006cbca2fbcc8a3202558ac9479"
|
||||
integrity sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA==
|
||||
|
||||
lru-cache@^10.2.0:
|
||||
lru-cache@^10.2, lru-cache@^10.2.0:
|
||||
version "10.2.2"
|
||||
resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-10.2.2.tgz#48206bc114c1252940c41b25b41af5b545aca878"
|
||||
integrity sha512-9hp3Vp2/hFQUiIwKo8XCeFVnrg8Pk3TYNPIR7tJADKi5YfcF7vEaK7avFHTlSy3kOKYaJQaalfEo6YuXdceBOQ==
|
||||
|
||||
@@ -567,6 +567,9 @@ export interface Electron {
|
||||
|
||||
/**
|
||||
* Clear any pending uploads.
|
||||
*
|
||||
* This is also taken by the Node.js layer as a signal to clear any cached
|
||||
* state it has kept around to speed up the upload.
|
||||
*/
|
||||
clearPendingUploads: () => Promise<void>;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user