[desktop] Cache the handle to the zip files to improve metadata parsing speed (#2287)

This should _significantly_ (hours => seconds) improve the performance
of the metadata parsing step during import of large Google Takeout zips,
and bring them to par as if the user had drag-and-dropped the unzipped
folder instead.

In my monkey tests, it seems that beyond a certain zip size (few GBs),
reopening the handle to a stream zip overshadows the time taken to read
the individual JSONs. This effect seems to grow very big for big zips to
a point where the metadata parsing step takes hours.

But note that I'm only testing this on synthetic exemplars I've created.
After merging it'll also need testing on more realistic huge takeout
examples.
This commit is contained in:
Manav Rathi
2024-06-25 13:17:04 +05:30
committed by GitHub
9 changed files with 102 additions and 12 deletions

View File

@@ -128,3 +128,6 @@ watcher for the watch folders functionality.
[node-stream-zip](https://github.com/antelle/node-stream-zip) is used for
reading of large ZIP files (e.g. during imports of Google Takeout ZIPs).
[lru-cache](https://github.com/isaacs/node-lru-cache) is used to cache file ZIP
handles to avoid reopening them for every operation.

View File

@@ -34,6 +34,7 @@
"ffmpeg-static": "^5.2",
"html-entities": "^2.5",
"jpeg-js": "^0.4",
"lru-cache": "^10.2",
"next-electron-server": "^1",
"node-stream-zip": "^1.15",
"onnxruntime-node": "^1.18"

View File

@@ -3,6 +3,7 @@ import log from "../log";
import { clearConvertToMP4Results } from "../stream";
import { clearStores } from "./store";
import { watchReset } from "./watch";
import { clearOpenZipCache } from "./zip";
/**
* Perform the native side logout sequence.
@@ -30,4 +31,9 @@ export const logout = (watcher: FSWatcher) => {
} catch (e) {
ignoreError("native stores", e);
}
try {
clearOpenZipCache();
} catch (e) {
ignoreError("zip cache", e);
}
};

View File

@@ -1,13 +1,13 @@
import StreamZip from "node-stream-zip";
import fs from "node:fs/promises";
import path from "node:path";
import { existsSync } from "original-fs";
import type { PendingUploads, ZipItem } from "../../types/ipc";
import log from "../log";
import { uploadStatusStore } from "../stores/upload-status";
import { clearOpenZipCache, markClosableZip, openZip } from "./zip";
export const listZipItems = async (zipPath: string): Promise<ZipItem[]> => {
const zip = new StreamZip.async({ file: zipPath });
const zip = openZip(zipPath);
const entries = await zip.entries();
const entryNames: string[] = [];
@@ -21,7 +21,7 @@ export const listZipItems = async (zipPath: string): Promise<ZipItem[]> => {
}
}
await zip.close();
markClosableZip(zipPath);
return entryNames.map((entryName) => [zipPath, entryName]);
};
@@ -34,7 +34,7 @@ export const pathOrZipItemSize = async (
return stat.size;
} else {
const [zipPath, entryName] = pathOrZipItem;
const zip = new StreamZip.async({ file: zipPath });
const zip = openZip(zipPath);
const entry = await zip.entry(entryName);
if (!entry)
throw new Error(
@@ -152,4 +152,7 @@ export const markUploadedZipItems = (
uploadStatusStore.set("zipItems", updated);
};
export const clearPendingUploads = () => uploadStatusStore.clear();
export const clearPendingUploads = () => {
uploadStatusStore.clear();
clearOpenZipCache();
};

View File

@@ -0,0 +1,74 @@
import { LRUCache } from "lru-cache";
import StreamZip from "node-stream-zip";
/** The cache. */
const _cache = new LRUCache<string, StreamZip.StreamZipAsync>({
max: 50,
disposeAfter: (zip, zipPath) => {
if (_refCount.has(zipPath)) {
// Add it back again.
_cache.set(zipPath, zip);
} else {
void zip.close();
}
},
});
/** Reference count. */
const _refCount = new Map<string, number>();
/**
* Cached `StreamZip.async`s
*
* This function uses an LRU cache to cache handles to zip files indexed by
* their path.
*
* To clear the cache (which is a good idea to avoid having open file handles
* lying around), use {@link clearOpenZipCache}.
*
* Why was this needed
* -------------------
*
* Caching the StreamZip file handles _significantly_ (hours => seconds)
* improves the performance of the metadata parsing step during import of large
* Google Takeout zips.
*
* In ad-hoc tests, it seems that beyond a certain zip size (few GBs), reopening
* the handle to a stream zip overshadows the time taken to read the individual
* JSONs.
*/
export const openZip = (zipPath: string) => {
let result = _cache.get(zipPath);
if (!result) {
result = new StreamZip.async({ file: zipPath });
_cache.set(zipPath, result);
}
_refCount.set(zipPath, (_refCount.get(zipPath) ?? 0) + 1);
return result;
};
/**
* Indicate to our cache that an item we opened earlier using {@link openZip}
* can now be safely closed.
*
* @param zipPath The key that was used for opening this zip.
*/
export const markClosableZip = (zipPath: string) => {
const rc = _refCount.get(zipPath);
if (!rc) throw new Error(`Double close for ${zipPath}`);
if (rc == 1) _refCount.delete(zipPath);
else _refCount.set(zipPath, rc - 1);
};
/**
* Clear any entries previously cached by {@link openZip}.
*/
export const clearOpenZipCache = () => {
if (_refCount.size > 0) {
const keys = JSON.stringify([..._refCount.keys()]);
throw new Error(
`Attempting to clear zip file cache when some items are still in use: ${keys}`,
);
}
_cache.clear();
};

View File

@@ -2,7 +2,6 @@
* @file stream data to-from renderer using a custom protocol handler.
*/
import { net, protocol } from "electron/main";
import StreamZip from "node-stream-zip";
import { randomUUID } from "node:crypto";
import { createWriteStream, existsSync } from "node:fs";
import fs from "node:fs/promises";
@@ -11,6 +10,7 @@ import { ReadableStream } from "node:stream/web";
import { pathToFileURL } from "node:url";
import log from "./log";
import { ffmpegConvertToMP4 } from "./services/ffmpeg";
import { markClosableZip, openZip } from "./services/zip";
import { ensure } from "./utils/common";
import {
deleteTempFile,
@@ -113,7 +113,7 @@ const handleRead = async (path: string) => {
};
const handleReadZip = async (zipPath: string, entryName: string) => {
const zip = new StreamZip.async({ file: zipPath });
const zip = openZip(zipPath);
const entry = await zip.entry(entryName);
if (!entry) return new Response("", { status: 404 });
@@ -130,7 +130,7 @@ const handleReadZip = async (zipPath: string, entryName: string) => {
webReadableStreamAny as ReadableStream<Uint8Array>;
// Close the zip handle when the underlying stream closes.
stream.on("end", () => void zip.close());
stream.on("end", () => markClosableZip(zipPath));
// While it is documented that entry.time is the modification time,
// the units are not mentioned. By seeing the source code, we can

View File

@@ -1,10 +1,10 @@
import { app } from "electron/main";
import StreamZip from "node-stream-zip";
import { existsSync } from "node:fs";
import fs from "node:fs/promises";
import path from "node:path";
import type { ZipItem } from "../../types/ipc";
import log from "../log";
import { markClosableZip, openZip } from "../services/zip";
import { ensure } from "./common";
/**
@@ -128,9 +128,9 @@ export const makeFileForDataOrPathOrZipItem = async (
} else {
writeToTemporaryFile = async () => {
const [zipPath, entryName] = dataOrPathOrZipItem;
const zip = new StreamZip.async({ file: zipPath });
const zip = openZip(zipPath);
await zip.extract(entryName, path);
await zip.close();
markClosableZip(zipPath);
};
}
}

View File

@@ -2259,7 +2259,7 @@ lowercase-keys@^2.0.0:
resolved "https://registry.yarnpkg.com/lowercase-keys/-/lowercase-keys-2.0.0.tgz#2603e78b7b4b0006cbca2fbcc8a3202558ac9479"
integrity sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA==
lru-cache@^10.2.0:
lru-cache@^10.2, lru-cache@^10.2.0:
version "10.2.2"
resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-10.2.2.tgz#48206bc114c1252940c41b25b41af5b545aca878"
integrity sha512-9hp3Vp2/hFQUiIwKo8XCeFVnrg8Pk3TYNPIR7tJADKi5YfcF7vEaK7avFHTlSy3kOKYaJQaalfEo6YuXdceBOQ==

View File

@@ -567,6 +567,9 @@ export interface Electron {
/**
* Clear any pending uploads.
*
* This is also taken by the Node.js layer as a signal to clear any cached
* state it has kept around to speed up the upload.
*/
clearPendingUploads: () => Promise<void>;
}