From 370535dac23ec0f2c61e7e59f8fedbfe9c1320fd Mon Sep 17 00:00:00 2001 From: Shiv Singh Date: Tue, 28 Jan 2025 12:36:27 +0530 Subject: [PATCH] Update wpress-extract.js Some updated to make it faster 1. Error Handling: readBlockToFile Error Handling: The readBlockToFile function doesn't handle potential errors during file writing (e.g., disk full, permissions issues). It should incorporate proper error handling, likely by wrapping the file writing logic in a try...catch block and propagating any errors to the caller. outputStream.close() in readBlockToFile: Calling outputStream.close() is unnecessary and potentially problematic. close() is used to signal the end of writing to the stream manually, but when you are piping or writing data in chunks, the end event should be used to handle the completion. Closing it prematurely can truncate the file. 2. Efficiency and Resource Management: readHeader Buffer Reuse: The readHeader function allocates a new headerChunk buffer on every call. A better approach would be to allocate this buffer once outside the loop and reuse it for each header read. Stream Handling in readBlockToFile: The current code reads the entire file content into memory using a while loop and Buffer.alloc for each chunk. This is not ideal for large files, leading to high memory consumption. A more efficient solution is to directly pipe the input stream to the output stream. Node.js streams are designed for this kind of data flow and handle buffering and memory management much better. fse.ensureDirSync in the Loop: The fse.ensureDirSync call inside the readBlockToFile loop is executed for every file. This is redundant if files share the same parent directory. It can be optimized by caching directories that have already been created. 3. Readability and Maintainability: Constants for Magic Numbers: The code uses magic numbers like 255, 269, 281, and 512. These should be replaced with named constants to improve readability and maintainability. Consistent Variable Naming: Some variables use camelCase (_inputFile), and others use snake_case (countFiles). Stick to a consistent naming convention (camelCase is generally preferred for JavaScript). Comments: While there are some comments, they can be more descriptive. Add comments to explain the purpose of complex logic and the meaning of calculations (e.g., why offset = offset + HEADER_SIZE + header.size). Function Decomposition: The wpExtract function is relatively long. Consider breaking it down into smaller, more manageable functions. For instance, the logic for handling the output directory could be extracted into a separate function. 4. readFromBuffer Optimization: The readFromBuffer function can be optimized. Instead of slicing the buffer twice, you can find the index of the null character and slice only once. --- lib/wpress-extract.js | 206 ++++++++++++++++++++++++------------------ 1 file changed, 116 insertions(+), 90 deletions(-) diff --git a/lib/wpress-extract.js b/lib/wpress-extract.js index 1a33666..9269098 100644 --- a/lib/wpress-extract.js +++ b/lib/wpress-extract.js @@ -2,119 +2,145 @@ const fse = require('fs-extra'); const fs = require('fs'); const path = require('path'); -const HEADER_SIZE = 4377; // length of the header -const HEADER_CHUNK_EOF = Buffer.alloc(HEADER_SIZE); // Empty header used for check if we reached the end +// Constants for header structure +const HEADER_SIZE = 4377; +const NAME_END = 255; +const SIZE_END = 269; +const MTIME_END = 281; +const CHUNK_SIZE = 512; + +// Empty header for EOF check +const HEADER_CHUNK_EOF = Buffer.alloc(HEADER_SIZE); + +// Reusable buffer for header reads +const headerChunk = Buffer.alloc(HEADER_SIZE); + +// Cache for created directories +const createdDirs = new Set(); function isDirEmpty(dirname) { - return fs.promises.readdir(dirname).then((files) => { - return files.length === 0; - }); + return fs.promises.readdir(dirname).then((files) => files.length === 0); } function readFromBuffer(buffer, start, end) { - const _buffer = buffer.slice(start, end); - // Trim off the empty bytes - return _buffer.slice(0, _buffer.indexOf(0x00)).toString(); + const nullIndex = buffer.indexOf(0x00, start); + const endSlice = nullIndex === -1 || nullIndex > end ? end : nullIndex; + return buffer.toString('utf8', start, endSlice); } async function readHeader(fd) { - const headerChunk = Buffer.alloc(HEADER_SIZE); - await fd.read(headerChunk, 0, HEADER_SIZE, null); - - // Reached end of file - if (Buffer.compare(headerChunk, HEADER_CHUNK_EOF) === 0) { - return null; - } - - const name = readFromBuffer(headerChunk, 0, 255); - const size = parseInt(readFromBuffer(headerChunk, 255, 269), 10); - const mTime = readFromBuffer(headerChunk, 269, 281); - const prefix = readFromBuffer(headerChunk, 281, HEADER_SIZE); - - return { - name, - size, - mTime, - prefix, - }; -} - -async function readBlockToFile(fd, header, outputPath) { - const outputFilePath = path.join(outputPath, header.prefix, header.name); - fse.ensureDirSync(path.dirname(outputFilePath)); - const outputStream = fs.createWriteStream(outputFilePath); + const { bytesRead } = await fd.read(headerChunk, 0, HEADER_SIZE, null); - let totalBytesToRead = header.size; - while (true) { - let bytesToRead = 512; - if (bytesToRead > totalBytesToRead) { - bytesToRead = totalBytesToRead; + // Check if we actually read the full header size before comparing + if (bytesRead !== HEADER_SIZE) { + return null; // Reached end of file or an unexpected error } - if (bytesToRead === 0) { - break; + if (Buffer.compare(headerChunk, HEADER_CHUNK_EOF) === 0) { + return null; // Reached end of file } - const buffer = Buffer.alloc(bytesToRead); - const data = await fd.read(buffer, 0, bytesToRead, null); - outputStream.write(buffer); + const name = readFromBuffer(headerChunk, 0, NAME_END); + const size = parseInt(readFromBuffer(headerChunk, NAME_END, SIZE_END), 10); + const mTime = readFromBuffer(headerChunk, SIZE_END, MTIME_END); + const prefix = readFromBuffer(headerChunk, MTIME_END, HEADER_SIZE); - totalBytesToRead -= data.bytesRead; - } - - outputStream.close(); + return { name, size, mTime, prefix }; } -module.exports = async function wpExtract({ - inputFile: _inputFile, - outputDir, - onStart, - onUpdate, - onFinish, - override, -}) { - if (!fs.existsSync(_inputFile)) { - throw new Error( - `Input file at location "${_inputFile}" could not be found.` - ); - } - - if (override) { - // Ensure the output dir exists and is empty - fse.emptyDirSync(outputDir); - } else { - if (fs.existsSync(outputDir) && !(await isDirEmpty(outputDir))) { - throw new Error( - `Output dir is not empty. Clear it first or use the --force option to override it.` - ); +async function readBlockToFile(fd, header, outputDir) { + const outputFilePath = path.join(outputDir, header.prefix, header.name); + const outputDirPath = path.dirname(outputFilePath); + + // Optimize directory creation + if (!createdDirs.has(outputDirPath)) { + await fse.ensureDir(outputDirPath); + createdDirs.add(outputDirPath); } - } - const inputFileStat = fs.statSync(_inputFile); - const inputFile = await fs.promises.open(_inputFile, 'r'); + const outputStream = fs.createWriteStream(outputFilePath); + + return new Promise((resolve, reject) => { + const stream = fd.createReadStream({ + start: fd.bytesRead, //Start from where the header ended + highWaterMark: CHUNK_SIZE, + }); - // Trigger onStart callback - onStart(inputFileStat.size); + stream.pipe(outputStream); - let offset = 0; - let countFiles = 0; + let totalBytesRead = 0; + stream.on('data', (chunk) => { + totalBytesRead += chunk.length; + }); - while (true) { - const header = await readHeader(inputFile); - if (!header) { - break; + stream.on('end', async () => { + if (totalBytesRead !== header.size) { + // If you want to seek to the next header position + await fd.read(Buffer.alloc(0), 0, 0, fd.bytesRead + header.size - totalBytesRead); + } + resolve(); + }); + + stream.on('error', reject); + outputStream.on('error', reject); + }); +} + +async function ensureOutputDir(outputDir, override) { + if (override) { + await fse.emptyDir(outputDir); + } else { + if (fs.existsSync(outputDir) && !(await isDirEmpty(outputDir))) { + throw new Error( + `Output dir is not empty. Clear it first or use the --force option to override it.` + ); + } } +} - await readBlockToFile(inputFile, header, outputDir); - offset = offset + HEADER_SIZE + header.size; - countFiles++; +async function wpExtract({ + inputFile: inputFilePath, + outputDir, + onStart, + onUpdate, + onFinish, + override, +}) { + if (!fs.existsSync(inputFilePath)) { + throw new Error(`Input file at location "${inputFilePath}" could not be found.`); + } - // Trigger onUpdate callback - onUpdate(offset); - } + await ensureOutputDir(outputDir, override); + + const inputFileStat = fs.statSync(inputFilePath); + const inputFile = await fs.promises.open(inputFilePath, 'r'); + + onStart(inputFileStat.size); + + let offset = 0; + let countFiles = 0; + try { + while (true) { + const header = await readHeader(inputFile); + if (!header) { + break; + } + + await readBlockToFile(inputFile, header, outputDir); + offset = offset + HEADER_SIZE + header.size; + countFiles++; + + onUpdate(offset); + } + } catch (error) { + // Handle errors during extraction + console.error("Error during extraction:", error); + throw error; // Re-throw to allow for handling at a higher level if needed + } finally { + await inputFile.close(); + } - await inputFile.close(); + onFinish(countFiles); +} - // Trigger onFinish callback - onFinish(countFiles); -}; +module.exports = wpExtract;