fix: pdf loading failures from trailing null padding and cache eviction (#57)

Mythie · web-flow · commit f8cde4a37225 · 2026-03-21T13:52:13.000+11:00
diff --git a/src/objects/pdf-name.test.ts b/src/objects/pdf-name.test.ts
@@ -50,9 +50,9 @@ describe("PdfName", () => {
     expect(PdfName.of("")).toBe(empty);
   });
 
-  describe("LRU cache", () => {
+  describe("WeakRef cache", () => {
     it("clearCache clears non-permanent names", () => {
-      const custom = PdfName.of("CustomName");
+      PdfName.of("CustomName");
       expect(PdfName.cacheSize).toBeGreaterThan(0);
 
       PdfName.clearCache();
@@ -70,5 +70,13 @@ describe("PdfName", () => {
       expect(PdfName.of("Type")).toBe(PdfName.Type);
       expect(PdfName.of("Page")).toBe(PdfName.Page);
     });
+
+    it("returns same instance while strong reference is held", () => {
+      const held = PdfName.of("HeldName");
+
+      // As long as we hold the reference, .of() returns the same instance
+      expect(PdfName.of("HeldName")).toBe(held);
+      expect(PdfName.of("HeldName")).toBe(held);
+    });
   });
 });
diff --git a/src/objects/pdf-name.ts b/src/objects/pdf-name.ts
@@ -1,6 +1,5 @@
 import { HEX_TABLE } from "#src/helpers/buffer";
 import { CHAR_HASH, DELIMITERS, WHITESPACE } from "#src/helpers/chars";
-import { LRUCache } from "#src/helpers/lru-cache";
 import type { ByteWriter } from "#src/io/byte-writer";
 
 import type { PdfPrimitive } from "./pdf-primitive";
@@ -60,37 +59,52 @@ function escapeName(name: string): string {
 }
 
 /**
- * Default cache size for PdfName interning.
- * Can be overridden via PdfName.setCacheSize().
- */
-const DEFAULT_NAME_CACHE_SIZE = 10000;
-
-/**
- * PDF name object (interned).
+ * PDF name object (interned via WeakRef).
  *
  * In PDF: `/Type`, `/Page`, `/Length`
  *
- * Names are interned using an LRU cache to prevent unbounded memory growth.
- * `PdfName.of("Type") === PdfName.of("Type")` as long as both are in cache.
- * Use `.of()` to get or create instances.
+ * Names are interned using a WeakRef cache: as long as any live object
+ * (e.g. a PdfDict key) holds a strong reference to a PdfName, calling
+ * `PdfName.of()` with the same string returns the *same instance*.
+ * Once all strong references are dropped, the GC may collect the
+ * PdfName and a FinalizationRegistry cleans up the cache entry.
+ *
+ * This avoids the correctness bug of LRU-based caching, where eviction
+ * of a still-referenced name would break Map key identity in PdfDict.
  *
- * Common PDF names (Type, Page, etc.) are pre-cached and always available.
+ * Common PDF names (Type, Page, etc.) are held as static fields and
+ * therefore never collected.
  */
 export class PdfName implements PdfPrimitive {
   get type(): "name" {
     return "name";
   }
 
-  private static cache = new LRUCache<string, PdfName>({ max: DEFAULT_NAME_CACHE_SIZE });
+  /** WeakRef cache for interning. Entries are cleaned up by the FinalizationRegistry. */
+  private static cache = new Map<string, WeakRef<PdfName>>();
+
+  /** Cleans up dead WeakRef entries from the cache when a PdfName is GC'd. */
+  private static registry = new FinalizationRegistry<string>(name => {
+    const ref = PdfName.cache.get(name);
+
+    // Only delete if the entry is actually dead — a new instance for the
+    // same name may have been inserted since the old one was collected.
+    if (ref && ref.deref() === undefined) {
+      PdfName.cache.delete(name);
+    }
+  });
 
   /**
-   * Pre-cached common names that should never be evicted.
-   * These are stored separately from the LRU cache.
+   * Pre-cached common names that are always available.
+   * These are stored as static readonly fields, so they always have
+   * strong references and their WeakRefs never die.
    */
   private static readonly permanentCache = new Map<string, PdfName>();
 
   // Common PDF names (pre-cached in permanent cache)
+  // -- Document structure --
   static readonly Type = PdfName.createPermanent("Type");
+  static readonly Subtype = PdfName.createPermanent("Subtype");
   static readonly Page = PdfName.createPermanent("Page");
   static readonly Pages = PdfName.createPermanent("Pages");
   static readonly Catalog = PdfName.createPermanent("Catalog");
@@ -100,9 +114,25 @@ export class PdfName implements PdfPrimitive {
   static readonly MediaBox = PdfName.createPermanent("MediaBox");
   static readonly Resources = PdfName.createPermanent("Resources");
   static readonly Contents = PdfName.createPermanent("Contents");
+  static readonly Annots = PdfName.createPermanent("Annots");
+  // -- Trailer / xref --
+  static readonly Root = PdfName.createPermanent("Root");
+  static readonly Size = PdfName.createPermanent("Size");
+  static readonly Info = PdfName.createPermanent("Info");
+  static readonly Prev = PdfName.createPermanent("Prev");
+  static readonly ID = PdfName.createPermanent("ID");
+  static readonly Encrypt = PdfName.createPermanent("Encrypt");
+  // -- Streams --
   static readonly Length = PdfName.createPermanent("Length");
   static readonly Filter = PdfName.createPermanent("Filter");
   static readonly FlateDecode = PdfName.createPermanent("FlateDecode");
+  // -- Fonts / resources --
+  static readonly Font = PdfName.createPermanent("Font");
+  static readonly BaseFont = PdfName.createPermanent("BaseFont");
+  static readonly Encoding = PdfName.createPermanent("Encoding");
+  static readonly XObject = PdfName.createPermanent("XObject");
+  // -- Name trees --
+  static readonly Names = PdfName.createPermanent("Names");
 
   /** Cached serialized form (e.g. "/Type"). Computed lazily on first toBytes(). */
   private cachedBytes: Uint8Array | null = null;
@@ -114,21 +144,31 @@ export class PdfName implements PdfPrimitive {
    * The leading `/` should NOT be included.
    */
   static of(name: string): PdfName {
-    // Check permanent cache first (common names)
+    // Check permanent cache first (common names — always alive)
     const permanent = PdfName.permanentCache.get(name);
+
     if (permanent) {
       return permanent;
     }
 
-    // Check LRU cache
-    let cached = PdfName.cache.get(name);
+    // Check WeakRef cache
+    const ref = PdfName.cache.get(name);
+
+    if (ref) {
+      const existing = ref.deref();
 
-    if (!cached) {
-      cached = new PdfName(name);
-      PdfName.cache.set(name, cached);
+      if (existing) {
+        return existing;
+      }
     }
 
-    return cached;
+    // Create new instance, store WeakRef, register for cleanup
+    const instance = new PdfName(name);
+
+    PdfName.cache.set(name, new WeakRef(instance));
+    PdfName.registry.register(instance, name);
+
+    return instance;
   }
 
   /**
@@ -144,7 +184,9 @@ export class PdfName implements PdfPrimitive {
   }
 
   /**
-   * Get the current size of the LRU cache.
+   * Get the current number of entries in the WeakRef cache.
+   * This includes entries whose targets may have been GC'd but whose
+   * FinalizationRegistry callbacks haven't run yet.
    */
   static get cacheSize(): number {
     return PdfName.cache.size;
diff --git a/src/parser/indirect-object-parser.test.ts b/src/parser/indirect-object-parser.test.ts
@@ -217,15 +217,51 @@ endobj`,
       expect(new TextDecoder().decode(stream.data)).toBe("Hello");
     });
 
-    it("throws if indirect /Length cannot be resolved", () => {
+    it("falls back to endstream scan when indirect /Length cannot be resolved", () => {
       const p = parser(`1 0 obj
 << /Length 99 0 R >>
 stream
 Hello
 endstream
 endobj`);
+      const result = p.parseObject();
 
-      expect(() => p.parseObject()).toThrow(/resolve.*length/i);
+      const stream = result.value as PdfStream;
+      expect(new TextDecoder().decode(stream.data)).toBe("Hello");
+    });
+
+    it("falls back to endstream scan when no resolver provided", () => {
+      // Build input with actual binary bytes in the stream data
+      const prefix = new TextEncoder().encode("1 0 obj\n<< /Length 99 0 R >>\nstream\n");
+      const binaryContent = new Uint8Array([0x00, 0x01, 0xff, 0xfe, 0x80]);
+      const suffix = new TextEncoder().encode("\nendstream\nendobj");
+
+      const fullBytes = new Uint8Array(prefix.length + binaryContent.length + suffix.length);
+      fullBytes.set(prefix);
+      fullBytes.set(binaryContent, prefix.length);
+      fullBytes.set(suffix, prefix.length + binaryContent.length);
+
+      const scanner = new Scanner(fullBytes);
+      const p = new IndirectObjectParser(scanner);
+      const result = p.parseObject();
+
+      const stream = result.value as PdfStream;
+      expect(stream.data.length).toBe(5);
+      expect(stream.data[0]).toBe(0x00);
+      expect(stream.data[2]).toBe(0xff);
+    });
+
+    it("falls back to endstream scan when /Length is missing", () => {
+      const p = parser(`1 0 obj
+<< /Filter /FlateDecode >>
+stream
+Hello
+endstream
+endobj`);
+      const result = p.parseObject();
+
+      const stream = result.value as PdfStream;
+      expect(new TextDecoder().decode(stream.data)).toBe("Hello");
     });
 
     it("preserves stream dict entries", () => {
@@ -281,15 +317,17 @@ endobj`);
       expect(() => p.parseObject()).toThrow(/obj/i);
     });
 
-    it("throws on missing /Length in stream", () => {
+    it("recovers stream with missing /Length via endstream scan", () => {
       const p = parser(`1 0 obj
 << /Type /XObject >>
 stream
 data
 endstream
 endobj`);
+      const result = p.parseObject();
 
-      expect(() => p.parseObject()).toThrow(/length/i);
+      const stream = result.value as PdfStream;
+      expect(new TextDecoder().decode(stream.data)).toBe("data");
     });
   });
 });
diff --git a/src/parser/indirect-object-parser.ts b/src/parser/indirect-object-parser.ts
@@ -129,13 +129,26 @@ export class IndirectObjectParser {
     // Skip EOL after "stream" (required: LF or CRLF)
     this.skipStreamEOL();
 
-    // Get the stream length
-    const length = this.resolveLength(dict);
+    const startPos = this.scanner.position;
+
+    // Try to resolve /Length from the dict. If that fails (e.g. indirect
+    // ref during brute-force recovery with no resolver), fall back to
+    // scanning for the "endstream" keyword to determine the length.
+    let length: number;
+
+    try {
+      length = this.resolveLength(dict);
+    } catch {
+      length = this.findEndStream(startPos);
+
+      if (length < 0) {
+        throw new ObjectParseError("Stream missing /Length and no endstream found");
+      }
+    }
 
     // Read exactly `length` bytes.
     // Use subarray (zero-copy view) since the underlying PDF bytes
     // are kept alive by the PDF object for the document's lifetime.
-    const startPos = this.scanner.position;
     const data = this.scanner.bytes.subarray(startPos, startPos + length);
 
     this.scanner.moveTo(startPos + length);
@@ -220,6 +233,52 @@ export class IndirectObjectParser {
     }
   }
 
+  /**
+   * Scan forward from startPos looking for the "endstream" keyword.
+   * Returns the stream data length (excluding any EOL before endstream),
+   * or -1 if not found.
+   */
+  private findEndStream(startPos: number): number {
+    const bytes = this.scanner.bytes;
+    const len = bytes.length;
+
+    // "endstream" as byte values
+    const sig = [0x65, 0x6e, 0x64, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d];
+    const sigLen = sig.length;
+
+    for (let i = startPos; i <= len - sigLen; i++) {
+      let match = true;
+
+      for (let j = 0; j < sigLen; j++) {
+        if (bytes[i + j] !== sig[j]) {
+          match = false;
+          break;
+        }
+      }
+
+      if (match) {
+        // Found "endstream" at position i.
+        // Strip the optional EOL that precedes it (part of stream framing,
+        // not stream data — per PDF spec 7.3.8.1).
+        let end = i;
+
+        if (end > startPos && bytes[end - 1] === LF) {
+          end--;
+
+          if (end > startPos && bytes[end - 1] === CR) {
+            end--;
+          }
+        } else if (end > startPos && bytes[end - 1] === CR) {
+          end--;
+        }
+
+        return end - startPos;
+      }
+    }
+
+    return -1;
+  }
+
   /**
    * Resolve the /Length value from the stream dict.
    * Handles both direct values and indirect references.
diff --git a/src/parser/xref-parser.test.ts b/src/parser/xref-parser.test.ts
@@ -356,6 +356,48 @@ some content without startxref
 
       expect(() => p.findStartXRef()).toThrow(/startxref/i);
     });
+
+    it("skips trailing null bytes to find startxref", () => {
+      const content = `%PDF-1.4
+some content
+xref
+0 1
+0000000000 65535 f
+trailer
+<< /Size 1 /Root 1 0 R >>
+startxref
+23
+%%EOF`;
+      // Append 2048 null bytes (exceeds the 1024-byte search window)
+      const contentBytes = new TextEncoder().encode(content);
+      const padded = new Uint8Array(contentBytes.length + 2048);
+
+      padded.set(contentBytes);
+      // rest is already 0x00
+
+      const scanner = new Scanner(padded);
+      const p = new XRefParser(scanner);
+      const offset = p.findStartXRef();
+
+      expect(offset).toBe(23);
+    });
+
+    it("skips trailing whitespace mix to find startxref", () => {
+      const content = `%PDF-1.4\nstartxref\n50\n%%EOF`;
+      const contentBytes = new TextEncoder().encode(content);
+      // Append a mix of whitespace: spaces, newlines, tabs, nulls
+      const padding = new Uint8Array([0x20, 0x0a, 0x09, 0x00, 0x0d, 0x20, 0x00]);
+      const padded = new Uint8Array(contentBytes.length + padding.length);
+
+      padded.set(contentBytes);
+      padded.set(padding, contentBytes.length);
+
+      const scanner = new Scanner(padded);
+      const p = new XRefParser(scanner);
+      const offset = p.findStartXRef();
+
+      expect(offset).toBe(50);
+    });
   });
 
   describe("lenient parsing", () => {
diff --git a/src/parser/xref-parser.ts b/src/parser/xref-parser.ts
diff --git a/src/tests/issues/issue-54-name-interning-load.test.ts b/src/tests/issues/issue-54-name-interning-load.test.ts