Skip to content

Commit f8cde4a

Browse files
authored
fix: pdf loading failures from trailing null padding and cache eviction (#57)
1 parent a7b42d4 commit f8cde4a

File tree

7 files changed

+439
-37
lines changed

7 files changed

+439
-37
lines changed

src/objects/pdf-name.test.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ describe("PdfName", () => {
5050
expect(PdfName.of("")).toBe(empty);
5151
});
5252

53-
describe("LRU cache", () => {
53+
describe("WeakRef cache", () => {
5454
it("clearCache clears non-permanent names", () => {
55-
const custom = PdfName.of("CustomName");
55+
PdfName.of("CustomName");
5656
expect(PdfName.cacheSize).toBeGreaterThan(0);
5757

5858
PdfName.clearCache();
@@ -70,5 +70,13 @@ describe("PdfName", () => {
7070
expect(PdfName.of("Type")).toBe(PdfName.Type);
7171
expect(PdfName.of("Page")).toBe(PdfName.Page);
7272
});
73+
74+
it("returns same instance while strong reference is held", () => {
75+
const held = PdfName.of("HeldName");
76+
77+
// As long as we hold the reference, .of() returns the same instance
78+
expect(PdfName.of("HeldName")).toBe(held);
79+
expect(PdfName.of("HeldName")).toBe(held);
80+
});
7381
});
7482
});

src/objects/pdf-name.ts

Lines changed: 65 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import { HEX_TABLE } from "#src/helpers/buffer";
22
import { CHAR_HASH, DELIMITERS, WHITESPACE } from "#src/helpers/chars";
3-
import { LRUCache } from "#src/helpers/lru-cache";
43
import type { ByteWriter } from "#src/io/byte-writer";
54

65
import type { PdfPrimitive } from "./pdf-primitive";
@@ -60,37 +59,52 @@ function escapeName(name: string): string {
6059
}
6160

6261
/**
63-
* Default cache size for PdfName interning.
64-
* Can be overridden via PdfName.setCacheSize().
65-
*/
66-
const DEFAULT_NAME_CACHE_SIZE = 10000;
67-
68-
/**
69-
* PDF name object (interned).
62+
* PDF name object (interned via WeakRef).
7063
*
7164
* In PDF: `/Type`, `/Page`, `/Length`
7265
*
73-
* Names are interned using an LRU cache to prevent unbounded memory growth.
74-
* `PdfName.of("Type") === PdfName.of("Type")` as long as both are in cache.
75-
* Use `.of()` to get or create instances.
66+
* Names are interned using a WeakRef cache: as long as any live object
67+
* (e.g. a PdfDict key) holds a strong reference to a PdfName, calling
68+
* `PdfName.of()` with the same string returns the *same instance*.
69+
* Once all strong references are dropped, the GC may collect the
70+
* PdfName and a FinalizationRegistry cleans up the cache entry.
71+
*
72+
* This avoids the correctness bug of LRU-based caching, where eviction
73+
* of a still-referenced name would break Map key identity in PdfDict.
7674
*
77-
* Common PDF names (Type, Page, etc.) are pre-cached and always available.
75+
* Common PDF names (Type, Page, etc.) are held as static fields and
76+
* therefore never collected.
7877
*/
7978
export class PdfName implements PdfPrimitive {
8079
get type(): "name" {
8180
return "name";
8281
}
8382

84-
private static cache = new LRUCache<string, PdfName>({ max: DEFAULT_NAME_CACHE_SIZE });
83+
/** WeakRef cache for interning. Entries are cleaned up by the FinalizationRegistry. */
84+
private static cache = new Map<string, WeakRef<PdfName>>();
85+
86+
/** Cleans up dead WeakRef entries from the cache when a PdfName is GC'd. */
87+
private static registry = new FinalizationRegistry<string>(name => {
88+
const ref = PdfName.cache.get(name);
89+
90+
// Only delete if the entry is actually dead — a new instance for the
91+
// same name may have been inserted since the old one was collected.
92+
if (ref && ref.deref() === undefined) {
93+
PdfName.cache.delete(name);
94+
}
95+
});
8596

8697
/**
87-
* Pre-cached common names that should never be evicted.
88-
* These are stored separately from the LRU cache.
98+
* Pre-cached common names that are always available.
99+
* These are stored as static readonly fields, so they always have
100+
* strong references and their WeakRefs never die.
89101
*/
90102
private static readonly permanentCache = new Map<string, PdfName>();
91103

92104
// Common PDF names (pre-cached in permanent cache)
105+
// -- Document structure --
93106
static readonly Type = PdfName.createPermanent("Type");
107+
static readonly Subtype = PdfName.createPermanent("Subtype");
94108
static readonly Page = PdfName.createPermanent("Page");
95109
static readonly Pages = PdfName.createPermanent("Pages");
96110
static readonly Catalog = PdfName.createPermanent("Catalog");
@@ -100,9 +114,25 @@ export class PdfName implements PdfPrimitive {
100114
static readonly MediaBox = PdfName.createPermanent("MediaBox");
101115
static readonly Resources = PdfName.createPermanent("Resources");
102116
static readonly Contents = PdfName.createPermanent("Contents");
117+
static readonly Annots = PdfName.createPermanent("Annots");
118+
// -- Trailer / xref --
119+
static readonly Root = PdfName.createPermanent("Root");
120+
static readonly Size = PdfName.createPermanent("Size");
121+
static readonly Info = PdfName.createPermanent("Info");
122+
static readonly Prev = PdfName.createPermanent("Prev");
123+
static readonly ID = PdfName.createPermanent("ID");
124+
static readonly Encrypt = PdfName.createPermanent("Encrypt");
125+
// -- Streams --
103126
static readonly Length = PdfName.createPermanent("Length");
104127
static readonly Filter = PdfName.createPermanent("Filter");
105128
static readonly FlateDecode = PdfName.createPermanent("FlateDecode");
129+
// -- Fonts / resources --
130+
static readonly Font = PdfName.createPermanent("Font");
131+
static readonly BaseFont = PdfName.createPermanent("BaseFont");
132+
static readonly Encoding = PdfName.createPermanent("Encoding");
133+
static readonly XObject = PdfName.createPermanent("XObject");
134+
// -- Name trees --
135+
static readonly Names = PdfName.createPermanent("Names");
106136

107137
/** Cached serialized form (e.g. "/Type"). Computed lazily on first toBytes(). */
108138
private cachedBytes: Uint8Array | null = null;
@@ -114,21 +144,31 @@ export class PdfName implements PdfPrimitive {
114144
* The leading `/` should NOT be included.
115145
*/
116146
static of(name: string): PdfName {
117-
// Check permanent cache first (common names)
147+
// Check permanent cache first (common names — always alive)
118148
const permanent = PdfName.permanentCache.get(name);
149+
119150
if (permanent) {
120151
return permanent;
121152
}
122153

123-
// Check LRU cache
124-
let cached = PdfName.cache.get(name);
154+
// Check WeakRef cache
155+
const ref = PdfName.cache.get(name);
156+
157+
if (ref) {
158+
const existing = ref.deref();
125159

126-
if (!cached) {
127-
cached = new PdfName(name);
128-
PdfName.cache.set(name, cached);
160+
if (existing) {
161+
return existing;
162+
}
129163
}
130164

131-
return cached;
165+
// Create new instance, store WeakRef, register for cleanup
166+
const instance = new PdfName(name);
167+
168+
PdfName.cache.set(name, new WeakRef(instance));
169+
PdfName.registry.register(instance, name);
170+
171+
return instance;
132172
}
133173

134174
/**
@@ -144,7 +184,9 @@ export class PdfName implements PdfPrimitive {
144184
}
145185

146186
/**
147-
* Get the current size of the LRU cache.
187+
* Get the current number of entries in the WeakRef cache.
188+
* This includes entries whose targets may have been GC'd but whose
189+
* FinalizationRegistry callbacks haven't run yet.
148190
*/
149191
static get cacheSize(): number {
150192
return PdfName.cache.size;

src/parser/indirect-object-parser.test.ts

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -217,15 +217,51 @@ endobj`,
217217
expect(new TextDecoder().decode(stream.data)).toBe("Hello");
218218
});
219219

220-
it("throws if indirect /Length cannot be resolved", () => {
220+
it("falls back to endstream scan when indirect /Length cannot be resolved", () => {
221221
const p = parser(`1 0 obj
222222
<< /Length 99 0 R >>
223223
stream
224224
Hello
225225
endstream
226226
endobj`);
227+
const result = p.parseObject();
227228

228-
expect(() => p.parseObject()).toThrow(/resolve.*length/i);
229+
const stream = result.value as PdfStream;
230+
expect(new TextDecoder().decode(stream.data)).toBe("Hello");
231+
});
232+
233+
it("falls back to endstream scan when no resolver provided", () => {
234+
// Build input with actual binary bytes in the stream data
235+
const prefix = new TextEncoder().encode("1 0 obj\n<< /Length 99 0 R >>\nstream\n");
236+
const binaryContent = new Uint8Array([0x00, 0x01, 0xff, 0xfe, 0x80]);
237+
const suffix = new TextEncoder().encode("\nendstream\nendobj");
238+
239+
const fullBytes = new Uint8Array(prefix.length + binaryContent.length + suffix.length);
240+
fullBytes.set(prefix);
241+
fullBytes.set(binaryContent, prefix.length);
242+
fullBytes.set(suffix, prefix.length + binaryContent.length);
243+
244+
const scanner = new Scanner(fullBytes);
245+
const p = new IndirectObjectParser(scanner);
246+
const result = p.parseObject();
247+
248+
const stream = result.value as PdfStream;
249+
expect(stream.data.length).toBe(5);
250+
expect(stream.data[0]).toBe(0x00);
251+
expect(stream.data[2]).toBe(0xff);
252+
});
253+
254+
it("falls back to endstream scan when /Length is missing", () => {
255+
const p = parser(`1 0 obj
256+
<< /Filter /FlateDecode >>
257+
stream
258+
Hello
259+
endstream
260+
endobj`);
261+
const result = p.parseObject();
262+
263+
const stream = result.value as PdfStream;
264+
expect(new TextDecoder().decode(stream.data)).toBe("Hello");
229265
});
230266

231267
it("preserves stream dict entries", () => {
@@ -281,15 +317,17 @@ endobj`);
281317
expect(() => p.parseObject()).toThrow(/obj/i);
282318
});
283319

284-
it("throws on missing /Length in stream", () => {
320+
it("recovers stream with missing /Length via endstream scan", () => {
285321
const p = parser(`1 0 obj
286322
<< /Type /XObject >>
287323
stream
288324
data
289325
endstream
290326
endobj`);
327+
const result = p.parseObject();
291328

292-
expect(() => p.parseObject()).toThrow(/length/i);
329+
const stream = result.value as PdfStream;
330+
expect(new TextDecoder().decode(stream.data)).toBe("data");
293331
});
294332
});
295333
});

src/parser/indirect-object-parser.ts

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,26 @@ export class IndirectObjectParser {
129129
// Skip EOL after "stream" (required: LF or CRLF)
130130
this.skipStreamEOL();
131131

132-
// Get the stream length
133-
const length = this.resolveLength(dict);
132+
const startPos = this.scanner.position;
133+
134+
// Try to resolve /Length from the dict. If that fails (e.g. indirect
135+
// ref during brute-force recovery with no resolver), fall back to
136+
// scanning for the "endstream" keyword to determine the length.
137+
let length: number;
138+
139+
try {
140+
length = this.resolveLength(dict);
141+
} catch {
142+
length = this.findEndStream(startPos);
143+
144+
if (length < 0) {
145+
throw new ObjectParseError("Stream missing /Length and no endstream found");
146+
}
147+
}
134148

135149
// Read exactly `length` bytes.
136150
// Use subarray (zero-copy view) since the underlying PDF bytes
137151
// are kept alive by the PDF object for the document's lifetime.
138-
const startPos = this.scanner.position;
139152
const data = this.scanner.bytes.subarray(startPos, startPos + length);
140153

141154
this.scanner.moveTo(startPos + length);
@@ -220,6 +233,52 @@ export class IndirectObjectParser {
220233
}
221234
}
222235

236+
/**
237+
* Scan forward from startPos looking for the "endstream" keyword.
238+
* Returns the stream data length (excluding any EOL before endstream),
239+
* or -1 if not found.
240+
*/
241+
private findEndStream(startPos: number): number {
242+
const bytes = this.scanner.bytes;
243+
const len = bytes.length;
244+
245+
// "endstream" as byte values
246+
const sig = [0x65, 0x6e, 0x64, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d];
247+
const sigLen = sig.length;
248+
249+
for (let i = startPos; i <= len - sigLen; i++) {
250+
let match = true;
251+
252+
for (let j = 0; j < sigLen; j++) {
253+
if (bytes[i + j] !== sig[j]) {
254+
match = false;
255+
break;
256+
}
257+
}
258+
259+
if (match) {
260+
// Found "endstream" at position i.
261+
// Strip the optional EOL that precedes it (part of stream framing,
262+
// not stream data — per PDF spec 7.3.8.1).
263+
let end = i;
264+
265+
if (end > startPos && bytes[end - 1] === LF) {
266+
end--;
267+
268+
if (end > startPos && bytes[end - 1] === CR) {
269+
end--;
270+
}
271+
} else if (end > startPos && bytes[end - 1] === CR) {
272+
end--;
273+
}
274+
275+
return end - startPos;
276+
}
277+
}
278+
279+
return -1;
280+
}
281+
223282
/**
224283
* Resolve the /Length value from the stream dict.
225284
* Handles both direct values and indirect references.

src/parser/xref-parser.test.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,48 @@ some content without startxref
356356

357357
expect(() => p.findStartXRef()).toThrow(/startxref/i);
358358
});
359+
360+
it("skips trailing null bytes to find startxref", () => {
361+
const content = `%PDF-1.4
362+
some content
363+
xref
364+
0 1
365+
0000000000 65535 f
366+
trailer
367+
<< /Size 1 /Root 1 0 R >>
368+
startxref
369+
23
370+
%%EOF`;
371+
// Append 2048 null bytes (exceeds the 1024-byte search window)
372+
const contentBytes = new TextEncoder().encode(content);
373+
const padded = new Uint8Array(contentBytes.length + 2048);
374+
375+
padded.set(contentBytes);
376+
// rest is already 0x00
377+
378+
const scanner = new Scanner(padded);
379+
const p = new XRefParser(scanner);
380+
const offset = p.findStartXRef();
381+
382+
expect(offset).toBe(23);
383+
});
384+
385+
it("skips trailing whitespace mix to find startxref", () => {
386+
const content = `%PDF-1.4\nstartxref\n50\n%%EOF`;
387+
const contentBytes = new TextEncoder().encode(content);
388+
// Append a mix of whitespace: spaces, newlines, tabs, nulls
389+
const padding = new Uint8Array([0x20, 0x0a, 0x09, 0x00, 0x0d, 0x20, 0x00]);
390+
const padded = new Uint8Array(contentBytes.length + padding.length);
391+
392+
padded.set(contentBytes);
393+
padded.set(padding, contentBytes.length);
394+
395+
const scanner = new Scanner(padded);
396+
const p = new XRefParser(scanner);
397+
const offset = p.findStartXRef();
398+
399+
expect(offset).toBe(50);
400+
});
359401
});
360402

361403
describe("lenient parsing", () => {

0 commit comments

Comments
 (0)