dslipak · BrennenWright · Mar 10, 2026 · Mar 10, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/go.mod b/go.mod
@@ -0,0 +1,3 @@
+module github.com/BrennenWright/pdf
+
+go 1.22.2
diff --git a/lex.go b/lex.go
@@ -121,6 +121,13 @@ func (b *buffer) readOffset() int64 {
 	return b.offset - int64(len(b.buf)) + int64(b.pos)
 }
 
+// exhausted reports that the buffer reached EOF and has no bytes left.
+// readByte returns a synthetic '\n' after EOF, which can otherwise allow
+// malformed unterminated tokens to loop forever.
+func (b *buffer) exhausted() bool {
+	return b.eof && b.pos >= len(b.buf)
+}
+
 func (b *buffer) unreadByte() {
 	if b.pos > 0 {
 		b.pos--
@@ -193,26 +200,34 @@ func (b *buffer) readToken() token {
 func (b *buffer) readHexString() token {
 	tmp := b.tmp[:0]
 	for {
-	Loop:
+		if b.exhausted() {
+			break
+		}
 		c := b.readByte()
 		if c == '>' {
 			break
 		}
 		if isSpace(c) {
-			goto Loop
-		}
-	Loop2:
-		c2 := b.readByte()
-		if isSpace(c2) {
-			goto Loop2
+			continue
 		}
-		x := unhex(c)<<4 | unhex(c2)
-		if x < 0 {
-			fmt.Sprint(b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]))
+		for {
+			if b.exhausted() {
+				goto hexDone
+			}
+			c2 := b.readByte()
+			if isSpace(c2) {
+				continue
+			}
+			x := unhex(c)<<4 | unhex(c2)
+			if x < 0 {
+				fmt.Sprint(b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]))
+				goto hexDone
+			}
+			tmp = append(tmp, byte(x))
 			break
 		}
-		tmp = append(tmp, byte(x))
 	}
+hexDone:
 	b.tmp = tmp
 	return string(tmp)
 }
@@ -234,6 +249,9 @@ func (b *buffer) readLiteralString() token {
 	depth := 1
 Loop:
 	for {
+		if b.exhausted() {
+			break
+		}
 		c := b.readByte()
 		switch c {
 		default:
@@ -423,6 +441,9 @@ type objdef struct {
 
 func (b *buffer) readObject() (object, error) {
 	tok := b.readToken()
+	if tok == io.EOF {
+		return nil, errors.New("unexpected EOF parsing PDF object")
+	}
 	if kw, ok := tok.(keyword); ok {
 		switch kw {
 		case "null":
@@ -481,13 +502,13 @@ func (b *buffer) readArray() object {
 	var x array
 	for {
 		tok := b.readToken()
-		if tok == nil || tok == keyword("]") {
+		if tok == io.EOF || tok == nil || tok == keyword("]") {
 			break
 		}
 		b.unreadToken(tok)
 		res, err := b.readObject()
 		if err != nil {
-			return err
+			break
 		}
 		x = append(x, res)
 	}
@@ -498,7 +519,7 @@ func (b *buffer) readDict() object {
 	x := make(dict)
 	for {
 		tok := b.readToken()
-		if tok == nil || tok == keyword(">>") {
+		if tok == io.EOF || tok == nil || tok == keyword(">>") {
 			break
 		}
 		n, ok := tok.(name)

diff --git a/page.go b/page.go
@@ -50,6 +50,8 @@ Search:
 				num--
 			}
 		}
+		// If Count overstates actual reachable kids, avoid looping forever.
+		break
 	}
 	return Page{}
 }
@@ -393,7 +395,7 @@ func readCmap(toUnicode Value) *cmap {
 			stk.Pop().Name() // key
 			stk.Push(value)
 		default:
-			println("interp\t", op)
+			// Ignore unrecognized cmap operators.
 		}
 	})
 	if !ok {
@@ -505,7 +507,8 @@ func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) {
 			showText("\n")
 		case "Tf": // set text font and size
 			if len(args) != 2 {
-				panic("bad TL")
+		        // Skip malformed Tf; continue interpreting rest of stream
+		        return
 			}
 			if font, ok := fonts[args[0].Name()]; ok {
 				enc = font.Encoder()
@@ -712,7 +715,8 @@ func (p Page) walkTextBlocks(walker func(enc TextEncoding, x, y float64, s strin
 		case "T*": // move to start of next line
 		case "Tf": // set text font and size
 			if len(args) != 2 {
-				panic("bad TL")
+		        // Skip malformed Tf; continue interpreting rest of stream
+		        return
 			}
 
 			if font, ok := fonts[args[0].Name()]; ok {
@@ -911,7 +915,8 @@ func (p Page) readContent(strm Value) Content {
 
 		case "Tf": // set text font and size
 			if len(args) != 2 {
-				panic("bad TL")
+		        // Skip malformed Tf; continue interpreting rest of stream
+		        return
 			}
 			f := args[0].Name()
 			g.Tf = p.Font(f)
@@ -944,29 +949,24 @@ func (p Page) readContent(strm Value) Content {
 			showText(args[0].RawString())
 
 		case "TJ": // show text, allowing individual glyph positioning
-			if len(args) > 0 {	// bugfix: don't raise an exception
-				v := args[0]
-				for i := 0; i < v.Len(); i++ {
-					x := v.Index(i)
-					if x.Kind() == String {
-						if i == v.Len()-1 {
-							showText(x.RawString())
-							op = "BT"
-							continue
-						} else {
-							showText(x.RawString())
-						}
-					} else {
-						tx := -x.Float64() / 1000 * g.Tfs * g.Th
-						g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
-					}
+			if len(args) != 1 {
+				break
+			}
+			v := args[0]
+			for i := 0; i < v.Len(); i++ {
+				x := v.Index(i)
+				if x.Kind() == String {
+					showText(x.RawString())
+				} else {
+					tx := -x.Float64() / 1000 * g.Tfs * g.Th
+					g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
 				}
-				// showText("\n")
 			}
 
 		case "TL": // set text leading
 			if len(args) != 1 {
-				panic("bad TL")
+		        // Skip malformed TL; continue interpreting rest of stream
+		        return
 			}
 			g.Tl = args[0].Float64()
 

diff --git a/pdf_test.go b/pdf_test.go
@@ -2,12 +2,14 @@ package pdf
 
 import (
 	"bytes"
+	"crypto/aes"
+	"crypto/cipher"
 	"fmt"
 	"os"
+	"path/filepath"
 	"strconv"
 	"strings"
 	"testing"
-	"path/filepath"
 )
 
 var referenceFirstPage = `TEST FILE 
@@ -27,6 +29,89 @@ erat, sed diam voluptua. At vero eos et accusam et
 TEST 
 SUBTITLE`
 
+// TestCryptKeyTruncation verifies that cryptKey truncates its output to
+// min(len(fileKey)+5, 16) bytes as required by PDF 32000-1:2008 §7.6.3.3 step 4.
+//
+// Previously cryptKey returned the full 16-byte MD5 digest regardless of the
+// file key length. For 40-bit RC4 encryption (5-byte file key) the correct
+// per-object key is 10 bytes; using 16 bytes produces a completely wrong
+// RC4 keystream, causing all object stream decryptions to fail silently with
+// garbage output and manifesting as "cannot find object in stream" panics.
+//
+// The bug went undetected because 128-bit RC4 files happen to need all 16
+// bytes (min(21, 16) = 16), so only sub-128-bit encrypted PDFs were affected.
+// TestPDF20HeaderAccepted verifies that NewReaderEncrypted accepts a %PDF-2.0 header.
+// The previous check (HasPrefix("%PDF-1.")) rejected all PDF 2.0+ files with
+// "not a PDF file: invalid header" despite them being structurally valid.
+func TestPDF20HeaderAccepted(t *testing.T) {
+	// Build a minimal byte slice with a %PDF-2.0 header, a well-formed %%EOF,
+	// and a startxref. We only need the header check to pass; the reader will
+	// fail later when it cannot find a valid xref, but that is a different error.
+	src := []byte("%PDF-2.0\n%%EOF\n")
+	r := bytes.NewReader(src)
+	_, err := NewReader(r, int64(len(src)))
+	// Any error other than the old "invalid header" rejection is acceptable —
+	// the file is intentionally not a complete PDF.
+	if err != nil && bytes.Contains([]byte(err.Error()), []byte("invalid header")) {
+		t.Errorf("NewReader rejected %%PDF-2.0 header: %v", err)
+	}
+}
+
+// TestAESStringDecryption verifies that decryptString no longer panics when
+// useAES is true. Previously it contained an unimplemented stub:
+//
+//	panic("AES not implemented")
+//
+// PDF 32000-1:2008 §7.6.5 specifies AES-encrypted strings have the same
+// layout as streams: a 16-byte IV followed by AES-CBC ciphertext with PKCS#7
+// padding. V=4 R=4 PDFs (AESV2, /StrF /StdCF) encrypt all string tokens via
+// this path; the panic surfaced on the first string encountered during parsing.
+func TestAESStringDecryption(t *testing.T) {
+	// Construct a known AES-128-CBC ciphertext for the string "hello" and
+	// verify decryptString round-trips it correctly.
+	import_key := make([]byte, 16) // all-zero file key for test
+	ptr := objptr{id: 1, gen: 0}
+
+	// Encrypt "hello" + PKCS#7 padding (11 bytes pad to reach 16) with a
+	// known IV so we can assert the plaintext coming back.
+	iv := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	plaintext := []byte("hello\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b") // 16 bytes with PKCS#7
+
+	perObjKey := cryptKey(import_key, true, ptr)
+	cb, _ := aes.NewCipher(perObjKey)
+	ciphertext := make([]byte, 16)
+	cipher.NewCBCEncrypter(cb, iv).CryptBlocks(ciphertext, plaintext)
+
+	input := string(append(iv, ciphertext...))
+	got := decryptString(import_key, true, ptr, input)
+	if got != "hello" {
+		t.Errorf("decryptString AES: got %q, want %q", got, "hello")
+	}
+}
+
+func TestCryptKeyTruncation(t *testing.T) {
+	ptr := objptr{id: 7874, gen: 0}
+
+	cases := []struct {
+		fileKeyLen  int
+		wantKeyLen  int
+	}{
+		{5, 10},  // RC4-40:  min(5+5, 16) = 10
+		{7, 12},  // RC4-56:  min(7+5, 16) = 12
+		{10, 15}, // RC4-80:  min(10+5, 16) = 15
+		{11, 16}, // RC4-88:  min(11+5, 16) = 16 (capped)
+		{16, 16}, // RC4-128: min(16+5, 16) = 16 (capped)
+	}
+
+	for _, tc := range cases {
+		key := make([]byte, tc.fileKeyLen)
+		got := cryptKey(key, false, ptr)
+		if len(got) != tc.wantKeyLen {
+			t.Errorf("cryptKey(%d-byte key): got %d bytes, want %d", tc.fileKeyLen, len(got), tc.wantKeyLen)
+		}
+	}
+}
+
 //
 // this pdf has an object within stream which is handled different!
 // the original implementation calculated the stream but didn't returned the object at resolve

diff --git a/ps.go b/ps.go
@@ -109,7 +109,9 @@ Reading:
 				val := stk.Pop()
 				key, ok := stk.Pop().data.(name)
 				if !ok {
-					panic("def of non-name")
+					// Some malformed cmap streams emit "def" with a non-name key.
+					// Skip the assignment and continue so text extraction can proceed.
+					continue
 				}
 				dicts[len(dicts)-1][key] = val.data
 				continue