Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/BrennenWright/pdf

go 1.22.2
49 changes: 35 additions & 14 deletions lex.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ func (b *buffer) readOffset() int64 {
return b.offset - int64(len(b.buf)) + int64(b.pos)
}

// exhausted reports that the buffer reached EOF and has no bytes left.
// readByte returns a synthetic '\n' after EOF, which can otherwise allow
// malformed unterminated tokens to loop forever.
func (b *buffer) exhausted() bool {
return b.eof && b.pos >= len(b.buf)
}

func (b *buffer) unreadByte() {
if b.pos > 0 {
b.pos--
Expand Down Expand Up @@ -193,26 +200,34 @@ func (b *buffer) readToken() token {
func (b *buffer) readHexString() token {
tmp := b.tmp[:0]
for {
Loop:
if b.exhausted() {
break
}
c := b.readByte()
if c == '>' {
break
}
if isSpace(c) {
goto Loop
}
Loop2:
c2 := b.readByte()
if isSpace(c2) {
goto Loop2
continue
}
x := unhex(c)<<4 | unhex(c2)
if x < 0 {
fmt.Sprint(b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]))
for {
if b.exhausted() {
goto hexDone
}
c2 := b.readByte()
if isSpace(c2) {
continue
}
x := unhex(c)<<4 | unhex(c2)
if x < 0 {
fmt.Sprint(b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]))
goto hexDone
}
tmp = append(tmp, byte(x))
break
}
tmp = append(tmp, byte(x))
}
hexDone:
b.tmp = tmp
return string(tmp)
}
Expand All @@ -234,6 +249,9 @@ func (b *buffer) readLiteralString() token {
depth := 1
Loop:
for {
if b.exhausted() {
break
}
c := b.readByte()
switch c {
default:
Expand Down Expand Up @@ -423,6 +441,9 @@ type objdef struct {

func (b *buffer) readObject() (object, error) {
tok := b.readToken()
if tok == io.EOF {
return nil, errors.New("unexpected EOF parsing PDF object")
}
if kw, ok := tok.(keyword); ok {
switch kw {
case "null":
Expand Down Expand Up @@ -481,13 +502,13 @@ func (b *buffer) readArray() object {
var x array
for {
tok := b.readToken()
if tok == nil || tok == keyword("]") {
if tok == io.EOF || tok == nil || tok == keyword("]") {
break
}
b.unreadToken(tok)
res, err := b.readObject()
if err != nil {
return err
break
}
x = append(x, res)
}
Expand All @@ -498,7 +519,7 @@ func (b *buffer) readDict() object {
x := make(dict)
for {
tok := b.readToken()
if tok == nil || tok == keyword(">>") {
if tok == io.EOF || tok == nil || tok == keyword(">>") {
break
}
n, ok := tok.(name)
Expand Down
44 changes: 22 additions & 22 deletions page.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ Search:
num--
}
}
// If Count overstates actual reachable kids, avoid looping forever.
break
}
return Page{}
}
Expand Down Expand Up @@ -393,7 +395,7 @@ func readCmap(toUnicode Value) *cmap {
stk.Pop().Name() // key
stk.Push(value)
default:
println("interp\t", op)
// Ignore unrecognized cmap operators.
}
})
if !ok {
Expand Down Expand Up @@ -505,7 +507,8 @@ func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) {
showText("\n")
case "Tf": // set text font and size
if len(args) != 2 {
panic("bad TL")
// Skip malformed Tf; continue interpreting rest of stream
return
}
if font, ok := fonts[args[0].Name()]; ok {
enc = font.Encoder()
Expand Down Expand Up @@ -712,7 +715,8 @@ func (p Page) walkTextBlocks(walker func(enc TextEncoding, x, y float64, s strin
case "T*": // move to start of next line
case "Tf": // set text font and size
if len(args) != 2 {
panic("bad TL")
// Skip malformed Tf; continue interpreting rest of stream
return
}

if font, ok := fonts[args[0].Name()]; ok {
Expand Down Expand Up @@ -911,7 +915,8 @@ func (p Page) readContent(strm Value) Content {

case "Tf": // set text font and size
if len(args) != 2 {
panic("bad TL")
// Skip malformed Tf; continue interpreting rest of stream
return
}
f := args[0].Name()
g.Tf = p.Font(f)
Expand Down Expand Up @@ -944,29 +949,24 @@ func (p Page) readContent(strm Value) Content {
showText(args[0].RawString())

case "TJ": // show text, allowing individual glyph positioning
if len(args) > 0 { // bugfix: don't raise an exception
v := args[0]
for i := 0; i < v.Len(); i++ {
x := v.Index(i)
if x.Kind() == String {
if i == v.Len()-1 {
showText(x.RawString())
op = "BT"
continue
} else {
showText(x.RawString())
}
} else {
tx := -x.Float64() / 1000 * g.Tfs * g.Th
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
}
if len(args) != 1 {
break
}
v := args[0]
for i := 0; i < v.Len(); i++ {
x := v.Index(i)
if x.Kind() == String {
showText(x.RawString())
} else {
tx := -x.Float64() / 1000 * g.Tfs * g.Th
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
}
// showText("\n")
}

case "TL": // set text leading
if len(args) != 1 {
panic("bad TL")
// Skip malformed TL; continue interpreting rest of stream
return
}
g.Tl = args[0].Float64()

Expand Down
87 changes: 86 additions & 1 deletion pdf_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ package pdf

import (
"bytes"
"crypto/aes"
"crypto/cipher"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"testing"
"path/filepath"
)

var referenceFirstPage = `TEST FILE
Expand All @@ -27,6 +29,89 @@ erat, sed diam voluptua. At vero eos et accusam et
TEST
SUBTITLE`

// TestCryptKeyTruncation verifies that cryptKey truncates its output to
// min(len(fileKey)+5, 16) bytes as required by PDF 32000-1:2008 §7.6.3.3 step 4.
//
// Previously cryptKey returned the full 16-byte MD5 digest regardless of the
// file key length. For 40-bit RC4 encryption (5-byte file key) the correct
// per-object key is 10 bytes; using 16 bytes produces a completely wrong
// RC4 keystream, causing all object stream decryptions to fail silently with
// garbage output and manifesting as "cannot find object in stream" panics.
//
// The bug went undetected because 128-bit RC4 files happen to need all 16
// bytes (min(21, 16) = 16), so only sub-128-bit encrypted PDFs were affected.
// TestPDF20HeaderAccepted verifies that NewReaderEncrypted accepts a %PDF-2.0 header.
// The previous check (HasPrefix("%PDF-1.")) rejected all PDF 2.0+ files with
// "not a PDF file: invalid header" despite them being structurally valid.
func TestPDF20HeaderAccepted(t *testing.T) {
// Build a minimal byte slice with a %PDF-2.0 header, a well-formed %%EOF,
// and a startxref. We only need the header check to pass; the reader will
// fail later when it cannot find a valid xref, but that is a different error.
src := []byte("%PDF-2.0\n%%EOF\n")
r := bytes.NewReader(src)
_, err := NewReader(r, int64(len(src)))
// Any error other than the old "invalid header" rejection is acceptable —
// the file is intentionally not a complete PDF.
if err != nil && bytes.Contains([]byte(err.Error()), []byte("invalid header")) {
t.Errorf("NewReader rejected %%PDF-2.0 header: %v", err)
}
}

// TestAESStringDecryption verifies that decryptString no longer panics when
// useAES is true. Previously it contained an unimplemented stub:
//
// panic("AES not implemented")
//
// PDF 32000-1:2008 §7.6.5 specifies AES-encrypted strings have the same
// layout as streams: a 16-byte IV followed by AES-CBC ciphertext with PKCS#7
// padding. V=4 R=4 PDFs (AESV2, /StrF /StdCF) encrypt all string tokens via
// this path; the panic surfaced on the first string encountered during parsing.
func TestAESStringDecryption(t *testing.T) {
// Construct a known AES-128-CBC ciphertext for the string "hello" and
// verify decryptString round-trips it correctly.
import_key := make([]byte, 16) // all-zero file key for test
ptr := objptr{id: 1, gen: 0}

// Encrypt "hello" + PKCS#7 padding (11 bytes pad to reach 16) with a
// known IV so we can assert the plaintext coming back.
iv := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
plaintext := []byte("hello\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b") // 16 bytes with PKCS#7

perObjKey := cryptKey(import_key, true, ptr)
cb, _ := aes.NewCipher(perObjKey)
ciphertext := make([]byte, 16)
cipher.NewCBCEncrypter(cb, iv).CryptBlocks(ciphertext, plaintext)

input := string(append(iv, ciphertext...))
got := decryptString(import_key, true, ptr, input)
if got != "hello" {
t.Errorf("decryptString AES: got %q, want %q", got, "hello")
}
}

func TestCryptKeyTruncation(t *testing.T) {
ptr := objptr{id: 7874, gen: 0}

cases := []struct {
fileKeyLen int
wantKeyLen int
}{
{5, 10}, // RC4-40: min(5+5, 16) = 10
{7, 12}, // RC4-56: min(7+5, 16) = 12
{10, 15}, // RC4-80: min(10+5, 16) = 15
{11, 16}, // RC4-88: min(11+5, 16) = 16 (capped)
{16, 16}, // RC4-128: min(16+5, 16) = 16 (capped)
}

for _, tc := range cases {
key := make([]byte, tc.fileKeyLen)
got := cryptKey(key, false, ptr)
if len(got) != tc.wantKeyLen {
t.Errorf("cryptKey(%d-byte key): got %d bytes, want %d", tc.fileKeyLen, len(got), tc.wantKeyLen)
}
}
}

//
// this pdf has an object within stream which is handled different!
// the original implementation calculated the stream but didn't returned the object at resolve
Expand Down
4 changes: 3 additions & 1 deletion ps.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ Reading:
val := stk.Pop()
key, ok := stk.Pop().data.(name)
if !ok {
panic("def of non-name")
// Some malformed cmap streams emit "def" with a non-name key.
// Skip the assignment and continue so text extraction can proceed.
continue
}
dicts[len(dicts)-1][key] = val.data
continue
Expand Down
Loading