diff --git a/go.mod b/go.mod new file mode 100644 index 00000000..689d8907 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/BrennenWright/pdf + +go 1.22.2 diff --git a/lex.go b/lex.go index 652e18b0..d2b17059 100644 --- a/lex.go +++ b/lex.go @@ -121,6 +121,13 @@ func (b *buffer) readOffset() int64 { return b.offset - int64(len(b.buf)) + int64(b.pos) } +// exhausted reports that the buffer reached EOF and has no bytes left. +// readByte returns a synthetic '\n' after EOF, which can otherwise allow +// malformed unterminated tokens to loop forever. +func (b *buffer) exhausted() bool { + return b.eof && b.pos >= len(b.buf) +} + func (b *buffer) unreadByte() { if b.pos > 0 { b.pos-- @@ -193,26 +200,34 @@ func (b *buffer) readToken() token { func (b *buffer) readHexString() token { tmp := b.tmp[:0] for { - Loop: + if b.exhausted() { + break + } c := b.readByte() if c == '>' { break } if isSpace(c) { - goto Loop - } - Loop2: - c2 := b.readByte() - if isSpace(c2) { - goto Loop2 + continue } - x := unhex(c)<<4 | unhex(c2) - if x < 0 { - fmt.Sprint(b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:])) + for { + if b.exhausted() { + goto hexDone + } + c2 := b.readByte() + if isSpace(c2) { + continue + } + x := unhex(c)<<4 | unhex(c2) + if x < 0 { + fmt.Sprint(b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:])) + goto hexDone + } + tmp = append(tmp, byte(x)) break } - tmp = append(tmp, byte(x)) } +hexDone: b.tmp = tmp return string(tmp) } @@ -234,6 +249,9 @@ func (b *buffer) readLiteralString() token { depth := 1 Loop: for { + if b.exhausted() { + break + } c := b.readByte() switch c { default: @@ -423,6 +441,9 @@ type objdef struct { func (b *buffer) readObject() (object, error) { tok := b.readToken() + if tok == io.EOF { + return nil, errors.New("unexpected EOF parsing PDF object") + } if kw, ok := tok.(keyword); ok { switch kw { case "null": @@ -481,13 +502,13 @@ func (b *buffer) readArray() object { var x array for { tok := b.readToken() - if tok == nil || tok == keyword("]") { + if tok == io.EOF || tok == nil || tok == keyword("]") { break } b.unreadToken(tok) res, err := b.readObject() if err != nil { - return err + break } x = append(x, res) } @@ -498,7 +519,7 @@ func (b *buffer) readDict() object { x := make(dict) for { tok := b.readToken() - if tok == nil || tok == keyword(">>") { + if tok == io.EOF || tok == nil || tok == keyword(">>") { break } n, ok := tok.(name) diff --git a/page.go b/page.go index 96473570..1154b5a1 100644 --- a/page.go +++ b/page.go @@ -50,6 +50,8 @@ Search: num-- } } + // If Count overstates actual reachable kids, avoid looping forever. + break } return Page{} } @@ -393,7 +395,7 @@ func readCmap(toUnicode Value) *cmap { stk.Pop().Name() // key stk.Push(value) default: - println("interp\t", op) + // Ignore unrecognized cmap operators. } }) if !ok { @@ -505,7 +507,8 @@ func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) { showText("\n") case "Tf": // set text font and size if len(args) != 2 { - panic("bad TL") + // Skip malformed Tf; continue interpreting rest of stream + return } if font, ok := fonts[args[0].Name()]; ok { enc = font.Encoder() @@ -712,7 +715,8 @@ func (p Page) walkTextBlocks(walker func(enc TextEncoding, x, y float64, s strin case "T*": // move to start of next line case "Tf": // set text font and size if len(args) != 2 { - panic("bad TL") + // Skip malformed Tf; continue interpreting rest of stream + return } if font, ok := fonts[args[0].Name()]; ok { @@ -911,7 +915,8 @@ func (p Page) readContent(strm Value) Content { case "Tf": // set text font and size if len(args) != 2 { - panic("bad TL") + // Skip malformed Tf; continue interpreting rest of stream + return } f := args[0].Name() g.Tf = p.Font(f) @@ -944,29 +949,24 @@ func (p Page) readContent(strm Value) Content { showText(args[0].RawString()) case "TJ": // show text, allowing individual glyph positioning - if len(args) > 0 { // bugfix: don't raise an exception - v := args[0] - for i := 0; i < v.Len(); i++ { - x := v.Index(i) - if x.Kind() == String { - if i == v.Len()-1 { - showText(x.RawString()) - op = "BT" - continue - } else { - showText(x.RawString()) - } - } else { - tx := -x.Float64() / 1000 * g.Tfs * g.Th - g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) - } + if len(args) != 1 { + break + } + v := args[0] + for i := 0; i < v.Len(); i++ { + x := v.Index(i) + if x.Kind() == String { + showText(x.RawString()) + } else { + tx := -x.Float64() / 1000 * g.Tfs * g.Th + g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } - // showText("\n") } case "TL": // set text leading if len(args) != 1 { - panic("bad TL") + // Skip malformed TL; continue interpreting rest of stream + return } g.Tl = args[0].Float64() diff --git a/pdf_test.go b/pdf_test.go index 0681b32c..b02b3c73 100644 --- a/pdf_test.go +++ b/pdf_test.go @@ -2,12 +2,14 @@ package pdf import ( "bytes" + "crypto/aes" + "crypto/cipher" "fmt" "os" + "path/filepath" "strconv" "strings" "testing" - "path/filepath" ) var referenceFirstPage = `TEST FILE @@ -27,6 +29,89 @@ erat, sed diam voluptua. At vero eos et accusam et TEST SUBTITLE` +// TestCryptKeyTruncation verifies that cryptKey truncates its output to +// min(len(fileKey)+5, 16) bytes as required by PDF 32000-1:2008 §7.6.3.3 step 4. +// +// Previously cryptKey returned the full 16-byte MD5 digest regardless of the +// file key length. For 40-bit RC4 encryption (5-byte file key) the correct +// per-object key is 10 bytes; using 16 bytes produces a completely wrong +// RC4 keystream, causing all object stream decryptions to fail silently with +// garbage output and manifesting as "cannot find object in stream" panics. +// +// The bug went undetected because 128-bit RC4 files happen to need all 16 +// bytes (min(21, 16) = 16), so only sub-128-bit encrypted PDFs were affected. +// TestPDF20HeaderAccepted verifies that NewReaderEncrypted accepts a %PDF-2.0 header. +// The previous check (HasPrefix("%PDF-1.")) rejected all PDF 2.0+ files with +// "not a PDF file: invalid header" despite them being structurally valid. +func TestPDF20HeaderAccepted(t *testing.T) { + // Build a minimal byte slice with a %PDF-2.0 header, a well-formed %%EOF, + // and a startxref. We only need the header check to pass; the reader will + // fail later when it cannot find a valid xref, but that is a different error. + src := []byte("%PDF-2.0\n%%EOF\n") + r := bytes.NewReader(src) + _, err := NewReader(r, int64(len(src))) + // Any error other than the old "invalid header" rejection is acceptable — + // the file is intentionally not a complete PDF. + if err != nil && bytes.Contains([]byte(err.Error()), []byte("invalid header")) { + t.Errorf("NewReader rejected %%PDF-2.0 header: %v", err) + } +} + +// TestAESStringDecryption verifies that decryptString no longer panics when +// useAES is true. Previously it contained an unimplemented stub: +// +// panic("AES not implemented") +// +// PDF 32000-1:2008 §7.6.5 specifies AES-encrypted strings have the same +// layout as streams: a 16-byte IV followed by AES-CBC ciphertext with PKCS#7 +// padding. V=4 R=4 PDFs (AESV2, /StrF /StdCF) encrypt all string tokens via +// this path; the panic surfaced on the first string encountered during parsing. +func TestAESStringDecryption(t *testing.T) { + // Construct a known AES-128-CBC ciphertext for the string "hello" and + // verify decryptString round-trips it correctly. + import_key := make([]byte, 16) // all-zero file key for test + ptr := objptr{id: 1, gen: 0} + + // Encrypt "hello" + PKCS#7 padding (11 bytes pad to reach 16) with a + // known IV so we can assert the plaintext coming back. + iv := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + plaintext := []byte("hello\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b") // 16 bytes with PKCS#7 + + perObjKey := cryptKey(import_key, true, ptr) + cb, _ := aes.NewCipher(perObjKey) + ciphertext := make([]byte, 16) + cipher.NewCBCEncrypter(cb, iv).CryptBlocks(ciphertext, plaintext) + + input := string(append(iv, ciphertext...)) + got := decryptString(import_key, true, ptr, input) + if got != "hello" { + t.Errorf("decryptString AES: got %q, want %q", got, "hello") + } +} + +func TestCryptKeyTruncation(t *testing.T) { + ptr := objptr{id: 7874, gen: 0} + + cases := []struct { + fileKeyLen int + wantKeyLen int + }{ + {5, 10}, // RC4-40: min(5+5, 16) = 10 + {7, 12}, // RC4-56: min(7+5, 16) = 12 + {10, 15}, // RC4-80: min(10+5, 16) = 15 + {11, 16}, // RC4-88: min(11+5, 16) = 16 (capped) + {16, 16}, // RC4-128: min(16+5, 16) = 16 (capped) + } + + for _, tc := range cases { + key := make([]byte, tc.fileKeyLen) + got := cryptKey(key, false, ptr) + if len(got) != tc.wantKeyLen { + t.Errorf("cryptKey(%d-byte key): got %d bytes, want %d", tc.fileKeyLen, len(got), tc.wantKeyLen) + } + } +} + // // this pdf has an object within stream which is handled different! // the original implementation calculated the stream but didn't returned the object at resolve diff --git a/ps.go b/ps.go index c7ec20e3..ff1b746d 100644 --- a/ps.go +++ b/ps.go @@ -109,7 +109,9 @@ Reading: val := stk.Pop() key, ok := stk.Pop().data.(name) if !ok { - panic("def of non-name") + // Some malformed cmap streams emit "def" with a non-name key. + // Skip the assignment and continue so text extraction can proceed. + continue } dicts[len(dicts)-1][key] = val.data continue diff --git a/read.go b/read.go index 04480ecc..cb0b26d2 100644 --- a/read.go +++ b/read.go @@ -124,9 +124,25 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) { // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt // the file and returns an error. func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { - buf := make([]byte, 10) - f.ReadAt(buf, 0) - if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' { + // Be permissive about optional whitespace/comments after the version token. + // Some producers emit "%PDF-1.x \r" and are still valid for parsing. + buf := make([]byte, 1024) + n, _ := f.ReadAt(buf, 0) + if n < 8 { + return nil, fmt.Errorf("not a PDF file: invalid header") + } + buf = buf[:n] + if i := bytes.IndexByte(buf, '\r'); i >= 0 { + buf = buf[:i] + } + if i := bytes.IndexByte(buf, '\n'); i >= 0 { + buf = buf[:i] + } + // Accept any %PDF-M.m version header. The PDF content stream format and + // cross-reference structure have remained stable across all published + // versions (1.0–1.7, 2.0), so restricting to %PDF-1.x incorrectly rejects + // structurally valid PDF 2.0 and any future revision files. + if !bytes.HasPrefix(buf, []byte("%PDF-")) { return nil, fmt.Errorf("not a PDF file: invalid header") } end := size @@ -275,6 +291,61 @@ func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) { return table, strmptr, strm.hdr, nil } +// normalizeDecodeParmsColumns ensures PNG/TIFF predictor streams use /Columns equal to the +// xref row width (sum of the /W widths). Many generators omit /Columns or set it wrong, +// which makes pngPredictorReader read the wrong row size and hit io.EOF before the xref +// table is fully consumed. +func normalizeDecodeParmsColumns(dp object, rowBytes int64) object { + if dp == nil || rowBytes <= 0 { + return dp + } + switch x := dp.(type) { + case dict: + return normalizeOneDecodeParmsDict(x, rowBytes) + case array: + out := make(array, len(x)) + for i, o := range x { + if d, ok := o.(dict); ok { + out[i] = normalizeOneDecodeParmsDict(d, rowBytes) + } else { + out[i] = o + } + } + return out + default: + return dp + } +} + +func normalizeOneDecodeParmsDict(d dict, rowBytes int64) dict { + if d == nil || rowBytes <= 0 { + return d + } + out := make(dict) + for k, v := range d { + out[k] = v + } + pred, ok := out[name("Predictor")].(int64) + if !ok || pred < 10 || pred > 15 { + return out + } + out[name("Columns")] = rowBytes + return out +} + +func streamReaderForXrefRows(r *Reader, strm stream, rowBytes int) io.ReadCloser { + if rowBytes <= 0 { + return Value{r, objptr{}, strm}.Reader() + } + hdr := make(dict) + for k, v := range strm.hdr { + hdr[k] = v + } + hdr[name("DecodeParms")] = normalizeDecodeParmsColumns(strm.hdr[name("DecodeParms")], int64(rowBytes)) + fixed := stream{hdr: hdr, ptr: strm.ptr, offset: strm.offset} + return Value{r, objptr{}, fixed}.Reader() +} + func readXrefStreamData(r *Reader, strm stream, table []xref, size int64) ([]xref, error) { index, _ := strm.hdr["Index"].(array) if index == nil { @@ -300,13 +371,13 @@ func readXrefStreamData(r *Reader, strm stream, table []xref, size int64) ([]xre return nil, fmt.Errorf("invalid W array %v", objfmt(ww)) } - v := Value{r, objptr{}, strm} wtotal := 0 for _, wid := range w { wtotal += wid } buf := make([]byte, wtotal) - data := v.Reader() + data := streamReaderForXrefRows(r, strm, wtotal) + defer data.Close() for len(index) > 0 { start, ok1 := index[0].(int64) n, ok2 := index[1].(int64) @@ -864,19 +935,29 @@ func applyFilter(rd io.Reader, name string, param Value) io.Reader { case "FlateDecode": zr, err := zlib.NewReader(rd) if err != nil { - panic(err) + // Malformed compressed stream: return empty content for this stream + // so other streams/pages can still be extracted. + return bytes.NewReader(nil) } pred := param.Key("Predictor") if pred.Kind() == Null { return zr } columns := param.Key("Columns").Int64() + if columns <= 0 { + return zr + } switch pred.Int64() { default: fmt.Println("unknown predictor", pred) panic("pred") - case 12: - return &pngUpReader{r: zr, hist: make([]byte, 1+columns), tmp: make([]byte, 1+columns)} + case 10, 11, 12, 13, 14, 15: + return &pngPredictorReader{ + r: zr, + tmp: make([]byte, 1+columns), + prev: make([]byte, columns), + curr: make([]byte, columns), + } } case "ASCII85Decode": cleanASCII85 := newAlphaReader(rd) @@ -891,14 +972,15 @@ func applyFilter(rd io.Reader, name string, param Value) io.Reader { } } -type pngUpReader struct { +type pngPredictorReader struct { r io.Reader - hist []byte tmp []byte + prev []byte + curr []byte pend []byte } -func (r *pngUpReader) Read(b []byte) (int, error) { +func (r *pngPredictorReader) Read(b []byte) (int, error) { n := 0 for len(b) > 0 { if len(r.pend) > 0 { @@ -912,17 +994,77 @@ func (r *pngUpReader) Read(b []byte) (int, error) { if err != nil { return n, err } - if r.tmp[0] != 2 { - return n, fmt.Errorf("malformed PNG-Up encoding") - } - for i, b := range r.tmp { - r.hist[i] += b + + filter := r.tmp[0] + src := r.tmp[1:] + switch filter { + case 0: // None + copy(r.curr, src) + case 1: // Sub + for i := range src { + left := byte(0) + if i > 0 { + left = r.curr[i-1] + } + r.curr[i] = src[i] + left + } + case 2: // Up + for i := range src { + r.curr[i] = src[i] + r.prev[i] + } + case 3: // Average + for i := range src { + left := byte(0) + if i > 0 { + left = r.curr[i-1] + } + up := r.prev[i] + r.curr[i] = src[i] + byte((int(left)+int(up))/2) + } + case 4: // Paeth + for i := range src { + left := byte(0) + upLeft := byte(0) + if i > 0 { + left = r.curr[i-1] + upLeft = r.prev[i-1] + } + up := r.prev[i] + r.curr[i] = src[i] + paeth(left, up, upLeft) + } + default: + // Some malformed PDFs store unexpected filter bytes in predictor streams. + // Keep extraction moving by treating the row as unfiltered data. + copy(r.curr, src) } - r.pend = r.hist[1:] + copy(r.prev, r.curr) + r.pend = r.curr } return n, nil } +func paeth(a, b, c byte) byte { + ai, bi, ci := int(a), int(b), int(c) + p := ai + bi - ci + pa := absInt(p - ai) + pb := absInt(p - bi) + pc := absInt(p - ci) + if pa <= pb && pa <= pc { + return a + } + if pb <= pc { + return b + } + return c +} + +func absInt(v int) int { + if v < 0 { + return -v + } + return v +} + var passwordPad = []byte{ 0x28, 0xBF, 0x4E, 0x5E, 0x4E, 0x75, 0x8A, 0x41, 0x64, 0x00, 0x4E, 0x56, 0xFF, 0xFA, 0x01, 0x08, 0x2E, 0x2E, 0x00, 0xB6, 0xD0, 0x68, 0x3E, 0x80, 0x2F, 0x0C, 0xA9, 0xFE, 0x64, 0x53, 0x69, 0x7A, @@ -1072,20 +1214,43 @@ func cryptKey(key []byte, useAES bool, ptr objptr) []byte { if useAES { h.Write([]byte("sAlT")) } - return h.Sum(nil) + // PDF 32000-1:2008 §7.6.3.3 step 4: use the first min(n/8+5, 16) bytes. + // For RC4-40 (n=40) that is 10; for RC4-128 (n=128) it is 16. Without this + // truncation the library uses all 16 MD5 bytes regardless of the file key + // length, which decrypts every stream incorrectly for sub-128-bit key sizes. + n := len(key) + 5 + if n > 16 { + n = 16 + } + return h.Sum(nil)[:n] } func decryptString(key []byte, useAES bool, ptr objptr, x string) string { key = cryptKey(key, useAES, ptr) if useAES { - panic("AES not implemented") - } else { - c, _ := rc4.NewCipher(key) + // PDF 32000-1:2008 §7.6.5: AES-encrypted strings have the same layout + // as AES-encrypted streams — a 16-byte IV prepended to the AES-CBC + // ciphertext, followed by PKCS#7 padding. data := []byte(x) - c.XORKeyStream(data, data) - x = string(data) + if len(data) < 32 || len(data[16:])%16 != 0 { + return x // malformed ciphertext; return raw rather than panic + } + cb, err := aes.NewCipher(key) + if err != nil { + return x + } + plain := make([]byte, len(data)-16) + cipher.NewCBCDecrypter(cb, data[:16]).CryptBlocks(plain, data[16:]) + // Strip PKCS#7 padding. + if pad := int(plain[len(plain)-1]); pad > 0 && pad <= 16 && pad <= len(plain) { + plain = plain[:len(plain)-pad] + } + return string(plain) } - return x + c, _ := rc4.NewCipher(key) + data := []byte(x) + c.XORKeyStream(data, data) + return string(data) } func decryptStream(key []byte, useAES bool, ptr objptr, rd io.Reader) io.Reader {