Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions base/display/canvas.js
Original file line number Diff line number Diff line change
Expand Up @@ -1239,15 +1239,19 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {

//MQZ. Feb.20.2013. Disable character based painting, make it a string
// this.paintChar(character, scaledX, scaledY);
str += glyph.unicode || character;
if (accent) {
scaledAccentX = scaledX + accent.offset.x / fontSizeScale;
scaledAccentY = scaledY - accent.offset.y / fontSizeScale;
//MQZ. Feb.20.2013. Disable character based painting, make it a string
// this.paintChar(accent.fontChar, scaledAccentX, scaledAccentY);
// str += accent.fontChar;
}
}

// Always extract text for pdf2json, even if glyph is disabled for rendering (fixes issue #385)
str += glyph.unicode || character;
if (accent) {
// str += accent.fontChar; // Accent characters handled above
}

x += charWidth;

Expand Down
4 changes: 2 additions & 2 deletions lib/parserstream.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export class ParserStream extends Transform {
}

static createOutputStream(outputPath, resolve, reject) {
const outputStream = fs.createWriteStream(outputPath);
const outputStream = fs.createWriteStream(outputPath, { encoding: 'utf8' });
outputStream.on('finish', () => resolve(outputPath));
outputStream.on('error', err => reject(err) );
return outputStream;
Expand Down Expand Up @@ -71,7 +71,7 @@ export class StringifyStream extends Transform {
}

_transform(obj, encoding, callback){
this.push(JSON.stringify(obj));
this.push(JSON.stringify(obj), 'utf8');
callback();
}
}
8 changes: 4 additions & 4 deletions lib/pdf.js
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ export default class PDFJSClass extends EventEmitter {
const isDup = j > 0 && PDFFont.areDuplicateBlocks(page.Texts[j - 1], t);
if (isDup) {
PJS.info(
`skipped: dup text block: ${decodeURIComponent(t.R[0].T)}`
`skipped: dup text block: ${t.R[0].T}`
);
}
return !isDup;
Expand All @@ -380,14 +380,14 @@ export default class PDFJSClass extends EventEmitter {
PDFFont.areAdjacentBlocks(prevText, text) &&
PDFFont.haveSameStyle(prevText, text)
) {
const preT = decodeURIComponent(prevText.R[0].T);
const curT = decodeURIComponent(text.R[0].T);
const preT = prevText.R[0].T;
const curT = text.R[0].T;

prevText.R[0].T += text.R[0].T;
prevText.w += text.w;
text.merged = true;

const mergedText = decodeURIComponent(prevText.R[0].T);
const mergedText = prevText.R[0].T;
PJS.info(
`merged text block: ${preT} + ${curT} => ${mergedText}`
);
Expand Down
33 changes: 20 additions & 13 deletions lib/pdffont.js
Original file line number Diff line number Diff line change
Expand Up @@ -558,22 +558,29 @@ export default class PDFFont {
}

/**
* Encode text for output
* Encode text for output - preserves UTF-8 multi-byte characters
* NOTE: Breaking change in v3.3.0 - removed URI encoding to fix issue #385
* Chinese/Japanese/Korean and other multi-byte characters now output as UTF-8
* @param {string} str - The string to encode
* @returns {string} - The encoded string
* @returns {string} - The encoded string with legacy character replacements
*/
flashEncode(str) {
let retVal = encodeURIComponent(str);
retVal = retVal.replace('%C2%96', '-');
retVal = retVal.replace('%C2%91', '%27');
retVal = retVal.replace('%C2%92', '%27');
retVal = retVal.replace('%C2%82', '%27');
retVal = retVal.replace('%C2%93', '%22');
retVal = retVal.replace('%C2%94', '%22');
retVal = retVal.replace('%C2%84', '%22');
retVal = retVal.replace('%C2%8B', '%C2%AB');
retVal = retVal.replace('%C2%9B', '%C2%BB');

if (!str) return str;

let retVal = str;

// Apply legacy Flash-specific character replacements
// These handle problematic characters from old PDF encodings
retVal = retVal.replace(/\u0096/g, '-'); // En dash
retVal = retVal.replace(/\u0091/g, "'"); // Left single quote
retVal = retVal.replace(/\u0092/g, "'"); // Right single quote
retVal = retVal.replace(/\u0082/g, "'"); // Low single quote
retVal = retVal.replace(/\u0093/g, '"'); // Left double quote
retVal = retVal.replace(/\u0094/g, '"'); // Right double quote
retVal = retVal.replace(/\u0084/g, '"'); // Low double quote
retVal = retVal.replace(/\u008B/g, '«'); // Left guillemet
retVal = retVal.replace(/\u009B/g, '»'); // Right guillemet

return retVal;
}

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "pdf2json",
"version": "3.2.3",
"version": "3.3.0",
"description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js",
"keywords": [
"pdf",
Expand Down
4 changes: 2 additions & 2 deletions src/cli/p2jcli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class PDFProcessor {
);
}

const outputStream = fs.createWriteStream(this.outputPath);
const outputStream = fs.createWriteStream(this.outputPath, { encoding: 'utf8' });
outputStream.on("finish", () => this.onPrimarySuccess(resolve, reject));
outputStream.on("error", (err) => this.onPrimaryError(err, reject));

Expand All @@ -163,7 +163,7 @@ class PDFProcessor {
}

this.pdfParser.on("pdfParser_dataReady", (evtData: PDFParserData) => {
fs.writeFile(this.outputPath, JSON.stringify(evtData), (err) => {
fs.writeFile(this.outputPath, JSON.stringify(evtData), 'utf8', (err) => {
if (err) {
this.onPrimaryError(err, reject);
} else {
Expand Down
19 changes: 9 additions & 10 deletions test/_test_type3glyph.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,27 @@ describe('Type3 Glyph Font Tests', () => {
expect(pdfData).toBeDefined();
expect(pdfData.Pages).toBeDefined();
expect(pdfData.Pages.length).toBe(1);

const page = pdfData.Pages[0];
expect(page.Texts).toBeDefined();
expect(page.Texts.length).toBe(2); // Should have both Type3 and regular text

// Check for Type3 text "CONTENT"
const type3Text = page.Texts.find(text =>
text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'CONTENT'
text.R && text.R[0] && text.R[0].T === 'CONTENT'
);
expect(type3Text).toBeDefined();
expect(type3Text.R[0].T).toBe('CONTENT');
expect((type3Text.R[0].T)).toBe('CONTENT');

// Check for regular text "Added Text from Acrobat"
const regularText = page.Texts.find(text =>
text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'Added Text from Acrobat'
text.R && text.R[0] && text.R[0].T === 'Added Text from Acrobat'
);
expect(regularText).toBeDefined();
expect(decodeURIComponent(regularText.R[0].T)).toBe('Added Text from Acrobat');
expect(regularText.R[0].T).toBe('Added Text from Acrobat');

console.log('✓ Type3 glyph font parsing successful');
console.log(`✓ Found Type3 text: "${decodeURIComponent(type3Text.R[0].T)}"`);
console.log(`✓ Found regular text: "${decodeURIComponent(regularText.R[0].T)}"`);
console.log(`✓ Found Type3 text: "${type3Text.R[0].T}"`);
console.log(`✓ Found regular text: "${regularText.R[0].T}"`);

resolve();
} catch (error) {
Expand Down Expand Up @@ -89,7 +88,7 @@ describe('Type3 Glyph Font Tests', () => {
page.Texts.forEach(text => {
if (text.R) {
text.R.forEach(run => {
contentOutput += decodeURIComponent(run.T) + '\n';
contentOutput += run.T + '\n';
});
}
});
Expand All @@ -104,7 +103,7 @@ describe('Type3 Glyph Font Tests', () => {

expect(parsedJson.Pages[0].Texts.length).toBe(2);
expect(jsonContent).toContain('CONTENT');
expect(jsonContent).toContain('Added%20Text%20from%20Acrobat');
expect(jsonContent).toContain('Added Text from Acrobat');

// Verify content file exists and contains both texts
expect(fs.existsSync(contentOutputPath)).toBe(true);
Expand Down Expand Up @@ -139,7 +138,7 @@ describe('Type3 Glyph Font Tests', () => {

// Find Type3 text
const type3Text = page.Texts.find(text =>
text.R && text.R[0] && decodeURIComponent(text.R[0].T) === 'CONTENT'
text.R && text.R[0] && text.R[0].T === 'CONTENT'
);

// Verify Type3 text has proper positioning
Expand Down
Loading