Skip to content

Commit 7302857

Browse files
committed
Merge branch 'v2'
2 parents 7ca0bef + 19e06a7 commit 7302857

133 files changed

Lines changed: 3676 additions & 19504 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,4 @@ tests/MiniPdf.Benchmark/__pycache__/
3737
tests/MiniPdf.Scripts/pdf_test_single/docx_classic01_single_paragraph.pdf
3838
MiniPdf.Web/publish.zip
3939
/tests/MiniPdf.Scripts/pdf_output_baseline
40+
/tests/MiniPdf.Benchmark/minipdf_docx

_latest_bench.txt

Lines changed: 1 addition & 8597 deletions
Large diffs are not rendered by default.

src/MiniPdf/DocxReader.cs

Lines changed: 322 additions & 32 deletions
Large diffs are not rendered by default.

src/MiniPdf/DocxToPdfConverter.cs

Lines changed: 471 additions & 76 deletions
Large diffs are not rendered by default.

src/MiniPdf/PdfPage.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,9 @@ internal PdfPage(float width, float height)
9090
/// <param name="fontSize">Font size in points (default: 12).</param>
9191
/// <param name="color">Text color (default: black).</param>
9292
/// <returns>The current page for chaining.</returns>
93-
public PdfPage AddText(string text, float x, float y, float fontSize = 12, PdfColor? color = null, (float, float, float, float)? clipRect = null, float? maxWidth = null)
93+
public PdfPage AddText(string text, float x, float y, float fontSize = 12, PdfColor? color = null, (float, float, float, float)? clipRect = null, float? maxWidth = null, bool bold = false, bool underline = false, float charSpacing = 0)
9494
{
95-
_textBlocks.Add(new PdfTextBlock(text, x, y, fontSize, color, clipRect, maxWidth));
95+
_textBlocks.Add(new PdfTextBlock(text, x, y, fontSize, color, clipRect, maxWidth, bold, underline, charSpacing));
9696
return this;
9797
}
9898

src/MiniPdf/PdfTextBlock.cs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,22 @@ internal sealed class PdfTextBlock
4747
/// </summary>
4848
public float? MaxWidth { get; }
4949

50-
internal PdfTextBlock(string text, float x, float y, float fontSize, PdfColor? color = null, (float, float, float, float)? clipRect = null, float? maxWidth = null)
50+
/// <summary>
51+
/// Whether to render text using the bold font variant.
52+
/// </summary>
53+
public bool Bold { get; }
54+
55+
/// <summary>
56+
/// Whether to render an underline below the text.
57+
/// </summary>
58+
public bool Underline { get; }
59+
60+
/// <summary>
61+
/// Character spacing in points (PDF Tc operator). 0 means default.
62+
/// </summary>
63+
public float CharSpacing { get; }
64+
65+
internal PdfTextBlock(string text, float x, float y, float fontSize, PdfColor? color = null, (float, float, float, float)? clipRect = null, float? maxWidth = null, bool bold = false, bool underline = false, float charSpacing = 0)
5166
{
5267
Text = text;
5368
X = x;
@@ -56,5 +71,8 @@ internal PdfTextBlock(string text, float x, float y, float fontSize, PdfColor? c
5671
Color = color ?? PdfColor.Black;
5772
ClipRect = clipRect;
5873
MaxWidth = maxWidth;
74+
Bold = bold;
75+
Underline = underline;
76+
CharSpacing = charSpacing;
5977
}
6078
}

src/MiniPdf/PdfWriter.cs

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -215,10 +215,10 @@ internal void Write(PdfDocument document)
215215
contentStreams.Add(Encoding.Latin1.GetBytes(BuildContentStream(pages[i], embeddedFonts.Count > 0, cpToFontSlot, embeddedFonts)));
216216

217217
// Allocate object numbers.
218-
// 1 = Catalog, 2 = Pages, 3 = Font F1 (Helvetica/WinAnsi)
218+
// 1 = Catalog, 2 = Pages, 3 = Font F1 (Helvetica/WinAnsi), 4 = Font F1B (Helvetica-Bold/WinAnsi)
219219
// Per embedded font: 6 objects (ToUnicode, Descriptor, CIDFont, Type0, FontFile2, CIDToGIDMap)
220220
// Per page: content stream obj, N image XObject objs, page obj
221-
var nextObj = 4;
221+
var nextObj = 5;
222222

223223
// Allocate font objects
224224
foreach (var ef in embeddedFonts)
@@ -263,6 +263,10 @@ internal void Write(PdfDocument document)
263263
_objectOffsets[3] = Position;
264264
WriteRaw("3 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >>\nendobj\n");
265265

266+
// ── Object 4: Font F1B (Helvetica-Bold, built-in WinAnsiEncoding) ──
267+
_objectOffsets[4] = Position;
268+
WriteRaw("4 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >>\nendobj\n");
269+
266270
// ── Per-font objects (F2, F3, …) ───────────────────────────────────
267271
for (var fi = 0; fi < embeddedFonts.Count; fi++)
268272
{
@@ -356,8 +360,8 @@ internal void Write(PdfDocument document)
356360
WriteRaw($"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 {w} {h}]\n");
357361
WriteRaw($"/Contents {contentObjNums[i]} 0 R\n");
358362
WriteRaw("/Resources <<\n");
359-
// Font dictionary: F1 + Fn for each embedded font
360-
WriteRaw("/Font << /F1 3 0 R");
363+
// Font dictionary: F1, F1B + Fn for each embedded font
364+
WriteRaw("/Font << /F1 3 0 R /F1B 4 0 R");
361365
for (var fi = 0; fi < embeddedFonts.Count; fi++)
362366
WriteRaw($" /F{fi + 2} {embeddedFonts[fi].Type0Obj} 0 R");
363367
WriteRaw(" >>\n");
@@ -514,15 +518,19 @@ private static string BuildContentStream(PdfPage page, bool hasUnicodeFont, Dict
514518

515519
if (!hasUnicodeFont || !block.Text.Any(c => !IsWinAnsiHandled(c)))
516520
{
517-
// Pure Latin-1 text — use F1 (Helvetica) as before
521+
// Pure Latin-1 text — use F1 (Helvetica) or F1B (Helvetica-Bold)
522+
var fontName = block.Bold ? "F1B" : "F1";
518523
var escapedText = EscapePdfString(block.Text);
519524
sb.Append("BT\n");
520525
sb.Append(colorCmd);
521-
sb.Append($"/F1 {fontSize} Tf\n");
526+
sb.Append($"/{fontName} {fontSize} Tf\n");
527+
// Apply character spacing (Tc)
528+
if (block.CharSpacing != 0)
529+
sb.Append($"{block.CharSpacing.ToString("F2", CultureInfo.InvariantCulture)} Tc\n");
522530
// Apply horizontal scaling if text overflows MaxWidth
523531
if (block.MaxWidth.HasValue)
524532
{
525-
var naturalWidth = MeasureTextWidth(block.Text, block.FontSize);
533+
var naturalWidth = MeasureTextWidth(block.Text, block.FontSize, block.CharSpacing);
526534
if (naturalWidth > block.MaxWidth.Value && naturalWidth > 0)
527535
{
528536
var tzPercent = (block.MaxWidth.Value / naturalWidth) * 100.0;
@@ -544,10 +552,13 @@ private static string BuildContentStream(PdfPage page, bool hasUnicodeFont, Dict
544552
// emit each run with the appropriate Fn, using Td to advance.
545553
sb.Append("BT\n");
546554
sb.Append(colorCmd);
555+
// Apply character spacing (Tc)
556+
if (block.CharSpacing != 0)
557+
sb.Append($"{block.CharSpacing.ToString("F2", CultureInfo.InvariantCulture)} Tc\n");
547558
// Apply horizontal scaling if text overflows MaxWidth
548559
if (block.MaxWidth.HasValue)
549560
{
550-
var naturalWidth = MeasureTextWidth(block.Text, block.FontSize);
561+
var naturalWidth = MeasureTextWidth(block.Text, block.FontSize, block.CharSpacing);
551562
if (naturalWidth > block.MaxWidth.Value && naturalWidth > 0)
552563
{
553564
var tzPercent = (block.MaxWidth.Value / naturalWidth) * 100.0;
@@ -594,6 +605,25 @@ private static string BuildContentStream(PdfPage page, bool hasUnicodeFont, Dict
594605
// Restore graphics state after clipping
595606
if (hasClip)
596607
sb.Append("Q\n");
608+
609+
// Render underline as a line below the text
610+
if (block.Underline)
611+
{
612+
var textWidth = MeasureTextWidth(block.Text, block.FontSize, block.CharSpacing);
613+
if (block.MaxWidth.HasValue && textWidth > block.MaxWidth.Value)
614+
textWidth = block.MaxWidth.Value;
615+
var ulY = block.Y - block.FontSize * 0.15f; // position below baseline
616+
var ulThickness = Math.Max(0.5f, block.FontSize * 0.05f);
617+
var x1 = block.X.ToString("F3", CultureInfo.InvariantCulture);
618+
var y1 = ulY.ToString("F3", CultureInfo.InvariantCulture);
619+
var x2 = (block.X + textWidth).ToString("F3", CultureInfo.InvariantCulture);
620+
var lw = ulThickness.ToString("F3", CultureInfo.InvariantCulture);
621+
sb.Append($"{block.Color.R.ToString("F3", CultureInfo.InvariantCulture)} " +
622+
$"{block.Color.G.ToString("F3", CultureInfo.InvariantCulture)} " +
623+
$"{block.Color.B.ToString("F3", CultureInfo.InvariantCulture)} RG\n");
624+
sb.Append($"{lw} w\n");
625+
sb.Append($"{x1} {y1} m {x2} {y1} l S\n");
626+
}
597627
}
598628

599629
return sb.ToString();
@@ -603,7 +633,7 @@ private static string BuildContentStream(PdfPage page, bool hasUnicodeFont, Dict
603633
/// Measures the natural rendering width of text in Helvetica at the given font size.
604634
/// Uses the standard Helvetica character width table.
605635
/// </summary>
606-
private static double MeasureTextWidth(string text, float fontSize)
636+
private static double MeasureTextWidth(string text, float fontSize, float charSpacing = 0)
607637
{
608638
double total = 0;
609639
foreach (var ch in text)
@@ -633,7 +663,11 @@ private static double MeasureTextWidth(string text, float fontSize)
633663
};
634664
total += w;
635665
}
636-
return total * fontSize / 1000.0;
666+
var result = total * fontSize / 1000.0;
667+
// Tc adds charSpacing points per character (except after the last)
668+
if (charSpacing != 0 && text.Length > 1)
669+
result += charSpacing * (text.Length - 1);
670+
return result;
637671
}
638672

639673
/// <summary>
@@ -806,12 +840,12 @@ private static string EscapePdfString(string text)
806840
'\u20AC' => (char)0x80, // euro sign
807841
'\u00A0' => ' ', // non-breaking space
808842
'\u0060' => '\'', // backtick → apostrophe
809-
'\u00B7' => '*', // middle dot → asterisk
810-
'\u00D7' => 'x', // multiplication sign
811-
'\u00F7' => '/', // division sign
843+
'\u00B7' => '\u00B7', // middle dot (already in WinAnsi)
844+
'\u00D7' => '\u00D7', // multiplication sign (already in WinAnsi)
845+
'\u00F7' => '\u00F7', // division sign (already in WinAnsi)
812846
'\u2264' => "<=", // ≤
813847
'\u2265' => ">=", // ≥
814-
'\u00B0' => " deg", // degree sign
848+
'\u00B0' => '\u00B0', // degree sign (already in WinAnsi)
815849
'\u00AE' => (char)0xAE, // registered trademark (already in WinAnsi)
816850
'\u00A3' => '\u00A3', // pound sign (already in WinAnsi)
817851
'\u00A5' => '\u00A5', // yen sign (already in WinAnsi)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""Investigate the spacing issue around checkmarks in SA8000"""
2+
import fitz
3+
4+
mini = fitz.open(r'D:\git\MiniPdf-v2\tests\Issue_Files\minipdf_docx\SA8000 ch sample.pdf')
5+
6+
# Find the line with item 1 checkmark
7+
td = mini[0].get_text("dict")
8+
for block in td["blocks"]:
9+
if "lines" not in block:
10+
continue
11+
for line in block["lines"]:
12+
text = "".join(s["text"] for s in line["spans"])
13+
if "签定劳动合同" in text:
14+
print(f"Line text: {repr(text)}")
15+
for i, span in enumerate(line["spans"]):
16+
print(f" Span {i}: x={span['origin'][0]:.1f} y={span['origin'][1]:.1f} "
17+
f"size={span['size']:.1f} text={repr(span['text'][:40])}")
18+
# Also show raw character positions
19+
chars = mini[0].get_text("rawdict")
20+
for rb in chars["blocks"]:
21+
if "lines" not in rb:
22+
continue
23+
for rl in rb["lines"]:
24+
rt = "".join(s["text"] for s in rl["spans"])
25+
if "签定劳动合同" in rt:
26+
for ri, rs in enumerate(rl["spans"]):
27+
if "(" in rs["text"] or "√" in rs["text"] or ")" in rs["text"]:
28+
print(f"\n Raw span {ri}: text={repr(rs['text'][:30])}")
29+
if "chars" in rs:
30+
for ci, ch in enumerate(rs["chars"][:20]):
31+
print(f" char[{ci}]: '{ch['c']}' x={ch['origin'][0]:.1f} bbox={tuple(round(v,1) for v in ch['bbox'])}")
32+
break

tests/Issue_Files/_check_coords.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import fitz
2+
3+
# Check Y coordinates of text spans for paragraph 1 on page 1
4+
doc = fitz.open('minipdf_docx/SA8000 ch sample.pdf')
5+
page = doc[0]
6+
blocks = page.get_text('dict')['blocks']
7+
for b in blocks:
8+
if 'lines' not in b:
9+
continue
10+
for line in b['lines']:
11+
for span in line['spans']:
12+
t = span['text']
13+
if '签定' in t or t.strip() in ['√', ')', '(']:
14+
bbox = span['bbox']
15+
print(f"y0={bbox[1]:.1f} y1={bbox[3]:.1f} x0={bbox[0]:.1f} x1={bbox[2]:.1f} fs={span['size']:.1f} text={repr(t)}")
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""Compare extracted text for first few lines of SA8000 MiniPdf vs Reference"""
2+
import fitz # PyMuPDF
3+
4+
mini = fitz.open(r'D:\git\MiniPdf-v2\tests\Issue_Files\minipdf_docx\SA8000 ch sample.pdf')
5+
ref = fitz.open(r'D:\git\MiniPdf-v2\tests\Issue_Files\reference_docx\SA8000 ch sample.pdf')
6+
7+
print("=== MiniPdf Page 1 first 500 chars ===")
8+
mt = mini[0].get_text()
9+
print(repr(mt[:500]))
10+
11+
print("\n=== Reference Page 1 first 500 chars ===")
12+
rt = ref[0].get_text()
13+
print(repr(rt[:500]))
14+
15+
# Also check raw text spans for first block
16+
print("\n=== MiniPdf Page 1 text blocks (first 5) ===")
17+
for i, block in enumerate(mini[0].get_text("blocks")[:5]):
18+
print(f" B{i}: bbox={block[:4]}: {repr(block[4][:100])}")
19+
20+
print("\n=== Reference Page 1 text blocks (first 5) ===")
21+
for i, block in enumerate(ref[0].get_text("blocks")[:5]):
22+
print(f" B{i}: bbox={block[:4]}: {repr(block[4][:100])}")
23+
24+
# Check text spans for P1 line
25+
print("\n=== MiniPdf Page 1 text dict (first 3 blocks) ===")
26+
td = mini[0].get_text("dict")
27+
for i, block in enumerate(td["blocks"][:3]):
28+
if "lines" in block:
29+
for j, line in enumerate(block["lines"]):
30+
for k, span in enumerate(line["spans"]):
31+
print(f" B{i}L{j}S{k}: origin={span['origin']}, size={span['size']:.1f}, text={repr(span['text'][:80])}")

0 commit comments

Comments
 (0)