tables = Utils.GetTables(
+ page,
+ clip: page.Rect,
+ vertical_strategy: "lines_strict",
+ horizontal_strategy: "lines_strict");
+
+ Console.WriteLine($"Found {tables.Count} table(s) on page 0");
+
+ if (tables.Count > 0)
+ {
+ for (int i = 0; i < tables.Count; i++)
+ {
+ Table table = tables[i];
+ Console.WriteLine($"\n Table {i + 1}:");
+ Console.WriteLine($" Rows: {table.row_count}");
+ Console.WriteLine($" Columns: {table.col_count}");
+ if (table.bbox != null)
+ {
+ Console.WriteLine($" BBox: ({table.bbox.X0:F2}, {table.bbox.Y0:F2}, {table.bbox.X1:F2}, {table.bbox.Y1:F2})");
+ }
+
+ // Display header information
+ if (table.header != null)
+ {
+ Console.WriteLine($" Header:");
+ Console.WriteLine($" External: {table.header.external}");
+ if (table.header.names != null && table.header.names.Count > 0)
+ {
+ Console.WriteLine($" Column names: {string.Join(", ", table.header.names)}");
+ }
+ }
+
+ // Extract table data
+ Console.WriteLine($"\n Extracting table data...");
+ List> tableData = table.Extract();
+ if (tableData != null && tableData.Count > 0)
+ {
+ Console.WriteLine($" Extracted {tableData.Count} row(s) of data");
+ // Show first few rows as preview
+ int previewRows = Math.Min(3, tableData.Count);
+ for (int row = 0; row < previewRows; row++)
+ {
+ var rowData = tableData[row];
+ if (rowData != null)
+ {
+ Console.WriteLine($" Row {row + 1}: {string.Join(" | ", rowData.Take(5))}"); // Show first 5 columns
+ }
+ }
+ if (tableData.Count > previewRows)
+ {
+ Console.WriteLine($" ... and {tableData.Count - previewRows} more row(s)");
+ }
+ }
+
+ // Convert to markdown
+ Console.WriteLine($"\n Converting to Markdown...");
+ try
+ {
+ string markdown = table.ToMarkdown(clean: false, fillEmpty: true);
+ if (!string.IsNullOrEmpty(markdown))
+ {
+ Console.WriteLine($" Markdown length: {markdown.Length} characters");
+ // Save markdown to file
+ string markdownFile = $"table_{i + 1}_page0.md";
+ File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+ Console.WriteLine($" Markdown saved to: {markdownFile}");
+
+ // Show preview
+ int previewLength = Math.Min(200, markdown.Length);
+ Console.WriteLine($" Preview (first {previewLength} chars):");
+ Console.WriteLine($" {markdown.Substring(0, previewLength)}...");
+ }
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($" Error converting to markdown: {ex.Message}");
+ }
+ }
+ }
+ else
+ {
+ Console.WriteLine("No tables found. Trying with 'lines' strategy...");
+
+ // Test 2: Try with 'lines' strategy (less strict)
+ Console.WriteLine("\n--- Test 2: Get tables with 'lines' strategy ---");
+ tables = Utils.GetTables(
+ page,
+ clip: page.Rect,
+ vertical_strategy: "lines",
+ horizontal_strategy: "lines");
+
+ Console.WriteLine($"Found {tables.Count} table(s) with 'lines' strategy");
+ }
+
+ // Test 3: Try with 'text' strategy
+ Console.WriteLine("\n--- Test 3: Get tables with 'text' strategy ---");
+ List
textTables = Utils.GetTables(
+ page,
+ clip: page.Rect,
+ vertical_strategy: "text",
+ horizontal_strategy: "text");
+
+ Console.WriteLine($"Found {textTables.Count} table(s) with 'text' strategy");
+
+ // Test 4: Get tables from all pages
+ Console.WriteLine("\n--- Test 4: Get tables from all pages ---");
+ int totalTables = 0;
+ for (int pageNum = 0; pageNum < doc.PageCount; pageNum++)
+ {
+ Page currentPage = doc[pageNum];
+ List
pageTables = Utils.GetTables(
+ currentPage,
+ clip: currentPage.Rect,
+ vertical_strategy: "lines_strict",
+ horizontal_strategy: "lines_strict");
+
+ if (pageTables.Count > 0)
+ {
+ Console.WriteLine($" Page {pageNum}: {pageTables.Count} table(s)");
+ totalTables += pageTables.Count;
+ }
+ currentPage.Dispose();
+ }
+ Console.WriteLine($"Total tables found across all pages: {totalTables}");
+
+ page.Dispose();
+ doc.Close();
+
+ Console.WriteLine("\n=== TestTable completed successfully ===");
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"Error in TestTable: {ex.Message}");
+ Console.WriteLine($"Stack trace: {ex.StackTrace}");
+ throw;
+ }
+ }
+
+ static void TestPyMuPdfRagToMarkdown()
+ {
+ Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown =======================");
+
+ try
+ {
+ // Find a test PDF file
+ string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf");
+
+ Document doc = new Document(testFilePath);
+ Console.WriteLine($"Document loaded: {doc.PageCount} page(s)");
+ Console.WriteLine($"Document name: {doc.Name}");
+
+ // Test 1: Basic ToMarkdown with default settings
+ Console.WriteLine("\n--- Test 1: Basic ToMarkdown (default settings) ---");
+ try
+ {
+ string markdown = MuPdfRag.ToMarkdown(
+ doc,
+ pages: null, // All pages
+ hdrInfo: null, // Auto-detect headers
+ writeImages: false,
+ embedImages: false,
+ ignoreImages: false,
+ ignoreGraphics: false,
+ detectBgColor: true,
+ imagePath: "",
+ imageFormat: "png",
+ imageSizeLimit: 0.05f,
+ filename: testFilePath,
+ forceText: true,
+ pageChunks: false,
+ pageSeparators: false,
+ margins: null,
+ dpi: 150,
+ pageWidth: 612,
+ pageHeight: null,
+ tableStrategy: "lines_strict",
+ graphicsLimit: null,
+ fontsizeLimit: 3.0f,
+ ignoreCode: false,
+ extractWords: false,
+ showProgress: false,
+ useGlyphs: false,
+ ignoreAlpha: false
+ );
+
+ string markdownFile = "TestPyMuPdfRag_Output.md";
+ File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+ Console.WriteLine($"Markdown output saved to: {markdownFile}");
+ Console.WriteLine($"Markdown length: {markdown.Length} characters");
+ if (markdown.Length > 0)
+ {
+ int previewLength = Math.Min(300, markdown.Length);
+ Console.WriteLine($"Preview (first {previewLength} chars):\n{markdown.Substring(0, previewLength)}...");
+ }
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"Error in basic ToMarkdown: {ex.Message}");
+ }
+
+ // Test 2: ToMarkdown with IdentifyHeaders
+ Console.WriteLine("\n--- Test 2: ToMarkdown with IdentifyHeaders ---");
+ try
+ {
+ var identifyHeaders = new IdentifyHeaders(doc, pages: null, bodyLimit: 12.0f, maxLevels: 6);
+ string markdown = MuPdfRag.ToMarkdown(
+ doc,
+ pages: new List { 0 }, // First page only
+ hdrInfo: identifyHeaders,
+ writeImages: false,
+ embedImages: false,
+ ignoreImages: false,
+ filename: testFilePath,
+ forceText: true,
+ showProgress: false
+ );
+
+ string markdownFile = "TestPyMuPdfRag_WithHeaders.md";
+ File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+ Console.WriteLine($"Markdown with headers saved to: {markdownFile}");
+ Console.WriteLine($"Markdown length: {markdown.Length} characters");
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"Error in ToMarkdown with IdentifyHeaders: {ex.Message}");
+ }
+
+ // Test 3: ToMarkdown with TocHeaders
+ Console.WriteLine("\n--- Test 3: ToMarkdown with TocHeaders ---");
+ try
+ {
+ var tocHeaders = new TocHeaders(doc);
+ string markdown = MuPdfRag.ToMarkdown(
+ doc,
+ pages: new List { 0 }, // First page only
+ hdrInfo: tocHeaders,
+ writeImages: false,
+ embedImages: false,
+ ignoreImages: false,
+ filename: testFilePath,
+ forceText: true,
+ showProgress: false
+ );
+
+ string markdownFile = "TestPyMuPdfRag_WithToc.md";
+ File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+ Console.WriteLine($"Markdown with TOC headers saved to: {markdownFile}");
+ Console.WriteLine($"Markdown length: {markdown.Length} characters");
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"Error in ToMarkdown with TocHeaders: {ex.Message}");
+ }
+
+ // Test 4: ToMarkdown with page separators
+ Console.WriteLine("\n--- Test 4: ToMarkdown with page separators ---");
+ try
+ {
+ string markdown = MuPdfRag.ToMarkdown(
+ doc,
+ pages: null, // All pages
+ hdrInfo: null,
+ writeImages: false,
+ embedImages: false,
+ ignoreImages: false,
+ filename: testFilePath,
+ forceText: true,
+ pageSeparators: true, // Add page separators
+ showProgress: false
+ );
+
+ string markdownFile = "TestPyMuPdfRag_WithSeparators.md";
+ File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+ Console.WriteLine($"Markdown with page separators saved to: {markdownFile}");
+ Console.WriteLine($"Markdown length: {markdown.Length} characters");
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"Error in ToMarkdown with page separators: {ex.Message}");
+ }
+
+ // Test 5: ToMarkdown with progress bar
+ Console.WriteLine("\n--- Test 5: ToMarkdown with progress bar ---");
+ try
+ {
+ string markdown = MuPdfRag.ToMarkdown(
+ doc,
+ pages: null, // All pages
+ hdrInfo: null,
+ writeImages: false,
+ embedImages: false,
+ ignoreImages: false,
+ filename: testFilePath,
+ forceText: true,
+ showProgress: true, // Show progress bar
+ pageSeparators: false
+ );
+
+ string markdownFile = "TestPyMuPdfRag_WithProgress.md";
+ File.WriteAllText(markdownFile, markdown, Encoding.UTF8);
+ Console.WriteLine($"\nMarkdown with progress saved to: {markdownFile}");
+ Console.WriteLine($"Markdown length: {markdown.Length} characters");
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"Error in ToMarkdown with progress: {ex.Message}");
+ }
+
+ doc.Close();
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"An unexpected error occurred during PyMuPdfRag test: {ex.Message}");
+ Console.WriteLine($"Stack trace: {ex.StackTrace}");
+ }
+
+ Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown Completed =======================");
+ }
+
+ static void TestLLM()
+ {
+ Console.WriteLine("\n=== TestLLM =======================");
+
+ try
+ {
+ // Display version information
+ Console.WriteLine($"MuPDF.NET4LLM Version: {MuPDF4LLM.Version}");
+ var versionTuple = MuPDF4LLM.VersionTuple;
+ Console.WriteLine($"Version Tuple: ({versionTuple.major}, {versionTuple.minor}, {versionTuple.patch})");
+
+ // Test with a sample PDF file
+ string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf");
+
+ // Try to find a PDF with actual content if Blank.pdf doesn't work well
+ if (!File.Exists(testFilePath))
+ {
+ testFilePath = Path.GetFullPath("../../../TestDocuments/Widget.pdf");
+ }
+
+ if (!File.Exists(testFilePath))
+ {
+ Console.WriteLine($"Test PDF file not found. Skipping LLM test.");
+ return;
+ }
+
+ Console.WriteLine($"\nTesting with PDF: {testFilePath}");
+
+ Document doc = new Document(testFilePath);
+ Console.WriteLine($"Document loaded: {doc.PageCount} page(s)");
+
+ doc.Close();
+ Console.WriteLine("\nLLM test completed successfully.");
+ }
+ catch (Exception ex)
+ {
+ Console.WriteLine($"Error in TestLLM: {ex.Message}");
+ Console.WriteLine($"Stack trace: {ex.StackTrace}");
+ }
+ }
+
static void TestIssue1880()
{
Console.WriteLine("\n=== TestIssue1880 =======================");
diff --git a/Demo/TestDocuments/Magazine.pdf b/Demo/TestDocuments/Magazine.pdf
new file mode 100644
index 0000000..c8e166e
Binary files /dev/null and b/Demo/TestDocuments/Magazine.pdf differ
diff --git a/Demo/TestDocuments/national-capitals.pdf b/Demo/TestDocuments/national-capitals.pdf
new file mode 100644
index 0000000..d2b4721
Binary files /dev/null and b/Demo/TestDocuments/national-capitals.pdf differ
diff --git a/Demo/annotations-freetext2.cs b/Demo/annotations-freetext2.cs
index fa5493c..fb3a0e3 100644
--- a/Demo/annotations-freetext2.cs
+++ b/Demo/annotations-freetext2.cs
@@ -25,7 +25,7 @@ public static void Run(string[] args)
// the annotation text with HTML and styling syntax
string text = $@"
-PyMuPDF འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན།
+MuPDF.NET འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན།Here is some bold and italic text, followed by bold-italic. Text-based check boxes: {bullet}.
";
diff --git a/MuPDF.NET.Test/AnnotTest.cs b/MuPDF.NET.Test/AnnotTest.cs
index f34a372..9573c38 100644
--- a/MuPDF.NET.Test/AnnotTest.cs
+++ b/MuPDF.NET.Test/AnnotTest.cs
@@ -334,7 +334,7 @@ public void TestRichText()
string bullet = "\u2610\u2611\u2612"; // Output: ☐☑☒;
string text = $@"
-PyMuPDF འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན།
+MuPDF.NET འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན།Here is some bold and italic text, followed by bold-italic. Text-based check boxes: {bullet}.
";
diff --git a/MuPDF.NET.Test/GeneralTest.cs b/MuPDF.NET.Test/GeneralTest.cs
index 2c63deb..82dfe02 100644
--- a/MuPDF.NET.Test/GeneralTest.cs
+++ b/MuPDF.NET.Test/GeneralTest.cs
@@ -486,14 +486,6 @@ assert repr(ee) == expected, f'Expected {expected=} but got {repr(ee)=}.'
{
Console.WriteLine($"test_2548(): {Utils.MUPDF_WARNINGS_STORE[i]}");
}
-
- // This checks that PyMuPDF 1.23.7 fixes this bug, and also that earlier
- // versions with updated MuPDF also fix the bug.
- //rebased = hasattr(pymupdf, 'mupdf')
- //expected = 'format error: cycle in structure tree\nstructure tree broken, assume tree is missing'
- //if rebased:
- // assert wt == expected, f'expected:\n {expected!r}\nwt:\n {wt!r}\n'
- //assert not e
}
[Test]
diff --git a/MuPDF.NET.Test/resources/test_1645_expected.pdf b/MuPDF.NET.Test/resources/test_1645_expected.pdf
index 1196788..55f59f4 100644
Binary files a/MuPDF.NET.Test/resources/test_1645_expected.pdf and b/MuPDF.NET.Test/resources/test_1645_expected.pdf differ
diff --git a/MuPDF.NET.sln b/MuPDF.NET.sln
index 3a66542..755bc8f 100644
--- a/MuPDF.NET.sln
+++ b/MuPDF.NET.sln
@@ -1,4 +1,4 @@
-
+
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.14.36511.14
@@ -9,6 +9,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MuPDF.NET.Test", "MuPDF.NET
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Demo", "Demo\Demo.csproj", "{D1CCB24F-A868-F185-9228-8CC249247C79}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MuPDF.NET4LLM", "MuPDF.NET4LLM\MuPDF.NET4LLM.csproj", "{9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "MuPDF.NET4LLM.Test", "MuPDF.NET4LLM.Test\MuPDF.NET4LLM.Test.csproj", "{5498436C-E1C0-418D-9DA3-0460A3C15953}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -55,6 +59,30 @@ Global
{D1CCB24F-A868-F185-9228-8CC249247C79}.Release|x64.Build.0 = Release|x64
{D1CCB24F-A868-F185-9228-8CC249247C79}.Release|x86.ActiveCfg = Release|x86
{D1CCB24F-A868-F185-9228-8CC249247C79}.Release|x86.Build.0 = Release|x86
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x64.ActiveCfg = Debug|x64
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x64.Build.0 = Debug|x64
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x86.ActiveCfg = Debug|x86
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Debug|x86.Build.0 = Debug|x86
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|Any CPU.Build.0 = Release|Any CPU
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x64.ActiveCfg = Release|x64
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x64.Build.0 = Release|x64
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x86.ActiveCfg = Release|x86
+ {9EC37CFF-ACB3-4212-B6C3-0DAC013BADDA}.Release|x86.Build.0 = Release|x86
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x64.Build.0 = Debug|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x86.ActiveCfg = Debug|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Debug|x86.Build.0 = Debug|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|Any CPU.Build.0 = Release|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x64.ActiveCfg = Release|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x64.Build.0 = Release|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x86.ActiveCfg = Release|Any CPU
+ {5498436C-E1C0-418D-9DA3-0460A3C15953}.Release|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
diff --git a/MuPDF.NET/Document.cs b/MuPDF.NET/Document.cs
index 0622056..864e4d0 100644
--- a/MuPDF.NET/Document.cs
+++ b/MuPDF.NET/Document.cs
@@ -1685,7 +1685,7 @@ private void _DeletePage(int pno)
/// Create a table of contents.
///
/// a bool to control output.
- /// Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation.
+ /// Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see MuPDF's documentation.
///
public List GetToc(bool simple = true)
{
@@ -5950,6 +5950,23 @@ public void Bake(bool annots = true, bool widgets = true)
pdf.Dispose();
}
+ public void Dispose()
+ {
+ if (IsClosed)
+ throw new Exception("document closed");
+
+ if (Outline != null)
+ {
+ Outline.Dispose();
+ Outline = null;
+ }
+ ResetPageRefs();
+ IsClosed = true;
+ GraftMaps = new Dictionary();
+ _nativeDocument.Dispose();
+ _nativeDocument = null;
+ }
+
public void Close()
{
if (IsClosed)
@@ -6000,7 +6017,7 @@ public int AddOcg(
PdfObj useFor = ocg.pdf_dict_put_dict(new PdfObj("Usage"), 3);
PdfObj ciName = mupdf.mupdf.pdf_new_name("CreatorInfo");
PdfObj creInfo = useFor.pdf_dict_put_dict(ciName, 2);
- creInfo.pdf_dict_put_text_string(new PdfObj("Creator"), "PyMuPDF");
+ creInfo.pdf_dict_put_text_string(new PdfObj("Creator"), "MuPDF");
if (!string.IsNullOrEmpty(usage))
creInfo.pdf_dict_put_name(new PdfObj("Subtype"), usage);
diff --git a/MuPDF.NET/Page.cs b/MuPDF.NET/Page.cs
index 1f192d9..83bf2cd 100644
--- a/MuPDF.NET/Page.cs
+++ b/MuPDF.NET/Page.cs
@@ -1,4 +1,4 @@
-using mupdf;
+using mupdf;
using SkiaSharp;
using System;
using System.Collections;
@@ -10,7 +10,6 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
-using static MuPDF.NET.Global;
using static System.Net.Mime.MediaTypeNames;
namespace MuPDF.NET
diff --git a/MuPDF.NET/Table.cs b/MuPDF.NET/Table.cs
index 5fbcbd3..3d432f2 100644
--- a/MuPDF.NET/Table.cs
+++ b/MuPDF.NET/Table.cs
@@ -1,2007 +1,2296 @@
-using System;
+/*
+Copyright (C) 2023 Artifex Software, Inc.
+
+This file is part of MuPDF.NET.
+
+MuPDF.NET is free software: you can redistribute it and/or modify it under the
+terms of the GNU Affero General Public License as published by the Free
+Software Foundation, either version 3 of the License, or (at your option)
+any later version.
+
+MuPDF.NET is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+details.
+
+You should have received a copy of the GNU Affero General Public License
+along with MuPDF. If not, see
+
+Alternative licensing terms are available from the licensor.
+For commercial licensing, see or contact
+Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+CA 94129, USA, for further information.
+
+---------------------------------------------------------------------
+Portions of this code have been ported from pdfplumber, see
+https://pypi.org/project/pdfplumber/.
+
+The ported code is under the following MIT license:
+
+---------------------------------------------------------------------
+The MIT License (MIT)
+
+Copyright (c) 2015, Jeremy Singer-Vine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+---------------------------------------------------------------------
+Also see here: https://github.com/jsvine/pdfplumber/blob/stable/LICENSE.txt
+---------------------------------------------------------------------
+
+The porting mainly pertains to files "table.py" and relevant parts of
+"utils/text.py" within pdfplumber's repository on Github.
+With respect to "text.py", we have removed functions or features that are not
+used by table processing. Examples are:
+
+* the text search function
+* simple text extraction
+* text extraction by lines
+
+Original pdfplumber code does neither detect, nor identify table headers.
+This MuPDF.NET port adds respective code to the 'Table' class as method '_get_header'.
+This is implemented as new class TableHeader with the properties:
+* bbox: A tuple for the header's bbox
+* cells: A tuple for each bbox of a column header
+* names: A list of strings with column header text
+* external: A bool indicating whether the header is outside the table cells.
+
+*/
+
+using mupdf;
+using System;
using System.Collections.Generic;
-using System.Data;
+using System.Drawing;
using System.Linq;
-using System.Net;
-using System.Reflection;
using System.Text;
-using static MuPDF.NET.Global;
+using System.Text.RegularExpressions;
namespace MuPDF.NET
{
- public class Global
+ // Global state for table processing
+ internal static class TableGlobals
{
- public class Edge
- {
- public float x0;
- public float y0;
- public float x1;
- public float y1;
- public float width;
- public float height;
- public Point[] pts;
- public float linewidth;
- public bool stroke;
- public bool fill;
- public bool evenodd;
- public float[] stroking_color;
- public float[] non_stroking_color;
- public string object_type;
- public int page_number;
- public object stroking_pattern;
- public object non_stroking_pattern;
- public float top;
- public float bottom;
- public float doctop;
- public string orientation;
- }
-
- public class Character
- {
- public float adv;
- public float bottom;
- public float doctop;
- public string fontname;
- public float height;
- public Matrix matrix;
- public string ncs;
- public int non_stroking_color;
- public object non_stroking_pattern;
- public string object_type;
- public int page_number;
- public float size;
- public int stroking_color;
- public object stroking_pattern;
- public string text;
- public float top;
- public bool upright;
- public int direction;
- public int rotation;
- public float width;
- public float x0;
- public float x1;
- public float y0;
- public float y1;
- }
-
- // Function to check if the extracted text contains only whitespace characters
- public static bool whiteSpaces_issuperset(string text)
- {
- HashSet whiteSpaces = new HashSet(new[] {
- ' ', '\t', '\n', '\r', '\v', '\f'
- });
- // Check if all characters in the extracted text are whitespace characters
- return text.All(c => whiteSpaces.Contains(c));
- }
+ internal static List EDGES = new List(); // vector graphics from MuPDF
+ internal static List CHARS = new List(); // text characters from MuPDF
+ internal static TextPage TEXTPAGE = null; // textpage for cell text extraction
+
+ // Constants matching Python implementation from __init__.py
+ internal static readonly HashSet WHITE_SPACES = new HashSet { ' ', '\t', '\n', '\r', '\f', '\v' };
+ // From __init__.py: TEXT_FONT_BOLD = 16, but for char flags use FZ_STEXT_BOLD
+ internal static readonly int TEXT_BOLD = (int)mupdf.mupdf.FZ_STEXT_BOLD;
+ // From __init__.py: TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
+ internal static readonly int TEXT_STRIKEOUT = (int)mupdf.mupdf.FZ_STEXT_STRIKEOUT;
+ // From __init__.py: TEXT_COLLECT_STYLES = mupdf.FZ_STEXT_COLLECT_STYLES
+ internal static readonly int TEXT_COLLECT_STYLES = (int)mupdf.mupdf.FZ_STEXT_COLLECT_STYLES;
+ // From __init__.py: TEXT_COLLECT_VECTORS = mupdf.FZ_STEXT_COLLECT_VECTORS
+ internal static readonly int TEXT_COLLECT_VECTORS = (int)mupdf.mupdf.FZ_STEXT_COLLECT_VECTORS;
+ // From __init__.py: TEXT_SEGMENT = mupdf.FZ_STEXT_SEGMENT
+ internal static readonly int TEXT_SEGMENT = (int)mupdf.mupdf.FZ_STEXT_SEGMENT;
+ // From table.py FLAGS: TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES | TEXT_ACCURATE_BBOXES | TEXT_MEDIABOX_CLIP
+ internal static readonly int FLAGS =
+ (int)TextFlagsExtension.TEXTFLAGS_TEXT |
+ TEXT_COLLECT_STYLES |
+ (int)TextFlags.TEXT_ACCURATE_BBOXES |
+ (int)TextFlags.TEXT_MEDIABOX_CLIP;
+ // From table.py TABLE_DETECTOR_FLAGS: TEXT_ACCURATE_BBOXES | TEXT_SEGMENT | TEXT_COLLECT_VECTORS | TEXT_MEDIABOX_CLIP
+ internal static readonly int TABLE_DETECTOR_FLAGS =
+ (int)TextFlags.TEXT_ACCURATE_BBOXES |
+ TEXT_SEGMENT |
+ TEXT_COLLECT_VECTORS |
+ (int)TextFlags.TEXT_MEDIABOX_CLIP;
+ }
- public class BBox
+ // Constants
+ internal static class TableConstants
+ {
+ internal static readonly string[] NON_NEGATIVE_SETTINGS = {
+ "snap_tolerance", "snap_x_tolerance", "snap_y_tolerance",
+ "join_tolerance", "join_x_tolerance", "join_y_tolerance",
+ "edge_min_length", "min_words_vertical", "min_words_horizontal",
+ "intersection_tolerance", "intersection_x_tolerance", "intersection_y_tolerance"
+ };
+
+ internal static readonly Dictionary LIGATURES = new Dictionary
{
- public float x0 { get; set; }
- public float top { get; set; }
- public float x1 { get; set; }
- public float bottom { get; set; }
+ { "ff", "ff" },
+ { "ffi", "ffi" },
+ { "ffl", "ffl" },
+ { "fi", "fi" },
+ { "fl", "fl" },
+ { "st", "st" },
+ { "ſt", "st" }
+ };
+ }
- public BBox(float x0, float top, float x1, float bottom)
- {
- this.x0 = x0;
- this.top = top;
- this.x1 = x1;
- this.bottom = bottom;
- }
+ // Character dictionary structure matching Python implementation
+ internal class CharDict
+ {
+ public float adv { get; set; }
+ public float bottom { get; set; }
+ public float doctop { get; set; }
+ public string fontname { get; set; }
+ public float height { get; set; }
+ public Tuple matrix { get; set; }
+ public string ncs { get; set; }
+ public Tuple non_stroking_color { get; set; }
+ public object non_stroking_pattern { get; set; }
+ public string object_type { get; set; }
+ public int page_number { get; set; }
+ public float size { get; set; }
+ public Tuple stroking_color { get; set; }
+ public object stroking_pattern { get; set; }
+ public bool bold { get; set; }
+ public string text { get; set; }
+ public float top { get; set; }
+ public bool upright { get; set; }
+ public float width { get; set; }
+ public float x0 { get; set; }
+ public float x1 { get; set; }
+ public float y0 { get; set; }
+ public float y1 { get; set; }
+ }
- // Union method: Combine two rectangles into one that covers both.
- public BBox Union(BBox other)
- {
- float newX0 = Math.Min(this.x0, other.x0);
- float newTop = Math.Min(this.top, other.top);
- float newX1 = Math.Max(this.x1, other.x1);
- float newBottom = Math.Max(this.bottom, other.bottom);
+ // Edge structure for table detection
+ public class Edge
+ {
+ public float x0 { get; set; }
+ public float x1 { get; set; }
+ public float top { get; set; }
+ public float bottom { get; set; }
+ public float width { get; set; }
+ public float height { get; set; }
+ public string orientation { get; set; } // "h" or "v"
+ public string object_type { get; set; }
+ public float doctop { get; set; }
+ public int page_number { get; set; }
+ public float y0 { get; set; }
+ public float y1 { get; set; }
+ }
- return new BBox(newX0, newTop, newX1, newBottom);
- }
+ // Helper functions
+ internal static class TableHelpers
+ {
+ // rect_in_rect - Check whether rectangle 'inner' is fully inside rectangle 'outer'
+ internal static bool RectInRect(Rect inner, Rect outer)
+ {
+ return inner.X0 >= outer.X0 && inner.Y0 >= outer.Y0 &&
+ inner.X1 <= outer.X1 && inner.Y1 <= outer.Y1;
+ }
- // Overload the |= operator to union two rectangles.
- public static BBox operator |(BBox r1, BBox r2)
- {
- return r1.Union(r2);
- }
+ // chars_in_rect - Check whether any of the chars are inside rectangle
+ internal static bool CharsInRect(List chars, Rect rect)
+ {
+ return chars.Any(c =>
+ rect.X0 <= c.x0 && c.x1 <= rect.X1 &&
+ rect.Y0 <= c.y0 && c.y1 <= rect.Y1);
+ }
- public bool IsEmpty()
- {
- if (x0 == 0 && top == 0 && x1 == 0 && bottom == 0)
- return true;
- return false;
- }
+ // _iou - Compute intersection over union of two rectangles
+ internal static float Iou(Rect r1, Rect r2)
+ {
+ float ix = Math.Max(0, Math.Min(r1.X1, r2.X1) - Math.Max(r1.X0, r2.X0));
+ float iy = Math.Max(0, Math.Min(r1.Y1, r2.Y1) - Math.Max(r1.Y0, r2.Y0));
+ float intersection = ix * iy;
+ if (intersection == 0)
+ return 0;
+ float area1 = (r1.X1 - r1.X0) * (r1.Y1 - r1.Y0);
+ float area2 = (r2.X1 - r2.X0) * (r2.Y1 - r2.Y0);
+ return intersection / (area1 + area2 - intersection);
+ }
- // Override Equals and GetHashCode for Distinct to work correctly
- public override bool Equals(object obj)
- {
- return obj is BBox bbox &&
- x0 == bbox.x0 &&
- top == bbox.top &&
- x1 == bbox.x1 &&
- bottom == bbox.bottom;
- }
+ // intersects_words_h - Check whether any words are cut through by horizontal line y
+ internal static bool IntersectsWordsH(Rect bbox, float y, List wordRects)
+ {
+ return wordRects.Any(r => RectInRect(r, bbox) && r.Y0 < y && y < r.Y1);
+ }
- public static int CombineHashCodes(float x0, float top, float x1, float bottom)
- {
- // Start with a prime number to mix in the values.
- int hash = 17;
+ // get_table_dict_from_rect - Extract MuPDF table structure information
+ // Note: This requires native MuPDF interop to call fz_find_table_within_bounds
+ // The Python version calls: pymupdf.extra.make_table_dict(textpage.this.m_internal, table_dict, rect)
+ // This would need to be implemented via P/Invoke or native wrapper
+ internal static Dictionary GetTableDictFromRect(TextPage textpage, Rect rect)
+ {
+ var tableDict = new Dictionary();
+ // TODO: Implement native interop call to MuPDF's table detection function
+ // This is used by make_table_from_bbox which is called when layout_information finds tables
+ return tableDict;
+ }
- // Combine each hash code using XOR and a prime number multiplier.
- hash = hash * 31 + x0.GetHashCode();
- hash = hash * 31 + top.GetHashCode();
- hash = hash * 31 + x1.GetHashCode();
- hash = hash * 31 + bottom.GetHashCode();
+ // make_table_from_bbox - Detect table structure within a given rectangle
+ internal static List MakeTableFromBbox(TextPage textpage, List wordRects, Rect rect)
+ {
+ var cells = new List();
+ var block = GetTableDictFromRect(textpage, rect);
+
+ if (!block.ContainsKey("type") || Convert.ToInt32(block["type"]) != mupdf.mupdf.FZ_STEXT_BLOCK_GRID)
+ return cells;
+
+ var bboxList = block["bbox"] as List